classifier 1.4.4 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CLAUDE.md +77 -0
- data/README.md +274 -0
- data/ext/classifier/classifier_ext.c +25 -0
- data/ext/classifier/extconf.rb +15 -0
- data/ext/classifier/linalg.h +64 -0
- data/ext/classifier/matrix.c +387 -0
- data/ext/classifier/svd.c +208 -0
- data/ext/classifier/vector.c +319 -0
- data/lib/classifier/bayes.rb +294 -60
- data/lib/classifier/errors.rb +16 -0
- data/lib/classifier/extensions/vector.rb +42 -26
- data/lib/classifier/extensions/word_hash.rb +8 -1
- data/lib/classifier/lsi/content_node.rb +30 -9
- data/lib/classifier/lsi/word_list.rb +12 -1
- data/lib/classifier/lsi.rb +479 -125
- data/lib/classifier/storage/base.rb +50 -0
- data/lib/classifier/storage/file.rb +51 -0
- data/lib/classifier/storage/memory.rb +49 -0
- data/lib/classifier/storage.rb +9 -0
- data/lib/classifier.rb +2 -0
- data/sig/vendor/fast_stemmer.rbs +9 -0
- data/sig/vendor/gsl.rbs +27 -0
- data/sig/vendor/json.rbs +4 -0
- data/sig/vendor/matrix.rbs +26 -0
- data/sig/vendor/mutex_m.rbs +16 -0
- data/test/test_helper.rb +13 -1
- metadata +71 -10
- data/lib/classifier/extensions/vector_serialize.rb +0 -18
data/lib/classifier/lsi.rb
CHANGED
|
@@ -1,20 +1,60 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
1
3
|
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
|
2
4
|
# Copyright:: Copyright (c) 2005 David Fayram II
|
|
3
5
|
# License:: LGPL
|
|
4
6
|
|
|
7
|
+
module Classifier
|
|
8
|
+
class LSI
|
|
9
|
+
# Backend options: :native, :ruby
|
|
10
|
+
# @rbs @backend: Symbol
|
|
11
|
+
@backend = :ruby
|
|
12
|
+
|
|
13
|
+
class << self
|
|
14
|
+
# @rbs @backend: Symbol
|
|
15
|
+
attr_accessor :backend
|
|
16
|
+
|
|
17
|
+
# Check if using native C extension
|
|
18
|
+
# @rbs () -> bool
|
|
19
|
+
def native_available?
|
|
20
|
+
backend == :native
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Get the Vector class for the current backend
|
|
24
|
+
# @rbs () -> Class
|
|
25
|
+
def vector_class
|
|
26
|
+
backend == :native ? Classifier::Linalg::Vector : ::Vector
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Get the Matrix class for the current backend
|
|
30
|
+
# @rbs () -> Class
|
|
31
|
+
def matrix_class
|
|
32
|
+
backend == :native ? Classifier::Linalg::Matrix : ::Matrix
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Backend detection: native extension > pure Ruby
|
|
39
|
+
# Set NATIVE_VECTOR=true to force pure Ruby implementation
|
|
40
|
+
|
|
5
41
|
begin
|
|
6
|
-
# to test the native vector class, try `rake test NATIVE_VECTOR=true`
|
|
7
42
|
raise LoadError if ENV['NATIVE_VECTOR'] == 'true'
|
|
8
43
|
|
|
9
|
-
require '
|
|
10
|
-
|
|
11
|
-
$GSL = true
|
|
44
|
+
require 'classifier/classifier_ext'
|
|
45
|
+
Classifier::LSI.backend = :native
|
|
12
46
|
rescue LoadError
|
|
13
|
-
|
|
14
|
-
|
|
47
|
+
# Fall back to pure Ruby implementation
|
|
48
|
+
unless ENV['SUPPRESS_LSI_WARNING'] == 'true'
|
|
49
|
+
warn 'Notice: for 5-10x faster LSI, install the classifier gem with native extensions. ' \
|
|
50
|
+
'Set SUPPRESS_LSI_WARNING=true to hide this.'
|
|
51
|
+
end
|
|
52
|
+
Classifier::LSI.backend = :ruby
|
|
15
53
|
require 'classifier/extensions/vector'
|
|
16
54
|
end
|
|
17
55
|
|
|
56
|
+
require 'json'
|
|
57
|
+
require 'mutex_m'
|
|
18
58
|
require 'classifier/lsi/word_list'
|
|
19
59
|
require 'classifier/lsi/content_node'
|
|
20
60
|
require 'classifier/lsi/summary'
|
|
@@ -24,26 +64,62 @@ module Classifier
|
|
|
24
64
|
# data based on underlying semantic relations. For more information on the algorithms used,
|
|
25
65
|
# please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
|
|
26
66
|
class LSI
|
|
27
|
-
|
|
28
|
-
|
|
67
|
+
include Mutex_m
|
|
68
|
+
|
|
69
|
+
# @rbs @auto_rebuild: bool
|
|
70
|
+
# @rbs @word_list: WordList
|
|
71
|
+
# @rbs @items: Hash[untyped, ContentNode]
|
|
72
|
+
# @rbs @version: Integer
|
|
73
|
+
# @rbs @built_at_version: Integer
|
|
74
|
+
# @rbs @singular_values: Array[Float]?
|
|
75
|
+
# @rbs @dirty: bool
|
|
76
|
+
# @rbs @storage: Storage::Base?
|
|
77
|
+
|
|
78
|
+
attr_reader :word_list, :singular_values
|
|
79
|
+
attr_accessor :auto_rebuild, :storage
|
|
29
80
|
|
|
30
81
|
# Create a fresh index.
|
|
31
82
|
# If you want to call #build_index manually, use
|
|
32
|
-
# Classifier::LSI.new :
|
|
83
|
+
# Classifier::LSI.new auto_rebuild: false
|
|
33
84
|
#
|
|
85
|
+
# @rbs (?Hash[Symbol, untyped]) -> void
|
|
34
86
|
def initialize(options = {})
|
|
87
|
+
super()
|
|
35
88
|
@auto_rebuild = true unless options[:auto_rebuild] == false
|
|
36
89
|
@word_list = WordList.new
|
|
37
90
|
@items = {}
|
|
38
91
|
@version = 0
|
|
39
92
|
@built_at_version = -1
|
|
93
|
+
@dirty = false
|
|
94
|
+
@storage = nil
|
|
40
95
|
end
|
|
41
96
|
|
|
42
97
|
# Returns true if the index needs to be rebuilt. The index needs
|
|
43
98
|
# to be built after all informaton is added, but before you start
|
|
44
99
|
# using it for search, classification and cluster detection.
|
|
100
|
+
#
|
|
101
|
+
# @rbs () -> bool
|
|
45
102
|
def needs_rebuild?
|
|
46
|
-
(@items.keys.size > 1) && (@version != @built_at_version)
|
|
103
|
+
synchronize { (@items.keys.size > 1) && (@version != @built_at_version) }
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# @rbs () -> Array[Hash[Symbol, untyped]]?
|
|
107
|
+
def singular_value_spectrum
|
|
108
|
+
return nil unless @singular_values
|
|
109
|
+
|
|
110
|
+
total = @singular_values.sum
|
|
111
|
+
return nil if total.zero?
|
|
112
|
+
|
|
113
|
+
cumulative = 0.0
|
|
114
|
+
@singular_values.map.with_index do |value, i|
|
|
115
|
+
cumulative += value
|
|
116
|
+
{
|
|
117
|
+
dimension: i,
|
|
118
|
+
value: value,
|
|
119
|
+
percentage: value / total,
|
|
120
|
+
cumulative_percentage: cumulative / total
|
|
121
|
+
}
|
|
122
|
+
end
|
|
47
123
|
end
|
|
48
124
|
|
|
49
125
|
# Adds an item to the index. item is assumed to be a string, but
|
|
@@ -59,10 +135,14 @@ module Classifier
|
|
|
59
135
|
# ar = ActiveRecordObject.find( :all )
|
|
60
136
|
# lsi.add_item ar, *ar.categories { |x| ar.content }
|
|
61
137
|
#
|
|
138
|
+
# @rbs (String, *String | Symbol) ?{ (String) -> String } -> void
|
|
62
139
|
def add_item(item, *categories, &block)
|
|
63
140
|
clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
|
|
64
|
-
|
|
65
|
-
|
|
141
|
+
synchronize do
|
|
142
|
+
@items[item] = ContentNode.new(clean_word_hash, *categories)
|
|
143
|
+
@version += 1
|
|
144
|
+
@dirty = true
|
|
145
|
+
end
|
|
66
146
|
build_index if @auto_rebuild
|
|
67
147
|
end
|
|
68
148
|
|
|
@@ -70,30 +150,42 @@ module Classifier
|
|
|
70
150
|
# you are passing in a string with no categorries. item
|
|
71
151
|
# will be duck typed via to_s .
|
|
72
152
|
#
|
|
153
|
+
# @rbs (String) -> void
|
|
73
154
|
def <<(item)
|
|
74
155
|
add_item(item)
|
|
75
156
|
end
|
|
76
157
|
|
|
77
158
|
# Returns the categories for a given indexed items. You are free to add and remove
|
|
78
159
|
# items from this as you see fit. It does not invalide an index to change its categories.
|
|
160
|
+
#
|
|
161
|
+
# @rbs (String) -> Array[String | Symbol]
|
|
79
162
|
def categories_for(item)
|
|
80
|
-
|
|
163
|
+
synchronize do
|
|
164
|
+
return [] unless @items[item]
|
|
81
165
|
|
|
82
|
-
|
|
166
|
+
@items[item].categories
|
|
167
|
+
end
|
|
83
168
|
end
|
|
84
169
|
|
|
85
170
|
# Removes an item from the database, if it is indexed.
|
|
86
171
|
#
|
|
172
|
+
# @rbs (String) -> void
|
|
87
173
|
def remove_item(item)
|
|
88
|
-
|
|
174
|
+
removed = synchronize do
|
|
175
|
+
next false unless @items.key?(item)
|
|
89
176
|
|
|
90
|
-
|
|
91
|
-
|
|
177
|
+
@items.delete(item)
|
|
178
|
+
@version += 1
|
|
179
|
+
@dirty = true
|
|
180
|
+
true
|
|
181
|
+
end
|
|
182
|
+
build_index if removed && @auto_rebuild
|
|
92
183
|
end
|
|
93
184
|
|
|
94
185
|
# Returns an array of items that are indexed.
|
|
186
|
+
# @rbs () -> Array[untyped]
|
|
95
187
|
def items
|
|
96
|
-
@items.keys
|
|
188
|
+
synchronize { @items.keys }
|
|
97
189
|
end
|
|
98
190
|
|
|
99
191
|
# This function rebuilds the index if needs_rebuild? returns true.
|
|
@@ -110,34 +202,33 @@ module Classifier
|
|
|
110
202
|
# cutoff parameter tells the indexer how many of these values to keep.
|
|
111
203
|
# A value of 1 for cutoff means that no semantic analysis will take place,
|
|
112
204
|
# turning the LSI class into a simple vector search engine.
|
|
205
|
+
#
|
|
206
|
+
# @rbs (?Float) -> void
|
|
113
207
|
def build_index(cutoff = 0.75)
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
208
|
+
validate_cutoff!(cutoff)
|
|
209
|
+
|
|
210
|
+
synchronize do
|
|
211
|
+
return unless needs_rebuild_unlocked?
|
|
212
|
+
|
|
213
|
+
make_word_list
|
|
214
|
+
|
|
215
|
+
doc_list = @items.values
|
|
216
|
+
tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }
|
|
217
|
+
|
|
218
|
+
if self.class.native_available?
|
|
219
|
+
# Convert vectors to arrays for matrix construction
|
|
220
|
+
tda_arrays = tda.map { |v| v.respond_to?(:to_a) ? v.to_a : v }
|
|
221
|
+
tdm = self.class.matrix_class.alloc(*tda_arrays).trans
|
|
222
|
+
ntdm = build_reduced_matrix(tdm, cutoff)
|
|
223
|
+
assign_native_ext_lsi_vectors(ntdm, doc_list)
|
|
224
|
+
else
|
|
225
|
+
tdm = Matrix.rows(tda).trans
|
|
226
|
+
ntdm = build_reduced_matrix(tdm, cutoff)
|
|
227
|
+
assign_ruby_lsi_vectors(ntdm, doc_list)
|
|
129
228
|
end
|
|
130
|
-
else
|
|
131
|
-
tdm = Matrix.rows(tda).trans
|
|
132
|
-
ntdm = build_reduced_matrix(tdm, cutoff)
|
|
133
229
|
|
|
134
|
-
|
|
135
|
-
doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
|
|
136
|
-
doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
|
|
137
|
-
end
|
|
230
|
+
@built_at_version = @version
|
|
138
231
|
end
|
|
139
|
-
|
|
140
|
-
@built_at_version = @version
|
|
141
232
|
end
|
|
142
233
|
|
|
143
234
|
# This method returns max_chunks entries, ordered by their average semantic rating.
|
|
@@ -148,13 +239,17 @@ module Classifier
|
|
|
148
239
|
# your dataset's general content. For example, if you were to use categorize on the
|
|
149
240
|
# results of this data, you could gather information on what your dataset is generally
|
|
150
241
|
# about.
|
|
242
|
+
#
|
|
243
|
+
# @rbs (?Integer) -> Array[String]
|
|
151
244
|
def highest_relative_content(max_chunks = 10)
|
|
152
|
-
|
|
245
|
+
synchronize do
|
|
246
|
+
return [] if needs_rebuild_unlocked?
|
|
153
247
|
|
|
154
|
-
|
|
155
|
-
|
|
248
|
+
avg_density = {}
|
|
249
|
+
@items.each_key { |x| avg_density[x] = proximity_array_for_content_unlocked(x).sum { |pair| pair[1] } }
|
|
156
250
|
|
|
157
|
-
|
|
251
|
+
avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..(max_chunks - 1)].map
|
|
252
|
+
end
|
|
158
253
|
end
|
|
159
254
|
|
|
160
255
|
# This function is the primitive that find_related and classify
|
|
@@ -169,20 +264,10 @@ module Classifier
|
|
|
169
264
|
# The parameter doc is the content to compare. If that content is not
|
|
170
265
|
# indexed, you can pass an optional block to define how to create the
|
|
171
266
|
# text data. See add_item for examples of how this works.
|
|
267
|
+
#
|
|
268
|
+
# @rbs (String) ?{ (String) -> String } -> Array[[String, Float]]
|
|
172
269
|
def proximity_array_for_content(doc, &block)
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
content_node = node_for_content(doc, &block)
|
|
176
|
-
result =
|
|
177
|
-
@items.keys.collect do |item|
|
|
178
|
-
val = if $GSL
|
|
179
|
-
content_node.search_vector * @items[item].search_vector.col
|
|
180
|
-
else
|
|
181
|
-
(Matrix[content_node.search_vector] * @items[item].search_vector)[0]
|
|
182
|
-
end
|
|
183
|
-
[item, val]
|
|
184
|
-
end
|
|
185
|
-
result.sort_by { |x| x[1] }.reverse
|
|
270
|
+
synchronize { proximity_array_for_content_unlocked(doc, &block) }
|
|
186
271
|
end
|
|
187
272
|
|
|
188
273
|
# Similar to proximity_array_for_content, this function takes similar
|
|
@@ -190,20 +275,10 @@ module Classifier
|
|
|
190
275
|
# calculated vectors instead of their full versions. This is useful when
|
|
191
276
|
# you're trying to perform operations on content that is much smaller than
|
|
192
277
|
# the text you're working with. search uses this primitive.
|
|
278
|
+
#
|
|
279
|
+
# @rbs (String) ?{ (String) -> String } -> Array[[String, Float]]
|
|
193
280
|
def proximity_norms_for_content(doc, &block)
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
content_node = node_for_content(doc, &block)
|
|
197
|
-
result =
|
|
198
|
-
@items.keys.collect do |item|
|
|
199
|
-
val = if $GSL
|
|
200
|
-
content_node.search_norm * @items[item].search_norm.col
|
|
201
|
-
else
|
|
202
|
-
(Matrix[content_node.search_norm] * @items[item].search_norm)[0]
|
|
203
|
-
end
|
|
204
|
-
[item, val]
|
|
205
|
-
end
|
|
206
|
-
result.sort_by { |x| x[1] }.reverse
|
|
281
|
+
synchronize { proximity_norms_for_content_unlocked(doc, &block) }
|
|
207
282
|
end
|
|
208
283
|
|
|
209
284
|
# This function allows for text-based search of your index. Unlike other functions
|
|
@@ -213,12 +288,16 @@ module Classifier
|
|
|
213
288
|
#
|
|
214
289
|
# While this may seem backwards compared to the other functions that LSI supports,
|
|
215
290
|
# it is actually the same algorithm, just applied on a smaller document.
|
|
291
|
+
#
|
|
292
|
+
# @rbs (String, ?Integer) -> Array[String]
|
|
216
293
|
def search(string, max_nearest = 3)
|
|
217
|
-
|
|
294
|
+
synchronize do
|
|
295
|
+
return [] if needs_rebuild_unlocked?
|
|
218
296
|
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
297
|
+
carry = proximity_norms_for_content_unlocked(string)
|
|
298
|
+
result = carry.collect { |x| x[0] }
|
|
299
|
+
result[0..(max_nearest - 1)]
|
|
300
|
+
end
|
|
222
301
|
end
|
|
223
302
|
|
|
224
303
|
# This function takes content and finds other documents
|
|
@@ -230,11 +309,15 @@ module Classifier
|
|
|
230
309
|
# This is particularly useful for identifing clusters in your document space.
|
|
231
310
|
# For example you may want to identify several "What's Related" items for weblog
|
|
232
311
|
# articles, or find paragraphs that relate to each other in an essay.
|
|
312
|
+
#
|
|
313
|
+
# @rbs (String, ?Integer) ?{ (String) -> String } -> Array[String]
|
|
233
314
|
def find_related(doc, max_nearest = 3, &block)
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
315
|
+
synchronize do
|
|
316
|
+
carry =
|
|
317
|
+
proximity_array_for_content_unlocked(doc, &block).reject { |pair| pair[0] == doc }
|
|
318
|
+
result = carry.collect { |x| x[0] }
|
|
319
|
+
result[0..(max_nearest - 1)]
|
|
320
|
+
end
|
|
238
321
|
end
|
|
239
322
|
|
|
240
323
|
# This function uses a voting system to categorize documents, based on
|
|
@@ -242,30 +325,23 @@ module Classifier
|
|
|
242
325
|
# find_related function to find related documents, then returns the
|
|
243
326
|
# most obvious category from this list.
|
|
244
327
|
#
|
|
245
|
-
#
|
|
246
|
-
# text. A cutoff of 1 means that every document in the index votes on
|
|
247
|
-
# what category the document is in. This may not always make sense.
|
|
248
|
-
#
|
|
328
|
+
# @rbs (String, ?Float) ?{ (String) -> String } -> String | Symbol
|
|
249
329
|
def classify(doc, cutoff = 0.30, &block)
|
|
250
|
-
|
|
330
|
+
validate_cutoff!(cutoff)
|
|
331
|
+
|
|
332
|
+
synchronize do
|
|
333
|
+
votes = vote_unlocked(doc, cutoff, &block)
|
|
251
334
|
|
|
252
|
-
|
|
253
|
-
|
|
335
|
+
ranking = votes.keys.sort_by { |x| votes[x] }
|
|
336
|
+
ranking[-1]
|
|
337
|
+
end
|
|
254
338
|
end
|
|
255
339
|
|
|
340
|
+
# @rbs (String, ?Float) ?{ (String) -> String } -> Hash[String | Symbol, Float]
|
|
256
341
|
def vote(doc, cutoff = 0.30, &block)
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
votes = {}
|
|
261
|
-
carry.each do |pair|
|
|
262
|
-
categories = @items[pair[0]].categories
|
|
263
|
-
categories.each do |category|
|
|
264
|
-
votes[category] ||= 0.0
|
|
265
|
-
votes[category] += pair[1]
|
|
266
|
-
end
|
|
267
|
-
end
|
|
268
|
-
votes
|
|
342
|
+
validate_cutoff!(cutoff)
|
|
343
|
+
|
|
344
|
+
synchronize { vote_unlocked(doc, cutoff, &block) }
|
|
269
345
|
end
|
|
270
346
|
|
|
271
347
|
# Returns the same category as classify() but also returns
|
|
@@ -278,59 +354,337 @@ module Classifier
|
|
|
278
354
|
# category = nil
|
|
279
355
|
# end
|
|
280
356
|
#
|
|
281
|
-
#
|
|
282
357
|
# See classify() for argument docs
|
|
358
|
+
# @rbs (String, ?Float) ?{ (String) -> String } -> [String | Symbol | nil, Float?]
|
|
283
359
|
def classify_with_confidence(doc, cutoff = 0.30, &block)
|
|
284
|
-
|
|
285
|
-
votes_sum = votes.values.inject(0.0) { |sum, v| sum + v }
|
|
286
|
-
return [nil, nil] if votes_sum.zero?
|
|
360
|
+
validate_cutoff!(cutoff)
|
|
287
361
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
362
|
+
synchronize do
|
|
363
|
+
votes = vote_unlocked(doc, cutoff, &block)
|
|
364
|
+
votes_sum = votes.values.sum
|
|
365
|
+
return [nil, nil] if votes_sum.zero?
|
|
366
|
+
|
|
367
|
+
ranking = votes.keys.sort_by { |x| votes[x] }
|
|
368
|
+
winner = ranking[-1]
|
|
369
|
+
vote_share = votes[winner] / votes_sum.to_f
|
|
370
|
+
[winner, vote_share]
|
|
371
|
+
end
|
|
292
372
|
end
|
|
293
373
|
|
|
294
374
|
# Prototype, only works on indexed documents.
|
|
295
375
|
# I have no clue if this is going to work, but in theory
|
|
296
376
|
# it's supposed to.
|
|
377
|
+
# @rbs (String, ?Integer) -> Array[Symbol]
|
|
297
378
|
def highest_ranked_stems(doc, count = 3)
|
|
298
|
-
|
|
379
|
+
synchronize do
|
|
380
|
+
raise 'Requested stem ranking on non-indexed content!' unless @items[doc]
|
|
381
|
+
|
|
382
|
+
arr = node_for_content_unlocked(doc).lsi_vector.to_a
|
|
383
|
+
top_n = arr.sort.reverse[0..(count - 1)]
|
|
384
|
+
top_n.collect { |x| @word_list.word_for_index(arr.index(x)) }
|
|
385
|
+
end
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
# Custom marshal serialization to exclude mutex state
|
|
389
|
+
# @rbs () -> Array[untyped]
|
|
390
|
+
def marshal_dump
|
|
391
|
+
[@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty]
|
|
392
|
+
end
|
|
393
|
+
|
|
394
|
+
# Custom marshal deserialization to recreate mutex
|
|
395
|
+
# @rbs (Array[untyped]) -> void
|
|
396
|
+
def marshal_load(data)
|
|
397
|
+
mu_initialize
|
|
398
|
+
@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty = data
|
|
399
|
+
@storage = nil
|
|
400
|
+
end
|
|
401
|
+
|
|
402
|
+
# Returns a hash representation of the LSI index.
|
|
403
|
+
# Only source data (word_hash, categories) is included, not computed vectors.
|
|
404
|
+
# This can be converted to JSON or used directly.
|
|
405
|
+
#
|
|
406
|
+
# @rbs () -> untyped
|
|
407
|
+
def as_json(*)
|
|
408
|
+
items_data = @items.transform_values do |node|
|
|
409
|
+
{
|
|
410
|
+
word_hash: node.word_hash.transform_keys(&:to_s),
|
|
411
|
+
categories: node.categories.map(&:to_s)
|
|
412
|
+
}
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
{
|
|
416
|
+
version: 1,
|
|
417
|
+
type: 'lsi',
|
|
418
|
+
auto_rebuild: @auto_rebuild,
|
|
419
|
+
items: items_data
|
|
420
|
+
}
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
# Serializes the LSI index to a JSON string.
|
|
424
|
+
# Only source data (word_hash, categories) is serialized, not computed vectors.
|
|
425
|
+
# On load, the index will be rebuilt automatically.
|
|
426
|
+
#
|
|
427
|
+
# @rbs () -> String
|
|
428
|
+
def to_json(*)
|
|
429
|
+
as_json.to_json
|
|
430
|
+
end
|
|
431
|
+
|
|
432
|
+
# Loads an LSI index from a JSON string or Hash created by #to_json or #as_json.
|
|
433
|
+
# The index will be rebuilt after loading.
|
|
434
|
+
#
|
|
435
|
+
# @rbs (String | Hash[String, untyped]) -> LSI
|
|
436
|
+
def self.from_json(json)
|
|
437
|
+
data = json.is_a?(String) ? JSON.parse(json) : json
|
|
438
|
+
raise ArgumentError, "Invalid classifier type: #{data['type']}" unless data['type'] == 'lsi'
|
|
439
|
+
|
|
440
|
+
# Create instance with auto_rebuild disabled during loading
|
|
441
|
+
instance = new(auto_rebuild: false)
|
|
442
|
+
|
|
443
|
+
# Restore items (categories stay as strings, matching original storage)
|
|
444
|
+
data['items'].each do |item_key, item_data|
|
|
445
|
+
word_hash = item_data['word_hash'].transform_keys(&:to_sym)
|
|
446
|
+
categories = item_data['categories']
|
|
447
|
+
instance.instance_variable_get(:@items)[item_key] = ContentNode.new(word_hash, *categories)
|
|
448
|
+
instance.instance_variable_set(:@version, instance.instance_variable_get(:@version) + 1)
|
|
449
|
+
end
|
|
299
450
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
451
|
+
# Restore auto_rebuild setting and rebuild index
|
|
452
|
+
instance.auto_rebuild = data['auto_rebuild']
|
|
453
|
+
instance.build_index
|
|
454
|
+
instance
|
|
455
|
+
end
|
|
456
|
+
|
|
457
|
+
# Saves the LSI index to the configured storage.
|
|
458
|
+
# Raises ArgumentError if no storage is configured.
|
|
459
|
+
#
|
|
460
|
+
# @rbs () -> void
|
|
461
|
+
def save
|
|
462
|
+
raise ArgumentError, 'No storage configured. Use save_to_file(path) or set storage=' unless storage
|
|
463
|
+
|
|
464
|
+
storage.write(to_json)
|
|
465
|
+
@dirty = false
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
# Saves the LSI index to a file (legacy API).
|
|
469
|
+
#
|
|
470
|
+
# @rbs (String) -> Integer
|
|
471
|
+
def save_to_file(path)
|
|
472
|
+
result = File.write(path, to_json)
|
|
473
|
+
@dirty = false
|
|
474
|
+
result
|
|
475
|
+
end
|
|
476
|
+
|
|
477
|
+
# Reloads the LSI index from the configured storage.
|
|
478
|
+
# Raises UnsavedChangesError if there are unsaved changes.
|
|
479
|
+
# Use reload! to force reload and discard changes.
|
|
480
|
+
#
|
|
481
|
+
# @rbs () -> self
|
|
482
|
+
def reload
|
|
483
|
+
raise ArgumentError, 'No storage configured' unless storage
|
|
484
|
+
raise UnsavedChangesError, 'Unsaved changes would be lost. Call save first or use reload!' if @dirty
|
|
485
|
+
|
|
486
|
+
data = storage.read
|
|
487
|
+
raise StorageError, 'No saved state found' unless data
|
|
488
|
+
|
|
489
|
+
restore_from_json(data)
|
|
490
|
+
@dirty = false
|
|
491
|
+
self
|
|
492
|
+
end
|
|
493
|
+
|
|
494
|
+
# Force reloads the LSI index from storage, discarding any unsaved changes.
|
|
495
|
+
#
|
|
496
|
+
# @rbs () -> self
|
|
497
|
+
def reload!
|
|
498
|
+
raise ArgumentError, 'No storage configured' unless storage
|
|
499
|
+
|
|
500
|
+
data = storage.read
|
|
501
|
+
raise StorageError, 'No saved state found' unless data
|
|
502
|
+
|
|
503
|
+
restore_from_json(data)
|
|
504
|
+
@dirty = false
|
|
505
|
+
self
|
|
506
|
+
end
|
|
507
|
+
|
|
508
|
+
# Returns true if there are unsaved changes.
|
|
509
|
+
#
|
|
510
|
+
# @rbs () -> bool
|
|
511
|
+
def dirty?
|
|
512
|
+
@dirty
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
# Loads an LSI index from the configured storage.
|
|
516
|
+
# The storage is set on the returned instance.
|
|
517
|
+
#
|
|
518
|
+
# @rbs (storage: Storage::Base) -> LSI
|
|
519
|
+
def self.load(storage:)
|
|
520
|
+
data = storage.read
|
|
521
|
+
raise StorageError, 'No saved state found' unless data
|
|
522
|
+
|
|
523
|
+
instance = from_json(data)
|
|
524
|
+
instance.storage = storage
|
|
525
|
+
instance
|
|
526
|
+
end
|
|
527
|
+
|
|
528
|
+
# Loads an LSI index from a file (legacy API).
|
|
529
|
+
#
|
|
530
|
+
# @rbs (String) -> LSI
|
|
531
|
+
def self.load_from_file(path)
|
|
532
|
+
from_json(File.read(path))
|
|
303
533
|
end
|
|
304
534
|
|
|
305
535
|
private
|
|
306
536
|
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
537
|
+
# Restores LSI state from a JSON string (used by reload)
|
|
538
|
+
# @rbs (String) -> void
|
|
539
|
+
def restore_from_json(json)
|
|
540
|
+
data = JSON.parse(json)
|
|
541
|
+
raise ArgumentError, "Invalid classifier type: #{data['type']}" unless data['type'] == 'lsi'
|
|
542
|
+
|
|
543
|
+
synchronize do
|
|
544
|
+
# Recreate the items
|
|
545
|
+
@items = {}
|
|
546
|
+
data['items'].each do |item_key, item_data|
|
|
547
|
+
word_hash = item_data['word_hash'].transform_keys(&:to_sym)
|
|
548
|
+
categories = item_data['categories']
|
|
549
|
+
@items[item_key] = ContentNode.new(word_hash, *categories)
|
|
550
|
+
end
|
|
310
551
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
552
|
+
# Restore settings
|
|
553
|
+
@auto_rebuild = data['auto_rebuild']
|
|
554
|
+
@version += 1
|
|
555
|
+
@built_at_version = -1
|
|
556
|
+
@word_list = WordList.new
|
|
557
|
+
@dirty = false
|
|
315
558
|
end
|
|
316
|
-
|
|
317
|
-
|
|
559
|
+
|
|
560
|
+
# Rebuild the index
|
|
561
|
+
build_index
|
|
562
|
+
end
|
|
563
|
+
|
|
564
|
+
# @rbs (Float) -> void
|
|
565
|
+
def validate_cutoff!(cutoff)
|
|
566
|
+
return if cutoff.positive? && cutoff < 1
|
|
567
|
+
|
|
568
|
+
raise ArgumentError, "cutoff must be between 0 and 1 (exclusive), got #{cutoff}"
|
|
569
|
+
end
|
|
570
|
+
|
|
571
|
+
# Assigns LSI vectors using native C extension
|
|
572
|
+
# @rbs (untyped, Array[ContentNode]) -> void
|
|
573
|
+
def assign_native_ext_lsi_vectors(ntdm, doc_list)
|
|
574
|
+
ntdm.size[1].times do |col|
|
|
575
|
+
vec = self.class.vector_class.alloc(ntdm.column(col).to_a).row
|
|
576
|
+
doc_list[col].lsi_vector = vec
|
|
577
|
+
doc_list[col].lsi_norm = vec.normalize
|
|
578
|
+
end
|
|
579
|
+
end
|
|
580
|
+
|
|
581
|
+
# Assigns LSI vectors using pure Ruby Matrix
|
|
582
|
+
# @rbs (untyped, Array[ContentNode]) -> void
|
|
583
|
+
def assign_ruby_lsi_vectors(ntdm, doc_list)
|
|
584
|
+
ntdm.column_size.times do |col|
|
|
585
|
+
next unless doc_list[col]
|
|
586
|
+
|
|
587
|
+
column = ntdm.column(col)
|
|
588
|
+
next unless column
|
|
589
|
+
|
|
590
|
+
doc_list[col].lsi_vector = column
|
|
591
|
+
doc_list[col].lsi_norm = column.normalize
|
|
592
|
+
end
|
|
593
|
+
end
|
|
594
|
+
|
|
595
|
+
# Unlocked version of needs_rebuild? for internal use when lock is already held
|
|
596
|
+
# @rbs () -> bool
|
|
597
|
+
def needs_rebuild_unlocked?
|
|
598
|
+
(@items.keys.size > 1) && (@version != @built_at_version)
|
|
318
599
|
end
|
|
319
600
|
|
|
320
|
-
|
|
601
|
+
# Unlocked version of proximity_array_for_content for internal use
|
|
602
|
+
# @rbs (String) ?{ (String) -> String } -> Array[[String, Float]]
|
|
603
|
+
def proximity_array_for_content_unlocked(doc, &)
|
|
604
|
+
return [] if needs_rebuild_unlocked?
|
|
605
|
+
|
|
606
|
+
content_node = node_for_content_unlocked(doc, &)
|
|
607
|
+
result =
|
|
608
|
+
@items.keys.collect do |item|
|
|
609
|
+
val = if self.class.native_available?
|
|
610
|
+
content_node.search_vector * @items[item].search_vector.col
|
|
611
|
+
else
|
|
612
|
+
(Matrix[content_node.search_vector] * @items[item].search_vector)[0]
|
|
613
|
+
end
|
|
614
|
+
[item, val]
|
|
615
|
+
end
|
|
616
|
+
result.sort_by { |x| x[1] }.reverse
|
|
617
|
+
end
|
|
618
|
+
|
|
619
|
+
# Unlocked version of proximity_norms_for_content for internal use
|
|
620
|
+
# @rbs (String) ?{ (String) -> String } -> Array[[String, Float]]
|
|
621
|
+
def proximity_norms_for_content_unlocked(doc, &)
|
|
622
|
+
return [] if needs_rebuild_unlocked?
|
|
623
|
+
|
|
624
|
+
content_node = node_for_content_unlocked(doc, &)
|
|
625
|
+
result =
|
|
626
|
+
@items.keys.collect do |item|
|
|
627
|
+
val = if self.class.native_available?
|
|
628
|
+
content_node.search_norm * @items[item].search_norm.col
|
|
629
|
+
else
|
|
630
|
+
(Matrix[content_node.search_norm] * @items[item].search_norm)[0]
|
|
631
|
+
end
|
|
632
|
+
[item, val]
|
|
633
|
+
end
|
|
634
|
+
result.sort_by { |x| x[1] }.reverse
|
|
635
|
+
end
|
|
636
|
+
|
|
637
|
+
# Unlocked version of vote for internal use
|
|
638
|
+
# @rbs (String, ?Float) ?{ (String) -> String } -> Hash[String | Symbol, Float]
|
|
639
|
+
def vote_unlocked(doc, cutoff = 0.30, &)
|
|
640
|
+
icutoff = (@items.size * cutoff).round
|
|
641
|
+
carry = proximity_array_for_content_unlocked(doc, &)
|
|
642
|
+
carry = carry[0..(icutoff - 1)]
|
|
643
|
+
votes = {}
|
|
644
|
+
carry.each do |pair|
|
|
645
|
+
categories = @items[pair[0]].categories
|
|
646
|
+
categories.each do |category|
|
|
647
|
+
votes[category] ||= 0.0
|
|
648
|
+
votes[category] += pair[1]
|
|
649
|
+
end
|
|
650
|
+
end
|
|
651
|
+
votes
|
|
652
|
+
end
|
|
653
|
+
|
|
654
|
+
# Unlocked version of node_for_content for internal use
|
|
655
|
+
# @rbs (String) ?{ (String) -> String } -> ContentNode
|
|
656
|
+
def node_for_content_unlocked(item, &block)
|
|
321
657
|
return @items[item] if @items[item]
|
|
322
658
|
|
|
323
659
|
clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
|
|
660
|
+
cn = ContentNode.new(clean_word_hash, &block)
|
|
661
|
+
cn.raw_vector_with(@word_list) unless needs_rebuild_unlocked?
|
|
662
|
+
cn
|
|
663
|
+
end
|
|
664
|
+
|
|
665
|
+
# @rbs (untyped, ?Float) -> untyped
|
|
666
|
+
def build_reduced_matrix(matrix, cutoff = 0.75)
|
|
667
|
+
# TODO: Check that M>=N on these dimensions! Transpose helps assure this
|
|
668
|
+
u, v, s = matrix.SV_decomp
|
|
324
669
|
|
|
325
|
-
|
|
670
|
+
@singular_values = s.sort.reverse
|
|
326
671
|
|
|
327
|
-
|
|
328
|
-
|
|
672
|
+
s_cutoff_index = [(s.size * cutoff).round - 1, 0].max
|
|
673
|
+
s_cutoff = @singular_values[s_cutoff_index]
|
|
674
|
+
s.size.times do |ord|
|
|
675
|
+
s[ord] = 0.0 if s[ord] < s_cutoff
|
|
329
676
|
end
|
|
677
|
+
# Reconstruct the term document matrix, only with reduced rank
|
|
678
|
+
result = u * self.class.matrix_class.diag(s) * v.trans
|
|
330
679
|
|
|
331
|
-
|
|
680
|
+
# SVD may return transposed dimensions when row_size < column_size
|
|
681
|
+
# Ensure result matches input dimensions
|
|
682
|
+
result = result.trans if result.row_size != matrix.row_size
|
|
683
|
+
|
|
684
|
+
result
|
|
332
685
|
end
|
|
333
686
|
|
|
687
|
+
# @rbs () -> void
|
|
334
688
|
def make_word_list
|
|
335
689
|
@word_list = WordList.new
|
|
336
690
|
@items.each_value do |node|
|