classifier 1.4.4 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CLAUDE.md +77 -0
- data/README.md +274 -0
- data/ext/classifier/classifier_ext.c +25 -0
- data/ext/classifier/extconf.rb +15 -0
- data/ext/classifier/linalg.h +64 -0
- data/ext/classifier/matrix.c +387 -0
- data/ext/classifier/svd.c +208 -0
- data/ext/classifier/vector.c +319 -0
- data/lib/classifier/bayes.rb +294 -60
- data/lib/classifier/errors.rb +16 -0
- data/lib/classifier/extensions/vector.rb +42 -26
- data/lib/classifier/extensions/word_hash.rb +8 -1
- data/lib/classifier/lsi/content_node.rb +30 -9
- data/lib/classifier/lsi/word_list.rb +12 -1
- data/lib/classifier/lsi.rb +479 -125
- data/lib/classifier/storage/base.rb +50 -0
- data/lib/classifier/storage/file.rb +51 -0
- data/lib/classifier/storage/memory.rb +49 -0
- data/lib/classifier/storage.rb +9 -0
- data/lib/classifier.rb +2 -0
- data/sig/vendor/fast_stemmer.rbs +9 -0
- data/sig/vendor/gsl.rbs +27 -0
- data/sig/vendor/json.rbs +4 -0
- data/sig/vendor/matrix.rbs +26 -0
- data/sig/vendor/mutex_m.rbs +16 -0
- data/test/test_helper.rb +13 -1
- metadata +71 -10
- data/lib/classifier/extensions/vector_serialize.rb +0 -18
data/lib/classifier/bayes.rb
CHANGED
|
@@ -1,39 +1,68 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
1
3
|
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
|
2
4
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
|
3
5
|
# License:: LGPL
|
|
4
6
|
|
|
7
|
+
require 'json'
|
|
8
|
+
require 'mutex_m'
|
|
9
|
+
|
|
5
10
|
module Classifier
|
|
6
11
|
class Bayes
|
|
12
|
+
include Mutex_m
|
|
13
|
+
|
|
14
|
+
# @rbs @categories: Hash[Symbol, Hash[Symbol, Integer]]
|
|
15
|
+
# @rbs @total_words: Integer
|
|
16
|
+
# @rbs @category_counts: Hash[Symbol, Integer]
|
|
17
|
+
# @rbs @category_word_count: Hash[Symbol, Integer]
|
|
18
|
+
# @rbs @cached_training_count: Float?
|
|
19
|
+
# @rbs @cached_vocab_size: Integer?
|
|
20
|
+
# @rbs @dirty: bool
|
|
21
|
+
# @rbs @storage: Storage::Base?
|
|
22
|
+
|
|
23
|
+
attr_accessor :storage
|
|
24
|
+
|
|
7
25
|
# The class can be created with one or more categories, each of which will be
|
|
8
26
|
# initialized and given a training method. E.g.,
|
|
9
27
|
# b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
|
28
|
+
# @rbs (*String | Symbol) -> void
|
|
10
29
|
def initialize(*categories)
|
|
30
|
+
super()
|
|
11
31
|
@categories = {}
|
|
12
32
|
categories.each { |category| @categories[category.prepare_category_name] = {} }
|
|
13
33
|
@total_words = 0
|
|
14
34
|
@category_counts = Hash.new(0)
|
|
15
35
|
@category_word_count = Hash.new(0)
|
|
36
|
+
@cached_training_count = nil
|
|
37
|
+
@cached_vocab_size = nil
|
|
38
|
+
@dirty = false
|
|
39
|
+
@storage = nil
|
|
16
40
|
end
|
|
17
41
|
|
|
18
|
-
#
|
|
19
42
|
# Provides a general training method for all categories specified in Bayes#new
|
|
20
43
|
# For example:
|
|
21
44
|
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
|
22
45
|
# b.train :this, "This text"
|
|
23
46
|
# b.train "that", "That text"
|
|
24
47
|
# b.train "The other", "The other text"
|
|
48
|
+
#
|
|
49
|
+
# @rbs (String | Symbol, String) -> void
|
|
25
50
|
def train(category, text)
|
|
26
51
|
category = category.prepare_category_name
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
@
|
|
31
|
-
@
|
|
32
|
-
|
|
52
|
+
word_hash = text.word_hash
|
|
53
|
+
synchronize do
|
|
54
|
+
invalidate_caches
|
|
55
|
+
@dirty = true
|
|
56
|
+
@category_counts[category] += 1
|
|
57
|
+
word_hash.each do |word, count|
|
|
58
|
+
@categories[category][word] ||= 0
|
|
59
|
+
@categories[category][word] += count
|
|
60
|
+
@total_words += count
|
|
61
|
+
@category_word_count[category] += count
|
|
62
|
+
end
|
|
33
63
|
end
|
|
34
64
|
end
|
|
35
65
|
|
|
36
|
-
#
|
|
37
66
|
# Provides a untraining method for all categories specified in Bayes#new
|
|
38
67
|
# Be very careful with this method.
|
|
39
68
|
#
|
|
@@ -41,54 +70,179 @@ module Classifier
|
|
|
41
70
|
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
|
42
71
|
# b.train :this, "This text"
|
|
43
72
|
# b.untrain :this, "This text"
|
|
73
|
+
#
|
|
74
|
+
# @rbs (String | Symbol, String) -> void
|
|
44
75
|
def untrain(category, text)
|
|
45
76
|
category = category.prepare_category_name
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
@categories[category]
|
|
55
|
-
|
|
77
|
+
word_hash = text.word_hash
|
|
78
|
+
synchronize do
|
|
79
|
+
invalidate_caches
|
|
80
|
+
@dirty = true
|
|
81
|
+
@category_counts[category] -= 1
|
|
82
|
+
word_hash.each do |word, count|
|
|
83
|
+
next unless @total_words >= 0
|
|
84
|
+
|
|
85
|
+
orig = @categories[category][word] || 0
|
|
86
|
+
@categories[category][word] ||= 0
|
|
87
|
+
@categories[category][word] -= count
|
|
88
|
+
if @categories[category][word] <= 0
|
|
89
|
+
@categories[category].delete(word)
|
|
90
|
+
count = orig
|
|
91
|
+
end
|
|
92
|
+
@category_word_count[category] -= count if @category_word_count[category] >= count
|
|
93
|
+
@total_words -= count
|
|
56
94
|
end
|
|
57
|
-
@category_word_count[category] -= count if @category_word_count[category] >= count
|
|
58
|
-
@total_words -= count
|
|
59
95
|
end
|
|
60
96
|
end
|
|
61
97
|
|
|
62
|
-
#
|
|
63
98
|
# Returns the scores in each category the provided +text+. E.g.,
|
|
64
99
|
# b.classifications "I hate bad words and you"
|
|
65
100
|
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
|
66
101
|
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
|
102
|
+
#
|
|
103
|
+
# @rbs (String) -> Hash[String, Float]
|
|
67
104
|
def classifications(text)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
105
|
+
words = text.word_hash.keys
|
|
106
|
+
synchronize do
|
|
107
|
+
training_count = cached_training_count
|
|
108
|
+
vocab_size = cached_vocab_size
|
|
109
|
+
|
|
110
|
+
@categories.to_h do |category, category_words|
|
|
111
|
+
smoothed_total = ((@category_word_count[category] || 0) + vocab_size).to_f
|
|
112
|
+
|
|
113
|
+
# Laplace smoothing: P(word|category) = (count + α) / (total + α * V)
|
|
114
|
+
word_score = words.sum { |w| Math.log(((category_words[w] || 0) + 1) / smoothed_total) }
|
|
115
|
+
prior_score = Math.log((@category_counts[category] || 0.1) / training_count)
|
|
116
|
+
|
|
117
|
+
[category.to_s, word_score + prior_score]
|
|
77
118
|
end
|
|
78
|
-
# now add prior probability for the category
|
|
79
|
-
s = @category_counts.key?(category) ? @category_counts[category] : 0.1
|
|
80
|
-
score[category.to_s] += Math.log(s / training_count)
|
|
81
119
|
end
|
|
82
|
-
score
|
|
83
120
|
end
|
|
84
121
|
|
|
85
|
-
#
|
|
86
122
|
# Returns the classification of the provided +text+, which is one of the
|
|
87
123
|
# categories given in the initializer. E.g.,
|
|
88
124
|
# b.classify "I hate bad words and you"
|
|
89
125
|
# => 'Uninteresting'
|
|
126
|
+
#
|
|
127
|
+
# @rbs (String) -> String
|
|
90
128
|
def classify(text)
|
|
91
|
-
|
|
129
|
+
best = classifications(text).min_by { |a| -a[1] }
|
|
130
|
+
raise StandardError, 'No classifications available' unless best
|
|
131
|
+
|
|
132
|
+
best.first.to_s
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Returns a hash representation of the classifier state.
|
|
136
|
+
# This can be converted to JSON or used directly.
|
|
137
|
+
#
|
|
138
|
+
# @rbs () -> untyped
|
|
139
|
+
def as_json(*)
|
|
140
|
+
{
|
|
141
|
+
version: 1,
|
|
142
|
+
type: 'bayes',
|
|
143
|
+
categories: @categories.transform_keys(&:to_s).transform_values { |v| v.transform_keys(&:to_s) },
|
|
144
|
+
total_words: @total_words,
|
|
145
|
+
category_counts: @category_counts.transform_keys(&:to_s),
|
|
146
|
+
category_word_count: @category_word_count.transform_keys(&:to_s)
|
|
147
|
+
}
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Serializes the classifier state to a JSON string.
|
|
151
|
+
# This can be saved to a file and later loaded with Bayes.from_json.
|
|
152
|
+
#
|
|
153
|
+
# @rbs () -> String
|
|
154
|
+
def to_json(*)
|
|
155
|
+
as_json.to_json
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Loads a classifier from a JSON string or a Hash created by #to_json or #as_json.
|
|
159
|
+
#
|
|
160
|
+
# @rbs (String | Hash[String, untyped]) -> Bayes
|
|
161
|
+
def self.from_json(json)
|
|
162
|
+
data = json.is_a?(String) ? JSON.parse(json) : json
|
|
163
|
+
raise ArgumentError, "Invalid classifier type: #{data['type']}" unless data['type'] == 'bayes'
|
|
164
|
+
|
|
165
|
+
instance = allocate
|
|
166
|
+
instance.send(:restore_state, data)
|
|
167
|
+
instance
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Saves the classifier to the configured storage.
|
|
171
|
+
# Raises ArgumentError if no storage is configured.
|
|
172
|
+
#
|
|
173
|
+
# @rbs () -> void
|
|
174
|
+
def save
|
|
175
|
+
raise ArgumentError, 'No storage configured. Use save_to_file(path) or set storage=' unless storage
|
|
176
|
+
|
|
177
|
+
storage.write(to_json)
|
|
178
|
+
@dirty = false
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Saves the classifier state to a file (legacy API).
|
|
182
|
+
#
|
|
183
|
+
# @rbs (String) -> Integer
|
|
184
|
+
def save_to_file(path)
|
|
185
|
+
result = File.write(path, to_json)
|
|
186
|
+
@dirty = false
|
|
187
|
+
result
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Reloads the classifier from the configured storage.
|
|
191
|
+
# Raises UnsavedChangesError if there are unsaved changes.
|
|
192
|
+
# Use reload! to force reload and discard changes.
|
|
193
|
+
#
|
|
194
|
+
# @rbs () -> self
|
|
195
|
+
def reload
|
|
196
|
+
raise ArgumentError, 'No storage configured' unless storage
|
|
197
|
+
raise UnsavedChangesError, 'Unsaved changes would be lost. Call save first or use reload!' if @dirty
|
|
198
|
+
|
|
199
|
+
data = storage.read
|
|
200
|
+
raise StorageError, 'No saved state found' unless data
|
|
201
|
+
|
|
202
|
+
restore_from_json(data)
|
|
203
|
+
@dirty = false
|
|
204
|
+
self
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# Force reloads the classifier from storage, discarding any unsaved changes.
|
|
208
|
+
#
|
|
209
|
+
# @rbs () -> self
|
|
210
|
+
def reload!
|
|
211
|
+
raise ArgumentError, 'No storage configured' unless storage
|
|
212
|
+
|
|
213
|
+
data = storage.read
|
|
214
|
+
raise StorageError, 'No saved state found' unless data
|
|
215
|
+
|
|
216
|
+
restore_from_json(data)
|
|
217
|
+
@dirty = false
|
|
218
|
+
self
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
# Returns true if there are unsaved changes.
|
|
222
|
+
#
|
|
223
|
+
# @rbs () -> bool
|
|
224
|
+
def dirty?
|
|
225
|
+
@dirty
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# Loads a classifier from the configured storage.
|
|
229
|
+
# The storage is set on the returned instance.
|
|
230
|
+
#
|
|
231
|
+
# @rbs (storage: Storage::Base) -> Bayes
|
|
232
|
+
def self.load(storage:)
|
|
233
|
+
data = storage.read
|
|
234
|
+
raise StorageError, 'No saved state found' unless data
|
|
235
|
+
|
|
236
|
+
instance = from_json(data)
|
|
237
|
+
instance.storage = storage
|
|
238
|
+
instance
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Loads a classifier from a file (legacy API).
|
|
242
|
+
#
|
|
243
|
+
# @rbs (String) -> Bayes
|
|
244
|
+
def self.load_from_file(path)
|
|
245
|
+
from_json(File.read(path))
|
|
92
246
|
end
|
|
93
247
|
|
|
94
248
|
#
|
|
@@ -100,32 +254,30 @@ module Classifier
|
|
|
100
254
|
# b.untrain_that "That text"
|
|
101
255
|
# b.train_the_other "The other text"
|
|
102
256
|
def method_missing(name, *args)
|
|
257
|
+
return super unless name.to_s =~ /(un)?train_(\w+)/
|
|
258
|
+
|
|
103
259
|
category = name.to_s.gsub(/(un)?train_(\w+)/, '\2').prepare_category_name
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
raise StandardError, "No such category: #{category}"
|
|
114
|
-
else
|
|
115
|
-
super
|
|
116
|
-
end
|
|
260
|
+
raise StandardError, "No such category: #{category}" unless @categories.key?(category)
|
|
261
|
+
|
|
262
|
+
method = name.to_s.start_with?('untrain_') ? :untrain : :train
|
|
263
|
+
args.each { |text| send(method, category, text) }
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
# @rbs (Symbol, ?bool) -> bool
|
|
267
|
+
def respond_to_missing?(name, include_private = false)
|
|
268
|
+
!!(name.to_s =~ /(un)?train_(\w+)/) || super
|
|
117
269
|
end
|
|
118
270
|
|
|
119
|
-
#
|
|
120
271
|
# Provides a list of category names
|
|
121
272
|
# For example:
|
|
122
273
|
# b.categories
|
|
123
274
|
# => ['This', 'That', 'the_other']
|
|
124
|
-
|
|
125
|
-
|
|
275
|
+
#
|
|
276
|
+
# @rbs () -> Array[String]
|
|
277
|
+
def categories
|
|
278
|
+
synchronize { @categories.keys.collect(&:to_s) }
|
|
126
279
|
end
|
|
127
280
|
|
|
128
|
-
#
|
|
129
281
|
# Allows you to add categories to the classifier.
|
|
130
282
|
# For example:
|
|
131
283
|
# b.add_category "Not spam"
|
|
@@ -134,13 +286,34 @@ module Classifier
|
|
|
134
286
|
# result in an undertrained category that will tend to match
|
|
135
287
|
# more criteria than the trained selective categories. In short,
|
|
136
288
|
# try to initialize your categories at initialization.
|
|
289
|
+
#
|
|
290
|
+
# @rbs (String | Symbol) -> Hash[Symbol, Integer]
|
|
137
291
|
def add_category(category)
|
|
138
|
-
|
|
292
|
+
synchronize do
|
|
293
|
+
invalidate_caches
|
|
294
|
+
@dirty = true
|
|
295
|
+
@categories[category.prepare_category_name] = {}
|
|
296
|
+
end
|
|
139
297
|
end
|
|
140
298
|
|
|
141
299
|
alias append_category add_category
|
|
142
300
|
|
|
143
|
-
#
|
|
301
|
+
# Custom marshal serialization to exclude mutex state
|
|
302
|
+
# @rbs () -> Array[untyped]
|
|
303
|
+
def marshal_dump
|
|
304
|
+
[@categories, @total_words, @category_counts, @category_word_count, @dirty]
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
# Custom marshal deserialization to recreate mutex
|
|
308
|
+
# @rbs (Array[untyped]) -> void
|
|
309
|
+
def marshal_load(data)
|
|
310
|
+
mu_initialize
|
|
311
|
+
@categories, @total_words, @category_counts, @category_word_count, @dirty = data
|
|
312
|
+
@cached_training_count = nil
|
|
313
|
+
@cached_vocab_size = nil
|
|
314
|
+
@storage = nil
|
|
315
|
+
end
|
|
316
|
+
|
|
144
317
|
# Allows you to remove categories from the classifier.
|
|
145
318
|
# For example:
|
|
146
319
|
# b.remove_category "Spam"
|
|
@@ -148,15 +321,76 @@ module Classifier
|
|
|
148
321
|
# WARNING: Removing categories from a trained classifier will
|
|
149
322
|
# result in the loss of all training data for that category.
|
|
150
323
|
# Make sure you really want to do this before calling this method.
|
|
324
|
+
#
|
|
325
|
+
# @rbs (String | Symbol) -> void
|
|
151
326
|
def remove_category(category)
|
|
152
327
|
category = category.prepare_category_name
|
|
153
|
-
|
|
328
|
+
synchronize do
|
|
329
|
+
raise StandardError, "No such category: #{category}" unless @categories.key?(category)
|
|
330
|
+
|
|
331
|
+
invalidate_caches
|
|
332
|
+
@dirty = true
|
|
333
|
+
@total_words -= @category_word_count[category].to_i
|
|
154
334
|
|
|
155
|
-
|
|
335
|
+
@categories.delete(category)
|
|
336
|
+
@category_counts.delete(category)
|
|
337
|
+
@category_word_count.delete(category)
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
private
|
|
342
|
+
|
|
343
|
+
# Restores classifier state from a JSON string (used by reload)
|
|
344
|
+
# @rbs (String) -> void
|
|
345
|
+
def restore_from_json(json)
|
|
346
|
+
data = JSON.parse(json)
|
|
347
|
+
raise ArgumentError, "Invalid classifier type: #{data['type']}" unless data['type'] == 'bayes'
|
|
348
|
+
|
|
349
|
+
synchronize do
|
|
350
|
+
restore_state(data)
|
|
351
|
+
end
|
|
352
|
+
end
|
|
353
|
+
|
|
354
|
+
# Restores classifier state from a hash (used by from_json)
|
|
355
|
+
# @rbs (Hash[String, untyped]) -> void
|
|
356
|
+
def restore_state(data)
|
|
357
|
+
mu_initialize
|
|
358
|
+
@categories = {} #: Hash[Symbol, Hash[Symbol, Integer]]
|
|
359
|
+
@total_words = data['total_words']
|
|
360
|
+
@category_counts = Hash.new(0) #: Hash[Symbol, Integer]
|
|
361
|
+
@category_word_count = Hash.new(0) #: Hash[Symbol, Integer]
|
|
362
|
+
@cached_training_count = nil
|
|
363
|
+
@cached_vocab_size = nil
|
|
364
|
+
@dirty = false
|
|
365
|
+
@storage = nil
|
|
366
|
+
|
|
367
|
+
data['categories'].each do |cat_name, words|
|
|
368
|
+
@categories[cat_name.to_sym] = words.transform_keys(&:to_sym)
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
data['category_counts'].each do |cat_name, count|
|
|
372
|
+
@category_counts[cat_name.to_sym] = count
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
data['category_word_count'].each do |cat_name, count|
|
|
376
|
+
@category_word_count[cat_name.to_sym] = count
|
|
377
|
+
end
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
# @rbs () -> void
|
|
381
|
+
def invalidate_caches
|
|
382
|
+
@cached_training_count = nil
|
|
383
|
+
@cached_vocab_size = nil
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
# @rbs () -> Float
|
|
387
|
+
def cached_training_count
|
|
388
|
+
@cached_training_count ||= @category_counts.values.sum.to_f
|
|
389
|
+
end
|
|
156
390
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
@
|
|
391
|
+
# @rbs () -> Integer
|
|
392
|
+
def cached_vocab_size
|
|
393
|
+
@cached_vocab_size ||= [@categories.values.flat_map(&:keys).uniq.size, 1].max
|
|
160
394
|
end
|
|
161
395
|
end
|
|
162
396
|
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
3
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
|
4
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
|
5
|
+
# License:: LGPL
|
|
6
|
+
|
|
7
|
+
module Classifier
|
|
8
|
+
# Base error class for all Classifier errors
|
|
9
|
+
class Error < StandardError; end
|
|
10
|
+
|
|
11
|
+
# Raised when reload would discard unsaved changes
|
|
12
|
+
class UnsavedChangesError < Error; end
|
|
13
|
+
|
|
14
|
+
# Raised when a storage operation fails
|
|
15
|
+
class StorageError < Error; end
|
|
16
|
+
end
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
1
3
|
# Author:: Ernest Ellingson
|
|
2
4
|
# Copyright:: Copyright (c) 2005
|
|
3
5
|
|
|
@@ -5,30 +7,41 @@
|
|
|
5
7
|
|
|
6
8
|
require 'matrix'
|
|
7
9
|
|
|
10
|
+
# @rbs skip
|
|
8
11
|
class Array
|
|
9
|
-
def sum_with_identity(identity = 0.0, &
|
|
12
|
+
def sum_with_identity(identity = 0.0, &)
|
|
10
13
|
return identity unless size.to_i.positive?
|
|
14
|
+
return map(&).sum_with_identity(identity) if block_given?
|
|
11
15
|
|
|
12
|
-
|
|
13
|
-
map(&block).sum_with_identity(identity)
|
|
14
|
-
else
|
|
15
|
-
compact.reduce(:+).to_f || identity.to_f
|
|
16
|
-
end
|
|
16
|
+
compact.reduce(identity, :+).to_f
|
|
17
17
|
end
|
|
18
18
|
end
|
|
19
19
|
|
|
20
|
-
|
|
20
|
+
# @rbs skip
|
|
21
|
+
class Vector
|
|
22
|
+
EPSILON = 1e-10
|
|
23
|
+
|
|
24
|
+
# Cache magnitude since Vector is immutable after creation
|
|
25
|
+
# Note: We undefine the matrix gem's normalize method first, then redefine it
|
|
26
|
+
# to provide a more robust implementation that handles zero vectors
|
|
27
|
+
undef_method :normalize if method_defined?(:normalize)
|
|
28
|
+
|
|
21
29
|
def magnitude
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
sum_of_squares
|
|
30
|
+
# Cache magnitude since Vector is immutable after creation
|
|
31
|
+
@magnitude ||= begin
|
|
32
|
+
sum_of_squares = 0.to_r
|
|
33
|
+
size.times do |i|
|
|
34
|
+
sum_of_squares += self[i]**2.to_r
|
|
35
|
+
end
|
|
36
|
+
Math.sqrt(sum_of_squares.to_f)
|
|
25
37
|
end
|
|
26
|
-
Math.sqrt(sum_of_squares.to_f)
|
|
27
38
|
end
|
|
28
39
|
|
|
29
40
|
def normalize
|
|
41
|
+
magnitude_value = magnitude
|
|
42
|
+
return Vector[*Array.new(size, 0.0)] if magnitude_value <= 0.0
|
|
43
|
+
|
|
30
44
|
normalized_values = []
|
|
31
|
-
magnitude_value = magnitude.to_r
|
|
32
45
|
size.times do |i|
|
|
33
46
|
normalized_values << (self[i] / magnitude_value)
|
|
34
47
|
end
|
|
@@ -36,10 +49,7 @@ module VectorExtensions
|
|
|
36
49
|
end
|
|
37
50
|
end
|
|
38
51
|
|
|
39
|
-
|
|
40
|
-
include VectorExtensions
|
|
41
|
-
end
|
|
42
|
-
|
|
52
|
+
# @rbs skip
|
|
43
53
|
class Matrix
|
|
44
54
|
def self.diag(diagonal_elements)
|
|
45
55
|
Matrix.diagonal(*diagonal_elements)
|
|
@@ -61,14 +71,19 @@ class Matrix
|
|
|
61
71
|
|
|
62
72
|
loop do
|
|
63
73
|
iteration_count += 1
|
|
64
|
-
(0...q_rotation_matrix.row_size - 1).each do |row|
|
|
65
|
-
(1..q_rotation_matrix.row_size - 1).each do |col|
|
|
74
|
+
(0...(q_rotation_matrix.row_size - 1)).each do |row|
|
|
75
|
+
(1..(q_rotation_matrix.row_size - 1)).each do |col|
|
|
66
76
|
next if row == col
|
|
67
77
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
78
|
+
numerator = 2.0 * q_rotation_matrix[row, col]
|
|
79
|
+
denominator = q_rotation_matrix[row, row] - q_rotation_matrix[col, col]
|
|
80
|
+
|
|
81
|
+
angle = if denominator.abs < Vector::EPSILON
|
|
82
|
+
numerator >= 0 ? Math::PI / 4.0 : -Math::PI / 4.0
|
|
83
|
+
else
|
|
84
|
+
Math.atan(numerator / denominator) / 2.0
|
|
85
|
+
end
|
|
86
|
+
|
|
72
87
|
cosine = Math.cos(angle)
|
|
73
88
|
sine = Math.sin(angle)
|
|
74
89
|
rotation_matrix = Matrix.identity(q_rotation_matrix.row_size)
|
|
@@ -92,11 +107,12 @@ class Matrix
|
|
|
92
107
|
break if (sum_of_differences <= 0.001 && iteration_count > 1) || iteration_count >= max_sweeps
|
|
93
108
|
end
|
|
94
109
|
|
|
95
|
-
singular_values =
|
|
96
|
-
|
|
97
|
-
singular_values << Math.sqrt(q_rotation_matrix[r, r].to_f)
|
|
110
|
+
singular_values = q_rotation_matrix.row_size.times.map do |r|
|
|
111
|
+
Math.sqrt([q_rotation_matrix[r, r].to_f, 0.0].max)
|
|
98
112
|
end
|
|
99
|
-
|
|
113
|
+
|
|
114
|
+
safe_singular_values = singular_values.map { |v| [v, Vector::EPSILON].max }
|
|
115
|
+
u_matrix = (row_size >= column_size ? self : trans) * v_matrix * Matrix.diagonal(*safe_singular_values).inverse
|
|
100
116
|
[u_matrix, v_matrix, singular_values]
|
|
101
117
|
end
|
|
102
118
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
1
3
|
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
|
2
4
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
|
3
5
|
# License:: LGPL
|
|
@@ -11,12 +13,14 @@ class String
|
|
|
11
13
|
# E.g.,
|
|
12
14
|
# "Hello (greeting's), with {braces} < >...?".without_punctuation
|
|
13
15
|
# => "Hello greetings with braces "
|
|
16
|
+
# @rbs () -> String
|
|
14
17
|
def without_punctuation
|
|
15
|
-
tr(',?.!;:"@#$%^&*()_=+[]{}
|
|
18
|
+
tr(',?.!;:"@#$%^&*()_=+[]{}|<>/`~', ' ').tr("'-", '')
|
|
16
19
|
end
|
|
17
20
|
|
|
18
21
|
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
|
19
22
|
# interned, and indexes to its frequency in the document.
|
|
23
|
+
# @rbs () -> Hash[Symbol, Integer]
|
|
20
24
|
def word_hash
|
|
21
25
|
word_hash = clean_word_hash
|
|
22
26
|
symbol_hash = word_hash_for_symbols(gsub(/\w/, ' ').split)
|
|
@@ -24,12 +28,14 @@ class String
|
|
|
24
28
|
end
|
|
25
29
|
|
|
26
30
|
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
|
31
|
+
# @rbs () -> Hash[Symbol, Integer]
|
|
27
32
|
def clean_word_hash
|
|
28
33
|
word_hash_for_words gsub(/[^\w\s]/, '').split
|
|
29
34
|
end
|
|
30
35
|
|
|
31
36
|
private
|
|
32
37
|
|
|
38
|
+
# @rbs (Array[String]) -> Hash[Symbol, Integer]
|
|
33
39
|
def word_hash_for_words(words)
|
|
34
40
|
d = Hash.new(0)
|
|
35
41
|
words.each do |word|
|
|
@@ -39,6 +45,7 @@ class String
|
|
|
39
45
|
d
|
|
40
46
|
end
|
|
41
47
|
|
|
48
|
+
# @rbs (Array[String]) -> Hash[Symbol, Integer]
|
|
42
49
|
def word_hash_for_symbols(words)
|
|
43
50
|
d = Hash.new(0)
|
|
44
51
|
words.each do |word|
|