classifier 1.4.4 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,39 +1,68 @@
1
+ # rbs_inline: enabled
2
+
1
3
  # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
4
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
5
  # License:: LGPL
4
6
 
7
+ require 'json'
8
+ require 'mutex_m'
9
+
5
10
  module Classifier
6
11
  class Bayes
12
+ include Mutex_m
13
+
14
+ # @rbs @categories: Hash[Symbol, Hash[Symbol, Integer]]
15
+ # @rbs @total_words: Integer
16
+ # @rbs @category_counts: Hash[Symbol, Integer]
17
+ # @rbs @category_word_count: Hash[Symbol, Integer]
18
+ # @rbs @cached_training_count: Float?
19
+ # @rbs @cached_vocab_size: Integer?
20
+ # @rbs @dirty: bool
21
+ # @rbs @storage: Storage::Base?
22
+
23
+ attr_accessor :storage
24
+
7
25
  # The class can be created with one or more categories, each of which will be
8
26
  # initialized and given a training method. E.g.,
9
27
  # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
28
+ # @rbs (*String | Symbol) -> void
10
29
  def initialize(*categories)
30
+ super()
11
31
  @categories = {}
12
32
  categories.each { |category| @categories[category.prepare_category_name] = {} }
13
33
  @total_words = 0
14
34
  @category_counts = Hash.new(0)
15
35
  @category_word_count = Hash.new(0)
36
+ @cached_training_count = nil
37
+ @cached_vocab_size = nil
38
+ @dirty = false
39
+ @storage = nil
16
40
  end
17
41
 
18
- #
19
42
  # Provides a general training method for all categories specified in Bayes#new
20
43
  # For example:
21
44
  # b = Classifier::Bayes.new 'This', 'That', 'the_other'
22
45
  # b.train :this, "This text"
23
46
  # b.train "that", "That text"
24
47
  # b.train "The other", "The other text"
48
+ #
49
+ # @rbs (String | Symbol, String) -> void
25
50
  def train(category, text)
26
51
  category = category.prepare_category_name
27
- @category_counts[category] += 1
28
- text.word_hash.each do |word, count|
29
- @categories[category][word] ||= 0
30
- @categories[category][word] += count
31
- @total_words += count
32
- @category_word_count[category] += count
52
+ word_hash = text.word_hash
53
+ synchronize do
54
+ invalidate_caches
55
+ @dirty = true
56
+ @category_counts[category] += 1
57
+ word_hash.each do |word, count|
58
+ @categories[category][word] ||= 0
59
+ @categories[category][word] += count
60
+ @total_words += count
61
+ @category_word_count[category] += count
62
+ end
33
63
  end
34
64
  end
35
65
 
36
- #
37
66
  # Provides a untraining method for all categories specified in Bayes#new
38
67
  # Be very careful with this method.
39
68
  #
@@ -41,54 +70,179 @@ module Classifier
41
70
  # b = Classifier::Bayes.new 'This', 'That', 'the_other'
42
71
  # b.train :this, "This text"
43
72
  # b.untrain :this, "This text"
73
+ #
74
+ # @rbs (String | Symbol, String) -> void
44
75
  def untrain(category, text)
45
76
  category = category.prepare_category_name
46
- @category_counts[category] -= 1
47
- text.word_hash.each do |word, count|
48
- next unless @total_words >= 0
49
-
50
- orig = @categories[category][word] || 0
51
- @categories[category][word] ||= 0
52
- @categories[category][word] -= count
53
- if @categories[category][word] <= 0
54
- @categories[category].delete(word)
55
- count = orig
77
+ word_hash = text.word_hash
78
+ synchronize do
79
+ invalidate_caches
80
+ @dirty = true
81
+ @category_counts[category] -= 1
82
+ word_hash.each do |word, count|
83
+ next unless @total_words >= 0
84
+
85
+ orig = @categories[category][word] || 0
86
+ @categories[category][word] ||= 0
87
+ @categories[category][word] -= count
88
+ if @categories[category][word] <= 0
89
+ @categories[category].delete(word)
90
+ count = orig
91
+ end
92
+ @category_word_count[category] -= count if @category_word_count[category] >= count
93
+ @total_words -= count
56
94
  end
57
- @category_word_count[category] -= count if @category_word_count[category] >= count
58
- @total_words -= count
59
95
  end
60
96
  end
61
97
 
62
- #
63
98
  # Returns the scores in each category the provided +text+. E.g.,
64
99
  # b.classifications "I hate bad words and you"
65
100
  # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
66
101
  # The largest of these scores (the one closest to 0) is the one picked out by #classify
102
+ #
103
+ # @rbs (String) -> Hash[String, Float]
67
104
  def classifications(text)
68
- score = {}
69
- word_hash = text.word_hash
70
- training_count = @category_counts.values.inject { |x, y| x + y }.to_f
71
- @categories.each do |category, category_words|
72
- score[category.to_s] = 0
73
- total = (@category_word_count[category] || 1).to_f
74
- word_hash.each_key do |word|
75
- s = category_words.key?(word) ? category_words[word] : 0.1
76
- score[category.to_s] += Math.log(s / total)
105
+ words = text.word_hash.keys
106
+ synchronize do
107
+ training_count = cached_training_count
108
+ vocab_size = cached_vocab_size
109
+
110
+ @categories.to_h do |category, category_words|
111
+ smoothed_total = ((@category_word_count[category] || 0) + vocab_size).to_f
112
+
113
+ # Laplace smoothing: P(word|category) = (count + α) / (total + α * V)
114
+ word_score = words.sum { |w| Math.log(((category_words[w] || 0) + 1) / smoothed_total) }
115
+ prior_score = Math.log((@category_counts[category] || 0.1) / training_count)
116
+
117
+ [category.to_s, word_score + prior_score]
77
118
  end
78
- # now add prior probability for the category
79
- s = @category_counts.key?(category) ? @category_counts[category] : 0.1
80
- score[category.to_s] += Math.log(s / training_count)
81
119
  end
82
- score
83
120
  end
84
121
 
85
- #
86
122
  # Returns the classification of the provided +text+, which is one of the
87
123
  # categories given in the initializer. E.g.,
88
124
  # b.classify "I hate bad words and you"
89
125
  # => 'Uninteresting'
126
+ #
127
+ # @rbs (String) -> String
90
128
  def classify(text)
91
- (classifications(text).sort_by { |a| -a[1] })[0][0]
129
+ best = classifications(text).min_by { |a| -a[1] }
130
+ raise StandardError, 'No classifications available' unless best
131
+
132
+ best.first.to_s
133
+ end
134
+
135
+ # Returns a hash representation of the classifier state.
136
+ # This can be converted to JSON or used directly.
137
+ #
138
+ # @rbs () -> untyped
139
+ def as_json(*)
140
+ {
141
+ version: 1,
142
+ type: 'bayes',
143
+ categories: @categories.transform_keys(&:to_s).transform_values { |v| v.transform_keys(&:to_s) },
144
+ total_words: @total_words,
145
+ category_counts: @category_counts.transform_keys(&:to_s),
146
+ category_word_count: @category_word_count.transform_keys(&:to_s)
147
+ }
148
+ end
149
+
150
+ # Serializes the classifier state to a JSON string.
151
+ # This can be saved to a file and later loaded with Bayes.from_json.
152
+ #
153
+ # @rbs () -> String
154
+ def to_json(*)
155
+ as_json.to_json
156
+ end
157
+
158
+ # Loads a classifier from a JSON string or a Hash created by #to_json or #as_json.
159
+ #
160
+ # @rbs (String | Hash[String, untyped]) -> Bayes
161
+ def self.from_json(json)
162
+ data = json.is_a?(String) ? JSON.parse(json) : json
163
+ raise ArgumentError, "Invalid classifier type: #{data['type']}" unless data['type'] == 'bayes'
164
+
165
+ instance = allocate
166
+ instance.send(:restore_state, data)
167
+ instance
168
+ end
169
+
170
+ # Saves the classifier to the configured storage.
171
+ # Raises ArgumentError if no storage is configured.
172
+ #
173
+ # @rbs () -> void
174
+ def save
175
+ raise ArgumentError, 'No storage configured. Use save_to_file(path) or set storage=' unless storage
176
+
177
+ storage.write(to_json)
178
+ @dirty = false
179
+ end
180
+
181
+ # Saves the classifier state to a file (legacy API).
182
+ #
183
+ # @rbs (String) -> Integer
184
+ def save_to_file(path)
185
+ result = File.write(path, to_json)
186
+ @dirty = false
187
+ result
188
+ end
189
+
190
+ # Reloads the classifier from the configured storage.
191
+ # Raises UnsavedChangesError if there are unsaved changes.
192
+ # Use reload! to force reload and discard changes.
193
+ #
194
+ # @rbs () -> self
195
+ def reload
196
+ raise ArgumentError, 'No storage configured' unless storage
197
+ raise UnsavedChangesError, 'Unsaved changes would be lost. Call save first or use reload!' if @dirty
198
+
199
+ data = storage.read
200
+ raise StorageError, 'No saved state found' unless data
201
+
202
+ restore_from_json(data)
203
+ @dirty = false
204
+ self
205
+ end
206
+
207
+ # Force reloads the classifier from storage, discarding any unsaved changes.
208
+ #
209
+ # @rbs () -> self
210
+ def reload!
211
+ raise ArgumentError, 'No storage configured' unless storage
212
+
213
+ data = storage.read
214
+ raise StorageError, 'No saved state found' unless data
215
+
216
+ restore_from_json(data)
217
+ @dirty = false
218
+ self
219
+ end
220
+
221
+ # Returns true if there are unsaved changes.
222
+ #
223
+ # @rbs () -> bool
224
+ def dirty?
225
+ @dirty
226
+ end
227
+
228
+ # Loads a classifier from the configured storage.
229
+ # The storage is set on the returned instance.
230
+ #
231
+ # @rbs (storage: Storage::Base) -> Bayes
232
+ def self.load(storage:)
233
+ data = storage.read
234
+ raise StorageError, 'No saved state found' unless data
235
+
236
+ instance = from_json(data)
237
+ instance.storage = storage
238
+ instance
239
+ end
240
+
241
+ # Loads a classifier from a file (legacy API).
242
+ #
243
+ # @rbs (String) -> Bayes
244
+ def self.load_from_file(path)
245
+ from_json(File.read(path))
92
246
  end
93
247
 
94
248
  #
@@ -100,32 +254,30 @@ module Classifier
100
254
  # b.untrain_that "That text"
101
255
  # b.train_the_other "The other text"
102
256
  def method_missing(name, *args)
257
+ return super unless name.to_s =~ /(un)?train_(\w+)/
258
+
103
259
  category = name.to_s.gsub(/(un)?train_(\w+)/, '\2').prepare_category_name
104
- if @categories.key?(category)
105
- args.each do |text|
106
- if name.to_s.start_with?('untrain_')
107
- untrain(category, text)
108
- else
109
- train(category, text)
110
- end
111
- end
112
- elsif name.to_s =~ /(un)?train_(\w+)/
113
- raise StandardError, "No such category: #{category}"
114
- else
115
- super
116
- end
260
+ raise StandardError, "No such category: #{category}" unless @categories.key?(category)
261
+
262
+ method = name.to_s.start_with?('untrain_') ? :untrain : :train
263
+ args.each { |text| send(method, category, text) }
264
+ end
265
+
266
+ # @rbs (Symbol, ?bool) -> bool
267
+ def respond_to_missing?(name, include_private = false)
268
+ !!(name.to_s =~ /(un)?train_(\w+)/) || super
117
269
  end
118
270
 
119
- #
120
271
  # Provides a list of category names
121
272
  # For example:
122
273
  # b.categories
123
274
  # => ['This', 'That', 'the_other']
124
- def categories # :nodoc:
125
- @categories.keys.collect(&:to_s)
275
+ #
276
+ # @rbs () -> Array[String]
277
+ def categories
278
+ synchronize { @categories.keys.collect(&:to_s) }
126
279
  end
127
280
 
128
- #
129
281
  # Allows you to add categories to the classifier.
130
282
  # For example:
131
283
  # b.add_category "Not spam"
@@ -134,13 +286,34 @@ module Classifier
134
286
  # result in an undertrained category that will tend to match
135
287
  # more criteria than the trained selective categories. In short,
136
288
  # try to initialize your categories at initialization.
289
+ #
290
+ # @rbs (String | Symbol) -> Hash[Symbol, Integer]
137
291
  def add_category(category)
138
- @categories[category.prepare_category_name] = {}
292
+ synchronize do
293
+ invalidate_caches
294
+ @dirty = true
295
+ @categories[category.prepare_category_name] = {}
296
+ end
139
297
  end
140
298
 
141
299
  alias append_category add_category
142
300
 
143
- #
301
+ # Custom marshal serialization to exclude mutex state
302
+ # @rbs () -> Array[untyped]
303
+ def marshal_dump
304
+ [@categories, @total_words, @category_counts, @category_word_count, @dirty]
305
+ end
306
+
307
+ # Custom marshal deserialization to recreate mutex
308
+ # @rbs (Array[untyped]) -> void
309
+ def marshal_load(data)
310
+ mu_initialize
311
+ @categories, @total_words, @category_counts, @category_word_count, @dirty = data
312
+ @cached_training_count = nil
313
+ @cached_vocab_size = nil
314
+ @storage = nil
315
+ end
316
+
144
317
  # Allows you to remove categories from the classifier.
145
318
  # For example:
146
319
  # b.remove_category "Spam"
@@ -148,15 +321,76 @@ module Classifier
148
321
  # WARNING: Removing categories from a trained classifier will
149
322
  # result in the loss of all training data for that category.
150
323
  # Make sure you really want to do this before calling this method.
324
+ #
325
+ # @rbs (String | Symbol) -> void
151
326
  def remove_category(category)
152
327
  category = category.prepare_category_name
153
- raise StandardError, "No such category: #{category}" unless @categories.key?(category)
328
+ synchronize do
329
+ raise StandardError, "No such category: #{category}" unless @categories.key?(category)
330
+
331
+ invalidate_caches
332
+ @dirty = true
333
+ @total_words -= @category_word_count[category].to_i
154
334
 
155
- @total_words -= @category_word_count[category].to_i
335
+ @categories.delete(category)
336
+ @category_counts.delete(category)
337
+ @category_word_count.delete(category)
338
+ end
339
+ end
340
+
341
+ private
342
+
343
+ # Restores classifier state from a JSON string (used by reload)
344
+ # @rbs (String) -> void
345
+ def restore_from_json(json)
346
+ data = JSON.parse(json)
347
+ raise ArgumentError, "Invalid classifier type: #{data['type']}" unless data['type'] == 'bayes'
348
+
349
+ synchronize do
350
+ restore_state(data)
351
+ end
352
+ end
353
+
354
+ # Restores classifier state from a hash (used by from_json)
355
+ # @rbs (Hash[String, untyped]) -> void
356
+ def restore_state(data)
357
+ mu_initialize
358
+ @categories = {} #: Hash[Symbol, Hash[Symbol, Integer]]
359
+ @total_words = data['total_words']
360
+ @category_counts = Hash.new(0) #: Hash[Symbol, Integer]
361
+ @category_word_count = Hash.new(0) #: Hash[Symbol, Integer]
362
+ @cached_training_count = nil
363
+ @cached_vocab_size = nil
364
+ @dirty = false
365
+ @storage = nil
366
+
367
+ data['categories'].each do |cat_name, words|
368
+ @categories[cat_name.to_sym] = words.transform_keys(&:to_sym)
369
+ end
370
+
371
+ data['category_counts'].each do |cat_name, count|
372
+ @category_counts[cat_name.to_sym] = count
373
+ end
374
+
375
+ data['category_word_count'].each do |cat_name, count|
376
+ @category_word_count[cat_name.to_sym] = count
377
+ end
378
+ end
379
+
380
+ # @rbs () -> void
381
+ def invalidate_caches
382
+ @cached_training_count = nil
383
+ @cached_vocab_size = nil
384
+ end
385
+
386
+ # @rbs () -> Float
387
+ def cached_training_count
388
+ @cached_training_count ||= @category_counts.values.sum.to_f
389
+ end
156
390
 
157
- @categories.delete(category)
158
- @category_counts.delete(category)
159
- @category_word_count.delete(category)
391
+ # @rbs () -> Integer
392
+ def cached_vocab_size
393
+ @cached_vocab_size ||= [@categories.values.flat_map(&:keys).uniq.size, 1].max
160
394
  end
161
395
  end
162
396
  end
@@ -0,0 +1,16 @@
1
+ # rbs_inline: enabled
2
+
3
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
4
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
5
+ # License:: LGPL
6
+
7
+ module Classifier
8
+ # Base error class for all Classifier errors
9
+ class Error < StandardError; end
10
+
11
+ # Raised when reload would discard unsaved changes
12
+ class UnsavedChangesError < Error; end
13
+
14
+ # Raised when a storage operation fails
15
+ class StorageError < Error; end
16
+ end
@@ -1,3 +1,5 @@
1
+ # rbs_inline: enabled
2
+
1
3
  # Author:: Ernest Ellingson
2
4
  # Copyright:: Copyright (c) 2005
3
5
 
@@ -5,30 +7,41 @@
5
7
 
6
8
  require 'matrix'
7
9
 
10
+ # @rbs skip
8
11
  class Array
9
- def sum_with_identity(identity = 0.0, &block)
12
+ def sum_with_identity(identity = 0.0, &)
10
13
  return identity unless size.to_i.positive?
14
+ return map(&).sum_with_identity(identity) if block_given?
11
15
 
12
- if block_given?
13
- map(&block).sum_with_identity(identity)
14
- else
15
- compact.reduce(:+).to_f || identity.to_f
16
- end
16
+ compact.reduce(identity, :+).to_f
17
17
  end
18
18
  end
19
19
 
20
- module VectorExtensions
20
+ # @rbs skip
21
+ class Vector
22
+ EPSILON = 1e-10
23
+
24
+ # Cache magnitude since Vector is immutable after creation
25
+ # Note: We undefine the matrix gem's normalize method first, then redefine it
26
+ # to provide a more robust implementation that handles zero vectors
27
+ undef_method :normalize if method_defined?(:normalize)
28
+
21
29
  def magnitude
22
- sum_of_squares = 0.to_r
23
- size.times do |i|
24
- sum_of_squares += self[i]**2.to_r
30
+ # Cache magnitude since Vector is immutable after creation
31
+ @magnitude ||= begin
32
+ sum_of_squares = 0.to_r
33
+ size.times do |i|
34
+ sum_of_squares += self[i]**2.to_r
35
+ end
36
+ Math.sqrt(sum_of_squares.to_f)
25
37
  end
26
- Math.sqrt(sum_of_squares.to_f)
27
38
  end
28
39
 
29
40
  def normalize
41
+ magnitude_value = magnitude
42
+ return Vector[*Array.new(size, 0.0)] if magnitude_value <= 0.0
43
+
30
44
  normalized_values = []
31
- magnitude_value = magnitude.to_r
32
45
  size.times do |i|
33
46
  normalized_values << (self[i] / magnitude_value)
34
47
  end
@@ -36,10 +49,7 @@ module VectorExtensions
36
49
  end
37
50
  end
38
51
 
39
- class Vector
40
- include VectorExtensions
41
- end
42
-
52
+ # @rbs skip
43
53
  class Matrix
44
54
  def self.diag(diagonal_elements)
45
55
  Matrix.diagonal(*diagonal_elements)
@@ -61,14 +71,19 @@ class Matrix
61
71
 
62
72
  loop do
63
73
  iteration_count += 1
64
- (0...q_rotation_matrix.row_size - 1).each do |row|
65
- (1..q_rotation_matrix.row_size - 1).each do |col|
74
+ (0...(q_rotation_matrix.row_size - 1)).each do |row|
75
+ (1..(q_rotation_matrix.row_size - 1)).each do |col|
66
76
  next if row == col
67
77
 
68
- angle = Math.atan((2.to_r * q_rotation_matrix[row,
69
- col]) / (q_rotation_matrix[row,
70
- row] - q_rotation_matrix[col,
71
- col])) / 2.0
78
+ numerator = 2.0 * q_rotation_matrix[row, col]
79
+ denominator = q_rotation_matrix[row, row] - q_rotation_matrix[col, col]
80
+
81
+ angle = if denominator.abs < Vector::EPSILON
82
+ numerator >= 0 ? Math::PI / 4.0 : -Math::PI / 4.0
83
+ else
84
+ Math.atan(numerator / denominator) / 2.0
85
+ end
86
+
72
87
  cosine = Math.cos(angle)
73
88
  sine = Math.sin(angle)
74
89
  rotation_matrix = Matrix.identity(q_rotation_matrix.row_size)
@@ -92,11 +107,12 @@ class Matrix
92
107
  break if (sum_of_differences <= 0.001 && iteration_count > 1) || iteration_count >= max_sweeps
93
108
  end
94
109
 
95
- singular_values = []
96
- q_rotation_matrix.row_size.times do |r|
97
- singular_values << Math.sqrt(q_rotation_matrix[r, r].to_f)
110
+ singular_values = q_rotation_matrix.row_size.times.map do |r|
111
+ Math.sqrt([q_rotation_matrix[r, r].to_f, 0.0].max)
98
112
  end
99
- u_matrix = (row_size >= column_size ? self : trans) * v_matrix * Matrix.diagonal(*singular_values).inverse
113
+
114
+ safe_singular_values = singular_values.map { |v| [v, Vector::EPSILON].max }
115
+ u_matrix = (row_size >= column_size ? self : trans) * v_matrix * Matrix.diagonal(*safe_singular_values).inverse
100
116
  [u_matrix, v_matrix, singular_values]
101
117
  end
102
118
 
@@ -1,3 +1,5 @@
1
+ # rbs_inline: enabled
2
+
1
3
  # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
4
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
5
  # License:: LGPL
@@ -11,12 +13,14 @@ class String
11
13
  # E.g.,
12
14
  # "Hello (greeting's), with {braces} < >...?".without_punctuation
13
15
  # => "Hello greetings with braces "
16
+ # @rbs () -> String
14
17
  def without_punctuation
15
- tr(',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', ' ').tr("'\-", '')
18
+ tr(',?.!;:"@#$%^&*()_=+[]{}|<>/`~', ' ').tr("'-", '')
16
19
  end
17
20
 
18
21
  # Return a Hash of strings => ints. Each word in the string is stemmed,
19
22
  # interned, and indexes to its frequency in the document.
23
+ # @rbs () -> Hash[Symbol, Integer]
20
24
  def word_hash
21
25
  word_hash = clean_word_hash
22
26
  symbol_hash = word_hash_for_symbols(gsub(/\w/, ' ').split)
@@ -24,12 +28,14 @@ class String
24
28
  end
25
29
 
26
30
  # Return a word hash without extra punctuation or short symbols, just stemmed words
31
+ # @rbs () -> Hash[Symbol, Integer]
27
32
  def clean_word_hash
28
33
  word_hash_for_words gsub(/[^\w\s]/, '').split
29
34
  end
30
35
 
31
36
  private
32
37
 
38
+ # @rbs (Array[String]) -> Hash[Symbol, Integer]
33
39
  def word_hash_for_words(words)
34
40
  d = Hash.new(0)
35
41
  words.each do |word|
@@ -39,6 +45,7 @@ class String
39
45
  d
40
46
  end
41
47
 
48
+ # @rbs (Array[String]) -> Hash[Symbol, Integer]
42
49
  def word_hash_for_symbols(words)
43
50
  d = Hash.new(0)
44
51
  words.each do |word|