lda-ruby 0.3.9 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +5 -13
  2. data/CHANGELOG.md +16 -0
  3. data/Gemfile +9 -0
  4. data/README.md +126 -3
  5. data/VERSION.yml +3 -3
  6. data/docs/modernization-handoff.md +233 -0
  7. data/docs/porting-strategy.md +148 -0
  8. data/docs/precompiled-platform-policy.md +81 -0
  9. data/docs/precompiled-target-evaluation.md +67 -0
  10. data/docs/release-runbook.md +192 -0
  11. data/docs/rust-orchestration-guardrails.md +50 -0
  12. data/ext/lda-ruby/cokus.c +10 -11
  13. data/ext/lda-ruby/cokus.h +3 -3
  14. data/ext/lda-ruby/extconf.rb +10 -6
  15. data/ext/lda-ruby/lda-inference.c +23 -7
  16. data/ext/lda-ruby/utils.c +8 -0
  17. data/ext/lda-ruby-rust/Cargo.toml +12 -0
  18. data/ext/lda-ruby-rust/README.md +73 -0
  19. data/ext/lda-ruby-rust/extconf.rb +135 -0
  20. data/ext/lda-ruby-rust/include/strings.h +35 -0
  21. data/ext/lda-ruby-rust/src/lib.rs +1263 -0
  22. data/lda-ruby.gemspec +0 -0
  23. data/lib/lda-ruby/backends/base.rb +133 -0
  24. data/lib/lda-ruby/backends/native.rb +158 -0
  25. data/lib/lda-ruby/backends/pure_ruby.rb +675 -0
  26. data/lib/lda-ruby/backends/rust.rb +607 -0
  27. data/lib/lda-ruby/backends.rb +58 -0
  28. data/lib/lda-ruby/corpus/corpus.rb +17 -15
  29. data/lib/lda-ruby/corpus/data_corpus.rb +2 -2
  30. data/lib/lda-ruby/corpus/directory_corpus.rb +2 -2
  31. data/lib/lda-ruby/corpus/text_corpus.rb +2 -2
  32. data/lib/lda-ruby/document/document.rb +6 -6
  33. data/lib/lda-ruby/document/text_document.rb +5 -4
  34. data/lib/lda-ruby/rust_build_policy.rb +21 -0
  35. data/lib/lda-ruby/version.rb +5 -0
  36. data/lib/lda-ruby.rb +293 -48
  37. data/test/backend_compatibility_test.rb +146 -0
  38. data/test/backends_selection_test.rb +100 -0
  39. data/test/benchmark_scripts_test.rb +23 -0
  40. data/test/gemspec_test.rb +27 -0
  41. data/test/lda_ruby_test.rb +49 -11
  42. data/test/packaged_gem_smoke_test.rb +33 -0
  43. data/test/pure_ruby_orchestration_test.rb +109 -0
  44. data/test/release_scripts_test.rb +93 -0
  45. data/test/rust_build_policy_test.rb +23 -0
  46. data/test/rust_orchestration_test.rb +911 -0
  47. data/test/simple_pipeline_test.rb +22 -0
  48. data/test/simple_yaml.rb +1 -7
  49. data/test/test_helper.rb +5 -6
  50. metadata +54 -38
  51. data/Rakefile +0 -61
  52. data/ext/lda-ruby/Makefile +0 -181
  53. data/test/data/.gitignore +0 -2
  54. data/test/simple_test.rb +0 -26
@@ -0,0 +1,675 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Lda
4
+ module Backends
5
+ class PureRuby < Base
6
+ MIN_PROBABILITY = 1e-12
7
+
8
+ def initialize(random_seed: nil)
9
+ super(random_seed: random_seed)
10
+ @beta_probabilities = nil
11
+ @beta_log = nil
12
+ @gamma = nil
13
+ @phi = nil
14
+ @topic_weights_kernel = nil
15
+ @topic_term_accumulator_kernel = nil
16
+ @document_inference_kernel = nil
17
+ @corpus_iteration_kernel = nil
18
+ @topic_term_finalizer_kernel = nil
19
+ @gamma_shift_kernel = nil
20
+ @topic_document_probability_kernel = nil
21
+ @topic_term_seed_kernel = nil
22
+ @trusted_kernel_outputs = false
23
+ end
24
+
25
+ attr_writer :topic_weights_kernel,
26
+ :topic_term_accumulator_kernel,
27
+ :document_inference_kernel,
28
+ :corpus_iteration_kernel,
29
+ :topic_term_finalizer_kernel,
30
+ :gamma_shift_kernel,
31
+ :topic_document_probability_kernel,
32
+ :topic_term_seed_kernel,
33
+ :trusted_kernel_outputs
34
+
35
+ def name
36
+ "pure_ruby"
37
+ end
38
+
39
+ def corpus=(corpus)
40
+ super
41
+ @beta_probabilities = nil
42
+ @beta_log = nil
43
+ @gamma = nil
44
+ @phi = nil
45
+ true
46
+ end
47
+
48
+ def em(start)
49
+ em_input = build_em_input(start)
50
+ return nil if em_input.nil?
51
+
52
+ run_em_iterations(em_input)
53
+ nil
54
+ end
55
+
56
+ # Returns an EM input snapshot that can be reused by Rust orchestration
57
+ # and Ruby fallback paths without re-sampling random initialization.
58
+ def rust_em_input(start)
59
+ build_em_input(start)
60
+ end
61
+
62
+ # Returns only the initial beta matrix for Rust compatibility paths that
63
+ # already hold a cached corpus snapshot.
64
+ def rust_initial_beta_probabilities(start, document_words, document_counts, topics, terms)
65
+ start_mode = start.to_s
66
+
67
+ if start_mode.strip.casecmp("seeded").zero? || start_mode.strip.casecmp("deterministic").zero?
68
+ seeded_topic_term_probabilities(
69
+ Integer(topics),
70
+ Integer(terms),
71
+ document_words,
72
+ document_counts
73
+ )
74
+ else
75
+ initial_topic_term_probabilities(Integer(topics), Integer(terms))
76
+ end
77
+ end
78
+
79
+ def em_from_input(em_input)
80
+ return nil if em_input.nil?
81
+
82
+ run_em_iterations(em_input)
83
+ nil
84
+ end
85
+
86
+ def apply_em_state(beta_probabilities:, beta_log:, gamma:, phi:)
87
+ @beta_probabilities = beta_probabilities
88
+ @beta_log = beta_log
89
+ @gamma = gamma
90
+ @phi = phi
91
+
92
+ nil
93
+ end
94
+
95
+ def beta
96
+ @beta_log || []
97
+ end
98
+
99
+ def gamma
100
+ @gamma || []
101
+ end
102
+
103
+ def compute_phi
104
+ clone_matrix(@phi || [])
105
+ end
106
+
107
+ def model
108
+ [Integer(num_topics), max_term_index + 1, Float(init_alpha)]
109
+ end
110
+
111
+ def topic_document_probability(phi_matrix, document_counts)
112
+ kernel_output = nil
113
+ if @topic_document_probability_kernel
114
+ kernel_output = @topic_document_probability_kernel.call(
115
+ phi_matrix,
116
+ document_counts,
117
+ Integer(num_topics),
118
+ MIN_PROBABILITY
119
+ )
120
+ end
121
+
122
+ if valid_topic_document_probability_output?(kernel_output, document_counts.size, Integer(num_topics))
123
+ if @trusted_kernel_outputs
124
+ kernel_output
125
+ else
126
+ kernel_output.map { |row| row.map(&:to_f) }
127
+ end
128
+ else
129
+ default_topic_document_probability(phi_matrix, document_counts)
130
+ end
131
+ rescue StandardError
132
+ default_topic_document_probability(phi_matrix, document_counts)
133
+ end
134
+
135
+ private
136
+
137
+ def build_em_input(start)
138
+ return nil if @corpus.nil? || @corpus.num_docs.zero?
139
+
140
+ topics = Integer(num_topics)
141
+ raise ArgumentError, "num_topics must be greater than zero" if topics <= 0
142
+
143
+ terms = max_term_index + 1
144
+ raise ArgumentError, "corpus must contain terms" if terms <= 0
145
+
146
+ document_words = @corpus.documents.map { |document| document.words.map(&:to_i) }
147
+ document_counts = @corpus.documents.map { |document| document.counts.map(&:to_f) }
148
+
149
+ {
150
+ topics: topics,
151
+ terms: terms,
152
+ document_words: document_words,
153
+ document_counts: document_counts,
154
+ document_totals: document_counts.map { |counts| counts.sum.to_f },
155
+ document_lengths: document_words.map(&:length),
156
+ initial_beta_probabilities: rust_initial_beta_probabilities(
157
+ start,
158
+ document_words,
159
+ document_counts,
160
+ topics,
161
+ terms
162
+ ),
163
+ min_probability: MIN_PROBABILITY
164
+ }
165
+ end
166
+
167
+ def run_em_iterations(em_input)
168
+ topics = em_input.fetch(:topics)
169
+ terms = em_input.fetch(:terms)
170
+ document_words = em_input.fetch(:document_words)
171
+ document_counts = em_input.fetch(:document_counts)
172
+ document_totals = em_input.fetch(:document_totals)
173
+ document_lengths = em_input.fetch(:document_lengths)
174
+
175
+ @beta_probabilities = em_input.fetch(:initial_beta_probabilities)
176
+ previous_gamma = nil
177
+
178
+ Integer(em_max_iter).times do
179
+ if @trusted_kernel_outputs && @corpus_iteration_kernel
180
+ current_gamma, current_phi, topic_term_counts = infer_corpus_iteration(
181
+ nil,
182
+ document_words,
183
+ document_counts,
184
+ document_totals,
185
+ document_lengths,
186
+ topics,
187
+ terms
188
+ )
189
+ else
190
+ topic_term_counts = Array.new(topics) { Array.new(terms, MIN_PROBABILITY) }
191
+ current_gamma, current_phi, topic_term_counts = infer_corpus_iteration(
192
+ topic_term_counts,
193
+ document_words,
194
+ document_counts,
195
+ document_totals,
196
+ document_lengths,
197
+ topics,
198
+ terms
199
+ )
200
+ end
201
+
202
+ @beta_probabilities, @beta_log = finalize_topic_term_counts(topic_term_counts)
203
+ @gamma = current_gamma
204
+ @phi = current_phi
205
+
206
+ break if previous_gamma && average_gamma_shift(previous_gamma, current_gamma) <= Float(em_convergence)
207
+
208
+ previous_gamma = current_gamma
209
+ end
210
+ end
211
+
212
+ def max_term_index
213
+ return -1 if @corpus.nil? || @corpus.documents.empty?
214
+
215
+ @corpus.documents
216
+ .flat_map(&:words)
217
+ .max || -1
218
+ end
219
+
220
+ def initial_topic_term_probabilities(topics, terms)
221
+ Array.new(topics) do
222
+ weights = Array.new(terms) { @random.rand + MIN_PROBABILITY }
223
+ normalize!(weights)
224
+ end
225
+ end
226
+
227
+ def seeded_topic_term_probabilities(topics, terms, document_words, document_counts)
228
+ kernel_output = nil
229
+ if @topic_term_seed_kernel
230
+ kernel_output = @topic_term_seed_kernel.call(
231
+ document_words,
232
+ document_counts,
233
+ Integer(topics),
234
+ Integer(terms),
235
+ MIN_PROBABILITY
236
+ )
237
+ end
238
+
239
+ if valid_seeded_topic_term_probabilities?(kernel_output, topics, terms)
240
+ if @trusted_kernel_outputs
241
+ kernel_output
242
+ else
243
+ kernel_output.map { |weights| normalize!(weights.map(&:to_f)) }
244
+ end
245
+ else
246
+ default_seeded_topic_term_probabilities(topics, terms, document_words, document_counts)
247
+ end
248
+ rescue StandardError
249
+ default_seeded_topic_term_probabilities(topics, terms, document_words, document_counts)
250
+ end
251
+
252
+ def valid_seeded_topic_term_probabilities?(matrix, expected_topics, expected_terms)
253
+ return false unless matrix.is_a?(Array)
254
+ return false unless matrix.size == expected_topics
255
+
256
+ matrix.each do |row|
257
+ return false unless row.is_a?(Array)
258
+ return false unless row.size == expected_terms
259
+ row.each do |value|
260
+ return false unless value.is_a?(Numeric)
261
+ return false unless value.finite?
262
+ end
263
+ end
264
+
265
+ true
266
+ end
267
+
268
+ def default_seeded_topic_term_probabilities(topics, terms, document_words, document_counts)
269
+ topic_term_counts = Array.new(topics) { Array.new(terms, MIN_PROBABILITY) }
270
+
271
+ document_words.each_with_index do |words, document_index|
272
+ topic_index = document_index % topics
273
+ counts = document_counts[document_index] || []
274
+
275
+ words.each_with_index do |word_index, word_offset|
276
+ next if word_index >= terms
277
+
278
+ topic_term_counts[topic_index][word_index] += counts[word_offset].to_f
279
+ end
280
+ end
281
+
282
+ topic_term_counts.map { |weights| normalize!(weights) }
283
+ end
284
+
285
+ def topic_weights_for_word(word_index, gamma_d)
286
+ kernel_weights = nil
287
+ if @topic_weights_kernel
288
+ kernel_weights = @topic_weights_kernel.call(@beta_probabilities, gamma_d, Integer(word_index), MIN_PROBABILITY)
289
+ end
290
+
291
+ weights =
292
+ if valid_topic_weights?(kernel_weights, gamma_d.length)
293
+ kernel_weights.map(&:to_f)
294
+ else
295
+ default_topic_weights_for_word(word_index, gamma_d)
296
+ end
297
+
298
+ normalize!(weights)
299
+ rescue StandardError
300
+ normalize!(default_topic_weights_for_word(word_index, gamma_d))
301
+ end
302
+
303
+ def valid_topic_weights?(weights, expected_size)
304
+ weights.is_a?(Array) && weights.size == expected_size
305
+ end
306
+
307
+ def default_topic_weights_for_word(word_index, gamma_d)
308
+ topics = gamma_d.length
309
+
310
+ Array.new(topics) do |topic_index|
311
+ @beta_probabilities[topic_index][word_index] * [gamma_d[topic_index], MIN_PROBABILITY].max
312
+ end
313
+ end
314
+
315
+ def infer_document(gamma_initial, phi_initial, words, counts)
316
+ kernel_output = nil
317
+
318
+ if @document_inference_kernel
319
+ kernel_output = @document_inference_kernel.call(
320
+ @beta_probabilities,
321
+ gamma_initial,
322
+ words.map(&:to_i),
323
+ counts.map(&:to_f),
324
+ Integer(max_iter),
325
+ Float(convergence),
326
+ MIN_PROBABILITY,
327
+ Float(init_alpha)
328
+ )
329
+ end
330
+
331
+ if valid_document_inference_output?(kernel_output, gamma_initial.length, phi_initial.length)
332
+ if @trusted_kernel_outputs
333
+ [kernel_output[0], kernel_output[1]]
334
+ else
335
+ gamma_out = kernel_output[0].map(&:to_f)
336
+ phi_out = kernel_output[1].map { |row| normalize!(row.map(&:to_f)) }
337
+ [gamma_out, phi_out]
338
+ end
339
+ else
340
+ default_infer_document(gamma_initial, phi_initial, words, counts)
341
+ end
342
+ rescue StandardError
343
+ default_infer_document(gamma_initial, phi_initial, words, counts)
344
+ end
345
+
346
+ def valid_document_inference_output?(output, expected_topics, expected_length)
347
+ return false unless output.is_a?(Array)
348
+ return false unless output.size == 2
349
+
350
+ gamma_out = output[0]
351
+ phi_out = output[1]
352
+
353
+ return false unless gamma_out.is_a?(Array) && gamma_out.size == expected_topics
354
+ return false unless phi_out.is_a?(Array) && phi_out.size == expected_length
355
+
356
+ phi_out.all? { |row| row.is_a?(Array) && row.size == expected_topics }
357
+ end
358
+
359
+ def default_infer_document(gamma_initial, phi_initial, words, counts)
360
+ topics = gamma_initial.length
361
+ gamma_d = gamma_initial.dup
362
+ phi_d = phi_initial
363
+
364
+ Integer(max_iter).times do
365
+ gamma_next = Array.new(topics, Float(init_alpha))
366
+
367
+ words.each_with_index do |word_index, word_offset|
368
+ topic_weights = topic_weights_for_word(word_index, gamma_d)
369
+ phi_d[word_offset] = topic_weights
370
+
371
+ count = counts[word_offset].to_f
372
+ topics.times do |topic_index|
373
+ gamma_next[topic_index] += count * topic_weights[topic_index]
374
+ end
375
+ end
376
+
377
+ gamma_shift = max_absolute_distance(gamma_d, gamma_next)
378
+ gamma_d = gamma_next
379
+ break if gamma_shift <= Float(convergence)
380
+ end
381
+
382
+ [gamma_d, phi_d]
383
+ end
384
+
385
+ def infer_corpus_iteration(
386
+ topic_term_counts_initial,
387
+ document_words,
388
+ document_counts,
389
+ document_totals,
390
+ document_lengths,
391
+ topics,
392
+ terms
393
+ )
394
+ topic_term_counts_fallback =
395
+ topic_term_counts_initial || Array.new(topics) { Array.new(terms, MIN_PROBABILITY) }
396
+ kernel_output = nil
397
+
398
+ if @corpus_iteration_kernel
399
+ kernel_output = @corpus_iteration_kernel.call(
400
+ @beta_probabilities,
401
+ document_words,
402
+ document_counts,
403
+ Integer(max_iter),
404
+ Float(convergence),
405
+ MIN_PROBABILITY,
406
+ Float(init_alpha)
407
+ )
408
+ end
409
+
410
+ if valid_corpus_iteration_output?(kernel_output, document_words.size, document_lengths, topics, terms)
411
+ if @trusted_kernel_outputs
412
+ [kernel_output[0], kernel_output[1], kernel_output[2]]
413
+ else
414
+ current_gamma = kernel_output[0].map { |row| row.map(&:to_f) }
415
+ current_phi = kernel_output[1].map do |doc_phi|
416
+ doc_phi.map { |row| normalize!(row.map(&:to_f)) }
417
+ end
418
+ topic_term_counts = kernel_output[2].map { |row| row.map(&:to_f) }
419
+
420
+ [current_gamma, current_phi, topic_term_counts]
421
+ end
422
+ else
423
+ default_infer_corpus_iteration(
424
+ topic_term_counts_fallback,
425
+ document_words,
426
+ document_counts,
427
+ document_totals,
428
+ topics
429
+ )
430
+ end
431
+ rescue StandardError
432
+ default_infer_corpus_iteration(
433
+ topic_term_counts_fallback,
434
+ document_words,
435
+ document_counts,
436
+ document_totals,
437
+ topics
438
+ )
439
+ end
440
+
441
+ def valid_corpus_iteration_output?(output, expected_docs, expected_lengths, expected_topics, expected_terms)
442
+ return false unless output.is_a?(Array)
443
+ return false unless output.size == 3
444
+
445
+ gamma_matrix = output[0]
446
+ phi_tensor = output[1]
447
+ topic_term_counts = output[2]
448
+
449
+ return false unless gamma_matrix.is_a?(Array) && gamma_matrix.size == expected_docs
450
+ return false unless phi_tensor.is_a?(Array) && phi_tensor.size == expected_docs
451
+ return false unless topic_term_counts.is_a?(Array) && topic_term_counts.size == expected_topics
452
+
453
+ gamma_matrix.each do |row|
454
+ return false unless row.is_a?(Array) && row.size == expected_topics
455
+ end
456
+
457
+ phi_tensor.each_with_index do |doc_phi, index|
458
+ return false unless doc_phi.is_a?(Array) && doc_phi.size == expected_lengths[index]
459
+ doc_phi.each do |row|
460
+ return false unless row.is_a?(Array) && row.size == expected_topics
461
+ end
462
+ end
463
+
464
+ topic_term_counts.each do |row|
465
+ return false unless row.is_a?(Array) && row.size == expected_terms
466
+ end
467
+
468
+ true
469
+ end
470
+
471
+ def default_infer_corpus_iteration(
472
+ topic_term_counts_initial,
473
+ document_words,
474
+ document_counts,
475
+ document_totals,
476
+ topics
477
+ )
478
+ topic_term_counts = topic_term_counts_initial
479
+ current_gamma = Array.new(document_words.size) { Array.new(topics, Float(init_alpha)) }
480
+ current_phi = Array.new(document_words.size)
481
+
482
+ document_words.each_with_index do |words, document_index|
483
+ counts = document_counts[document_index]
484
+ total = document_totals[document_index].to_f
485
+
486
+ gamma_d = Array.new(topics, Float(init_alpha) + (total / topics))
487
+ phi_d = Array.new(words.length) { Array.new(topics, 1.0 / topics) }
488
+
489
+ gamma_d, phi_d = infer_document(gamma_d, phi_d, words, counts)
490
+
491
+ current_gamma[document_index] = gamma_d
492
+ current_phi[document_index] = phi_d
493
+ topic_term_counts = accumulate_topic_term_counts(topic_term_counts, phi_d, words, counts)
494
+ end
495
+
496
+ [current_gamma, current_phi, topic_term_counts]
497
+ end
498
+
499
+ def accumulate_topic_term_counts(topic_term_counts, phi_d, words, counts)
500
+ kernel_counts = nil
501
+ if @topic_term_accumulator_kernel
502
+ kernel_counts = @topic_term_accumulator_kernel.call(
503
+ topic_term_counts,
504
+ phi_d,
505
+ words.map(&:to_i),
506
+ counts.map(&:to_f)
507
+ )
508
+ end
509
+
510
+ if valid_topic_term_counts?(kernel_counts, topic_term_counts)
511
+ kernel_counts
512
+ else
513
+ default_accumulate_topic_term_counts(topic_term_counts, phi_d, words, counts)
514
+ end
515
+ rescue StandardError
516
+ default_accumulate_topic_term_counts(topic_term_counts, phi_d, words, counts)
517
+ end
518
+
519
+ def valid_topic_term_counts?(candidate, reference)
520
+ return false unless candidate.is_a?(Array)
521
+ return false unless candidate.size == reference.size
522
+
523
+ candidate.each_with_index do |row, index|
524
+ return false unless row.is_a?(Array)
525
+ return false unless row.size == reference[index].size
526
+ end
527
+
528
+ true
529
+ end
530
+
531
+ def default_accumulate_topic_term_counts(topic_term_counts, phi_d, words, counts)
532
+ topics = topic_term_counts.size
533
+
534
+ words.each_with_index do |word_index, word_offset|
535
+ count = counts[word_offset].to_f
536
+ next if count.zero?
537
+
538
+ topics.times do |topic_index|
539
+ topic_term_counts[topic_index][word_index] += count * phi_d[word_offset][topic_index]
540
+ end
541
+ end
542
+
543
+ topic_term_counts
544
+ end
545
+
546
+ def finalize_topic_term_counts(topic_term_counts)
547
+ kernel_output = nil
548
+ if @topic_term_finalizer_kernel
549
+ kernel_output = @topic_term_finalizer_kernel.call(topic_term_counts, MIN_PROBABILITY)
550
+ end
551
+
552
+ if valid_topic_term_finalization_output?(kernel_output, topic_term_counts)
553
+ if @trusted_kernel_outputs
554
+ [kernel_output[0], kernel_output[1]]
555
+ else
556
+ beta_probabilities = kernel_output[0].map { |row| row.map(&:to_f) }
557
+ beta_log = kernel_output[1].map { |row| row.map(&:to_f) }
558
+ [beta_probabilities, beta_log]
559
+ end
560
+ else
561
+ default_finalize_topic_term_counts(topic_term_counts)
562
+ end
563
+ rescue StandardError
564
+ default_finalize_topic_term_counts(topic_term_counts)
565
+ end
566
+
567
+ def valid_topic_term_finalization_output?(output, topic_term_counts)
568
+ return false unless output.is_a?(Array)
569
+ return false unless output.size == 2
570
+
571
+ beta_probabilities = output[0]
572
+ beta_log = output[1]
573
+ return false unless beta_probabilities.is_a?(Array) && beta_log.is_a?(Array)
574
+ return false unless beta_probabilities.size == topic_term_counts.size
575
+ return false unless beta_log.size == topic_term_counts.size
576
+
577
+ beta_probabilities.each_with_index do |row, index|
578
+ return false unless row.is_a?(Array)
579
+ return false unless row.size == topic_term_counts[index].size
580
+ end
581
+
582
+ beta_log.each_with_index do |row, index|
583
+ return false unless row.is_a?(Array)
584
+ return false unless row.size == topic_term_counts[index].size
585
+ end
586
+
587
+ true
588
+ end
589
+
590
+ def default_finalize_topic_term_counts(topic_term_counts)
591
+ beta_probabilities = topic_term_counts.map { |weights| normalize!(weights) }
592
+ beta_log = beta_probabilities.map do |topic_weights|
593
+ topic_weights.map { |probability| Math.log([probability, MIN_PROBABILITY].max) }
594
+ end
595
+
596
+ [beta_probabilities, beta_log]
597
+ end
598
+
599
+ def valid_topic_document_probability_output?(output, expected_docs, expected_topics)
600
+ return false unless output.is_a?(Array)
601
+ return false unless output.size == expected_docs
602
+
603
+ output.each do |row|
604
+ return false unless row.is_a?(Array)
605
+ return false unless row.size == expected_topics
606
+ row.each do |value|
607
+ return false unless value.is_a?(Numeric)
608
+ return false unless value.finite?
609
+ end
610
+ end
611
+
612
+ true
613
+ end
614
+
615
+ def default_topic_document_probability(phi_matrix, document_counts)
616
+ topics = Integer(num_topics)
617
+ output = []
618
+
619
+ document_counts.each_with_index do |counts, doc_index|
620
+ tops = Array.new(topics, 0.0)
621
+ ttl = counts.inject(0.0) { |sum, value| sum + value.to_f }
622
+ doc_phi = phi_matrix[doc_index] || []
623
+
624
+ doc_phi.each_with_index do |word_dist, word_idx|
625
+ count = counts[word_idx].to_f
626
+ next if count.zero?
627
+
628
+ topics.times do |topic_idx|
629
+ top_prob = word_dist[topic_idx].to_f
630
+ tops[topic_idx] += Math.log([top_prob, MIN_PROBABILITY].max) * count
631
+ end
632
+ end
633
+
634
+ tops = tops.map { |value| value / ttl } if ttl.positive?
635
+ output << tops
636
+ end
637
+
638
+ output
639
+ end
640
+
641
+ def max_absolute_distance(left, right)
642
+ left.zip(right).map { |a, b| (a - b).abs }.max.to_f
643
+ end
644
+
645
+ def average_gamma_shift(previous_gamma, current_gamma)
646
+ kernel_shift = nil
647
+ if @gamma_shift_kernel
648
+ kernel_shift = @gamma_shift_kernel.call(previous_gamma, current_gamma)
649
+ end
650
+
651
+ if kernel_shift.is_a?(Numeric) && kernel_shift.finite? && kernel_shift >= 0.0
652
+ kernel_shift.to_f
653
+ else
654
+ default_average_gamma_shift(previous_gamma, current_gamma)
655
+ end
656
+ rescue StandardError
657
+ default_average_gamma_shift(previous_gamma, current_gamma)
658
+ end
659
+
660
+ def default_average_gamma_shift(previous_gamma, current_gamma)
661
+ deltas = []
662
+
663
+ previous_gamma.each_with_index do |previous_row, row_index|
664
+ previous_row.each_with_index do |previous_value, col_index|
665
+ deltas << (previous_value - current_gamma[row_index][col_index]).abs
666
+ end
667
+ end
668
+
669
+ return 0.0 if deltas.empty?
670
+
671
+ deltas.sum / deltas.size.to_f
672
+ end
673
+ end
674
+ end
675
+ end