lda-ruby 0.4.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +61 -0
  3. data/Gemfile +9 -0
  4. data/README.md +157 -0
  5. data/VERSION.yml +5 -0
  6. data/docs/modernization-handoff.md +190 -0
  7. data/docs/porting-strategy.md +127 -0
  8. data/docs/precompiled-platform-policy.md +68 -0
  9. data/docs/release-runbook.md +157 -0
  10. data/ext/lda-ruby/cokus.c +145 -0
  11. data/ext/lda-ruby/cokus.h +27 -0
  12. data/ext/lda-ruby/extconf.rb +13 -0
  13. data/ext/lda-ruby/lda-alpha.c +96 -0
  14. data/ext/lda-ruby/lda-alpha.h +21 -0
  15. data/ext/lda-ruby/lda-data.c +67 -0
  16. data/ext/lda-ruby/lda-data.h +14 -0
  17. data/ext/lda-ruby/lda-inference.c +1023 -0
  18. data/ext/lda-ruby/lda-inference.h +63 -0
  19. data/ext/lda-ruby/lda-model.c +345 -0
  20. data/ext/lda-ruby/lda-model.h +31 -0
  21. data/ext/lda-ruby/lda.h +54 -0
  22. data/ext/lda-ruby/utils.c +111 -0
  23. data/ext/lda-ruby/utils.h +18 -0
  24. data/ext/lda-ruby-rust/Cargo.toml +12 -0
  25. data/ext/lda-ruby-rust/README.md +48 -0
  26. data/ext/lda-ruby-rust/extconf.rb +123 -0
  27. data/ext/lda-ruby-rust/src/lib.rs +456 -0
  28. data/lda-ruby.gemspec +78 -0
  29. data/lib/lda-ruby/backends/base.rb +129 -0
  30. data/lib/lda-ruby/backends/native.rb +158 -0
  31. data/lib/lda-ruby/backends/pure_ruby.rb +613 -0
  32. data/lib/lda-ruby/backends/rust.rb +226 -0
  33. data/lib/lda-ruby/backends.rb +58 -0
  34. data/lib/lda-ruby/config/stopwords.yml +571 -0
  35. data/lib/lda-ruby/corpus/corpus.rb +45 -0
  36. data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
  37. data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
  38. data/lib/lda-ruby/corpus/text_corpus.rb +27 -0
  39. data/lib/lda-ruby/document/data_document.rb +30 -0
  40. data/lib/lda-ruby/document/document.rb +40 -0
  41. data/lib/lda-ruby/document/text_document.rb +39 -0
  42. data/lib/lda-ruby/lda.so +0 -0
  43. data/lib/lda-ruby/rust_build_policy.rb +21 -0
  44. data/lib/lda-ruby/version.rb +5 -0
  45. data/lib/lda-ruby/vocabulary.rb +46 -0
  46. data/lib/lda-ruby.rb +413 -0
  47. data/lib/lda_ruby_rust.so +0 -0
  48. data/license.txt +504 -0
  49. data/test/backend_compatibility_test.rb +146 -0
  50. data/test/backends_selection_test.rb +100 -0
  51. data/test/data/docs.dat +46 -0
  52. data/test/data/sample.rb +20 -0
  53. data/test/data/wiki-test-docs.yml +123 -0
  54. data/test/gemspec_test.rb +27 -0
  55. data/test/lda_ruby_test.rb +319 -0
  56. data/test/packaged_gem_smoke_test.rb +33 -0
  57. data/test/release_scripts_test.rb +54 -0
  58. data/test/rust_build_policy_test.rb +23 -0
  59. data/test/simple_pipeline_test.rb +22 -0
  60. data/test/simple_yaml.rb +17 -0
  61. data/test/test_helper.rb +10 -0
  62. metadata +111 -0
@@ -0,0 +1,129 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Lda
4
+ module Backends
5
+ class Base
6
+ attr_reader :corpus
7
+
8
+ attr_accessor :max_iter,
9
+ :convergence,
10
+ :em_max_iter,
11
+ :em_convergence,
12
+ :num_topics,
13
+ :init_alpha,
14
+ :est_alpha,
15
+ :verbose
16
+
17
+ def initialize(random_seed: nil)
18
+ @random = random_seed.nil? ? Random.new : Random.new(random_seed)
19
+
20
+ @max_iter = 20
21
+ @convergence = 1e-6
22
+ @em_max_iter = 100
23
+ @em_convergence = 1e-4
24
+ @num_topics = 20
25
+ @init_alpha = 0.3
26
+ @est_alpha = 1
27
+ @verbose = true
28
+
29
+ @corpus = nil
30
+ end
31
+
32
+ def name
33
+ self.class.name.split("::").last.downcase
34
+ end
35
+
36
+ def corpus=(corpus)
37
+ @corpus = corpus
38
+ true
39
+ end
40
+
41
+ def fast_load_corpus_from_file(filename)
42
+ self.corpus = Lda::DataCorpus.new(filename)
43
+ end
44
+
45
+ def load_settings(settings_file)
46
+ File.readlines(settings_file).each do |line|
47
+ next if line.strip.empty? || line.strip.start_with?("#")
48
+
49
+ key, value = line.split(/\s+/, 2)
50
+ next if value.nil?
51
+
52
+ case key.downcase
53
+ when "max_iter", "var_max_iter"
54
+ self.max_iter = value.to_i
55
+ when "convergence", "var_converged"
56
+ self.convergence = value.to_f
57
+ when "em_max_iter"
58
+ self.em_max_iter = value.to_i
59
+ when "em_convergence", "em_converged"
60
+ self.em_convergence = value.to_f
61
+ when "num_topics", "ntopics"
62
+ self.num_topics = value.to_i
63
+ when "init_alpha", "initial_alpha", "alpha"
64
+ self.init_alpha = value.to_f
65
+ when "est_alpha", "estimate_alpha"
66
+ self.est_alpha = value.to_i
67
+ when "verbose"
68
+ self.verbose = value.to_i != 0
69
+ end
70
+ end
71
+
72
+ true
73
+ end
74
+
75
+ def set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha)
76
+ self.init_alpha = init_alpha
77
+ self.num_topics = num_topics
78
+ self.max_iter = max_iter
79
+ self.convergence = convergence
80
+ self.em_max_iter = em_max_iter
81
+ self.em_convergence = em_convergence
82
+ self.est_alpha = est_alpha
83
+ true
84
+ end
85
+
86
+ def em(_start)
87
+ raise NotImplementedError, "#{self.class} must implement #em"
88
+ end
89
+
90
+ def beta
91
+ raise NotImplementedError, "#{self.class} must implement #beta"
92
+ end
93
+
94
+ def gamma
95
+ raise NotImplementedError, "#{self.class} must implement #gamma"
96
+ end
97
+
98
+ def compute_phi
99
+ raise NotImplementedError, "#{self.class} must implement #compute_phi"
100
+ end
101
+
102
+ def model
103
+ raise NotImplementedError, "#{self.class} must implement #model"
104
+ end
105
+
106
+ def topic_document_probability(_phi_matrix, _document_counts)
107
+ nil
108
+ end
109
+
110
+ private
111
+
112
+ def normalize!(weights)
113
+ total = weights.sum.to_f
114
+
115
+ if total <= 0.0
116
+ uniform = 1.0 / weights.size
117
+ weights.map! { uniform }
118
+ return weights
119
+ end
120
+
121
+ weights.map! { |w| w / total }
122
+ end
123
+
124
+ def clone_matrix(matrix)
125
+ Marshal.load(Marshal.dump(matrix))
126
+ end
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,158 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Lda
4
+ module Backends
5
+ class Native < Base
6
+ REQUIRED_NATIVE_METHODS = %i[
7
+ __native_fast_load_corpus_from_file
8
+ __native_load_settings
9
+ __native_set_config
10
+ __native_em
11
+ __native_beta
12
+ __native_gamma
13
+ __native_compute_phi
14
+ __native_model
15
+ __native_set_corpus
16
+ __native_max_iter
17
+ __native_set_max_iter
18
+ __native_convergence
19
+ __native_set_convergence
20
+ __native_em_max_iter
21
+ __native_set_em_max_iter
22
+ __native_em_convergence
23
+ __native_set_em_convergence
24
+ __native_init_alpha
25
+ __native_set_init_alpha
26
+ __native_num_topics
27
+ __native_set_num_topics
28
+ __native_est_alpha
29
+ __native_set_est_alpha
30
+ __native_verbose
31
+ __native_set_verbose
32
+ ].freeze
33
+
34
+ def self.available?(host)
35
+ REQUIRED_NATIVE_METHODS.all? { |method_name| host.respond_to?(method_name, true) }
36
+ end
37
+
38
+ def initialize(host, random_seed: nil)
39
+ super(random_seed: random_seed)
40
+ @host = host
41
+ end
42
+
43
+ def name
44
+ "native"
45
+ end
46
+
47
+ def corpus=(corpus)
48
+ @corpus = corpus
49
+ @host.__send__(:__native_set_corpus, corpus)
50
+ end
51
+
52
+ def fast_load_corpus_from_file(filename)
53
+ @host.__send__(:__native_fast_load_corpus_from_file, filename)
54
+ end
55
+
56
+ def load_settings(settings_file)
57
+ @host.__send__(:__native_load_settings, settings_file)
58
+ end
59
+
60
+ def set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha)
61
+ @host.__send__(
62
+ :__native_set_config,
63
+ init_alpha,
64
+ num_topics,
65
+ max_iter,
66
+ convergence,
67
+ em_max_iter,
68
+ em_convergence,
69
+ est_alpha
70
+ )
71
+ end
72
+
73
+ def max_iter
74
+ @host.__send__(:__native_max_iter)
75
+ end
76
+
77
+ def max_iter=(value)
78
+ @host.__send__(:__native_set_max_iter, Integer(value))
79
+ end
80
+
81
+ def convergence
82
+ @host.__send__(:__native_convergence)
83
+ end
84
+
85
+ def convergence=(value)
86
+ @host.__send__(:__native_set_convergence, Float(value))
87
+ end
88
+
89
+ def em_max_iter
90
+ @host.__send__(:__native_em_max_iter)
91
+ end
92
+
93
+ def em_max_iter=(value)
94
+ @host.__send__(:__native_set_em_max_iter, Integer(value))
95
+ end
96
+
97
+ def em_convergence
98
+ @host.__send__(:__native_em_convergence)
99
+ end
100
+
101
+ def em_convergence=(value)
102
+ @host.__send__(:__native_set_em_convergence, Float(value))
103
+ end
104
+
105
+ def init_alpha
106
+ @host.__send__(:__native_init_alpha)
107
+ end
108
+
109
+ def init_alpha=(value)
110
+ @host.__send__(:__native_set_init_alpha, Float(value))
111
+ end
112
+
113
+ def num_topics
114
+ @host.__send__(:__native_num_topics)
115
+ end
116
+
117
+ def num_topics=(value)
118
+ @host.__send__(:__native_set_num_topics, Integer(value))
119
+ end
120
+
121
+ def est_alpha
122
+ @host.__send__(:__native_est_alpha)
123
+ end
124
+
125
+ def est_alpha=(value)
126
+ @host.__send__(:__native_set_est_alpha, Integer(value))
127
+ end
128
+
129
+ def verbose
130
+ @host.__send__(:__native_verbose)
131
+ end
132
+
133
+ def verbose=(value)
134
+ @host.__send__(:__native_set_verbose, !!value)
135
+ end
136
+
137
+ def em(start)
138
+ @host.__send__(:__native_em, start)
139
+ end
140
+
141
+ def beta
142
+ @host.__send__(:__native_beta)
143
+ end
144
+
145
+ def gamma
146
+ @host.__send__(:__native_gamma)
147
+ end
148
+
149
+ def compute_phi
150
+ @host.__send__(:__native_compute_phi)
151
+ end
152
+
153
+ def model
154
+ @host.__send__(:__native_model)
155
+ end
156
+ end
157
+ end
158
+ end