ealdent-lda-ruby 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -75,8 +75,6 @@ void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
75
75
  model->alpha = quiet_opt_alpha(ss->alpha_suffstats,
76
76
  ss->num_docs,
77
77
  model->num_topics);
78
-
79
- printf("new alpha = %5.5f\n", model->alpha);
80
78
  }
81
79
  }
82
80
 
@@ -217,7 +215,7 @@ void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c)
217
215
  int num_topics = MIN(model->num_topics, c->num_docs);
218
216
  int k, n;
219
217
  document* doc;
220
-
218
+
221
219
  for (k = 0; k < num_topics; k++) {
222
220
  doc = &(c->docs[k]);
223
221
  for (n = 0; n < doc->length; n++) {
@@ -253,6 +251,23 @@ lda_model* new_lda_model(int num_terms, int num_topics) {
253
251
  return(model);
254
252
  }
255
253
 
254
+ lda_model* quiet_new_lda_model(int num_terms, int num_topics) {
255
+ int i;
256
+ lda_model* model;
257
+
258
+ model = malloc(sizeof(lda_model));
259
+ model->num_topics = num_topics;
260
+ model->num_terms = num_terms;
261
+ model->alpha = 1.0;
262
+ model->log_prob_w = malloc(sizeof(double*)*num_topics);
263
+ for (i = 0; i < num_topics; i++)
264
+ {
265
+ model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
266
+ memset(model->log_prob_w[i],0,sizeof(double)*num_terms);
267
+ }
268
+ return(model);
269
+ }
270
+
256
271
 
257
272
  /*
258
273
  * deallocate new lda model
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,74 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{lda-ruby}
5
+ s.version = "0.3.0"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["David Blei", "Jason Adams"]
9
+ s.date = %q{2009-07-24}
10
+ s.description = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.}
11
+ s.email = %q{jasonmadams@gmail.com}
12
+ s.extensions = ["ext/lda-ruby/extconf.rb"]
13
+ s.extra_rdoc_files = [
14
+ "README",
15
+ "README.markdown"
16
+ ]
17
+ s.files = [
18
+ ".gitignore",
19
+ "README",
20
+ "README.markdown",
21
+ "Rakefile",
22
+ "VERSION.yml",
23
+ "ext/lda-ruby/Makefile",
24
+ "ext/lda-ruby/cokus.c",
25
+ "ext/lda-ruby/cokus.h",
26
+ "ext/lda-ruby/extconf.rb",
27
+ "ext/lda-ruby/lda-alpha.c",
28
+ "ext/lda-ruby/lda-alpha.h",
29
+ "ext/lda-ruby/lda-data.c",
30
+ "ext/lda-ruby/lda-data.h",
31
+ "ext/lda-ruby/lda-inference.c",
32
+ "ext/lda-ruby/lda-inference.h",
33
+ "ext/lda-ruby/lda-model.c",
34
+ "ext/lda-ruby/lda-model.h",
35
+ "ext/lda-ruby/lda.h",
36
+ "ext/lda-ruby/utils.c",
37
+ "ext/lda-ruby/utils.h",
38
+ "lda-ruby.gemspec",
39
+ "lib/lda-ruby.rb",
40
+ "lib/lda-ruby/corpus/corpus.rb",
41
+ "lib/lda-ruby/corpus/data_corpus.rb",
42
+ "lib/lda-ruby/corpus/directory_corpus.rb",
43
+ "lib/lda-ruby/corpus/text_corpus.rb",
44
+ "lib/lda-ruby/document/data_document.rb",
45
+ "lib/lda-ruby/document/document.rb",
46
+ "lib/lda-ruby/document/text_document.rb",
47
+ "lib/lda-ruby/vocabulary.rb",
48
+ "license.txt",
49
+ "test/data/.gitignore",
50
+ "test/data/docs.dat",
51
+ "test/data/wiki-test-docs.yml",
52
+ "test/lda_ruby_test.rb",
53
+ "test/test_helper.rb"
54
+ ]
55
+ s.homepage = %q{http://github.com/ealdent/lda-ruby}
56
+ s.rdoc_options = ["--charset=UTF-8"]
57
+ s.require_paths = ["lib", "ext"]
58
+ s.rubygems_version = %q{1.3.4}
59
+ s.summary = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei.}
60
+ s.test_files = [
61
+ "test/lda_ruby_test.rb",
62
+ "test/test_helper.rb"
63
+ ]
64
+
65
+ if s.respond_to? :specification_version then
66
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
67
+ s.specification_version = 3
68
+
69
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
70
+ else
71
+ end
72
+ else
73
+ end
74
+ end
@@ -0,0 +1,157 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
2
+
3
+ require 'lda-ruby/lda'
4
+ require 'lda-ruby/document/document'
5
+ require 'lda-ruby/document/data_document'
6
+ require 'lda-ruby/document/text_document'
7
+ require 'lda-ruby/corpus/corpus'
8
+ require 'lda-ruby/corpus/data_corpus'
9
+ require 'lda-ruby/corpus/text_corpus'
10
+ require 'lda-ruby/corpus/directory_corpus'
11
+ require 'lda-ruby/vocabulary'
12
+
13
+ module Lda
14
+ class Lda
15
+ attr_reader :vocab, :corpus
16
+
17
+ def initialize(corpus)
18
+ load_default_settings
19
+
20
+ @vocab = nil
21
+ self.corpus = corpus
22
+ @vocab = corpus.vocabulary.to_a if corpus.vocabulary
23
+
24
+ @phi = nil
25
+ end
26
+
27
+ def load_default_settings
28
+ self.max_iter = 20
29
+ self.convergence = 1e-6
30
+ self.em_max_iter = 100
31
+ self.em_convergence = 1e-4
32
+ self.num_topics = 20
33
+ self.init_alpha = 0.3
34
+ self.est_alpha = 1
35
+
36
+ [20, 1e-6, 100, 1e-4, 20, 0.3, 1]
37
+ end
38
+
39
+ def load_corpus(filename)
40
+ @corpus = Corpus.new
41
+ @corpus.load_from_file(filename)
42
+
43
+ true
44
+ end
45
+
46
+ def load_vocabulary(vocab)
47
+ if vocab.is_a?(Array)
48
+ @vocab = Marshal::load(Marshal::dump(vocab)) # deep clone array
49
+ elsif vocab.is_a?(Vocabulary)
50
+ @vocab = vocab.to_a
51
+ else
52
+ @vocab = File.open(vocab, 'r') { |f| f.read.split(/\s+/) }
53
+ end
54
+
55
+ true
56
+ end
57
+
58
+ #
59
+ # Visualization method for printing out the top +words_per_topic+ words
60
+ # for each topic.
61
+ #
62
+ # See also +top_words+.
63
+ #
64
+ def print_topics(words_per_topic = 10)
65
+ raise 'No vocabulary loaded.' unless @vocab
66
+
67
+ self.beta.each_with_index do |topic, topic_num|
68
+ # Sort the topic array and return the sorted indices of the best scores
69
+ indices = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
70
+
71
+ puts "Topic #{topic_num}"
72
+ puts "\t#{indices.map {|i| @vocab[i]}.join("\n\t")}"
73
+ puts ""
74
+ end
75
+
76
+ nil
77
+ end
78
+
79
+ #
80
+ # After the model has been run and a vocabulary has been loaded, return the
81
+ # +words_per_topic+ top words chosen by the model for each topic. This is
82
+ # returned as a hash mapping the topic number to an array of top words
83
+ # (in descending order of importance).
84
+ #
85
+ # topic_number => [w1, w2, ..., w_n]
86
+ #
87
+ # See also +print_topics+.
88
+ #
89
+ def top_words(words_per_topic = 10)
90
+ raise 'No vocabulary loaded.' unless @vocab
91
+
92
+ # find the highest scoring words per topic
93
+ topics = Hash.new
94
+ indices = (0...@vocab.size).to_a
95
+
96
+ self.beta.each_with_index do |topic, topic_num|
97
+ topics[topic_num] = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
98
+ end
99
+
100
+ topics
101
+ end
102
+
103
+ #
104
+ # Get the phi matrix which can be used to assign probabilities to words
105
+ # belonging to a specific topic in each document. The return value is a
106
+ # 3D matrix: num_docs x doc_length x num_topics. The value is cached
107
+ # after the first call, so if it needs to be recomputed, set the +recompute+
108
+ # value to true.
109
+ #
110
+ def phi(recompute=false)
111
+ if @phi.nil? || recompute
112
+ @phi = self.compute_phi
113
+ end
114
+
115
+ @phi
116
+ end
117
+
118
+ #
119
+ # Compute the average log probability for each topic for each document in the corpus.
120
+ # This method returns a matrix: num_docs x num_topics with the average log probability
121
+ # for the topic in the document.
122
+ #
123
+ def compute_topic_document_probability
124
+ outp = Array.new
125
+
126
+ @corpus.documents.each_with_index do |doc, idx|
127
+ tops = [0.0] * self.num_topics
128
+ ttl = doc.counts.inject(0.0) {|sum, i| sum + i}
129
+ self.phi[idx].each_with_index do |word_dist, word_idx|
130
+ word_dist.each_with_index do |top_prob, top_idx|
131
+ tops[top_idx] += Math.log(top_prob) * doc.counts[word_idx]
132
+ end
133
+ end
134
+ tops = tops.map {|i| i / ttl}
135
+ outp << tops
136
+ end
137
+
138
+ outp
139
+ end
140
+
141
+ #
142
+ # String representation displaying current settings.
143
+ #
144
+ def to_s
145
+ outp = ["LDA Settings:"]
146
+ outp << " Initial alpha: %0.6f" % self.init_alpha
147
+ outp << " # of topics: %d" % self.num_topics
148
+ outp << " Max iterations: %d" % self.max_iter
149
+ outp << " Convergence: %0.6f" % self.convergence
150
+ outp << "EM max iterations: %d" % self.em_max_iter
151
+ outp << " EM convergence: %0.6f" % self.em_convergence
152
+ outp << " Estimate alpha: %d" % self.est_alpha
153
+
154
+ outp.join("\n")
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,34 @@
1
+ require 'set'
2
+
3
+ module Lda
4
+ class Corpus
5
+ attr_reader :documents, :num_docs, :num_terms, :vocabulary
6
+
7
+ def initialize
8
+ @documents = Array.new
9
+ @all_terms = Set.new
10
+ @num_terms = @num_docs = 0
11
+ @vocabulary = Vocabulary.new
12
+ end
13
+
14
+ def add_document(doc)
15
+ raise 'Parameter +doc+ must be of type Document' unless doc.kind_of?(Document)
16
+
17
+ @documents << doc
18
+
19
+ @all_terms += doc.words
20
+ @num_docs += 1
21
+ @num_terms = @all_terms.size
22
+
23
+ update_vocabulary(doc)
24
+
25
+ nil
26
+ end
27
+
28
+ protected
29
+
30
+ def update_vocabulary(doc)
31
+ doc.tokens.each { |w| @vocabulary.check_word(w) }
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,22 @@
1
+ module Lda
2
+ class DataCorpus < Corpus
3
+ attr_reader :filename
4
+
5
+ def initialize(filename)
6
+ super()
7
+
8
+ @filename = filename
9
+ load_from_file
10
+ end
11
+
12
+ protected
13
+
14
+ def load_from_file
15
+ txt = File.open(@filename, 'r') { |f| f.read }
16
+ lines = txt.split(/[\r\n]+/)
17
+ lines.each do |line|
18
+ add_document(DataDocument.new(self, line))
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,25 @@
1
+ module Lda
2
+ class DirectoryCorpus < Corpus
3
+ attr_reader :path, :extension
4
+
5
+ # load documents from a directory
6
+ def initialize(path, extension = nil)
7
+ super()
8
+
9
+ @path = path.dup.freeze
10
+ @extension = extension ? extension.dup.freeze : nil
11
+
12
+ load_from_directory
13
+ end
14
+
15
+ protected
16
+
17
+ def load_from_directory
18
+ dir_glob = File.join(@path, (@extension ? "*.#{@extension}" : "*"))
19
+
20
+ Dir.glob(dir_glob).each do |filename|
21
+ add_document(TextDocument.build_from_file(self, filename))
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,22 @@
1
+ module Lda
2
+ class TextCorpus < Corpus
3
+ attr_reader :filename
4
+
5
+ # Load text documents from YAML file if filename is given.
6
+ def initialize(filename)
7
+ super()
8
+
9
+ @filename = filename
10
+ load_from_file
11
+ end
12
+
13
+ protected
14
+
15
+ def load_from_file
16
+ docs = YAML.load_file(@filename)
17
+ docs.each do |doc|
18
+ add_document(TextDocument.new(self, doc))
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,30 @@
1
+ #
2
+ # Create the Document using the svmlight-style text line:
3
+ #
4
+ # num_words w1:freq1 w2:freq2 ... w_n:freq_n
5
+ #
6
+ # Ex.
7
+ # 5 1:2 3:1 4:2 7:3 12:1
8
+ #
9
+ # The value for the number of words should equal the number of pairs
10
+ # following it, though this isn't at all enforced. Order of word-pair
11
+ # indices is not important.
12
+ #
13
+
14
+ module Lda
15
+ class DataDocument < Document
16
+ def initialize(corpus, data)
17
+ super(corpus)
18
+
19
+ items = data.split(/\s+/)
20
+ pairs = items[1..items.size].map { |item| item.split(':') }
21
+
22
+ pairs.each do |feature_identifier, feature_weight|
23
+ @words << feature_identifier.to_i
24
+ @counts << feature_weight.to_i
25
+ end
26
+
27
+ recompute
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,36 @@
1
+ module Lda
2
+ class Document
3
+ attr_reader :corpus, :words, :counts, :length, :total, :tokens
4
+
5
+ def initialize(corpus)
6
+ @corpus = corpus
7
+
8
+ @words = Array.new
9
+ @counts = Array.new
10
+ @tokens = Array.new
11
+ @length = 0
12
+ @total = 0
13
+ end
14
+
15
+ #
16
+ # Recompute the total and length values.
17
+ #
18
+ def recompute
19
+ @total = @counts.inject(0) { |sum, i| sum + i }
20
+ @length = @words.size
21
+ end
22
+
23
+ def has_text?
24
+ false
25
+ end
26
+
27
+ def handle(tokens)
28
+ tokens
29
+ end
30
+
31
+ def tokenize(text)
32
+ clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ') # remove everything but letters and ' and leave only single spaces
33
+ @tokens = handle(clean_text.split(' '))
34
+ end
35
+ end
36
+ end