lda-ruby 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,111 @@
1
+ #include "utils.h"
2
+
3
+ /*
4
+ * given log(a) and log(b), return log(a + b)
5
+ *
6
+ */
7
+
8
+ double log_sum(double log_a, double log_b)
9
+ {
10
+ double v;
11
+
12
+ if (log_a < log_b)
13
+ {
14
+ v = log_b+log(1 + exp(log_a-log_b));
15
+ }
16
+ else
17
+ {
18
+ v = log_a+log(1 + exp(log_b-log_a));
19
+ }
20
+ return(v);
21
+ }
22
+
23
+ /**
24
+ * Proc to calculate the value of the trigamma, the second
25
+ * derivative of the loggamma function. Accepts positive matrices.
26
+ * From Abromowitz and Stegun. Uses formulas 6.4.11 and 6.4.12 with
27
+ * recurrence formula 6.4.6. Each requires workspace at least 5
28
+ * times the size of X.
29
+ *
30
+ **/
31
+
32
+ double trigamma(double x)
33
+ {
34
+ double p;
35
+ int i;
36
+
37
+ x=x+6;
38
+ p=1/(x*x);
39
+ p=(((((0.075757575757576*p-0.033333333333333)*p+0.0238095238095238)
40
+ *p-0.033333333333333)*p+0.166666666666667)*p+1)/x+0.5*p;
41
+ for (i=0; i<6 ;i++)
42
+ {
43
+ x=x-1;
44
+ p=1/(x*x)+p;
45
+ }
46
+ return(p);
47
+ }
48
+
49
+
50
+ /*
51
+ * taylor approximation of first derivative of the log gamma function
52
+ *
53
+ */
54
+
55
+ double digamma(double x)
56
+ {
57
+ double p;
58
+ x=x+6;
59
+ p=1/(x*x);
60
+ p=(((0.004166666666667*p-0.003968253986254)*p+
61
+ 0.008333333333333)*p-0.083333333333333)*p;
62
+ p=p+log(x)-0.5/x-1/(x-1)-1/(x-2)-1/(x-3)-1/(x-4)-1/(x-5)-1/(x-6);
63
+ return p;
64
+ }
65
+
66
+
67
+ double log_gamma(double x)
68
+ {
69
+ double z=1/(x*x);
70
+
71
+ x=x+6;
72
+ z=(((-0.000595238095238*z+0.000793650793651)
73
+ *z-0.002777777777778)*z+0.083333333333333)/x;
74
+ z=(x-0.5)*log(x)-x+0.918938533204673+z-log(x-1)-
75
+ log(x-2)-log(x-3)-log(x-4)-log(x-5)-log(x-6);
76
+ return z;
77
+ }
78
+
79
+
80
+
81
+ /*
82
+ * make directory
83
+ *
84
+ */
85
+
86
+ void make_directory(char* name)
87
+ {
88
+ mkdir(name, S_IRUSR|S_IWUSR|S_IXUSR);
89
+ }
90
+
91
+
92
+ /*
93
+ * argmax
94
+ *
95
+ */
96
+
97
+ int argmax(double* x, int n)
98
+ {
99
+ int i;
100
+ double max = x[0];
101
+ int argmax = 0;
102
+ for (i = 1; i < n; i++)
103
+ {
104
+ if (x[i] > max)
105
+ {
106
+ max = x[i];
107
+ argmax = i;
108
+ }
109
+ }
110
+ return(argmax);
111
+ }
@@ -0,0 +1,18 @@
1
+ #ifndef UTILS_H
2
+ #define UTILS_H
3
+
4
+ #include <stdio.h>
5
+ #include <math.h>
6
+ #include <float.h>
7
+ #include <stdlib.h>
8
+ #include <sys/stat.h>
9
+ #include <sys/types.h>
10
+
11
+ double log_sum(double log_a, double log_b);
12
+ double trigamma(double x);
13
+ double digamma(double x);
14
+ double log_gamma(double x);
15
+ void make_directory(char* name);
16
+ int argmax(double* x, int n);
17
+
18
+ #endif
data/lda-ruby.gemspec ADDED
@@ -0,0 +1,78 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{lda-ruby}
8
+ s.version = "0.3.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["David Blei", "Jason Adams"]
12
+ s.date = %q{2009-08-11}
13
+ s.description = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.}
14
+ s.email = %q{jasonmadams@gmail.com}
15
+ s.extensions = ["ext/lda-ruby/extconf.rb"]
16
+ s.extra_rdoc_files = [
17
+ "README",
18
+ "README.markdown"
19
+ ]
20
+ s.files = [
21
+ ".gitignore",
22
+ "CHANGELOG",
23
+ "README",
24
+ "README.markdown",
25
+ "Rakefile",
26
+ "VERSION.yml",
27
+ "ext/lda-ruby/Makefile",
28
+ "ext/lda-ruby/cokus.c",
29
+ "ext/lda-ruby/cokus.h",
30
+ "ext/lda-ruby/extconf.rb",
31
+ "ext/lda-ruby/lda-alpha.c",
32
+ "ext/lda-ruby/lda-alpha.h",
33
+ "ext/lda-ruby/lda-data.c",
34
+ "ext/lda-ruby/lda-data.h",
35
+ "ext/lda-ruby/lda-inference.c",
36
+ "ext/lda-ruby/lda-inference.h",
37
+ "ext/lda-ruby/lda-model.c",
38
+ "ext/lda-ruby/lda-model.h",
39
+ "ext/lda-ruby/lda.h",
40
+ "ext/lda-ruby/utils.c",
41
+ "ext/lda-ruby/utils.h",
42
+ "lda-ruby.gemspec",
43
+ "lib/lda-ruby.rb",
44
+ "lib/lda-ruby/corpus/corpus.rb",
45
+ "lib/lda-ruby/corpus/data_corpus.rb",
46
+ "lib/lda-ruby/corpus/directory_corpus.rb",
47
+ "lib/lda-ruby/corpus/text_corpus.rb",
48
+ "lib/lda-ruby/document/data_document.rb",
49
+ "lib/lda-ruby/document/document.rb",
50
+ "lib/lda-ruby/document/text_document.rb",
51
+ "lib/lda-ruby/vocabulary.rb",
52
+ "license.txt",
53
+ "test/data/.gitignore",
54
+ "test/data/docs.dat",
55
+ "test/data/wiki-test-docs.yml",
56
+ "test/lda_ruby_test.rb",
57
+ "test/test_helper.rb"
58
+ ]
59
+ s.homepage = %q{http://github.com/ealdent/lda-ruby}
60
+ s.rdoc_options = ["--charset=UTF-8"]
61
+ s.require_paths = ["lib", "ext"]
62
+ s.rubygems_version = %q{1.3.4}
63
+ s.summary = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei.}
64
+ s.test_files = [
65
+ "test/lda_ruby_test.rb",
66
+ "test/test_helper.rb"
67
+ ]
68
+
69
+ if s.respond_to? :specification_version then
70
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
71
+ s.specification_version = 3
72
+
73
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
74
+ else
75
+ end
76
+ else
77
+ end
78
+ end
data/lib/lda-ruby.rb ADDED
@@ -0,0 +1,168 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
2
+
3
+ require 'lda-ruby/lda'
4
+ require 'lda-ruby/document/document'
5
+ require 'lda-ruby/document/data_document'
6
+ require 'lda-ruby/document/text_document'
7
+ require 'lda-ruby/corpus/corpus'
8
+ require 'lda-ruby/corpus/data_corpus'
9
+ require 'lda-ruby/corpus/text_corpus'
10
+ require 'lda-ruby/corpus/directory_corpus'
11
+ require 'lda-ruby/vocabulary'
12
+
13
+ module Lda
14
+ class Lda
15
+ attr_reader :vocab, :corpus
16
+
17
+ def initialize(corpus)
18
+ load_default_settings
19
+
20
+ @vocab = nil
21
+ self.corpus = corpus
22
+ @vocab = corpus.vocabulary.to_a if corpus.vocabulary
23
+
24
+ @phi = nil
25
+ end
26
+
27
+ def load_default_settings
28
+ self.max_iter = 20
29
+ self.convergence = 1e-6
30
+ self.em_max_iter = 100
31
+ self.em_convergence = 1e-4
32
+ self.num_topics = 20
33
+ self.init_alpha = 0.3
34
+ self.est_alpha = 1
35
+
36
+ [20, 1e-6, 100, 1e-4, 20, 0.3, 1]
37
+ end
38
+
39
+ def load_corpus(filename)
40
+ @corpus = Corpus.new
41
+ @corpus.load_from_file(filename)
42
+
43
+ true
44
+ end
45
+
46
+ def load_vocabulary(vocab)
47
+ if vocab.is_a?(Array)
48
+ @vocab = Marshal::load(Marshal::dump(vocab)) # deep clone array
49
+ elsif vocab.is_a?(Vocabulary)
50
+ @vocab = vocab.to_a
51
+ else
52
+ @vocab = File.open(vocab, 'r') { |f| f.read.split(/\s+/) }
53
+ end
54
+
55
+ true
56
+ end
57
+
58
+ #
59
+ # Visualization method for printing out the top +words_per_topic+ words
60
+ # for each topic.
61
+ #
62
+ # See also +top_words+.
63
+ #
64
+ def print_topics(words_per_topic = 10)
65
+ raise 'No vocabulary loaded.' unless @vocab
66
+
67
+ self.beta.each_with_index do |topic, topic_num|
68
+ # Sort the topic array and return the sorted indices of the best scores
69
+ indices = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
70
+
71
+ puts "Topic #{topic_num}"
72
+ puts "\t#{indices.map {|i| @vocab[i]}.join("\n\t")}"
73
+ puts ""
74
+ end
75
+
76
+ nil
77
+ end
78
+
79
+ #
80
+ # After the model has been run and a vocabulary has been loaded, return the
81
+ # +words_per_topic+ top words chosen by the model for each topic. This is
82
+ # returned as a hash mapping the topic number to an array of top words
83
+ # (in descending order of importance).
84
+ #
85
+ # topic_number => [w1, w2, ..., w_n]
86
+ #
87
+ # See also +print_topics+.
88
+ #
89
+ def top_word_indices(words_per_topic = 10)
90
+ raise 'No vocabulary loaded.' unless @vocab
91
+
92
+ # find the highest scoring words per topic
93
+ topics = Hash.new
94
+ indices = (0...@vocab.size).to_a
95
+
96
+ self.beta.each_with_index do |topic, topic_num|
97
+ topics[topic_num] = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
98
+ end
99
+
100
+ topics
101
+ end
102
+
103
+ def top_words(words_per_topic = 10)
104
+ output = Hash.new
105
+
106
+ topics = top_word_indices(words_per_topic)
107
+ topics.each_pair do |topic_num, words|
108
+ output[topic_num] = words.map { |w| @vocab[w] }
109
+ end
110
+
111
+ output
112
+ end
113
+
114
+ #
115
+ # Get the phi matrix which can be used to assign probabilities to words
116
+ # belonging to a specific topic in each document. The return value is a
117
+ # 3D matrix: num_docs x doc_length x num_topics. The value is cached
118
+ # after the first call, so if it needs to be recomputed, set the +recompute+
119
+ # value to true.
120
+ #
121
+ def phi(recompute=false)
122
+ if @phi.nil? || recompute
123
+ @phi = self.compute_phi
124
+ end
125
+
126
+ @phi
127
+ end
128
+
129
+ #
130
+ # Compute the average log probability for each topic for each document in the corpus.
131
+ # This method returns a matrix: num_docs x num_topics with the average log probability
132
+ # for the topic in the document.
133
+ #
134
+ def compute_topic_document_probability
135
+ outp = Array.new
136
+
137
+ @corpus.documents.each_with_index do |doc, idx|
138
+ tops = [0.0] * self.num_topics
139
+ ttl = doc.counts.inject(0.0) {|sum, i| sum + i}
140
+ self.phi[idx].each_with_index do |word_dist, word_idx|
141
+ word_dist.each_with_index do |top_prob, top_idx|
142
+ tops[top_idx] += Math.log(top_prob) * doc.counts[word_idx]
143
+ end
144
+ end
145
+ tops = tops.map {|i| i / ttl}
146
+ outp << tops
147
+ end
148
+
149
+ outp
150
+ end
151
+
152
+ #
153
+ # String representation displaying current settings.
154
+ #
155
+ def to_s
156
+ outp = ["LDA Settings:"]
157
+ outp << " Initial alpha: %0.6f" % self.init_alpha
158
+ outp << " # of topics: %d" % self.num_topics
159
+ outp << " Max iterations: %d" % self.max_iter
160
+ outp << " Convergence: %0.6f" % self.convergence
161
+ outp << "EM max iterations: %d" % self.em_max_iter
162
+ outp << " EM convergence: %0.6f" % self.em_convergence
163
+ outp << " Estimate alpha: %d" % self.est_alpha
164
+
165
+ outp.join("\n")
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,34 @@
1
+ require 'set'
2
+
3
+ module Lda
4
+ class Corpus
5
+ attr_reader :documents, :num_docs, :num_terms, :vocabulary
6
+
7
+ def initialize
8
+ @documents = Array.new
9
+ @all_terms = Set.new
10
+ @num_terms = @num_docs = 0
11
+ @vocabulary = Vocabulary.new
12
+ end
13
+
14
+ def add_document(doc)
15
+ raise 'Parameter +doc+ must be of type Document' unless doc.kind_of?(Document)
16
+
17
+ @documents << doc
18
+
19
+ @all_terms += doc.words
20
+ @num_docs += 1
21
+ @num_terms = @all_terms.size
22
+
23
+ update_vocabulary(doc)
24
+
25
+ nil
26
+ end
27
+
28
+ protected
29
+
30
+ def update_vocabulary(doc)
31
+ doc.tokens.each { |w| @vocabulary.check_word(w) }
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,22 @@
1
+ module Lda
2
+ class DataCorpus < Corpus
3
+ attr_reader :filename
4
+
5
+ def initialize(filename)
6
+ super()
7
+
8
+ @filename = filename
9
+ load_from_file
10
+ end
11
+
12
+ protected
13
+
14
+ def load_from_file
15
+ txt = File.open(@filename, 'r') { |f| f.read }
16
+ lines = txt.split(/[\r\n]+/)
17
+ lines.each do |line|
18
+ add_document(DataDocument.new(self, line))
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,25 @@
1
+ module Lda
2
+ class DirectoryCorpus < Corpus
3
+ attr_reader :path, :extension
4
+
5
+ # load documents from a directory
6
+ def initialize(path, extension = nil)
7
+ super()
8
+
9
+ @path = path.dup.freeze
10
+ @extension = extension ? extension.dup.freeze : nil
11
+
12
+ load_from_directory
13
+ end
14
+
15
+ protected
16
+
17
+ def load_from_directory
18
+ dir_glob = File.join(@path, (@extension ? "*.#{@extension}" : "*"))
19
+
20
+ Dir.glob(dir_glob).each do |filename|
21
+ add_document(TextDocument.build_from_file(self, filename))
22
+ end
23
+ end
24
+ end
25
+ end