lda-ruby 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,111 @@
1
+ #include "utils.h"
2
+
3
+ /*
4
+ * given log(a) and log(b), return log(a + b)
5
+ *
6
+ */
7
+
8
+ double log_sum(double log_a, double log_b)
9
+ {
10
+ double v;
11
+
12
+ if (log_a < log_b)
13
+ {
14
+ v = log_b+log(1 + exp(log_a-log_b));
15
+ }
16
+ else
17
+ {
18
+ v = log_a+log(1 + exp(log_b-log_a));
19
+ }
20
+ return(v);
21
+ }
22
+
23
+ /**
24
+ * Proc to calculate the value of the trigamma, the second
25
+ * derivative of the loggamma function. Accepts positive matrices.
26
+ * From Abromowitz and Stegun. Uses formulas 6.4.11 and 6.4.12 with
27
+ * recurrence formula 6.4.6. Each requires workspace at least 5
28
+ * times the size of X.
29
+ *
30
+ **/
31
+
32
+ double trigamma(double x)
33
+ {
34
+ double p;
35
+ int i;
36
+
37
+ x=x+6;
38
+ p=1/(x*x);
39
+ p=(((((0.075757575757576*p-0.033333333333333)*p+0.0238095238095238)
40
+ *p-0.033333333333333)*p+0.166666666666667)*p+1)/x+0.5*p;
41
+ for (i=0; i<6 ;i++)
42
+ {
43
+ x=x-1;
44
+ p=1/(x*x)+p;
45
+ }
46
+ return(p);
47
+ }
48
+
49
+
50
+ /*
51
+ * taylor approximation of first derivative of the log gamma function
52
+ *
53
+ */
54
+
55
+ double digamma(double x)
56
+ {
57
+ double p;
58
+ x=x+6;
59
+ p=1/(x*x);
60
+ p=(((0.004166666666667*p-0.003968253986254)*p+
61
+ 0.008333333333333)*p-0.083333333333333)*p;
62
+ p=p+log(x)-0.5/x-1/(x-1)-1/(x-2)-1/(x-3)-1/(x-4)-1/(x-5)-1/(x-6);
63
+ return p;
64
+ }
65
+
66
+
67
+ double log_gamma(double x)
68
+ {
69
+ double z=1/(x*x);
70
+
71
+ x=x+6;
72
+ z=(((-0.000595238095238*z+0.000793650793651)
73
+ *z-0.002777777777778)*z+0.083333333333333)/x;
74
+ z=(x-0.5)*log(x)-x+0.918938533204673+z-log(x-1)-
75
+ log(x-2)-log(x-3)-log(x-4)-log(x-5)-log(x-6);
76
+ return z;
77
+ }
78
+
79
+
80
+
81
+ /*
82
+ * make directory
83
+ *
84
+ */
85
+
86
+ void make_directory(char* name)
87
+ {
88
+ mkdir(name, S_IRUSR|S_IWUSR|S_IXUSR);
89
+ }
90
+
91
+
92
+ /*
93
+ * argmax
94
+ *
95
+ */
96
+
97
+ int argmax(double* x, int n)
98
+ {
99
+ int i;
100
+ double max = x[0];
101
+ int argmax = 0;
102
+ for (i = 1; i < n; i++)
103
+ {
104
+ if (x[i] > max)
105
+ {
106
+ max = x[i];
107
+ argmax = i;
108
+ }
109
+ }
110
+ return(argmax);
111
+ }
@@ -0,0 +1,18 @@
1
+ #ifndef UTILS_H
2
+ #define UTILS_H
3
+
4
+ #include <stdio.h>
5
+ #include <math.h>
6
+ #include <float.h>
7
+ #include <stdlib.h>
8
+ #include <sys/stat.h>
9
+ #include <sys/types.h>
10
+
11
+ double log_sum(double log_a, double log_b);
12
+ double trigamma(double x);
13
+ double digamma(double x);
14
+ double log_gamma(double x);
15
+ void make_directory(char* name);
16
+ int argmax(double* x, int n);
17
+
18
+ #endif
data/lda-ruby.gemspec ADDED
@@ -0,0 +1,78 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{lda-ruby}
8
+ s.version = "0.3.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["David Blei", "Jason Adams"]
12
+ s.date = %q{2009-08-11}
13
+ s.description = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.}
14
+ s.email = %q{jasonmadams@gmail.com}
15
+ s.extensions = ["ext/lda-ruby/extconf.rb"]
16
+ s.extra_rdoc_files = [
17
+ "README",
18
+ "README.markdown"
19
+ ]
20
+ s.files = [
21
+ ".gitignore",
22
+ "CHANGELOG",
23
+ "README",
24
+ "README.markdown",
25
+ "Rakefile",
26
+ "VERSION.yml",
27
+ "ext/lda-ruby/Makefile",
28
+ "ext/lda-ruby/cokus.c",
29
+ "ext/lda-ruby/cokus.h",
30
+ "ext/lda-ruby/extconf.rb",
31
+ "ext/lda-ruby/lda-alpha.c",
32
+ "ext/lda-ruby/lda-alpha.h",
33
+ "ext/lda-ruby/lda-data.c",
34
+ "ext/lda-ruby/lda-data.h",
35
+ "ext/lda-ruby/lda-inference.c",
36
+ "ext/lda-ruby/lda-inference.h",
37
+ "ext/lda-ruby/lda-model.c",
38
+ "ext/lda-ruby/lda-model.h",
39
+ "ext/lda-ruby/lda.h",
40
+ "ext/lda-ruby/utils.c",
41
+ "ext/lda-ruby/utils.h",
42
+ "lda-ruby.gemspec",
43
+ "lib/lda-ruby.rb",
44
+ "lib/lda-ruby/corpus/corpus.rb",
45
+ "lib/lda-ruby/corpus/data_corpus.rb",
46
+ "lib/lda-ruby/corpus/directory_corpus.rb",
47
+ "lib/lda-ruby/corpus/text_corpus.rb",
48
+ "lib/lda-ruby/document/data_document.rb",
49
+ "lib/lda-ruby/document/document.rb",
50
+ "lib/lda-ruby/document/text_document.rb",
51
+ "lib/lda-ruby/vocabulary.rb",
52
+ "license.txt",
53
+ "test/data/.gitignore",
54
+ "test/data/docs.dat",
55
+ "test/data/wiki-test-docs.yml",
56
+ "test/lda_ruby_test.rb",
57
+ "test/test_helper.rb"
58
+ ]
59
+ s.homepage = %q{http://github.com/ealdent/lda-ruby}
60
+ s.rdoc_options = ["--charset=UTF-8"]
61
+ s.require_paths = ["lib", "ext"]
62
+ s.rubygems_version = %q{1.3.4}
63
+ s.summary = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei.}
64
+ s.test_files = [
65
+ "test/lda_ruby_test.rb",
66
+ "test/test_helper.rb"
67
+ ]
68
+
69
+ if s.respond_to? :specification_version then
70
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
71
+ s.specification_version = 3
72
+
73
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
74
+ else
75
+ end
76
+ else
77
+ end
78
+ end
data/lib/lda-ruby.rb ADDED
@@ -0,0 +1,168 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
2
+
3
+ require 'lda-ruby/lda'
4
+ require 'lda-ruby/document/document'
5
+ require 'lda-ruby/document/data_document'
6
+ require 'lda-ruby/document/text_document'
7
+ require 'lda-ruby/corpus/corpus'
8
+ require 'lda-ruby/corpus/data_corpus'
9
+ require 'lda-ruby/corpus/text_corpus'
10
+ require 'lda-ruby/corpus/directory_corpus'
11
+ require 'lda-ruby/vocabulary'
12
+
13
+ module Lda
14
+ class Lda
15
+ attr_reader :vocab, :corpus
16
+
17
+ def initialize(corpus)
18
+ load_default_settings
19
+
20
+ @vocab = nil
21
+ self.corpus = corpus
22
+ @vocab = corpus.vocabulary.to_a if corpus.vocabulary
23
+
24
+ @phi = nil
25
+ end
26
+
27
+ def load_default_settings
28
+ self.max_iter = 20
29
+ self.convergence = 1e-6
30
+ self.em_max_iter = 100
31
+ self.em_convergence = 1e-4
32
+ self.num_topics = 20
33
+ self.init_alpha = 0.3
34
+ self.est_alpha = 1
35
+
36
+ [20, 1e-6, 100, 1e-4, 20, 0.3, 1]
37
+ end
38
+
39
+ def load_corpus(filename)
40
+ @corpus = Corpus.new
41
+ @corpus.load_from_file(filename)
42
+
43
+ true
44
+ end
45
+
46
+ def load_vocabulary(vocab)
47
+ if vocab.is_a?(Array)
48
+ @vocab = Marshal::load(Marshal::dump(vocab)) # deep clone array
49
+ elsif vocab.is_a?(Vocabulary)
50
+ @vocab = vocab.to_a
51
+ else
52
+ @vocab = File.open(vocab, 'r') { |f| f.read.split(/\s+/) }
53
+ end
54
+
55
+ true
56
+ end
57
+
58
+ #
59
+ # Visualization method for printing out the top +words_per_topic+ words
60
+ # for each topic.
61
+ #
62
+ # See also +top_words+.
63
+ #
64
+ def print_topics(words_per_topic = 10)
65
+ raise 'No vocabulary loaded.' unless @vocab
66
+
67
+ self.beta.each_with_index do |topic, topic_num|
68
+ # Sort the topic array and return the sorted indices of the best scores
69
+ indices = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
70
+
71
+ puts "Topic #{topic_num}"
72
+ puts "\t#{indices.map {|i| @vocab[i]}.join("\n\t")}"
73
+ puts ""
74
+ end
75
+
76
+ nil
77
+ end
78
+
79
+ #
80
+ # After the model has been run and a vocabulary has been loaded, return the
81
+ # +words_per_topic+ top words chosen by the model for each topic. This is
82
+ # returned as a hash mapping the topic number to an array of top words
83
+ # (in descending order of importance).
84
+ #
85
+ # topic_number => [w1, w2, ..., w_n]
86
+ #
87
+ # See also +print_topics+.
88
+ #
89
+ def top_word_indices(words_per_topic = 10)
90
+ raise 'No vocabulary loaded.' unless @vocab
91
+
92
+ # find the highest scoring words per topic
93
+ topics = Hash.new
94
+ indices = (0...@vocab.size).to_a
95
+
96
+ self.beta.each_with_index do |topic, topic_num|
97
+ topics[topic_num] = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
98
+ end
99
+
100
+ topics
101
+ end
102
+
103
+ def top_words(words_per_topic = 10)
104
+ output = Hash.new
105
+
106
+ topics = top_word_indices(words_per_topic)
107
+ topics.each_pair do |topic_num, words|
108
+ output[topic_num] = words.map { |w| @vocab[w] }
109
+ end
110
+
111
+ output
112
+ end
113
+
114
+ #
115
+ # Get the phi matrix which can be used to assign probabilities to words
116
+ # belonging to a specific topic in each document. The return value is a
117
+ # 3D matrix: num_docs x doc_length x num_topics. The value is cached
118
+ # after the first call, so if it needs to be recomputed, set the +recompute+
119
+ # value to true.
120
+ #
121
+ def phi(recompute=false)
122
+ if @phi.nil? || recompute
123
+ @phi = self.compute_phi
124
+ end
125
+
126
+ @phi
127
+ end
128
+
129
+ #
130
+ # Compute the average log probability for each topic for each document in the corpus.
131
+ # This method returns a matrix: num_docs x num_topics with the average log probability
132
+ # for the topic in the document.
133
+ #
134
+ def compute_topic_document_probability
135
+ outp = Array.new
136
+
137
+ @corpus.documents.each_with_index do |doc, idx|
138
+ tops = [0.0] * self.num_topics
139
+ ttl = doc.counts.inject(0.0) {|sum, i| sum + i}
140
+ self.phi[idx].each_with_index do |word_dist, word_idx|
141
+ word_dist.each_with_index do |top_prob, top_idx|
142
+ tops[top_idx] += Math.log(top_prob) * doc.counts[word_idx]
143
+ end
144
+ end
145
+ tops = tops.map {|i| i / ttl}
146
+ outp << tops
147
+ end
148
+
149
+ outp
150
+ end
151
+
152
+ #
153
+ # String representation displaying current settings.
154
+ #
155
+ def to_s
156
+ outp = ["LDA Settings:"]
157
+ outp << " Initial alpha: %0.6f" % self.init_alpha
158
+ outp << " # of topics: %d" % self.num_topics
159
+ outp << " Max iterations: %d" % self.max_iter
160
+ outp << " Convergence: %0.6f" % self.convergence
161
+ outp << "EM max iterations: %d" % self.em_max_iter
162
+ outp << " EM convergence: %0.6f" % self.em_convergence
163
+ outp << " Estimate alpha: %d" % self.est_alpha
164
+
165
+ outp.join("\n")
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,34 @@
1
+ require 'set'
2
+
3
+ module Lda
4
+ class Corpus
5
+ attr_reader :documents, :num_docs, :num_terms, :vocabulary
6
+
7
+ def initialize
8
+ @documents = Array.new
9
+ @all_terms = Set.new
10
+ @num_terms = @num_docs = 0
11
+ @vocabulary = Vocabulary.new
12
+ end
13
+
14
+ def add_document(doc)
15
+ raise 'Parameter +doc+ must be of type Document' unless doc.kind_of?(Document)
16
+
17
+ @documents << doc
18
+
19
+ @all_terms += doc.words
20
+ @num_docs += 1
21
+ @num_terms = @all_terms.size
22
+
23
+ update_vocabulary(doc)
24
+
25
+ nil
26
+ end
27
+
28
+ protected
29
+
30
+ def update_vocabulary(doc)
31
+ doc.tokens.each { |w| @vocabulary.check_word(w) }
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,22 @@
1
+ module Lda
2
+ class DataCorpus < Corpus
3
+ attr_reader :filename
4
+
5
+ def initialize(filename)
6
+ super()
7
+
8
+ @filename = filename
9
+ load_from_file
10
+ end
11
+
12
+ protected
13
+
14
+ def load_from_file
15
+ txt = File.open(@filename, 'r') { |f| f.read }
16
+ lines = txt.split(/[\r\n]+/)
17
+ lines.each do |line|
18
+ add_document(DataDocument.new(self, line))
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,25 @@
1
+ module Lda
2
+ class DirectoryCorpus < Corpus
3
+ attr_reader :path, :extension
4
+
5
+ # load documents from a directory
6
+ def initialize(path, extension = nil)
7
+ super()
8
+
9
+ @path = path.dup.freeze
10
+ @extension = extension ? extension.dup.freeze : nil
11
+
12
+ load_from_directory
13
+ end
14
+
15
+ protected
16
+
17
+ def load_from_directory
18
+ dir_glob = File.join(@path, (@extension ? "*.#{@extension}" : "*"))
19
+
20
+ Dir.glob(dir_glob).each do |filename|
21
+ add_document(TextDocument.build_from_file(self, filename))
22
+ end
23
+ end
24
+ end
25
+ end