lda-ruby 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -0
- data/CHANGELOG +22 -0
- data/README +21 -0
- data/README.markdown +38 -0
- data/Rakefile +58 -0
- data/VERSION.yml +4 -0
- data/ext/lda-ruby/Makefile +181 -0
- data/ext/lda-ruby/cokus.c +145 -0
- data/ext/lda-ruby/cokus.h +27 -0
- data/ext/lda-ruby/extconf.rb +9 -0
- data/ext/lda-ruby/lda-alpha.c +96 -0
- data/ext/lda-ruby/lda-alpha.h +21 -0
- data/ext/lda-ruby/lda-data.c +67 -0
- data/ext/lda-ruby/lda-data.h +14 -0
- data/ext/lda-ruby/lda-inference.c +1007 -0
- data/ext/lda-ruby/lda-inference.h +63 -0
- data/ext/lda-ruby/lda-model.c +345 -0
- data/ext/lda-ruby/lda-model.h +29 -0
- data/ext/lda-ruby/lda.h +54 -0
- data/ext/lda-ruby/utils.c +111 -0
- data/ext/lda-ruby/utils.h +18 -0
- data/lda-ruby.gemspec +78 -0
- data/lib/lda-ruby.rb +168 -0
- data/lib/lda-ruby/corpus/corpus.rb +34 -0
- data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
- data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
- data/lib/lda-ruby/corpus/text_corpus.rb +22 -0
- data/lib/lda-ruby/document/data_document.rb +30 -0
- data/lib/lda-ruby/document/document.rb +36 -0
- data/lib/lda-ruby/document/text_document.rb +37 -0
- data/lib/lda-ruby/vocabulary.rb +46 -0
- data/license.txt +504 -0
- data/test/data/.gitignore +2 -0
- data/test/data/docs.dat +46 -0
- data/test/data/wiki-test-docs.yml +123 -0
- data/test/lda_ruby_test.rb +274 -0
- data/test/test_helper.rb +10 -0
- metadata +95 -0
@@ -0,0 +1,111 @@
|
|
1
|
+
#include "utils.h"
|
2
|
+
|
3
|
+
/*
|
4
|
+
* given log(a) and log(b), return log(a + b)
|
5
|
+
*
|
6
|
+
*/
|
7
|
+
|
8
|
+
double log_sum(double log_a, double log_b)
|
9
|
+
{
|
10
|
+
double v;
|
11
|
+
|
12
|
+
if (log_a < log_b)
|
13
|
+
{
|
14
|
+
v = log_b+log(1 + exp(log_a-log_b));
|
15
|
+
}
|
16
|
+
else
|
17
|
+
{
|
18
|
+
v = log_a+log(1 + exp(log_b-log_a));
|
19
|
+
}
|
20
|
+
return(v);
|
21
|
+
}
|
22
|
+
|
23
|
+
/**
|
24
|
+
* Proc to calculate the value of the trigamma, the second
|
25
|
+
* derivative of the loggamma function. Accepts positive matrices.
|
26
|
+
* From Abromowitz and Stegun. Uses formulas 6.4.11 and 6.4.12 with
|
27
|
+
* recurrence formula 6.4.6. Each requires workspace at least 5
|
28
|
+
* times the size of X.
|
29
|
+
*
|
30
|
+
**/
|
31
|
+
|
32
|
+
double trigamma(double x)
|
33
|
+
{
|
34
|
+
double p;
|
35
|
+
int i;
|
36
|
+
|
37
|
+
x=x+6;
|
38
|
+
p=1/(x*x);
|
39
|
+
p=(((((0.075757575757576*p-0.033333333333333)*p+0.0238095238095238)
|
40
|
+
*p-0.033333333333333)*p+0.166666666666667)*p+1)/x+0.5*p;
|
41
|
+
for (i=0; i<6 ;i++)
|
42
|
+
{
|
43
|
+
x=x-1;
|
44
|
+
p=1/(x*x)+p;
|
45
|
+
}
|
46
|
+
return(p);
|
47
|
+
}
|
48
|
+
|
49
|
+
|
50
|
+
/*
|
51
|
+
* taylor approximation of first derivative of the log gamma function
|
52
|
+
*
|
53
|
+
*/
|
54
|
+
|
55
|
+
double digamma(double x)
|
56
|
+
{
|
57
|
+
double p;
|
58
|
+
x=x+6;
|
59
|
+
p=1/(x*x);
|
60
|
+
p=(((0.004166666666667*p-0.003968253986254)*p+
|
61
|
+
0.008333333333333)*p-0.083333333333333)*p;
|
62
|
+
p=p+log(x)-0.5/x-1/(x-1)-1/(x-2)-1/(x-3)-1/(x-4)-1/(x-5)-1/(x-6);
|
63
|
+
return p;
|
64
|
+
}
|
65
|
+
|
66
|
+
|
67
|
+
double log_gamma(double x)
|
68
|
+
{
|
69
|
+
double z=1/(x*x);
|
70
|
+
|
71
|
+
x=x+6;
|
72
|
+
z=(((-0.000595238095238*z+0.000793650793651)
|
73
|
+
*z-0.002777777777778)*z+0.083333333333333)/x;
|
74
|
+
z=(x-0.5)*log(x)-x+0.918938533204673+z-log(x-1)-
|
75
|
+
log(x-2)-log(x-3)-log(x-4)-log(x-5)-log(x-6);
|
76
|
+
return z;
|
77
|
+
}
|
78
|
+
|
79
|
+
|
80
|
+
|
81
|
+
/*
|
82
|
+
* make directory
|
83
|
+
*
|
84
|
+
*/
|
85
|
+
|
86
|
+
void make_directory(char* name)
|
87
|
+
{
|
88
|
+
mkdir(name, S_IRUSR|S_IWUSR|S_IXUSR);
|
89
|
+
}
|
90
|
+
|
91
|
+
|
92
|
+
/*
|
93
|
+
* argmax
|
94
|
+
*
|
95
|
+
*/
|
96
|
+
|
97
|
+
int argmax(double* x, int n)
|
98
|
+
{
|
99
|
+
int i;
|
100
|
+
double max = x[0];
|
101
|
+
int argmax = 0;
|
102
|
+
for (i = 1; i < n; i++)
|
103
|
+
{
|
104
|
+
if (x[i] > max)
|
105
|
+
{
|
106
|
+
max = x[i];
|
107
|
+
argmax = i;
|
108
|
+
}
|
109
|
+
}
|
110
|
+
return(argmax);
|
111
|
+
}
|
@@ -0,0 +1,18 @@
|
|
1
|
+
#ifndef UTILS_H
|
2
|
+
#define UTILS_H
|
3
|
+
|
4
|
+
#include <stdio.h>
|
5
|
+
#include <math.h>
|
6
|
+
#include <float.h>
|
7
|
+
#include <stdlib.h>
|
8
|
+
#include <sys/stat.h>
|
9
|
+
#include <sys/types.h>
|
10
|
+
|
11
|
+
double log_sum(double log_a, double log_b);
|
12
|
+
double trigamma(double x);
|
13
|
+
double digamma(double x);
|
14
|
+
double log_gamma(double x);
|
15
|
+
void make_directory(char* name);
|
16
|
+
int argmax(double* x, int n);
|
17
|
+
|
18
|
+
#endif
|
data/lda-ruby.gemspec
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{lda-ruby}
|
8
|
+
s.version = "0.3.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["David Blei", "Jason Adams"]
|
12
|
+
s.date = %q{2009-08-11}
|
13
|
+
s.description = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.}
|
14
|
+
s.email = %q{jasonmadams@gmail.com}
|
15
|
+
s.extensions = ["ext/lda-ruby/extconf.rb"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"README",
|
18
|
+
"README.markdown"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
".gitignore",
|
22
|
+
"CHANGELOG",
|
23
|
+
"README",
|
24
|
+
"README.markdown",
|
25
|
+
"Rakefile",
|
26
|
+
"VERSION.yml",
|
27
|
+
"ext/lda-ruby/Makefile",
|
28
|
+
"ext/lda-ruby/cokus.c",
|
29
|
+
"ext/lda-ruby/cokus.h",
|
30
|
+
"ext/lda-ruby/extconf.rb",
|
31
|
+
"ext/lda-ruby/lda-alpha.c",
|
32
|
+
"ext/lda-ruby/lda-alpha.h",
|
33
|
+
"ext/lda-ruby/lda-data.c",
|
34
|
+
"ext/lda-ruby/lda-data.h",
|
35
|
+
"ext/lda-ruby/lda-inference.c",
|
36
|
+
"ext/lda-ruby/lda-inference.h",
|
37
|
+
"ext/lda-ruby/lda-model.c",
|
38
|
+
"ext/lda-ruby/lda-model.h",
|
39
|
+
"ext/lda-ruby/lda.h",
|
40
|
+
"ext/lda-ruby/utils.c",
|
41
|
+
"ext/lda-ruby/utils.h",
|
42
|
+
"lda-ruby.gemspec",
|
43
|
+
"lib/lda-ruby.rb",
|
44
|
+
"lib/lda-ruby/corpus/corpus.rb",
|
45
|
+
"lib/lda-ruby/corpus/data_corpus.rb",
|
46
|
+
"lib/lda-ruby/corpus/directory_corpus.rb",
|
47
|
+
"lib/lda-ruby/corpus/text_corpus.rb",
|
48
|
+
"lib/lda-ruby/document/data_document.rb",
|
49
|
+
"lib/lda-ruby/document/document.rb",
|
50
|
+
"lib/lda-ruby/document/text_document.rb",
|
51
|
+
"lib/lda-ruby/vocabulary.rb",
|
52
|
+
"license.txt",
|
53
|
+
"test/data/.gitignore",
|
54
|
+
"test/data/docs.dat",
|
55
|
+
"test/data/wiki-test-docs.yml",
|
56
|
+
"test/lda_ruby_test.rb",
|
57
|
+
"test/test_helper.rb"
|
58
|
+
]
|
59
|
+
s.homepage = %q{http://github.com/ealdent/lda-ruby}
|
60
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
61
|
+
s.require_paths = ["lib", "ext"]
|
62
|
+
s.rubygems_version = %q{1.3.4}
|
63
|
+
s.summary = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei.}
|
64
|
+
s.test_files = [
|
65
|
+
"test/lda_ruby_test.rb",
|
66
|
+
"test/test_helper.rb"
|
67
|
+
]
|
68
|
+
|
69
|
+
if s.respond_to? :specification_version then
|
70
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
71
|
+
s.specification_version = 3
|
72
|
+
|
73
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
74
|
+
else
|
75
|
+
end
|
76
|
+
else
|
77
|
+
end
|
78
|
+
end
|
data/lib/lda-ruby.rb
ADDED
@@ -0,0 +1,168 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
|
2
|
+
|
3
|
+
require 'lda-ruby/lda'
|
4
|
+
require 'lda-ruby/document/document'
|
5
|
+
require 'lda-ruby/document/data_document'
|
6
|
+
require 'lda-ruby/document/text_document'
|
7
|
+
require 'lda-ruby/corpus/corpus'
|
8
|
+
require 'lda-ruby/corpus/data_corpus'
|
9
|
+
require 'lda-ruby/corpus/text_corpus'
|
10
|
+
require 'lda-ruby/corpus/directory_corpus'
|
11
|
+
require 'lda-ruby/vocabulary'
|
12
|
+
|
13
|
+
module Lda
|
14
|
+
class Lda
|
15
|
+
attr_reader :vocab, :corpus
|
16
|
+
|
17
|
+
def initialize(corpus)
|
18
|
+
load_default_settings
|
19
|
+
|
20
|
+
@vocab = nil
|
21
|
+
self.corpus = corpus
|
22
|
+
@vocab = corpus.vocabulary.to_a if corpus.vocabulary
|
23
|
+
|
24
|
+
@phi = nil
|
25
|
+
end
|
26
|
+
|
27
|
+
def load_default_settings
|
28
|
+
self.max_iter = 20
|
29
|
+
self.convergence = 1e-6
|
30
|
+
self.em_max_iter = 100
|
31
|
+
self.em_convergence = 1e-4
|
32
|
+
self.num_topics = 20
|
33
|
+
self.init_alpha = 0.3
|
34
|
+
self.est_alpha = 1
|
35
|
+
|
36
|
+
[20, 1e-6, 100, 1e-4, 20, 0.3, 1]
|
37
|
+
end
|
38
|
+
|
39
|
+
def load_corpus(filename)
|
40
|
+
@corpus = Corpus.new
|
41
|
+
@corpus.load_from_file(filename)
|
42
|
+
|
43
|
+
true
|
44
|
+
end
|
45
|
+
|
46
|
+
def load_vocabulary(vocab)
|
47
|
+
if vocab.is_a?(Array)
|
48
|
+
@vocab = Marshal::load(Marshal::dump(vocab)) # deep clone array
|
49
|
+
elsif vocab.is_a?(Vocabulary)
|
50
|
+
@vocab = vocab.to_a
|
51
|
+
else
|
52
|
+
@vocab = File.open(vocab, 'r') { |f| f.read.split(/\s+/) }
|
53
|
+
end
|
54
|
+
|
55
|
+
true
|
56
|
+
end
|
57
|
+
|
58
|
+
#
|
59
|
+
# Visualization method for printing out the top +words_per_topic+ words
|
60
|
+
# for each topic.
|
61
|
+
#
|
62
|
+
# See also +top_words+.
|
63
|
+
#
|
64
|
+
def print_topics(words_per_topic = 10)
|
65
|
+
raise 'No vocabulary loaded.' unless @vocab
|
66
|
+
|
67
|
+
self.beta.each_with_index do |topic, topic_num|
|
68
|
+
# Sort the topic array and return the sorted indices of the best scores
|
69
|
+
indices = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
|
70
|
+
|
71
|
+
puts "Topic #{topic_num}"
|
72
|
+
puts "\t#{indices.map {|i| @vocab[i]}.join("\n\t")}"
|
73
|
+
puts ""
|
74
|
+
end
|
75
|
+
|
76
|
+
nil
|
77
|
+
end
|
78
|
+
|
79
|
+
#
|
80
|
+
# After the model has been run and a vocabulary has been loaded, return the
|
81
|
+
# +words_per_topic+ top words chosen by the model for each topic. This is
|
82
|
+
# returned as a hash mapping the topic number to an array of top words
|
83
|
+
# (in descending order of importance).
|
84
|
+
#
|
85
|
+
# topic_number => [w1, w2, ..., w_n]
|
86
|
+
#
|
87
|
+
# See also +print_topics+.
|
88
|
+
#
|
89
|
+
def top_word_indices(words_per_topic = 10)
|
90
|
+
raise 'No vocabulary loaded.' unless @vocab
|
91
|
+
|
92
|
+
# find the highest scoring words per topic
|
93
|
+
topics = Hash.new
|
94
|
+
indices = (0...@vocab.size).to_a
|
95
|
+
|
96
|
+
self.beta.each_with_index do |topic, topic_num|
|
97
|
+
topics[topic_num] = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
|
98
|
+
end
|
99
|
+
|
100
|
+
topics
|
101
|
+
end
|
102
|
+
|
103
|
+
def top_words(words_per_topic = 10)
|
104
|
+
output = Hash.new
|
105
|
+
|
106
|
+
topics = top_word_indices(words_per_topic)
|
107
|
+
topics.each_pair do |topic_num, words|
|
108
|
+
output[topic_num] = words.map { |w| @vocab[w] }
|
109
|
+
end
|
110
|
+
|
111
|
+
output
|
112
|
+
end
|
113
|
+
|
114
|
+
#
|
115
|
+
# Get the phi matrix which can be used to assign probabilities to words
|
116
|
+
# belonging to a specific topic in each document. The return value is a
|
117
|
+
# 3D matrix: num_docs x doc_length x num_topics. The value is cached
|
118
|
+
# after the first call, so if it needs to be recomputed, set the +recompute+
|
119
|
+
# value to true.
|
120
|
+
#
|
121
|
+
def phi(recompute=false)
|
122
|
+
if @phi.nil? || recompute
|
123
|
+
@phi = self.compute_phi
|
124
|
+
end
|
125
|
+
|
126
|
+
@phi
|
127
|
+
end
|
128
|
+
|
129
|
+
#
|
130
|
+
# Compute the average log probability for each topic for each document in the corpus.
|
131
|
+
# This method returns a matrix: num_docs x num_topics with the average log probability
|
132
|
+
# for the topic in the document.
|
133
|
+
#
|
134
|
+
def compute_topic_document_probability
|
135
|
+
outp = Array.new
|
136
|
+
|
137
|
+
@corpus.documents.each_with_index do |doc, idx|
|
138
|
+
tops = [0.0] * self.num_topics
|
139
|
+
ttl = doc.counts.inject(0.0) {|sum, i| sum + i}
|
140
|
+
self.phi[idx].each_with_index do |word_dist, word_idx|
|
141
|
+
word_dist.each_with_index do |top_prob, top_idx|
|
142
|
+
tops[top_idx] += Math.log(top_prob) * doc.counts[word_idx]
|
143
|
+
end
|
144
|
+
end
|
145
|
+
tops = tops.map {|i| i / ttl}
|
146
|
+
outp << tops
|
147
|
+
end
|
148
|
+
|
149
|
+
outp
|
150
|
+
end
|
151
|
+
|
152
|
+
#
|
153
|
+
# String representation displaying current settings.
|
154
|
+
#
|
155
|
+
def to_s
|
156
|
+
outp = ["LDA Settings:"]
|
157
|
+
outp << " Initial alpha: %0.6f" % self.init_alpha
|
158
|
+
outp << " # of topics: %d" % self.num_topics
|
159
|
+
outp << " Max iterations: %d" % self.max_iter
|
160
|
+
outp << " Convergence: %0.6f" % self.convergence
|
161
|
+
outp << "EM max iterations: %d" % self.em_max_iter
|
162
|
+
outp << " EM convergence: %0.6f" % self.em_convergence
|
163
|
+
outp << " Estimate alpha: %d" % self.est_alpha
|
164
|
+
|
165
|
+
outp.join("\n")
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Lda
|
4
|
+
class Corpus
|
5
|
+
attr_reader :documents, :num_docs, :num_terms, :vocabulary
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@documents = Array.new
|
9
|
+
@all_terms = Set.new
|
10
|
+
@num_terms = @num_docs = 0
|
11
|
+
@vocabulary = Vocabulary.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def add_document(doc)
|
15
|
+
raise 'Parameter +doc+ must be of type Document' unless doc.kind_of?(Document)
|
16
|
+
|
17
|
+
@documents << doc
|
18
|
+
|
19
|
+
@all_terms += doc.words
|
20
|
+
@num_docs += 1
|
21
|
+
@num_terms = @all_terms.size
|
22
|
+
|
23
|
+
update_vocabulary(doc)
|
24
|
+
|
25
|
+
nil
|
26
|
+
end
|
27
|
+
|
28
|
+
protected
|
29
|
+
|
30
|
+
def update_vocabulary(doc)
|
31
|
+
doc.tokens.each { |w| @vocabulary.check_word(w) }
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Lda
|
2
|
+
class DataCorpus < Corpus
|
3
|
+
attr_reader :filename
|
4
|
+
|
5
|
+
def initialize(filename)
|
6
|
+
super()
|
7
|
+
|
8
|
+
@filename = filename
|
9
|
+
load_from_file
|
10
|
+
end
|
11
|
+
|
12
|
+
protected
|
13
|
+
|
14
|
+
def load_from_file
|
15
|
+
txt = File.open(@filename, 'r') { |f| f.read }
|
16
|
+
lines = txt.split(/[\r\n]+/)
|
17
|
+
lines.each do |line|
|
18
|
+
add_document(DataDocument.new(self, line))
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Lda
|
2
|
+
class DirectoryCorpus < Corpus
|
3
|
+
attr_reader :path, :extension
|
4
|
+
|
5
|
+
# load documents from a directory
|
6
|
+
def initialize(path, extension = nil)
|
7
|
+
super()
|
8
|
+
|
9
|
+
@path = path.dup.freeze
|
10
|
+
@extension = extension ? extension.dup.freeze : nil
|
11
|
+
|
12
|
+
load_from_directory
|
13
|
+
end
|
14
|
+
|
15
|
+
protected
|
16
|
+
|
17
|
+
def load_from_directory
|
18
|
+
dir_glob = File.join(@path, (@extension ? "*.#{@extension}" : "*"))
|
19
|
+
|
20
|
+
Dir.glob(dir_glob).each do |filename|
|
21
|
+
add_document(TextDocument.build_from_file(self, filename))
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|