lda-ruby 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/CHANGELOG +22 -0
- data/README +21 -0
- data/README.markdown +38 -0
- data/Rakefile +58 -0
- data/VERSION.yml +4 -0
- data/ext/lda-ruby/Makefile +181 -0
- data/ext/lda-ruby/cokus.c +145 -0
- data/ext/lda-ruby/cokus.h +27 -0
- data/ext/lda-ruby/extconf.rb +9 -0
- data/ext/lda-ruby/lda-alpha.c +96 -0
- data/ext/lda-ruby/lda-alpha.h +21 -0
- data/ext/lda-ruby/lda-data.c +67 -0
- data/ext/lda-ruby/lda-data.h +14 -0
- data/ext/lda-ruby/lda-inference.c +1007 -0
- data/ext/lda-ruby/lda-inference.h +63 -0
- data/ext/lda-ruby/lda-model.c +345 -0
- data/ext/lda-ruby/lda-model.h +29 -0
- data/ext/lda-ruby/lda.h +54 -0
- data/ext/lda-ruby/utils.c +111 -0
- data/ext/lda-ruby/utils.h +18 -0
- data/lda-ruby.gemspec +78 -0
- data/lib/lda-ruby.rb +168 -0
- data/lib/lda-ruby/corpus/corpus.rb +34 -0
- data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
- data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
- data/lib/lda-ruby/corpus/text_corpus.rb +22 -0
- data/lib/lda-ruby/document/data_document.rb +30 -0
- data/lib/lda-ruby/document/document.rb +36 -0
- data/lib/lda-ruby/document/text_document.rb +37 -0
- data/lib/lda-ruby/vocabulary.rb +46 -0
- data/license.txt +504 -0
- data/test/data/.gitignore +2 -0
- data/test/data/docs.dat +46 -0
- data/test/data/wiki-test-docs.yml +123 -0
- data/test/lda_ruby_test.rb +274 -0
- data/test/test_helper.rb +10 -0
- metadata +95 -0
@@ -0,0 +1,111 @@
|
|
1
|
+
#include "utils.h"
|
2
|
+
|
3
|
+
/*
|
4
|
+
* given log(a) and log(b), return log(a + b)
|
5
|
+
*
|
6
|
+
*/
|
7
|
+
|
8
|
+
double log_sum(double log_a, double log_b)
|
9
|
+
{
|
10
|
+
double v;
|
11
|
+
|
12
|
+
if (log_a < log_b)
|
13
|
+
{
|
14
|
+
v = log_b+log(1 + exp(log_a-log_b));
|
15
|
+
}
|
16
|
+
else
|
17
|
+
{
|
18
|
+
v = log_a+log(1 + exp(log_b-log_a));
|
19
|
+
}
|
20
|
+
return(v);
|
21
|
+
}
|
22
|
+
|
23
|
+
/**
|
24
|
+
* Proc to calculate the value of the trigamma, the second
|
25
|
+
* derivative of the loggamma function. Accepts positive matrices.
|
26
|
+
* From Abromowitz and Stegun. Uses formulas 6.4.11 and 6.4.12 with
|
27
|
+
* recurrence formula 6.4.6. Each requires workspace at least 5
|
28
|
+
* times the size of X.
|
29
|
+
*
|
30
|
+
**/
|
31
|
+
|
32
|
+
double trigamma(double x)
|
33
|
+
{
|
34
|
+
double p;
|
35
|
+
int i;
|
36
|
+
|
37
|
+
x=x+6;
|
38
|
+
p=1/(x*x);
|
39
|
+
p=(((((0.075757575757576*p-0.033333333333333)*p+0.0238095238095238)
|
40
|
+
*p-0.033333333333333)*p+0.166666666666667)*p+1)/x+0.5*p;
|
41
|
+
for (i=0; i<6 ;i++)
|
42
|
+
{
|
43
|
+
x=x-1;
|
44
|
+
p=1/(x*x)+p;
|
45
|
+
}
|
46
|
+
return(p);
|
47
|
+
}
|
48
|
+
|
49
|
+
|
50
|
+
/*
|
51
|
+
* taylor approximation of first derivative of the log gamma function
|
52
|
+
*
|
53
|
+
*/
|
54
|
+
|
55
|
+
double digamma(double x)
|
56
|
+
{
|
57
|
+
double p;
|
58
|
+
x=x+6;
|
59
|
+
p=1/(x*x);
|
60
|
+
p=(((0.004166666666667*p-0.003968253986254)*p+
|
61
|
+
0.008333333333333)*p-0.083333333333333)*p;
|
62
|
+
p=p+log(x)-0.5/x-1/(x-1)-1/(x-2)-1/(x-3)-1/(x-4)-1/(x-5)-1/(x-6);
|
63
|
+
return p;
|
64
|
+
}
|
65
|
+
|
66
|
+
|
67
|
+
double log_gamma(double x)
|
68
|
+
{
|
69
|
+
double z=1/(x*x);
|
70
|
+
|
71
|
+
x=x+6;
|
72
|
+
z=(((-0.000595238095238*z+0.000793650793651)
|
73
|
+
*z-0.002777777777778)*z+0.083333333333333)/x;
|
74
|
+
z=(x-0.5)*log(x)-x+0.918938533204673+z-log(x-1)-
|
75
|
+
log(x-2)-log(x-3)-log(x-4)-log(x-5)-log(x-6);
|
76
|
+
return z;
|
77
|
+
}
|
78
|
+
|
79
|
+
|
80
|
+
|
81
|
+
/*
|
82
|
+
* make directory
|
83
|
+
*
|
84
|
+
*/
|
85
|
+
|
86
|
+
void make_directory(char* name)
|
87
|
+
{
|
88
|
+
mkdir(name, S_IRUSR|S_IWUSR|S_IXUSR);
|
89
|
+
}
|
90
|
+
|
91
|
+
|
92
|
+
/*
|
93
|
+
* argmax
|
94
|
+
*
|
95
|
+
*/
|
96
|
+
|
97
|
+
int argmax(double* x, int n)
|
98
|
+
{
|
99
|
+
int i;
|
100
|
+
double max = x[0];
|
101
|
+
int argmax = 0;
|
102
|
+
for (i = 1; i < n; i++)
|
103
|
+
{
|
104
|
+
if (x[i] > max)
|
105
|
+
{
|
106
|
+
max = x[i];
|
107
|
+
argmax = i;
|
108
|
+
}
|
109
|
+
}
|
110
|
+
return(argmax);
|
111
|
+
}
|
@@ -0,0 +1,18 @@
|
|
1
|
+
#ifndef UTILS_H
|
2
|
+
#define UTILS_H
|
3
|
+
|
4
|
+
#include <stdio.h>
|
5
|
+
#include <math.h>
|
6
|
+
#include <float.h>
|
7
|
+
#include <stdlib.h>
|
8
|
+
#include <sys/stat.h>
|
9
|
+
#include <sys/types.h>
|
10
|
+
|
11
|
+
double log_sum(double log_a, double log_b);
|
12
|
+
double trigamma(double x);
|
13
|
+
double digamma(double x);
|
14
|
+
double log_gamma(double x);
|
15
|
+
void make_directory(char* name);
|
16
|
+
int argmax(double* x, int n);
|
17
|
+
|
18
|
+
#endif
|
data/lda-ruby.gemspec
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{lda-ruby}
|
8
|
+
s.version = "0.3.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["David Blei", "Jason Adams"]
|
12
|
+
s.date = %q{2009-08-11}
|
13
|
+
s.description = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.}
|
14
|
+
s.email = %q{jasonmadams@gmail.com}
|
15
|
+
s.extensions = ["ext/lda-ruby/extconf.rb"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"README",
|
18
|
+
"README.markdown"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
".gitignore",
|
22
|
+
"CHANGELOG",
|
23
|
+
"README",
|
24
|
+
"README.markdown",
|
25
|
+
"Rakefile",
|
26
|
+
"VERSION.yml",
|
27
|
+
"ext/lda-ruby/Makefile",
|
28
|
+
"ext/lda-ruby/cokus.c",
|
29
|
+
"ext/lda-ruby/cokus.h",
|
30
|
+
"ext/lda-ruby/extconf.rb",
|
31
|
+
"ext/lda-ruby/lda-alpha.c",
|
32
|
+
"ext/lda-ruby/lda-alpha.h",
|
33
|
+
"ext/lda-ruby/lda-data.c",
|
34
|
+
"ext/lda-ruby/lda-data.h",
|
35
|
+
"ext/lda-ruby/lda-inference.c",
|
36
|
+
"ext/lda-ruby/lda-inference.h",
|
37
|
+
"ext/lda-ruby/lda-model.c",
|
38
|
+
"ext/lda-ruby/lda-model.h",
|
39
|
+
"ext/lda-ruby/lda.h",
|
40
|
+
"ext/lda-ruby/utils.c",
|
41
|
+
"ext/lda-ruby/utils.h",
|
42
|
+
"lda-ruby.gemspec",
|
43
|
+
"lib/lda-ruby.rb",
|
44
|
+
"lib/lda-ruby/corpus/corpus.rb",
|
45
|
+
"lib/lda-ruby/corpus/data_corpus.rb",
|
46
|
+
"lib/lda-ruby/corpus/directory_corpus.rb",
|
47
|
+
"lib/lda-ruby/corpus/text_corpus.rb",
|
48
|
+
"lib/lda-ruby/document/data_document.rb",
|
49
|
+
"lib/lda-ruby/document/document.rb",
|
50
|
+
"lib/lda-ruby/document/text_document.rb",
|
51
|
+
"lib/lda-ruby/vocabulary.rb",
|
52
|
+
"license.txt",
|
53
|
+
"test/data/.gitignore",
|
54
|
+
"test/data/docs.dat",
|
55
|
+
"test/data/wiki-test-docs.yml",
|
56
|
+
"test/lda_ruby_test.rb",
|
57
|
+
"test/test_helper.rb"
|
58
|
+
]
|
59
|
+
s.homepage = %q{http://github.com/ealdent/lda-ruby}
|
60
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
61
|
+
s.require_paths = ["lib", "ext"]
|
62
|
+
s.rubygems_version = %q{1.3.4}
|
63
|
+
s.summary = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei.}
|
64
|
+
s.test_files = [
|
65
|
+
"test/lda_ruby_test.rb",
|
66
|
+
"test/test_helper.rb"
|
67
|
+
]
|
68
|
+
|
69
|
+
if s.respond_to? :specification_version then
|
70
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
71
|
+
s.specification_version = 3
|
72
|
+
|
73
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
74
|
+
else
|
75
|
+
end
|
76
|
+
else
|
77
|
+
end
|
78
|
+
end
|
data/lib/lda-ruby.rb
ADDED
@@ -0,0 +1,168 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
|
2
|
+
|
3
|
+
require 'lda-ruby/lda'
|
4
|
+
require 'lda-ruby/document/document'
|
5
|
+
require 'lda-ruby/document/data_document'
|
6
|
+
require 'lda-ruby/document/text_document'
|
7
|
+
require 'lda-ruby/corpus/corpus'
|
8
|
+
require 'lda-ruby/corpus/data_corpus'
|
9
|
+
require 'lda-ruby/corpus/text_corpus'
|
10
|
+
require 'lda-ruby/corpus/directory_corpus'
|
11
|
+
require 'lda-ruby/vocabulary'
|
12
|
+
|
13
|
+
module Lda
|
14
|
+
class Lda
|
15
|
+
attr_reader :vocab, :corpus
|
16
|
+
|
17
|
+
def initialize(corpus)
|
18
|
+
load_default_settings
|
19
|
+
|
20
|
+
@vocab = nil
|
21
|
+
self.corpus = corpus
|
22
|
+
@vocab = corpus.vocabulary.to_a if corpus.vocabulary
|
23
|
+
|
24
|
+
@phi = nil
|
25
|
+
end
|
26
|
+
|
27
|
+
def load_default_settings
|
28
|
+
self.max_iter = 20
|
29
|
+
self.convergence = 1e-6
|
30
|
+
self.em_max_iter = 100
|
31
|
+
self.em_convergence = 1e-4
|
32
|
+
self.num_topics = 20
|
33
|
+
self.init_alpha = 0.3
|
34
|
+
self.est_alpha = 1
|
35
|
+
|
36
|
+
[20, 1e-6, 100, 1e-4, 20, 0.3, 1]
|
37
|
+
end
|
38
|
+
|
39
|
+
def load_corpus(filename)
|
40
|
+
@corpus = Corpus.new
|
41
|
+
@corpus.load_from_file(filename)
|
42
|
+
|
43
|
+
true
|
44
|
+
end
|
45
|
+
|
46
|
+
def load_vocabulary(vocab)
|
47
|
+
if vocab.is_a?(Array)
|
48
|
+
@vocab = Marshal::load(Marshal::dump(vocab)) # deep clone array
|
49
|
+
elsif vocab.is_a?(Vocabulary)
|
50
|
+
@vocab = vocab.to_a
|
51
|
+
else
|
52
|
+
@vocab = File.open(vocab, 'r') { |f| f.read.split(/\s+/) }
|
53
|
+
end
|
54
|
+
|
55
|
+
true
|
56
|
+
end
|
57
|
+
|
58
|
+
#
|
59
|
+
# Visualization method for printing out the top +words_per_topic+ words
|
60
|
+
# for each topic.
|
61
|
+
#
|
62
|
+
# See also +top_words+.
|
63
|
+
#
|
64
|
+
def print_topics(words_per_topic = 10)
|
65
|
+
raise 'No vocabulary loaded.' unless @vocab
|
66
|
+
|
67
|
+
self.beta.each_with_index do |topic, topic_num|
|
68
|
+
# Sort the topic array and return the sorted indices of the best scores
|
69
|
+
indices = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
|
70
|
+
|
71
|
+
puts "Topic #{topic_num}"
|
72
|
+
puts "\t#{indices.map {|i| @vocab[i]}.join("\n\t")}"
|
73
|
+
puts ""
|
74
|
+
end
|
75
|
+
|
76
|
+
nil
|
77
|
+
end
|
78
|
+
|
79
|
+
#
|
80
|
+
# After the model has been run and a vocabulary has been loaded, return the
|
81
|
+
# +words_per_topic+ top words chosen by the model for each topic. This is
|
82
|
+
# returned as a hash mapping the topic number to an array of top words
|
83
|
+
# (in descending order of importance).
|
84
|
+
#
|
85
|
+
# topic_number => [w1, w2, ..., w_n]
|
86
|
+
#
|
87
|
+
# See also +print_topics+.
|
88
|
+
#
|
89
|
+
def top_word_indices(words_per_topic = 10)
|
90
|
+
raise 'No vocabulary loaded.' unless @vocab
|
91
|
+
|
92
|
+
# find the highest scoring words per topic
|
93
|
+
topics = Hash.new
|
94
|
+
indices = (0...@vocab.size).to_a
|
95
|
+
|
96
|
+
self.beta.each_with_index do |topic, topic_num|
|
97
|
+
topics[topic_num] = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
|
98
|
+
end
|
99
|
+
|
100
|
+
topics
|
101
|
+
end
|
102
|
+
|
103
|
+
def top_words(words_per_topic = 10)
|
104
|
+
output = Hash.new
|
105
|
+
|
106
|
+
topics = top_word_indices(words_per_topic)
|
107
|
+
topics.each_pair do |topic_num, words|
|
108
|
+
output[topic_num] = words.map { |w| @vocab[w] }
|
109
|
+
end
|
110
|
+
|
111
|
+
output
|
112
|
+
end
|
113
|
+
|
114
|
+
#
|
115
|
+
# Get the phi matrix which can be used to assign probabilities to words
|
116
|
+
# belonging to a specific topic in each document. The return value is a
|
117
|
+
# 3D matrix: num_docs x doc_length x num_topics. The value is cached
|
118
|
+
# after the first call, so if it needs to be recomputed, set the +recompute+
|
119
|
+
# value to true.
|
120
|
+
#
|
121
|
+
def phi(recompute=false)
|
122
|
+
if @phi.nil? || recompute
|
123
|
+
@phi = self.compute_phi
|
124
|
+
end
|
125
|
+
|
126
|
+
@phi
|
127
|
+
end
|
128
|
+
|
129
|
+
#
|
130
|
+
# Compute the average log probability for each topic for each document in the corpus.
|
131
|
+
# This method returns a matrix: num_docs x num_topics with the average log probability
|
132
|
+
# for the topic in the document.
|
133
|
+
#
|
134
|
+
def compute_topic_document_probability
|
135
|
+
outp = Array.new
|
136
|
+
|
137
|
+
@corpus.documents.each_with_index do |doc, idx|
|
138
|
+
tops = [0.0] * self.num_topics
|
139
|
+
ttl = doc.counts.inject(0.0) {|sum, i| sum + i}
|
140
|
+
self.phi[idx].each_with_index do |word_dist, word_idx|
|
141
|
+
word_dist.each_with_index do |top_prob, top_idx|
|
142
|
+
tops[top_idx] += Math.log(top_prob) * doc.counts[word_idx]
|
143
|
+
end
|
144
|
+
end
|
145
|
+
tops = tops.map {|i| i / ttl}
|
146
|
+
outp << tops
|
147
|
+
end
|
148
|
+
|
149
|
+
outp
|
150
|
+
end
|
151
|
+
|
152
|
+
#
|
153
|
+
# String representation displaying current settings.
|
154
|
+
#
|
155
|
+
def to_s
|
156
|
+
outp = ["LDA Settings:"]
|
157
|
+
outp << " Initial alpha: %0.6f" % self.init_alpha
|
158
|
+
outp << " # of topics: %d" % self.num_topics
|
159
|
+
outp << " Max iterations: %d" % self.max_iter
|
160
|
+
outp << " Convergence: %0.6f" % self.convergence
|
161
|
+
outp << "EM max iterations: %d" % self.em_max_iter
|
162
|
+
outp << " EM convergence: %0.6f" % self.em_convergence
|
163
|
+
outp << " Estimate alpha: %d" % self.est_alpha
|
164
|
+
|
165
|
+
outp.join("\n")
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Lda
|
4
|
+
class Corpus
|
5
|
+
attr_reader :documents, :num_docs, :num_terms, :vocabulary
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@documents = Array.new
|
9
|
+
@all_terms = Set.new
|
10
|
+
@num_terms = @num_docs = 0
|
11
|
+
@vocabulary = Vocabulary.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def add_document(doc)
|
15
|
+
raise 'Parameter +doc+ must be of type Document' unless doc.kind_of?(Document)
|
16
|
+
|
17
|
+
@documents << doc
|
18
|
+
|
19
|
+
@all_terms += doc.words
|
20
|
+
@num_docs += 1
|
21
|
+
@num_terms = @all_terms.size
|
22
|
+
|
23
|
+
update_vocabulary(doc)
|
24
|
+
|
25
|
+
nil
|
26
|
+
end
|
27
|
+
|
28
|
+
protected
|
29
|
+
|
30
|
+
def update_vocabulary(doc)
|
31
|
+
doc.tokens.each { |w| @vocabulary.check_word(w) }
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Lda
|
2
|
+
class DataCorpus < Corpus
|
3
|
+
attr_reader :filename
|
4
|
+
|
5
|
+
def initialize(filename)
|
6
|
+
super()
|
7
|
+
|
8
|
+
@filename = filename
|
9
|
+
load_from_file
|
10
|
+
end
|
11
|
+
|
12
|
+
protected
|
13
|
+
|
14
|
+
def load_from_file
|
15
|
+
txt = File.open(@filename, 'r') { |f| f.read }
|
16
|
+
lines = txt.split(/[\r\n]+/)
|
17
|
+
lines.each do |line|
|
18
|
+
add_document(DataDocument.new(self, line))
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Lda
|
2
|
+
class DirectoryCorpus < Corpus
|
3
|
+
attr_reader :path, :extension
|
4
|
+
|
5
|
+
# load documents from a directory
|
6
|
+
def initialize(path, extension = nil)
|
7
|
+
super()
|
8
|
+
|
9
|
+
@path = path.dup.freeze
|
10
|
+
@extension = extension ? extension.dup.freeze : nil
|
11
|
+
|
12
|
+
load_from_directory
|
13
|
+
end
|
14
|
+
|
15
|
+
protected
|
16
|
+
|
17
|
+
def load_from_directory
|
18
|
+
dir_glob = File.join(@path, (@extension ? "*.#{@extension}" : "*"))
|
19
|
+
|
20
|
+
Dir.glob(dir_glob).each do |filename|
|
21
|
+
add_document(TextDocument.build_from_file(self, filename))
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|