ealdent-lda-ruby 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/README +5 -6
- data/README.markdown +8 -9
- data/Rakefile +58 -0
- data/VERSION.yml +2 -2
- data/ext/lda-ruby/Makefile +181 -0
- data/{lib → ext/lda-ruby}/cokus.c +0 -0
- data/{lib → ext/lda-ruby}/cokus.h +0 -0
- data/ext/lda-ruby/extconf.rb +9 -0
- data/{lib → ext/lda-ruby}/lda-alpha.c +0 -0
- data/{lib → ext/lda-ruby}/lda-alpha.h +0 -0
- data/{lib → ext/lda-ruby}/lda-data.c +0 -0
- data/{lib → ext/lda-ruby}/lda-data.h +0 -0
- data/{lib → ext/lda-ruby}/lda-inference.c +43 -44
- data/{lib → ext/lda-ruby}/lda-inference.h +0 -0
- data/{lib → ext/lda-ruby}/lda-model.c +18 -3
- data/{lib → ext/lda-ruby}/lda-model.h +0 -0
- data/{lib → ext/lda-ruby}/lda.h +0 -0
- data/{lib → ext/lda-ruby}/utils.c +0 -0
- data/{lib → ext/lda-ruby}/utils.h +0 -0
- data/lda-ruby.gemspec +74 -0
- data/lib/lda-ruby.rb +157 -0
- data/lib/lda-ruby/corpus/corpus.rb +34 -0
- data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
- data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
- data/lib/lda-ruby/corpus/text_corpus.rb +22 -0
- data/lib/lda-ruby/document/data_document.rb +30 -0
- data/lib/lda-ruby/document/document.rb +36 -0
- data/lib/lda-ruby/document/text_document.rb +32 -0
- data/lib/lda-ruby/vocabulary.rb +39 -0
- data/test/data/.gitignore +2 -0
- data/test/data/docs.dat +46 -0
- data/test/data/wiki-test-docs.yml +123 -0
- data/test/lda_ruby_test.rb +274 -0
- data/test/test_helper.rb +10 -0
- metadata +47 -36
- data/lib/extconf.rb +0 -7
- data/lib/lda.rb +0 -319
File without changes
|
@@ -75,8 +75,6 @@ void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
|
|
75
75
|
model->alpha = quiet_opt_alpha(ss->alpha_suffstats,
|
76
76
|
ss->num_docs,
|
77
77
|
model->num_topics);
|
78
|
-
|
79
|
-
printf("new alpha = %5.5f\n", model->alpha);
|
80
78
|
}
|
81
79
|
}
|
82
80
|
|
@@ -217,7 +215,7 @@ void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c)
|
|
217
215
|
int num_topics = MIN(model->num_topics, c->num_docs);
|
218
216
|
int k, n;
|
219
217
|
document* doc;
|
220
|
-
|
218
|
+
|
221
219
|
for (k = 0; k < num_topics; k++) {
|
222
220
|
doc = &(c->docs[k]);
|
223
221
|
for (n = 0; n < doc->length; n++) {
|
@@ -253,6 +251,23 @@ lda_model* new_lda_model(int num_terms, int num_topics) {
|
|
253
251
|
return(model);
|
254
252
|
}
|
255
253
|
|
254
|
+
lda_model* quiet_new_lda_model(int num_terms, int num_topics) {
|
255
|
+
int i;
|
256
|
+
lda_model* model;
|
257
|
+
|
258
|
+
model = malloc(sizeof(lda_model));
|
259
|
+
model->num_topics = num_topics;
|
260
|
+
model->num_terms = num_terms;
|
261
|
+
model->alpha = 1.0;
|
262
|
+
model->log_prob_w = malloc(sizeof(double*)*num_topics);
|
263
|
+
for (i = 0; i < num_topics; i++)
|
264
|
+
{
|
265
|
+
model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
|
266
|
+
memset(model->log_prob_w[i],0,sizeof(double)*num_terms);
|
267
|
+
}
|
268
|
+
return(model);
|
269
|
+
}
|
270
|
+
|
256
271
|
|
257
272
|
/*
|
258
273
|
* deallocate new lda model
|
File without changes
|
data/{lib → ext/lda-ruby}/lda.h
RENAMED
File without changes
|
File without changes
|
File without changes
|
data/lda-ruby.gemspec
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{lda-ruby}
|
5
|
+
s.version = "0.3.0"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["David Blei", "Jason Adams"]
|
9
|
+
s.date = %q{2009-07-24}
|
10
|
+
s.description = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.}
|
11
|
+
s.email = %q{jasonmadams@gmail.com}
|
12
|
+
s.extensions = ["ext/lda-ruby/extconf.rb"]
|
13
|
+
s.extra_rdoc_files = [
|
14
|
+
"README",
|
15
|
+
"README.markdown"
|
16
|
+
]
|
17
|
+
s.files = [
|
18
|
+
".gitignore",
|
19
|
+
"README",
|
20
|
+
"README.markdown",
|
21
|
+
"Rakefile",
|
22
|
+
"VERSION.yml",
|
23
|
+
"ext/lda-ruby/Makefile",
|
24
|
+
"ext/lda-ruby/cokus.c",
|
25
|
+
"ext/lda-ruby/cokus.h",
|
26
|
+
"ext/lda-ruby/extconf.rb",
|
27
|
+
"ext/lda-ruby/lda-alpha.c",
|
28
|
+
"ext/lda-ruby/lda-alpha.h",
|
29
|
+
"ext/lda-ruby/lda-data.c",
|
30
|
+
"ext/lda-ruby/lda-data.h",
|
31
|
+
"ext/lda-ruby/lda-inference.c",
|
32
|
+
"ext/lda-ruby/lda-inference.h",
|
33
|
+
"ext/lda-ruby/lda-model.c",
|
34
|
+
"ext/lda-ruby/lda-model.h",
|
35
|
+
"ext/lda-ruby/lda.h",
|
36
|
+
"ext/lda-ruby/utils.c",
|
37
|
+
"ext/lda-ruby/utils.h",
|
38
|
+
"lda-ruby.gemspec",
|
39
|
+
"lib/lda-ruby.rb",
|
40
|
+
"lib/lda-ruby/corpus/corpus.rb",
|
41
|
+
"lib/lda-ruby/corpus/data_corpus.rb",
|
42
|
+
"lib/lda-ruby/corpus/directory_corpus.rb",
|
43
|
+
"lib/lda-ruby/corpus/text_corpus.rb",
|
44
|
+
"lib/lda-ruby/document/data_document.rb",
|
45
|
+
"lib/lda-ruby/document/document.rb",
|
46
|
+
"lib/lda-ruby/document/text_document.rb",
|
47
|
+
"lib/lda-ruby/vocabulary.rb",
|
48
|
+
"license.txt",
|
49
|
+
"test/data/.gitignore",
|
50
|
+
"test/data/docs.dat",
|
51
|
+
"test/data/wiki-test-docs.yml",
|
52
|
+
"test/lda_ruby_test.rb",
|
53
|
+
"test/test_helper.rb"
|
54
|
+
]
|
55
|
+
s.homepage = %q{http://github.com/ealdent/lda-ruby}
|
56
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
57
|
+
s.require_paths = ["lib", "ext"]
|
58
|
+
s.rubygems_version = %q{1.3.4}
|
59
|
+
s.summary = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei.}
|
60
|
+
s.test_files = [
|
61
|
+
"test/lda_ruby_test.rb",
|
62
|
+
"test/test_helper.rb"
|
63
|
+
]
|
64
|
+
|
65
|
+
if s.respond_to? :specification_version then
|
66
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
67
|
+
s.specification_version = 3
|
68
|
+
|
69
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
70
|
+
else
|
71
|
+
end
|
72
|
+
else
|
73
|
+
end
|
74
|
+
end
|
data/lib/lda-ruby.rb
ADDED
@@ -0,0 +1,157 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
|
2
|
+
|
3
|
+
require 'lda-ruby/lda'
|
4
|
+
require 'lda-ruby/document/document'
|
5
|
+
require 'lda-ruby/document/data_document'
|
6
|
+
require 'lda-ruby/document/text_document'
|
7
|
+
require 'lda-ruby/corpus/corpus'
|
8
|
+
require 'lda-ruby/corpus/data_corpus'
|
9
|
+
require 'lda-ruby/corpus/text_corpus'
|
10
|
+
require 'lda-ruby/corpus/directory_corpus'
|
11
|
+
require 'lda-ruby/vocabulary'
|
12
|
+
|
13
|
+
module Lda
|
14
|
+
class Lda
|
15
|
+
attr_reader :vocab, :corpus
|
16
|
+
|
17
|
+
def initialize(corpus)
|
18
|
+
load_default_settings
|
19
|
+
|
20
|
+
@vocab = nil
|
21
|
+
self.corpus = corpus
|
22
|
+
@vocab = corpus.vocabulary.to_a if corpus.vocabulary
|
23
|
+
|
24
|
+
@phi = nil
|
25
|
+
end
|
26
|
+
|
27
|
+
def load_default_settings
|
28
|
+
self.max_iter = 20
|
29
|
+
self.convergence = 1e-6
|
30
|
+
self.em_max_iter = 100
|
31
|
+
self.em_convergence = 1e-4
|
32
|
+
self.num_topics = 20
|
33
|
+
self.init_alpha = 0.3
|
34
|
+
self.est_alpha = 1
|
35
|
+
|
36
|
+
[20, 1e-6, 100, 1e-4, 20, 0.3, 1]
|
37
|
+
end
|
38
|
+
|
39
|
+
def load_corpus(filename)
|
40
|
+
@corpus = Corpus.new
|
41
|
+
@corpus.load_from_file(filename)
|
42
|
+
|
43
|
+
true
|
44
|
+
end
|
45
|
+
|
46
|
+
def load_vocabulary(vocab)
|
47
|
+
if vocab.is_a?(Array)
|
48
|
+
@vocab = Marshal::load(Marshal::dump(vocab)) # deep clone array
|
49
|
+
elsif vocab.is_a?(Vocabulary)
|
50
|
+
@vocab = vocab.to_a
|
51
|
+
else
|
52
|
+
@vocab = File.open(vocab, 'r') { |f| f.read.split(/\s+/) }
|
53
|
+
end
|
54
|
+
|
55
|
+
true
|
56
|
+
end
|
57
|
+
|
58
|
+
#
|
59
|
+
# Visualization method for printing out the top +words_per_topic+ words
|
60
|
+
# for each topic.
|
61
|
+
#
|
62
|
+
# See also +top_words+.
|
63
|
+
#
|
64
|
+
def print_topics(words_per_topic = 10)
|
65
|
+
raise 'No vocabulary loaded.' unless @vocab
|
66
|
+
|
67
|
+
self.beta.each_with_index do |topic, topic_num|
|
68
|
+
# Sort the topic array and return the sorted indices of the best scores
|
69
|
+
indices = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
|
70
|
+
|
71
|
+
puts "Topic #{topic_num}"
|
72
|
+
puts "\t#{indices.map {|i| @vocab[i]}.join("\n\t")}"
|
73
|
+
puts ""
|
74
|
+
end
|
75
|
+
|
76
|
+
nil
|
77
|
+
end
|
78
|
+
|
79
|
+
#
|
80
|
+
# After the model has been run and a vocabulary has been loaded, return the
|
81
|
+
# +words_per_topic+ top words chosen by the model for each topic. This is
|
82
|
+
# returned as a hash mapping the topic number to an array of top words
|
83
|
+
# (in descending order of importance).
|
84
|
+
#
|
85
|
+
# topic_number => [w1, w2, ..., w_n]
|
86
|
+
#
|
87
|
+
# See also +print_topics+.
|
88
|
+
#
|
89
|
+
def top_words(words_per_topic = 10)
|
90
|
+
raise 'No vocabulary loaded.' unless @vocab
|
91
|
+
|
92
|
+
# find the highest scoring words per topic
|
93
|
+
topics = Hash.new
|
94
|
+
indices = (0...@vocab.size).to_a
|
95
|
+
|
96
|
+
self.beta.each_with_index do |topic, topic_num|
|
97
|
+
topics[topic_num] = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
|
98
|
+
end
|
99
|
+
|
100
|
+
topics
|
101
|
+
end
|
102
|
+
|
103
|
+
#
|
104
|
+
# Get the phi matrix which can be used to assign probabilities to words
|
105
|
+
# belonging to a specific topic in each document. The return value is a
|
106
|
+
# 3D matrix: num_docs x doc_length x num_topics. The value is cached
|
107
|
+
# after the first call, so if it needs to be recomputed, set the +recompute+
|
108
|
+
# value to true.
|
109
|
+
#
|
110
|
+
def phi(recompute=false)
|
111
|
+
if @phi.nil? || recompute
|
112
|
+
@phi = self.compute_phi
|
113
|
+
end
|
114
|
+
|
115
|
+
@phi
|
116
|
+
end
|
117
|
+
|
118
|
+
#
|
119
|
+
# Compute the average log probability for each topic for each document in the corpus.
|
120
|
+
# This method returns a matrix: num_docs x num_topics with the average log probability
|
121
|
+
# for the topic in the document.
|
122
|
+
#
|
123
|
+
def compute_topic_document_probability
|
124
|
+
outp = Array.new
|
125
|
+
|
126
|
+
@corpus.documents.each_with_index do |doc, idx|
|
127
|
+
tops = [0.0] * self.num_topics
|
128
|
+
ttl = doc.counts.inject(0.0) {|sum, i| sum + i}
|
129
|
+
self.phi[idx].each_with_index do |word_dist, word_idx|
|
130
|
+
word_dist.each_with_index do |top_prob, top_idx|
|
131
|
+
tops[top_idx] += Math.log(top_prob) * doc.counts[word_idx]
|
132
|
+
end
|
133
|
+
end
|
134
|
+
tops = tops.map {|i| i / ttl}
|
135
|
+
outp << tops
|
136
|
+
end
|
137
|
+
|
138
|
+
outp
|
139
|
+
end
|
140
|
+
|
141
|
+
#
|
142
|
+
# String representation displaying current settings.
|
143
|
+
#
|
144
|
+
def to_s
|
145
|
+
outp = ["LDA Settings:"]
|
146
|
+
outp << " Initial alpha: %0.6f" % self.init_alpha
|
147
|
+
outp << " # of topics: %d" % self.num_topics
|
148
|
+
outp << " Max iterations: %d" % self.max_iter
|
149
|
+
outp << " Convergence: %0.6f" % self.convergence
|
150
|
+
outp << "EM max iterations: %d" % self.em_max_iter
|
151
|
+
outp << " EM convergence: %0.6f" % self.em_convergence
|
152
|
+
outp << " Estimate alpha: %d" % self.est_alpha
|
153
|
+
|
154
|
+
outp.join("\n")
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Lda
|
4
|
+
class Corpus
|
5
|
+
attr_reader :documents, :num_docs, :num_terms, :vocabulary
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@documents = Array.new
|
9
|
+
@all_terms = Set.new
|
10
|
+
@num_terms = @num_docs = 0
|
11
|
+
@vocabulary = Vocabulary.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def add_document(doc)
|
15
|
+
raise 'Parameter +doc+ must be of type Document' unless doc.kind_of?(Document)
|
16
|
+
|
17
|
+
@documents << doc
|
18
|
+
|
19
|
+
@all_terms += doc.words
|
20
|
+
@num_docs += 1
|
21
|
+
@num_terms = @all_terms.size
|
22
|
+
|
23
|
+
update_vocabulary(doc)
|
24
|
+
|
25
|
+
nil
|
26
|
+
end
|
27
|
+
|
28
|
+
protected
|
29
|
+
|
30
|
+
def update_vocabulary(doc)
|
31
|
+
doc.tokens.each { |w| @vocabulary.check_word(w) }
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Lda
|
2
|
+
class DataCorpus < Corpus
|
3
|
+
attr_reader :filename
|
4
|
+
|
5
|
+
def initialize(filename)
|
6
|
+
super()
|
7
|
+
|
8
|
+
@filename = filename
|
9
|
+
load_from_file
|
10
|
+
end
|
11
|
+
|
12
|
+
protected
|
13
|
+
|
14
|
+
def load_from_file
|
15
|
+
txt = File.open(@filename, 'r') { |f| f.read }
|
16
|
+
lines = txt.split(/[\r\n]+/)
|
17
|
+
lines.each do |line|
|
18
|
+
add_document(DataDocument.new(self, line))
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Lda
|
2
|
+
class DirectoryCorpus < Corpus
|
3
|
+
attr_reader :path, :extension
|
4
|
+
|
5
|
+
# load documents from a directory
|
6
|
+
def initialize(path, extension = nil)
|
7
|
+
super()
|
8
|
+
|
9
|
+
@path = path.dup.freeze
|
10
|
+
@extension = extension ? extension.dup.freeze : nil
|
11
|
+
|
12
|
+
load_from_directory
|
13
|
+
end
|
14
|
+
|
15
|
+
protected
|
16
|
+
|
17
|
+
def load_from_directory
|
18
|
+
dir_glob = File.join(@path, (@extension ? "*.#{@extension}" : "*"))
|
19
|
+
|
20
|
+
Dir.glob(dir_glob).each do |filename|
|
21
|
+
add_document(TextDocument.build_from_file(self, filename))
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Lda
|
2
|
+
class TextCorpus < Corpus
|
3
|
+
attr_reader :filename
|
4
|
+
|
5
|
+
# Load text documents from YAML file if filename is given.
|
6
|
+
def initialize(filename)
|
7
|
+
super()
|
8
|
+
|
9
|
+
@filename = filename
|
10
|
+
load_from_file
|
11
|
+
end
|
12
|
+
|
13
|
+
protected
|
14
|
+
|
15
|
+
def load_from_file
|
16
|
+
docs = YAML.load_file(@filename)
|
17
|
+
docs.each do |doc|
|
18
|
+
add_document(TextDocument.new(self, doc))
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
#
|
2
|
+
# Create the Document using the svmlight-style text line:
|
3
|
+
#
|
4
|
+
# num_words w1:freq1 w2:freq2 ... w_n:freq_n
|
5
|
+
#
|
6
|
+
# Ex.
|
7
|
+
# 5 1:2 3:1 4:2 7:3 12:1
|
8
|
+
#
|
9
|
+
# The value for the number of words should equal the number of pairs
|
10
|
+
# following it, though this isn't at all enforced. Order of word-pair
|
11
|
+
# indices is not important.
|
12
|
+
#
|
13
|
+
|
14
|
+
module Lda
|
15
|
+
class DataDocument < Document
|
16
|
+
def initialize(corpus, data)
|
17
|
+
super(corpus)
|
18
|
+
|
19
|
+
items = data.split(/\s+/)
|
20
|
+
pairs = items[1..items.size].map { |item| item.split(':') }
|
21
|
+
|
22
|
+
pairs.each do |feature_identifier, feature_weight|
|
23
|
+
@words << feature_identifier.to_i
|
24
|
+
@counts << feature_weight.to_i
|
25
|
+
end
|
26
|
+
|
27
|
+
recompute
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Lda
|
2
|
+
class Document
|
3
|
+
attr_reader :corpus, :words, :counts, :length, :total, :tokens
|
4
|
+
|
5
|
+
def initialize(corpus)
|
6
|
+
@corpus = corpus
|
7
|
+
|
8
|
+
@words = Array.new
|
9
|
+
@counts = Array.new
|
10
|
+
@tokens = Array.new
|
11
|
+
@length = 0
|
12
|
+
@total = 0
|
13
|
+
end
|
14
|
+
|
15
|
+
#
|
16
|
+
# Recompute the total and length values.
|
17
|
+
#
|
18
|
+
def recompute
|
19
|
+
@total = @counts.inject(0) { |sum, i| sum + i }
|
20
|
+
@length = @words.size
|
21
|
+
end
|
22
|
+
|
23
|
+
def has_text?
|
24
|
+
false
|
25
|
+
end
|
26
|
+
|
27
|
+
def handle(tokens)
|
28
|
+
tokens
|
29
|
+
end
|
30
|
+
|
31
|
+
def tokenize(text)
|
32
|
+
clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ') # remove everything but letters and ' and leave only single spaces
|
33
|
+
@tokens = handle(clean_text.split(' '))
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|