ealdent-lda-ruby 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/README +5 -6
- data/README.markdown +8 -9
- data/Rakefile +58 -0
- data/VERSION.yml +2 -2
- data/ext/lda-ruby/Makefile +181 -0
- data/{lib → ext/lda-ruby}/cokus.c +0 -0
- data/{lib → ext/lda-ruby}/cokus.h +0 -0
- data/ext/lda-ruby/extconf.rb +9 -0
- data/{lib → ext/lda-ruby}/lda-alpha.c +0 -0
- data/{lib → ext/lda-ruby}/lda-alpha.h +0 -0
- data/{lib → ext/lda-ruby}/lda-data.c +0 -0
- data/{lib → ext/lda-ruby}/lda-data.h +0 -0
- data/{lib → ext/lda-ruby}/lda-inference.c +43 -44
- data/{lib → ext/lda-ruby}/lda-inference.h +0 -0
- data/{lib → ext/lda-ruby}/lda-model.c +18 -3
- data/{lib → ext/lda-ruby}/lda-model.h +0 -0
- data/{lib → ext/lda-ruby}/lda.h +0 -0
- data/{lib → ext/lda-ruby}/utils.c +0 -0
- data/{lib → ext/lda-ruby}/utils.h +0 -0
- data/lda-ruby.gemspec +74 -0
- data/lib/lda-ruby.rb +157 -0
- data/lib/lda-ruby/corpus/corpus.rb +34 -0
- data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
- data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
- data/lib/lda-ruby/corpus/text_corpus.rb +22 -0
- data/lib/lda-ruby/document/data_document.rb +30 -0
- data/lib/lda-ruby/document/document.rb +36 -0
- data/lib/lda-ruby/document/text_document.rb +32 -0
- data/lib/lda-ruby/vocabulary.rb +39 -0
- data/test/data/.gitignore +2 -0
- data/test/data/docs.dat +46 -0
- data/test/data/wiki-test-docs.yml +123 -0
- data/test/lda_ruby_test.rb +274 -0
- data/test/test_helper.rb +10 -0
- metadata +47 -36
- data/lib/extconf.rb +0 -7
- data/lib/lda.rb +0 -319
data/README
CHANGED
@@ -7,16 +7,15 @@ The original C code relied on files for the input and output. We felt it was nec
|
|
7
7
|
Example usage:
|
8
8
|
|
9
9
|
require 'lda'
|
10
|
-
lda = Lda::Lda.new # create an Lda object for training
|
11
10
|
corpus = Lda::Corpus.new("data/data_file.dat")
|
12
|
-
lda.corpus
|
13
|
-
lda.em("random")
|
11
|
+
lda = Lda::Lda.new(corpus) # create an Lda object for training
|
12
|
+
lda.em("random") # run EM algorithm using random starting points
|
14
13
|
lda.load_vocabulary("data/vocab.txt")
|
15
|
-
lda.print_topics(20)
|
14
|
+
lda.print_topics(20) # print the topic 20 words per topic
|
16
15
|
|
17
|
-
|
16
|
+
You can check out the mailing list for this project if you have any questions or mail lda-ruby@groups.google.com [email link]. If you have general questions about Latent Dirichlet Allocation, I urge you to use the topic models mailing list, since the people who monitor that are very knowledgeable.
|
18
17
|
|
19
18
|
|
20
19
|
References
|
21
20
|
|
22
|
-
Blei, David M., Ng, Andrew Y., and Jordan, Michael I. 2003. Latent dirichlet allocation. Journal of Machine Learning Research. 3 (Mar. 2003), 993-1022.
|
21
|
+
Blei, David M., Ng, Andrew Y., and Jordan, Michael I. 2003. Latent dirichlet allocation. Journal of Machine Learning Research. 3 (Mar. 2003), 993-1022.
|
data/README.markdown
CHANGED
@@ -8,19 +8,17 @@ The original C code relied on files for the input and output. We felt it was nec
|
|
8
8
|
|
9
9
|
### Example usage:
|
10
10
|
|
11
|
-
require 'lda'
|
12
|
-
lda = Lda::Lda.new # create an Lda object for training
|
11
|
+
require 'lda-ruby'
|
13
12
|
corpus = Lda::Corpus.new("data/data_file.dat")
|
14
|
-
lda.corpus
|
15
|
-
lda.em("random")
|
13
|
+
lda = Lda::Lda.new(corpus) # create an Lda object for training
|
14
|
+
lda.em("random") # run EM algorithm using random starting points
|
16
15
|
lda.load_vocabulary("data/vocab.txt")
|
17
|
-
lda.print_topics(20)
|
16
|
+
lda.print_topics(20) # print the topic 20 words per topic
|
18
17
|
|
19
|
-
|
18
|
+
You can check out the mailing list for this project if you have any questions or mail lda-ruby@groups.google.com [email link]. If you have general questions about Latent Dirichlet Allocation, I urge you to use the [topic models mailing list][topic-models], since the people who monitor that are very knowledgeable.
|
20
19
|
|
21
20
|
## Resources
|
22
21
|
|
23
|
-
|
24
22
|
+ [Blog post about LDA-Ruby][lda-ruby]
|
25
23
|
+ [David Blei's lda-c code][blei]
|
26
24
|
+ [Wikipedia article on LDA][wikipedia]
|
@@ -29,11 +27,12 @@ See the rdocs for further information. You can also check out the mailing list f
|
|
29
27
|
|
30
28
|
## References
|
31
29
|
|
32
|
-
Blei, David M., Ng, Andrew Y., and Jordan, Michael I. 2003. Latent dirichlet allocation. Journal of Machine Learning Research. 3 (Mar. 2003), 993-1022 [[pdf][pdf]].
|
30
|
+
Blei, David M., Ng, Andrew Y., and Jordan, Michael I. 2003. Latent dirichlet allocation. Journal of Machine Learning Research. 3 (Mar. 2003), 993-1022 [[pdf][pdf]].
|
33
31
|
|
34
32
|
[svmlight]: http://svmlight.joachims.org
|
35
33
|
[lda-ruby]: http://mendicantbug.com/2008/11/17/lda-in-ruby/
|
36
34
|
[blei]: http://www.cs.princeton.edu/~blei/lda-c/
|
37
35
|
[wikipedia]: http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation
|
38
36
|
[ap-data]: http://www.cs.princeton.edu/~blei/lda-c/ap.tgz
|
39
|
-
[pdf]: http://www.cs.princeton.edu/picasso/mats/BleiNgJordan2003_blei.pdf
|
37
|
+
[pdf]: http://www.cs.princeton.edu/picasso/mats/BleiNgJordan2003_blei.pdf
|
38
|
+
[topic-models]: https://lists.cs.princeton.edu/mailman/listinfo/topic-models
|
data/Rakefile
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "lda-ruby"
|
8
|
+
gem.summary = %Q{Ruby port of Latent Dirichlet Allocation by David M. Blei.}
|
9
|
+
gem.description = %Q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.}
|
10
|
+
gem.email = "jasonmadams@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/ealdent/lda-ruby"
|
12
|
+
gem.authors = ['David Blei', 'Jason Adams']
|
13
|
+
gem.extensions = ['ext/lda-ruby/extconf.rb']
|
14
|
+
gem.require_paths = ['lib', 'ext']
|
15
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
16
|
+
end
|
17
|
+
|
18
|
+
rescue LoadError
|
19
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
20
|
+
end
|
21
|
+
|
22
|
+
require 'rake/testtask'
|
23
|
+
Rake::TestTask.new(:test) do |test|
|
24
|
+
test.libs << 'lib' << 'test'
|
25
|
+
test.pattern = 'test/**/*_test.rb'
|
26
|
+
test.verbose = true
|
27
|
+
end
|
28
|
+
|
29
|
+
begin
|
30
|
+
require 'rcov/rcovtask'
|
31
|
+
Rcov::RcovTask.new do |test|
|
32
|
+
test.libs << 'test'
|
33
|
+
test.pattern = 'test/**/*_test.rb'
|
34
|
+
test.verbose = true
|
35
|
+
end
|
36
|
+
rescue LoadError
|
37
|
+
task :rcov do
|
38
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
task :default => :test
|
43
|
+
|
44
|
+
require 'rake/rdoctask'
|
45
|
+
Rake::RDocTask.new do |rdoc|
|
46
|
+
if File.exist?('VERSION.yml')
|
47
|
+
config = YAML.load(File.read('VERSION.yml'))
|
48
|
+
version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
|
49
|
+
else
|
50
|
+
version = ""
|
51
|
+
end
|
52
|
+
|
53
|
+
rdoc.rdoc_dir = 'rdoc'
|
54
|
+
rdoc.title = "lda-ruby #{version}"
|
55
|
+
rdoc.rdoc_files.include('README*')
|
56
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
57
|
+
end
|
58
|
+
|
data/VERSION.yml
CHANGED
@@ -0,0 +1,181 @@
|
|
1
|
+
|
2
|
+
SHELL = /bin/sh
|
3
|
+
|
4
|
+
#### Start of system configuration section. ####
|
5
|
+
|
6
|
+
srcdir = .
|
7
|
+
topdir = /home/taf2/.local/include/ruby-1.9.1
|
8
|
+
hdrdir = /home/taf2/.local/include/ruby-1.9.1
|
9
|
+
arch_hdrdir = /home/taf2/.local/include/ruby-1.9.1/$(arch)
|
10
|
+
VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
|
11
|
+
prefix = $(DESTDIR)/home/taf2/.local
|
12
|
+
exec_prefix = $(prefix)
|
13
|
+
vendorhdrdir = $(rubyhdrdir)/vendor_ruby
|
14
|
+
sitehdrdir = $(rubyhdrdir)/site_ruby
|
15
|
+
rubyhdrdir = $(includedir)/$(RUBY_INSTALL_NAME)-$(ruby_version)
|
16
|
+
vendordir = $(libdir)/$(RUBY_INSTALL_NAME)/vendor_ruby
|
17
|
+
sitedir = $(libdir)/$(RUBY_INSTALL_NAME)/site_ruby
|
18
|
+
mandir = $(datarootdir)/man
|
19
|
+
localedir = $(datarootdir)/locale
|
20
|
+
libdir = $(exec_prefix)/lib
|
21
|
+
psdir = $(docdir)
|
22
|
+
pdfdir = $(docdir)
|
23
|
+
dvidir = $(docdir)
|
24
|
+
htmldir = $(docdir)
|
25
|
+
infodir = $(datarootdir)/info
|
26
|
+
docdir = $(datarootdir)/doc/$(PACKAGE)
|
27
|
+
oldincludedir = $(DESTDIR)/usr/include
|
28
|
+
includedir = $(prefix)/include
|
29
|
+
localstatedir = $(prefix)/var
|
30
|
+
sharedstatedir = $(prefix)/com
|
31
|
+
sysconfdir = $(prefix)/etc
|
32
|
+
datadir = $(datarootdir)
|
33
|
+
datarootdir = $(prefix)/share
|
34
|
+
libexecdir = $(exec_prefix)/libexec
|
35
|
+
sbindir = $(exec_prefix)/sbin
|
36
|
+
bindir = $(exec_prefix)/bin
|
37
|
+
rubylibdir = $(libdir)/$(ruby_install_name)/$(ruby_version)
|
38
|
+
archdir = $(rubylibdir)/$(arch)
|
39
|
+
sitelibdir = $(sitedir)/$(ruby_version)
|
40
|
+
sitearchdir = $(sitelibdir)/$(sitearch)
|
41
|
+
vendorlibdir = $(vendordir)/$(ruby_version)
|
42
|
+
vendorarchdir = $(vendorlibdir)/$(sitearch)
|
43
|
+
|
44
|
+
CC = gcc
|
45
|
+
CXX = g++
|
46
|
+
LIBRUBY = $(LIBRUBY_SO)
|
47
|
+
LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
|
48
|
+
LIBRUBYARG_SHARED = -Wl,-R -Wl,$(libdir) -L$(libdir) -l$(RUBY_SO_NAME)
|
49
|
+
LIBRUBYARG_STATIC = -Wl,-R -Wl,$(libdir) -L$(libdir) -l$(RUBY_SO_NAME)-static
|
50
|
+
OUTFLAG = -o
|
51
|
+
COUTFLAG = -o
|
52
|
+
|
53
|
+
RUBY_EXTCONF_H =
|
54
|
+
cflags = $(optflags) $(debugflags) $(warnflags)
|
55
|
+
optflags = -O0
|
56
|
+
debugflags = -g3 -ggdb
|
57
|
+
warnflags = -Wall -Wno-parentheses
|
58
|
+
CFLAGS = -fPIC $(cflags) -fPIC -Wall -ggdb -O0
|
59
|
+
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
60
|
+
DEFS =
|
61
|
+
CPPFLAGS = -D USE_RUBY $(DEFS) $(cppflags)
|
62
|
+
CXXFLAGS = $(CFLAGS) $(cxxflags)
|
63
|
+
ldflags = -L. -rdynamic -Wl,-export-dynamic
|
64
|
+
dldflags =
|
65
|
+
archflag =
|
66
|
+
DLDFLAGS = $(ldflags) $(dldflags) $(archflag)
|
67
|
+
LDSHARED = $(CC) -shared
|
68
|
+
LDSHAREDXX = $(CXX) -shared
|
69
|
+
AR = ar
|
70
|
+
EXEEXT =
|
71
|
+
|
72
|
+
RUBY_INSTALL_NAME = ruby
|
73
|
+
RUBY_SO_NAME = ruby
|
74
|
+
arch = x86_64-linux
|
75
|
+
sitearch = x86_64-linux
|
76
|
+
ruby_version = 1.9.1
|
77
|
+
ruby = /home/taf2/.local/bin/ruby
|
78
|
+
RUBY = $(ruby)
|
79
|
+
RM = rm -f
|
80
|
+
RM_RF = $(RUBY) -run -e rm -- -rf
|
81
|
+
RMDIRS = $(RUBY) -run -e rmdir -- -p
|
82
|
+
MAKEDIRS = mkdir -p
|
83
|
+
INSTALL = /usr/bin/install -c
|
84
|
+
INSTALL_PROG = $(INSTALL) -m 0755
|
85
|
+
INSTALL_DATA = $(INSTALL) -m 644
|
86
|
+
COPY = cp
|
87
|
+
|
88
|
+
#### End of system configuration section. ####
|
89
|
+
|
90
|
+
preload =
|
91
|
+
|
92
|
+
libpath = . $(libdir)
|
93
|
+
LIBPATH = -L. -L$(libdir) -Wl,-R$(libdir)
|
94
|
+
DEFFILE =
|
95
|
+
|
96
|
+
CLEANFILES = mkmf.log
|
97
|
+
DISTCLEANFILES =
|
98
|
+
DISTCLEANDIRS =
|
99
|
+
|
100
|
+
extout =
|
101
|
+
extout_prefix =
|
102
|
+
target_prefix =
|
103
|
+
LOCAL_LIBS =
|
104
|
+
LIBS = $(LIBRUBYARG_SHARED) -lpthread -lrt -ldl -lcrypt -lm -lc
|
105
|
+
SRCS = lda-model.c lda-data.c utils.c lda-alpha.c cokus.c lda-inference.c
|
106
|
+
OBJS = lda-model.o lda-data.o utils.o lda-alpha.o cokus.o lda-inference.o
|
107
|
+
TARGET = lda_ext
|
108
|
+
DLLIB = $(TARGET).so
|
109
|
+
EXTSTATIC =
|
110
|
+
STATIC_LIB =
|
111
|
+
|
112
|
+
BINDIR = $(bindir)
|
113
|
+
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
114
|
+
RUBYLIBDIR = $(sitelibdir)$(target_prefix)
|
115
|
+
RUBYARCHDIR = $(sitearchdir)$(target_prefix)
|
116
|
+
HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
|
117
|
+
ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
|
118
|
+
|
119
|
+
TARGET_SO = $(DLLIB)
|
120
|
+
CLEANLIBS = $(TARGET).so
|
121
|
+
CLEANOBJS = *.o *.bak
|
122
|
+
|
123
|
+
all: $(DLLIB)
|
124
|
+
static: $(STATIC_LIB)
|
125
|
+
|
126
|
+
clean-rb-default::
|
127
|
+
clean-rb::
|
128
|
+
clean-so::
|
129
|
+
clean: clean-so clean-rb-default clean-rb
|
130
|
+
@-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
|
131
|
+
|
132
|
+
distclean-rb-default::
|
133
|
+
distclean-rb::
|
134
|
+
distclean-so::
|
135
|
+
distclean: clean distclean-so distclean-rb-default distclean-rb
|
136
|
+
@-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
|
137
|
+
@-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
|
138
|
+
@-$(RMDIRS) $(DISTCLEANDIRS)
|
139
|
+
|
140
|
+
realclean: distclean
|
141
|
+
install: install-so install-rb
|
142
|
+
|
143
|
+
install-so: $(RUBYARCHDIR)
|
144
|
+
install-so: $(RUBYARCHDIR)/$(DLLIB)
|
145
|
+
$(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
|
146
|
+
$(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
|
147
|
+
install-rb: pre-install-rb install-rb-default
|
148
|
+
install-rb-default: pre-install-rb-default
|
149
|
+
pre-install-rb: Makefile
|
150
|
+
pre-install-rb-default: Makefile
|
151
|
+
$(RUBYARCHDIR):
|
152
|
+
$(MAKEDIRS) $@
|
153
|
+
|
154
|
+
site-install: site-install-so site-install-rb
|
155
|
+
site-install-so: install-so
|
156
|
+
site-install-rb: install-rb
|
157
|
+
|
158
|
+
.SUFFIXES: .c .m .cc .cxx .cpp .C .o
|
159
|
+
|
160
|
+
.cc.o:
|
161
|
+
$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
162
|
+
|
163
|
+
.cxx.o:
|
164
|
+
$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
165
|
+
|
166
|
+
.cpp.o:
|
167
|
+
$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
168
|
+
|
169
|
+
.C.o:
|
170
|
+
$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
171
|
+
|
172
|
+
.c.o:
|
173
|
+
$(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
|
174
|
+
|
175
|
+
$(DLLIB): $(OBJS) Makefile
|
176
|
+
@-$(RM) $(@)
|
177
|
+
$(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
|
178
|
+
|
179
|
+
|
180
|
+
|
181
|
+
$(OBJS): $(hdrdir)/ruby.h $(hdrdir)/ruby/defines.h $(arch_hdrdir)/ruby/config.h
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -34,7 +34,7 @@
|
|
34
34
|
#ifdef USE_RUBY
|
35
35
|
#include "ruby.h"
|
36
36
|
|
37
|
-
VALUE
|
37
|
+
VALUE rb_cLdaModule;
|
38
38
|
VALUE rb_cLda;
|
39
39
|
VALUE rb_cLdaCorpus;
|
40
40
|
VALUE rb_cLdaDocument;
|
@@ -83,7 +83,7 @@ double lda_inference(document* doc, lda_model* model, double* var_gamma, double*
|
|
83
83
|
printf("phi for term: %d of %d\n", index, model->num_terms);
|
84
84
|
phi[n][k] = 0.0;
|
85
85
|
}
|
86
|
-
else {
|
86
|
+
else {
|
87
87
|
phi[n][k] =
|
88
88
|
digamma_gam[k] +
|
89
89
|
model->log_prob_w[k][index];
|
@@ -142,7 +142,7 @@ double compute_likelihood(document* doc, lda_model* model, double** phi, double*
|
|
142
142
|
for (k = 0; k < model->num_topics; k++)
|
143
143
|
{
|
144
144
|
likelihood += (model->alpha - 1)*(dig[k] - digsum) + lgamma(var_gamma[k]) - (var_gamma[k] - 1)*(dig[k] - digsum);
|
145
|
-
|
145
|
+
|
146
146
|
for (n = 0; n < doc->length; n++)
|
147
147
|
{
|
148
148
|
if (phi[n][k] > 0)
|
@@ -261,7 +261,7 @@ void run_em(char* start, char* directory, corpus* corpus) {
|
|
261
261
|
} else {
|
262
262
|
quiet_lda_mle(model, ss, 0);
|
263
263
|
}
|
264
|
-
|
264
|
+
|
265
265
|
model->alpha = INITIAL_ALPHA;
|
266
266
|
} else if (strcmp(start, "random")==0) {
|
267
267
|
model = new_lda_model(corpus->num_terms, NTOPICS);
|
@@ -499,7 +499,7 @@ void run_quiet_em(char* start, corpus* corpus) {
|
|
499
499
|
|
500
500
|
lda_suffstats* ss = NULL;
|
501
501
|
if (strncmp(start, "seeded",6)==0) {
|
502
|
-
model =
|
502
|
+
model = quiet_new_lda_model(corpus->num_terms, NTOPICS);
|
503
503
|
model->alpha = INITIAL_ALPHA;
|
504
504
|
ss = new_lda_suffstats(model);
|
505
505
|
if (VERBOSE) {
|
@@ -513,7 +513,7 @@ void run_quiet_em(char* start, corpus* corpus) {
|
|
513
513
|
quiet_lda_mle(model, ss, 0);
|
514
514
|
}
|
515
515
|
} else if (strncmp(start, "fixed",5)==0) {
|
516
|
-
model =
|
516
|
+
model = quiet_new_lda_model(corpus->num_terms, NTOPICS);
|
517
517
|
model->alpha = INITIAL_ALPHA;
|
518
518
|
ss = new_lda_suffstats(model);
|
519
519
|
corpus_initialize_fixed_ss(ss, model, corpus);
|
@@ -523,7 +523,7 @@ void run_quiet_em(char* start, corpus* corpus) {
|
|
523
523
|
quiet_lda_mle(model, ss, 0);
|
524
524
|
}
|
525
525
|
} else if (strncmp(start, "random",6)==0) {
|
526
|
-
model =
|
526
|
+
model = quiet_new_lda_model(corpus->num_terms, NTOPICS);
|
527
527
|
model->alpha = INITIAL_ALPHA;
|
528
528
|
ss = new_lda_suffstats(model);
|
529
529
|
random_initialize_ss(ss, model);
|
@@ -605,7 +605,7 @@ void run_quiet_em(char* start, corpus* corpus) {
|
|
605
605
|
|
606
606
|
/*
|
607
607
|
* Set all of the settings in one command:
|
608
|
-
*
|
608
|
+
*
|
609
609
|
* * init_alpha
|
610
610
|
* * num_topics
|
611
611
|
* * max_iter
|
@@ -710,7 +710,7 @@ static VALUE wrap_get_num_topics(VALUE self) {
|
|
710
710
|
*/
|
711
711
|
static VALUE wrap_set_initial_alpha(VALUE self, VALUE initial_alpha) {
|
712
712
|
INITIAL_ALPHA = (float)NUM2DBL(initial_alpha);
|
713
|
-
|
713
|
+
|
714
714
|
return initial_alpha;
|
715
715
|
}
|
716
716
|
|
@@ -719,7 +719,7 @@ static VALUE wrap_set_initial_alpha(VALUE self, VALUE initial_alpha) {
|
|
719
719
|
*/
|
720
720
|
static VALUE wrap_set_num_topics(VALUE self, VALUE ntopics) {
|
721
721
|
NTOPICS = NUM2INT(ntopics);
|
722
|
-
|
722
|
+
|
723
723
|
return ntopics;
|
724
724
|
}
|
725
725
|
|
@@ -735,7 +735,7 @@ static VALUE wrap_get_estimate_alpha(VALUE self) {
|
|
735
735
|
*/
|
736
736
|
static VALUE wrap_set_estimate_alpha(VALUE self, VALUE est_alpha) {
|
737
737
|
ESTIMATE_ALPHA = NUM2INT(est_alpha);
|
738
|
-
|
738
|
+
|
739
739
|
return est_alpha;
|
740
740
|
}
|
741
741
|
|
@@ -760,7 +760,7 @@ static VALUE wrap_set_verbosity(VALUE self, VALUE verbosity) {
|
|
760
760
|
} else {
|
761
761
|
VERBOSE = FALSE;
|
762
762
|
}
|
763
|
-
|
763
|
+
|
764
764
|
return verbosity;
|
765
765
|
}
|
766
766
|
|
@@ -777,7 +777,7 @@ static VALUE wrap_set_verbosity(VALUE self, VALUE verbosity) {
|
|
777
777
|
static VALUE wrap_em(VALUE self, VALUE start) {
|
778
778
|
if (!corpus_loaded)
|
779
779
|
return Qnil;
|
780
|
-
|
780
|
+
|
781
781
|
run_quiet_em(STR2CSTR(start), last_corpus);
|
782
782
|
|
783
783
|
return Qnil;
|
@@ -815,7 +815,7 @@ static VALUE wrap_ruby_corpus(VALUE self, VALUE rcorpus) {
|
|
815
815
|
corpus* c;
|
816
816
|
int i = 0;
|
817
817
|
int j = 0;
|
818
|
-
|
818
|
+
|
819
819
|
c = malloc(sizeof(corpus));
|
820
820
|
c->num_terms = NUM2INT(rb_iv_get(rcorpus, "@num_terms"));
|
821
821
|
c->num_docs = NUM2INT(rb_iv_get(rcorpus, "@num_docs"));
|
@@ -825,7 +825,7 @@ static VALUE wrap_ruby_corpus(VALUE self, VALUE rcorpus) {
|
|
825
825
|
VALUE one_doc = rb_ary_entry(doc_ary, i);
|
826
826
|
VALUE words = rb_iv_get(one_doc, "@words");
|
827
827
|
VALUE counts = rb_iv_get(one_doc, "@counts");
|
828
|
-
|
828
|
+
|
829
829
|
c->docs[i].length = NUM2INT(rb_iv_get(one_doc, "@length"));
|
830
830
|
c->docs[i].total = NUM2INT(rb_iv_get(one_doc, "@total"));
|
831
831
|
c->docs[i].words = malloc(sizeof(int) * c->docs[i].length);
|
@@ -834,18 +834,18 @@ static VALUE wrap_ruby_corpus(VALUE self, VALUE rcorpus) {
|
|
834
834
|
int one_word = NUM2INT(rb_ary_entry(words, j));
|
835
835
|
int one_count = NUM2INT(rb_ary_entry(counts, j));
|
836
836
|
if( one_word > c->num_terms ) {
|
837
|
-
rb_raise(rb_eRuntimeError, "error term count(%d) less
|
837
|
+
rb_raise(rb_eRuntimeError, "error term count(%d) less than word index(%d)", c->num_terms, one_word);
|
838
838
|
}
|
839
839
|
c->docs[i].words[j] = one_word;
|
840
840
|
c->docs[i].counts[j] = one_count;
|
841
841
|
}
|
842
842
|
}
|
843
|
-
|
843
|
+
|
844
844
|
last_corpus = c;
|
845
845
|
corpus_loaded = TRUE;
|
846
|
-
|
846
|
+
|
847
847
|
rb_iv_set(self, "@corpus", rcorpus);
|
848
|
-
|
848
|
+
|
849
849
|
return Qtrue;
|
850
850
|
}
|
851
851
|
|
@@ -856,11 +856,11 @@ static VALUE wrap_ruby_corpus(VALUE self, VALUE rcorpus) {
|
|
856
856
|
static VALUE wrap_get_gamma(VALUE self) {
|
857
857
|
if (!model_loaded)
|
858
858
|
return Qnil;
|
859
|
-
|
859
|
+
|
860
860
|
// last_gamma is a double[num_docs][num_topics]
|
861
861
|
VALUE arr;
|
862
862
|
int i = 0, j = 0;
|
863
|
-
|
863
|
+
|
864
864
|
arr = rb_ary_new2(last_corpus->num_docs);
|
865
865
|
for (i = 0; i < last_corpus->num_docs; i++) {
|
866
866
|
VALUE arr2 = rb_ary_new2(last_model->num_topics);
|
@@ -869,7 +869,7 @@ static VALUE wrap_get_gamma(VALUE self) {
|
|
869
869
|
}
|
870
870
|
rb_ary_store(arr, i, arr2);
|
871
871
|
}
|
872
|
-
|
872
|
+
|
873
873
|
return arr;
|
874
874
|
}
|
875
875
|
|
@@ -882,31 +882,31 @@ static VALUE wrap_get_gamma(VALUE self) {
|
|
882
882
|
static VALUE wrap_get_phi(VALUE self) {
|
883
883
|
if (!model_loaded)
|
884
884
|
return Qnil;
|
885
|
-
|
885
|
+
|
886
886
|
VALUE arr = rb_ary_new2(last_corpus->num_docs);
|
887
887
|
int i = 0, j = 0, k = 0;
|
888
|
-
|
888
|
+
|
889
889
|
//int max_length = max_corpus_length(last_corpus);
|
890
890
|
short error = 0;
|
891
|
-
|
891
|
+
|
892
892
|
for (i = 0; i < last_corpus->num_docs; i++) {
|
893
893
|
VALUE arr1 = rb_ary_new2(last_corpus->docs[i].length);
|
894
|
-
|
894
|
+
|
895
895
|
lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi, &error);
|
896
|
-
|
896
|
+
|
897
897
|
for (j = 0; j < last_corpus->docs[i].length; j++) {
|
898
898
|
VALUE arr2 = rb_ary_new2(last_model->num_topics);
|
899
|
-
|
899
|
+
|
900
900
|
for (k = 0; k < last_model->num_topics; k++) {
|
901
901
|
rb_ary_store(arr2, k, rb_float_new(last_phi[j][k]));
|
902
902
|
}
|
903
|
-
|
903
|
+
|
904
904
|
rb_ary_store(arr1, j, arr2);
|
905
905
|
}
|
906
|
-
|
906
|
+
|
907
907
|
rb_ary_store(arr, i, arr1);
|
908
908
|
}
|
909
|
-
|
909
|
+
|
910
910
|
return arr;
|
911
911
|
}
|
912
912
|
|
@@ -918,11 +918,11 @@ static VALUE wrap_get_phi(VALUE self) {
|
|
918
918
|
static VALUE wrap_get_model_beta(VALUE self) {
|
919
919
|
if (!model_loaded)
|
920
920
|
return Qnil;
|
921
|
-
|
921
|
+
|
922
922
|
// beta is a double[num_topics][num_terms]
|
923
923
|
VALUE arr;
|
924
924
|
int i = 0, j = 0;
|
925
|
-
|
925
|
+
|
926
926
|
arr = rb_ary_new2(last_model->num_topics);
|
927
927
|
for (i = 0; i < last_model->num_topics; i++) {
|
928
928
|
VALUE arr2 = rb_ary_new2(last_model->num_terms);
|
@@ -931,7 +931,7 @@ static VALUE wrap_get_model_beta(VALUE self) {
|
|
931
931
|
}
|
932
932
|
rb_ary_store(arr, i, arr2);
|
933
933
|
}
|
934
|
-
|
934
|
+
|
935
935
|
return arr;
|
936
936
|
}
|
937
937
|
|
@@ -944,28 +944,27 @@ static VALUE wrap_get_model_settings(VALUE self) {
|
|
944
944
|
return Qnil;
|
945
945
|
|
946
946
|
VALUE arr;
|
947
|
-
|
947
|
+
|
948
948
|
arr = rb_ary_new();
|
949
949
|
rb_ary_push(arr, rb_int_new(last_model->num_topics));
|
950
950
|
rb_ary_push(arr, rb_int_new(last_model->num_terms));
|
951
951
|
rb_ary_push(arr, rb_float_new(last_model->alpha));
|
952
|
-
|
952
|
+
|
953
953
|
return arr; // [num_topics, num_terms, alpha]
|
954
954
|
}
|
955
955
|
|
956
956
|
|
957
|
-
void
|
957
|
+
void Init_lda() {
|
958
958
|
corpus_loaded = FALSE;
|
959
959
|
model_loaded = FALSE;
|
960
960
|
VERBOSE = TRUE;
|
961
961
|
|
962
|
-
rb_require("lda");
|
963
|
-
|
964
|
-
rb_mLda = rb_define_module("Lda");
|
965
|
-
rb_cLda = rb_define_class_under(rb_mLda, "Lda", rb_cObject);
|
966
|
-
rb_cLdaCorpus = rb_define_class_under(rb_mLda, "Corpus", rb_cObject);
|
967
|
-
rb_cLdaDocument = rb_define_class_under(rb_mLda, "Document", rb_cObject);
|
962
|
+
rb_require("lda-ruby");
|
968
963
|
|
964
|
+
rb_cLdaModule = rb_define_module("Lda");
|
965
|
+
rb_cLda = rb_define_class_under(rb_cLdaModule, "Lda", rb_cObject);
|
966
|
+
rb_cLdaCorpus = rb_define_class_under(rb_cLdaModule, "Corpus", rb_cObject);
|
967
|
+
rb_cLdaDocument = rb_define_class_under(rb_cLdaModule, "Document", rb_cObject);
|
969
968
|
|
970
969
|
// method to load the corpus
|
971
970
|
rb_define_method(rb_cLda, "fast_load_corpus_from_file", wrap_load_corpus, 1);
|
@@ -988,7 +987,7 @@ void Init_lda_ext() {
|
|
988
987
|
rb_define_method(rb_cLda, "em_max_iter", wrap_get_em_max_iter, 0);
|
989
988
|
rb_define_method(rb_cLda, "em_max_iter=", wrap_set_em_max_iter, 1);
|
990
989
|
rb_define_method(rb_cLda, "em_convergence", wrap_get_em_converged, 0);
|
991
|
-
rb_define_method(rb_cLda, "em_convergence=", wrap_set_em_converged, 1);
|
990
|
+
rb_define_method(rb_cLda, "em_convergence=", wrap_set_em_converged, 1);
|
992
991
|
rb_define_method(rb_cLda, "init_alpha=", wrap_set_initial_alpha, 1);
|
993
992
|
rb_define_method(rb_cLda, "init_alpha", wrap_get_initial_alpha, 0);
|
994
993
|
rb_define_method(rb_cLda, "est_alpha=", wrap_set_estimate_alpha, 1);
|