RubyGems - lda-ruby - Versions diffs - 0.3.1 - Mend

lda-ruby 0.3.1

Files changed (38) hide show

data/.gitignore +5 -0
data/CHANGELOG +22 -0
data/README +21 -0
data/README.markdown +38 -0
data/Rakefile +58 -0
data/VERSION.yml +4 -0
data/ext/lda-ruby/Makefile +181 -0
data/ext/lda-ruby/cokus.c +145 -0
data/ext/lda-ruby/cokus.h +27 -0
data/ext/lda-ruby/extconf.rb +9 -0
data/ext/lda-ruby/lda-alpha.c +96 -0
data/ext/lda-ruby/lda-alpha.h +21 -0
data/ext/lda-ruby/lda-data.c +67 -0
data/ext/lda-ruby/lda-data.h +14 -0
data/ext/lda-ruby/lda-inference.c +1007 -0
data/ext/lda-ruby/lda-inference.h +63 -0
data/ext/lda-ruby/lda-model.c +345 -0
data/ext/lda-ruby/lda-model.h +29 -0
data/ext/lda-ruby/lda.h +54 -0
data/ext/lda-ruby/utils.c +111 -0
data/ext/lda-ruby/utils.h +18 -0
data/lda-ruby.gemspec +78 -0
data/lib/lda-ruby.rb +168 -0
data/lib/lda-ruby/corpus/corpus.rb +34 -0
data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
data/lib/lda-ruby/corpus/text_corpus.rb +22 -0
data/lib/lda-ruby/document/data_document.rb +30 -0
data/lib/lda-ruby/document/document.rb +36 -0
data/lib/lda-ruby/document/text_document.rb +37 -0
data/lib/lda-ruby/vocabulary.rb +46 -0
data/license.txt +504 -0
data/test/data/.gitignore +2 -0
data/test/data/docs.dat +46 -0
data/test/data/wiki-test-docs.yml +123 -0
data/test/lda_ruby_test.rb +274 -0
data/test/test_helper.rb +10 -0
metadata +95 -0

data/ext/lda-ruby/cokus.h ADDED Viewed

@@ -0,0 +1,27 @@
+#ifndef COKUS_H
+#define COKUS_H
+#include <stdio.h>
+#include <stdlib.h>
+//
+// uint32 must be an unsigned integer type capable of holding at least 32
+// bits; exactly 32 should be fastest, but 64 is better on an Alpha with
+// GCC at -O3 optimization so try your options and see what's best for you
+//
+typedef unsigned long uint32;
+#define N              (624)                 // length of state vector
+#define M              (397)                 // a period parameter
+#define K              (0x9908B0DFU)         // a magic constant
+#define hiBit(u)       ((u) & 0x80000000U)   // mask all but highest   bit of u
+#define loBit(u)       ((u) & 0x00000001U)   // mask all but lowest    bit of u
+#define loBits(u)      ((u) & 0x7FFFFFFFU)   // mask     the highest   bit of u
+#define mixBits(u, v)  (hiBit(u)|loBits(v))  // move hi bit of u to hi bit of v
+void seedMT(uint32 seed);
+uint32 reloadMT(void);
+uint32 randomMT(void);
+#endif

data/ext/lda-ruby/extconf.rb ADDED Viewed

@@ -0,0 +1,9 @@
+ENV["ARCHFLAGS"] = "-arch #{`uname -p` =~ /powerpc/ ? 'ppc' : 'i386'}"
+require 'mkmf'
+$CFLAGS << ' -Wall -ggdb -O0'
+$defs.push( "-D USE_RUBY" )
+dir_config('lda-ruby/lda')
+create_makefile("lda-ruby/lda")

data/ext/lda-ruby/lda-alpha.c ADDED Viewed

@@ -0,0 +1,96 @@
+// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
+// This file is part of LDA-C.
+// LDA-C is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 2 of the License, or (at your
+// option) any later version.
+// LDA-C is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+#include "lda-alpha.h"
+/*
+ * objective function and its derivatives
+ *
+ */
+double alhood(double a, double ss, int D, int K)
+{ return(D * (lgamma(K * a) - K * lgamma(a)) + (a - 1) * ss); }
+double d_alhood(double a, double ss, int D, int K)
+{ return(D * (K * digamma(K * a) - K * digamma(a)) + ss); }
+double d2_alhood(double a, int D, int K)
+{ return(D * (K * K * trigamma(K * a) - K * trigamma(a))); }
+/*
+ * newtons method
+ *
+ */
+double opt_alpha(double ss, int D, int K)
+{
+    double a, log_a, init_a = 100;
+    double f, df, d2f;
+    int iter = 0;
+    log_a = log(init_a);
+    do
+    {
+        iter++;
+        a = exp(log_a);
+        if (isnan(a))
+        {
+            init_a = init_a * 10;
+            printf("warning : alpha is nan; new init = %5.5f\n", init_a);
+            a = init_a;
+            log_a = log(a);
+        }
+        f = alhood(a, ss, D, K);
+        df = d_alhood(a, ss, D, K);
+        d2f = d2_alhood(a, D, K);
+        log_a = log_a - df/(d2f * a + df);
+        printf("alpha maximization : %5.5f   %5.5f\n", f, df);
+    }
+    while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
+    return(exp(log_a));
+}
+double quiet_opt_alpha(double ss, int D, int K)
+{
+    double a, log_a, init_a = 100;
+    double f, df, d2f;
+    int iter = 0;
+    log_a = log(init_a);
+    do
+    {
+        iter++;
+        a = exp(log_a);
+        if (isnan(a))
+        {
+            init_a = init_a * 10;
+            //printf("warning : alpha is nan; new init = %5.5f\n", init_a);
+            a = init_a;
+            log_a = log(a);
+        }
+        f = alhood(a, ss, D, K);
+        df = d_alhood(a, ss, D, K);
+        d2f = d2_alhood(a, D, K);
+        log_a = log_a - df/(d2f * a + df);
+        //printf("alpha maximization : %5.5f   %5.5f\n", f, df);
+    }
+    while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
+    return(exp(log_a));
+}

data/ext/lda-ruby/lda-alpha.h ADDED Viewed

@@ -0,0 +1,21 @@
+#ifndef LDA_ALPHA_H
+#define LDA_ALPHA_H
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include "lda.h"
+#include "utils.h"
+#define NEWTON_THRESH 1e-5
+#define MAX_ALPHA_ITER 1000
+double alhood(double a, double ss, int D, int K);
+double d_alhood(double a, double ss, int D, int K);
+double d2_alhood(double a, int D, int K);
+double opt_alpha(double ss, int D, int K);
+double quiet_opt_alpha(double ss, int D, int K);
+//void maximize_alpha(double** gamma, lda_model* model, int num_docs);
+#endif

data/ext/lda-ruby/lda-data.c ADDED Viewed

@@ -0,0 +1,67 @@
+// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
+// This file is part of LDA-C.
+// LDA-C is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 2 of the License, or (at your
+// option) any later version.
+// LDA-C is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+#include "lda-data.h"
+corpus* read_data(char* data_filename)
+{
+	FILE *fileptr;
+	int length, count, word, n, nd, nw;
+	corpus* c;
+	printf("reading data from %s\n", data_filename);
+	c = malloc(sizeof(corpus));
+	c->docs = 0;
+	c->num_terms = 0;
+	c->num_docs = 0;
+	fileptr = fopen(data_filename, "r");
+	nd = 0; nw = 0;
+	while ((fscanf(fileptr, "%10d", &length) != EOF))
+	{
+		c->docs = (document*) realloc(c->docs, sizeof(document)*(nd+1));
+		c->docs[nd].length = length;
+		c->docs[nd].total = 0;
+		c->docs[nd].words = malloc(sizeof(int)*length);
+		c->docs[nd].counts = malloc(sizeof(int)*length);
+		for (n = 0; n < length; n++)
+		{
+			fscanf(fileptr, "%10d:%10d", &word, &count);
+			word = word - OFFSET;
+			c->docs[nd].words[n] = word;
+			c->docs[nd].counts[n] = count;
+			c->docs[nd].total += count;
+			if (word >= nw) { nw = word + 1; }
+		}
+		nd++;
+	}
+	fclose(fileptr);
+	c->num_docs = nd;
+	c->num_terms = nw;
+	printf("number of docs    : %d\n", nd);
+	printf("number of terms   : %d\n", nw);
+	return(c);
+}
+int max_corpus_length(corpus* c)
+{
+	int n, max = 0;
+	for (n = 0; n < c->num_docs; n++)
+		if (c->docs[n].length > max) max = c->docs[n].length;
+	return(max);
+}

data/ext/lda-ruby/lda-data.h ADDED Viewed

@@ -0,0 +1,14 @@
+#ifndef LDA_DATA_H
+#define LDA_DATA_H
+#include <stdio.h>
+#include <stdlib.h>
+#include "lda.h"
+#define OFFSET 0;                  // offset for reading data
+corpus* read_data(char* data_filename);
+int max_corpus_length(corpus* c);
+#endif

data/ext/lda-ruby/lda-inference.c ADDED Viewed

@@ -0,0 +1,1007 @@
+// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
+// This file is part of LDA-C.
+// LDA-C is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 2 of the License, or (at your
+// option) any later version.
+// LDA-C is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+#include <string.h>
+#include <time.h>
+#include "lda.h"
+#include "lda-data.h"
+#include "lda-inference.h"
+#include "lda-model.h"
+#include "utils.h"
+#include "cokus.h"
+#ifdef USE_RUBY
+#include "ruby.h"
+VALUE rb_cLdaModule;
+VALUE rb_cLda;
+VALUE rb_cLdaCorpus;
+VALUE rb_cLdaDocument;
+#endif
+/*
+ * variational inference
+ */
+double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi, short* errors) {
+	double converged = 1;
+	double phisum = 0, likelihood = 0;
+	double likelihood_old = 0, oldphi[model->num_topics];
+	int k = 0, n = 0, var_iter = 0, index = 0;
+	double digamma_gam[model->num_topics];
+  /* zero'em out */
+  memset(digamma_gam,0.0,sizeof(digamma_gam));
+  memset(oldphi,0.0,sizeof(oldphi));
+		// compute posterior dirichlet
+	for (k = 0; k < model->num_topics; k++)
+	{
+		var_gamma[k] = model->alpha + (doc->total/((double) model->num_topics));
+		digamma_gam[k] = digamma(var_gamma[k]);
+		for (n = 0; n < doc->length; n++)
+			phi[n][k] = 1.0/model->num_topics;
+	}
+	var_iter = 0;
+	while ((converged > VAR_CONVERGED) &&
+		((var_iter < VAR_MAX_ITER) || (VAR_MAX_ITER == -1)))
+	{
+		var_iter++;
+		for (n = 0; n < doc->length; n++)
+		{
+			phisum = 0;
+			for (k = 0; k < model->num_topics; k++)
+			{
+				oldphi[k] = phi[n][k];
+        index = doc->words[n];
+        if( index < 0 || index > model->num_terms ) {
+          printf("phi for term: %d of %d\n", index, model->num_terms);
+				  phi[n][k] = 0.0;
+        }
+        else {
+				  phi[n][k] =
+				  	digamma_gam[k] +
+				  	model->log_prob_w[k][index];
+        }
+				if (k > 0)
+					phisum = log_sum(phisum, phi[n][k]);
+				else
+					phisum = phi[n][k]; // note, phi is in log space
+			}
+			for (k = 0; k < model->num_topics; k++)
+			{
+				phi[n][k] = exp(phi[n][k] - phisum);
+				var_gamma[k] =
+					var_gamma[k] + doc->counts[n]*(phi[n][k] - oldphi[k]);
+								// !!! a lot of extra digamma's here because of how we're computing it
+								// !!! but its more automatically updated too.
+				digamma_gam[k] = digamma(var_gamma[k]);
+			}
+		}
+		likelihood = compute_likelihood(doc, model, phi, var_gamma);
+		//assert(!isnan(likelihood));
+    if( isnan(likelihood) ) { *errors = 1; }
+		converged = (likelihood_old - likelihood) / likelihood_old;
+		likelihood_old = likelihood;
+				// printf("[LDA INF] %8.5f %1.3e\n", likelihood, converged);
+	}
+	return(likelihood);
+}
+/*
+ * compute likelihood bound
+ */
+double compute_likelihood(document* doc, lda_model* model, double** phi, double* var_gamma) {
+	double likelihood = 0, digsum = 0, var_gamma_sum = 0, dig[model->num_topics];
+	int k = 0, n = 0, index = 0;
+  memset(dig,0.0,sizeof(dig));
+	for (k = 0; k < model->num_topics; k++)
+	{
+		dig[k] = digamma(var_gamma[k]);
+		var_gamma_sum += var_gamma[k];
+	}
+	digsum = digamma(var_gamma_sum);
+	likelihood = lgamma(model->alpha * model->num_topics) -
+               model->num_topics *
+               lgamma(model->alpha) -
+               lgamma(var_gamma_sum);
+	for (k = 0; k < model->num_topics; k++)
+	{
+		likelihood += (model->alpha - 1)*(dig[k] - digsum) + lgamma(var_gamma[k]) - (var_gamma[k] - 1)*(dig[k] - digsum);
+		for (n = 0; n < doc->length; n++)
+		{
+			if (phi[n][k] > 0)
+			{
+        index = doc->words[n];
+				likelihood += doc->counts[n]*
+					(phi[n][k]*((dig[k] - digsum) - log(phi[n][k])
+					+ model->log_prob_w[k][index]));
+			}
+		}
+	}
+	return(likelihood);
+}
+double doc_e_step(document* doc, double* gamma, double** phi, lda_model* model, lda_suffstats* ss) {
+	double likelihood;
+	int n, k;
+  short error = 0;
+  // posterior inference
+	likelihood = lda_inference(doc, model, gamma, phi, &error);
+  if (error) { likelihood = 0.0; }
+		// update sufficient statistics
+	double gamma_sum = 0;
+	for (k = 0; k < model->num_topics; k++)
+	{
+		gamma_sum += gamma[k];
+		ss->alpha_suffstats += digamma(gamma[k]);
+	}
+	ss->alpha_suffstats -= model->num_topics * digamma(gamma_sum);
+	for (n = 0; n < doc->length; n++)
+	{
+		for (k = 0; k < model->num_topics; k++)
+		{
+			ss->class_word[k][doc->words[n]] += doc->counts[n]*phi[n][k];
+			ss->class_total[k] += doc->counts[n]*phi[n][k];
+		}
+	}
+	ss->num_docs = ss->num_docs + 1;
+	return(likelihood);
+}
+/*
+ * writes the word assignments line for a document to a file
+ */
+void write_word_assignment(FILE* f, document* doc, double** phi, lda_model* model) {
+	int n;
+	fprintf(f, "%03d", doc->length);
+	for (n = 0; n < doc->length; n++) {
+		fprintf(f, " %04d:%02d", doc->words[n], argmax(phi[n], model->num_topics));
+	}
+	fprintf(f, "\n");
+	fflush(f);
+}
+/*
+ * saves the gamma parameters of the current dataset
+ */
+void save_gamma(char* filename, double** gamma, int num_docs, int num_topics) {
+	FILE* fileptr;
+	int d, k;
+	fileptr = fopen(filename, "w");
+	for (d = 0; d < num_docs; d++) {
+		fprintf(fileptr, "%5.10f", gamma[d][0]);
+		for (k = 1; k < num_topics; k++) {
+			fprintf(fileptr, " %5.10f", gamma[d][k]);
+		}
+		fprintf(fileptr, "\n");
+	}
+	fclose(fileptr);
+}
+void run_em(char* start, char* directory, corpus* corpus) {
+	int d, n;
+	lda_model *model = NULL;
+	double **var_gamma, **phi;
+	// allocate variational parameters
+	var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
+	for (d = 0; d < corpus->num_docs; d++)
+		var_gamma[d] = malloc(sizeof(double) * NTOPICS);
+	int max_length = max_corpus_length(corpus);
+	phi = malloc(sizeof(double*)*max_length);
+	for (n = 0; n < max_length; n++)
+		phi[n] = malloc(sizeof(double) * NTOPICS);
+	// initialize model
+	char filename[100];
+	lda_suffstats* ss = NULL;
+	if (strcmp(start, "seeded")==0) {
+		model = new_lda_model(corpus->num_terms, NTOPICS);
+		ss = new_lda_suffstats(model);
+		corpus_initialize_ss(ss, model, corpus);
+		if (VERBOSE) {
+		    lda_mle(model, ss, 0);
+	    } else {
+            quiet_lda_mle(model, ss, 0);
+	    }
+		model->alpha = INITIAL_ALPHA;
+	} else if (strcmp(start, "random")==0) {
+		model = new_lda_model(corpus->num_terms, NTOPICS);
+		ss = new_lda_suffstats(model);
+		random_initialize_ss(ss, model);
+		if (VERBOSE) {
+		    lda_mle(model, ss, 0);
+	    } else {
+	        quiet_lda_mle(model, ss, 0);
+	    }
+		model->alpha = INITIAL_ALPHA;
+	} else {
+		model = load_lda_model(start);
+		ss = new_lda_suffstats(model);
+	}
+	sprintf(filename,"%s/000",directory);
+	save_lda_model(model, filename);
+	// run expectation maximization
+	int i = 0;
+	double likelihood, likelihood_old = 0, converged = 1;
+	sprintf(filename, "%s/likelihood.dat", directory);
+	FILE* likelihood_file = fopen(filename, "w");
+	while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
+		i++;
+		if (VERBOSE)
+		    printf("**** em iteration %d ****\n", i);
+		likelihood = 0;
+		zero_initialize_ss(ss, model);
+		// e-step
+    printf("e-step\n");
+		for (d = 0; d < corpus->num_docs; d++) {
+			if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
+			likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
+		}
+    printf("m-step\n");
+		// m-step
+    if (VERBOSE) {
+      lda_mle(model, ss, ESTIMATE_ALPHA);
+    } else {
+      quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
+    }
+		// check for convergence
+		converged = (likelihood_old - likelihood) / (likelihood_old);
+		if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
+		likelihood_old = likelihood;
+		// output model and likelihood
+		fprintf(likelihood_file, "%10.10f\t%5.5e\n", likelihood, converged);
+		fflush(likelihood_file);
+		if ((i % LAG) == 0)
+		{
+			sprintf(filename,"%s/%03d",directory, i);
+			save_lda_model(model, filename);
+			sprintf(filename,"%s/%03d.gamma",directory, i);
+			save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
+		}
+	}
+		// output the final model
+	sprintf(filename,"%s/final",directory);
+	save_lda_model(model, filename);
+	sprintf(filename,"%s/final.gamma",directory);
+	save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
+		// output the word assignments (for visualization)
+	sprintf(filename, "%s/word-assignments.dat", directory);
+	FILE* w_asgn_file = fopen(filename, "w");
+  short error = 0;
+  double tl = 0.0;
+	for (d = 0; d < corpus->num_docs; d++)
+	{
+		if ((d % 100) == 0 && VERBOSE) printf("final e step document %d\n",d);
+    error = 0;
+    tl = lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi,&error);
+    if( error ) { continue; }
+		likelihood += tl;
+		write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
+	}
+	fclose(w_asgn_file);
+	fclose(likelihood_file);
+}
+/*
+ * read settings.
+ */
+void read_settings(char* filename) {
+	FILE* fileptr;
+	char alpha_action[100];
+	fileptr = fopen(filename, "r");
+	fscanf(fileptr, "var max iter %d\n", &VAR_MAX_ITER);
+	fscanf(fileptr, "var convergence %f\n", &VAR_CONVERGED);
+	fscanf(fileptr, "em max iter %d\n", &EM_MAX_ITER);
+	fscanf(fileptr, "em convergence %f\n", &EM_CONVERGED);
+	fscanf(fileptr, "alpha %s", alpha_action);
+	if (strcmp(alpha_action, "fixed")==0)
+	{
+		ESTIMATE_ALPHA = 0;
+	}
+	else
+	{
+		ESTIMATE_ALPHA = 1;
+	}
+	fclose(fileptr);
+}
+/*
+* inference only
+	*
+*/
+void infer(char* model_root, char* save, corpus* corpus) {
+	FILE* fileptr;
+	char filename[100];
+	int i, d, n;
+	lda_model *model;
+	double **var_gamma, likelihood, **phi;
+	document* doc;
+	model = load_lda_model(model_root);
+	var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
+	for (i = 0; i < corpus->num_docs; i++)
+		var_gamma[i] = malloc(sizeof(double)*model->num_topics);
+	sprintf(filename, "%s-lda-lhood.dat", save);
+	fileptr = fopen(filename, "w");
+	for (d = 0; d < corpus->num_docs; d++) {
+		if (((d % 100) == 0) && (d>0) && VERBOSE) printf("document %d\n",d);
+		doc = &(corpus->docs[d]);
+		phi = (double**) malloc(sizeof(double*) * doc->length);
+		for (n = 0; n < doc->length; n++)
+			phi[n] = (double*) malloc(sizeof(double) * model->num_topics);
+    short error = 0;
+		likelihood = lda_inference(doc, model, var_gamma[d], phi, &error);
+		fprintf(fileptr, "%5.5f\n", likelihood);
+	}
+	fclose(fileptr);
+	sprintf(filename, "%s-gamma.dat", save);
+	save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
+}
+/*
+ * update sufficient statistics
+ *
+ */
+/*
+ * main
+ *
+ */
+int main(int argc, char* argv[]) {
+	corpus* corpus;
+	long t1;
+	(void) time(&t1);
+	seedMT(t1);
+		// seedMT(4357U);
+	if (argc > 1)
+	{
+		if (strcmp(argv[1], "est")==0)
+		{
+			INITIAL_ALPHA = atof(argv[2]);
+			NTOPICS = atoi(argv[3]);
+			read_settings(argv[4]);
+			corpus = read_data(argv[5]);
+			make_directory(argv[7]);
+			run_em(argv[6], argv[7], corpus);
+		}
+		if (strcmp(argv[1], "inf")==0)
+		{
+			read_settings(argv[2]);
+			corpus = read_data(argv[4]);
+			infer(argv[3], argv[5], corpus);
+		}
+	}
+	else
+	{
+		printf("usage : lda est [initial alpha] [k] [settings] [data] [random/seeded/*] [directory]\n");
+		printf("        lda inf [settings] [model] [data] [name]\n");
+	}
+	return(0);
+}
+#ifdef USE_RUBY
+/* */
+void run_quiet_em(char* start, corpus* corpus) {
+	int d = 0, n = 0;
+	lda_model *model = NULL;
+	double **var_gamma = NULL, **phi = NULL;
+	// last_gamma is a double[num_docs][num_topics]
+	// allocate variational parameters
+	var_gamma = (double**)malloc(sizeof(double*)*(corpus->num_docs));
+  memset(var_gamma, 0.0, corpus->num_docs);
+	for (d = 0; d < corpus->num_docs; ++d) {
+		var_gamma[d] = (double*)malloc(sizeof(double) * NTOPICS);
+    memset(var_gamma[d], 0.0, sizeof(double)*NTOPICS);
+  }
+	int max_length = max_corpus_length(corpus);
+	phi = (double**)malloc(sizeof(double*)*max_length);
+  memset(phi, 0.0, max_length);
+	for (n = 0; n < max_length; ++n) {
+		phi[n] = (double*)malloc(sizeof(double) * NTOPICS);
+    memset(phi[n], 0.0, sizeof(double)*NTOPICS);
+  }
+	// initialize model
+	lda_suffstats* ss = NULL;
+	if (strncmp(start, "seeded",6)==0) {
+		model = quiet_new_lda_model(corpus->num_terms, NTOPICS);
+		model->alpha = INITIAL_ALPHA;
+		ss = new_lda_suffstats(model);
+		if (VERBOSE) {
+      corpus_initialize_ss(ss, model, corpus);
+    } else {
+      quiet_corpus_initialize_ss(ss, model, corpus);
+    }
+		if (VERBOSE) {
+      lda_mle(model, ss, 0);
+		} else {
+      quiet_lda_mle(model, ss, 0);
+		}
+	} else if (strncmp(start, "fixed",5)==0) {
+	  model = quiet_new_lda_model(corpus->num_terms, NTOPICS);
+    model->alpha = INITIAL_ALPHA;
+	  ss = new_lda_suffstats(model);
+	  corpus_initialize_fixed_ss(ss, model, corpus);
+    if (VERBOSE) {
+      lda_mle(model, ss, 0);
+    } else {
+      quiet_lda_mle(model, ss, 0);
+    }
+	} else if (strncmp(start, "random",6)==0) {
+		model = quiet_new_lda_model(corpus->num_terms, NTOPICS);
+		model->alpha = INITIAL_ALPHA;
+		ss = new_lda_suffstats(model);
+		random_initialize_ss(ss, model);
+		if (VERBOSE) {
+      lda_mle(model, ss, 0);
+		} else {
+      quiet_lda_mle(model, ss, 0);
+		}
+	} else {
+		model = load_lda_model(start);
+		ss = new_lda_suffstats(model);
+	}
+	// save the model in the last_model global
+	last_model = model;
+	model_loaded = TRUE;
+	// run expectation maximization
+	int i = 0;
+	double likelihood = 0.0, likelihood_old = 0, converged = 1;
+	while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
+		i++;
+		if (VERBOSE) printf("**** em iteration %d ****\n", i);
+		likelihood = 0;
+		zero_initialize_ss(ss, model);
+		// e-step
+		for (d = 0; d < corpus->num_docs; d++) {
+			if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
+			likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
+		}
+		// m-step
+    if (VERBOSE) {
+      lda_mle(model, ss, ESTIMATE_ALPHA);
+    } else {
+      quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
+    }
+		// check for convergence
+		converged = (likelihood_old - likelihood) / (likelihood_old);
+		if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
+		likelihood_old = likelihood;
+		// store model and likelihood
+		last_model = model;
+		last_gamma = var_gamma;
+    last_phi = phi;
+	}
+	// output the final model
+	last_model = model;
+	last_gamma = var_gamma;
+  last_phi = phi;
+  free_lda_suffstats(model,ss);
+	// output the word assignments (for visualization)
+	/*
+	char filename[100];
+	sprintf(filename, "%s/word-assignments.dat", directory);
+	FILE* w_asgn_file = fopen(filename, "w");
+	for (d = 0; d < corpus->num_docs; d++) {
+		if ((d % 100) == 0)
+			printf("final e step document %d\n",d);
+		likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi);
+		write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
+	}
+	fclose(w_asgn_file);
+	*/
+}
+/*
+ * Set all of the settings in one command:
+ *
+ *  * init_alpha
+ *  * num_topics
+ *  * max_iter
+ *  * convergence
+ *  * em_max_iter
+ *  * em_convergence
+ *  * est_alpha
+ */
+static VALUE wrap_set_config(VALUE self, VALUE init_alpha, VALUE num_topics, VALUE max_iter, VALUE convergence, VALUE em_max_iter, VALUE em_convergence, VALUE est_alpha) {
+	INITIAL_ALPHA = NUM2DBL(init_alpha);
+	NTOPICS = NUM2INT(num_topics);
+  if( NTOPICS < 0 ) { rb_raise(rb_eRuntimeError, "NTOPICS must be greater than 0 - %d", NTOPICS); }
+	VAR_MAX_ITER = NUM2INT(max_iter);
+	VAR_CONVERGED = (float)NUM2DBL(convergence);
+	EM_MAX_ITER = NUM2INT(em_max_iter);
+	EM_CONVERGED = (float)NUM2DBL(em_convergence);
+	ESTIMATE_ALPHA = NUM2INT(est_alpha);
+	return Qtrue;
+}
+/*
+ * Get the maximum iterations.
+ */
+static VALUE wrap_get_max_iter(VALUE self) {
+	return rb_int_new(VAR_MAX_ITER);
+}
+/*
+ * Set the maximum iterations.
+ */
+static VALUE wrap_set_max_iter(VALUE self, VALUE max_iter) {
+	VAR_MAX_ITER = NUM2INT(max_iter);
+	return max_iter;
+}
+/*
+ * Get the convergence setting.
+ */
+static VALUE wrap_get_converged(VALUE self) {
+	return rb_float_new(VAR_CONVERGED);
+}
+/*
+ * Set the convergence setting.
+ */
+static VALUE wrap_set_converged(VALUE self, VALUE converged) {
+	VAR_CONVERGED = (float)NUM2DBL(converged);
+	return converged;
+}
+/*
+ * Get the max iterations for the EM algorithm.
+ */
+static VALUE wrap_get_em_max_iter(VALUE self) {
+	return rb_int_new(EM_MAX_ITER);
+}
+/*
+ * Set the max iterations for the EM algorithm.
+ */
+static VALUE wrap_set_em_max_iter(VALUE self, VALUE em_max_iter) {
+	EM_MAX_ITER = NUM2INT(em_max_iter);
+	return em_max_iter;
+}
+/*
+ * Get the convergence value for EM.
+ */
+static VALUE wrap_get_em_converged(VALUE self) {
+	return rb_float_new(EM_CONVERGED);
+}
+/*
+ * Set the convergence value for EM.
+ */
+static VALUE wrap_set_em_converged(VALUE self, VALUE em_converged) {
+	EM_CONVERGED = (float)NUM2DBL(em_converged);
+	return em_converged;
+}
+/*
+ * Get the initial alpha value.
+ */
+static VALUE wrap_get_initial_alpha(VALUE self) {
+	return rb_float_new(INITIAL_ALPHA);
+}
+/*
+ * Get the number of topics being clustered.
+ */
+static VALUE wrap_get_num_topics(VALUE self) {
+	return rb_int_new(NTOPICS);
+}
+/*
+ * Set the initial value of alpha.
+ */
+static VALUE wrap_set_initial_alpha(VALUE self, VALUE initial_alpha) {
+	INITIAL_ALPHA = (float)NUM2DBL(initial_alpha);
+	return initial_alpha;
+}
+/*
+ * Set the number of topics to be clustered.
+ */
+static VALUE wrap_set_num_topics(VALUE self, VALUE ntopics) {
+	NTOPICS = NUM2INT(ntopics);
+	return ntopics;
+}
+/*
+ * Get the estimate alpha value (fixed = 0).
+ */
+static VALUE wrap_get_estimate_alpha(VALUE self) {
+	return rb_int_new(ESTIMATE_ALPHA);
+}
+/*
+ * Set the estimate alpha value (fixed = 0).
+ */
+static VALUE wrap_set_estimate_alpha(VALUE self, VALUE est_alpha) {
+	ESTIMATE_ALPHA = NUM2INT(est_alpha);
+	return est_alpha;
+}
+/*
+ * Get the verbosity setting.
+ */
+static VALUE wrap_get_verbosity(VALUE self) {
+    if (VERBOSE) {
+        return Qtrue;
+    } else {
+        return Qfalse;
+    }
+}
+/*
+ * Set the verbosity level (true, false).
+ */
+static VALUE wrap_set_verbosity(VALUE self, VALUE verbosity) {
+    if (verbosity == Qtrue) {
+        VERBOSE = TRUE;
+    } else {
+        VERBOSE = FALSE;
+    }
+    return verbosity;
+}
+/*
+ * Run the EM algorithm with the loaded corpus and using the current
+ * configuration settings.  The +start+ parameter can take the following
+ * values:
+ *  * random - starting alpha are randomized
+ *  * seeded - loaded based on the corpus values
+ *  * <filename> - path to the file containing the model
+ */
+static VALUE wrap_em(VALUE self, VALUE start) {
+	if (!corpus_loaded)
+		return Qnil;
+	run_quiet_em(STR2CSTR(start), last_corpus);
+	return Qnil;
+}
+/*
+ * Load settings from the given file.
+ */
+static VALUE wrap_load_settings(VALUE self, VALUE settings_file) {
+	read_settings(STR2CSTR(settings_file));
+	return Qtrue;
+}
+/*
+ * Load the corpus from the given file.  This will not create
+ * a +Corpus+ object that is accessible, but it will load the corpus
+ * much faster.
+ */
+static VALUE wrap_load_corpus(VALUE self, VALUE filename) {
+	if (!corpus_loaded) {
+		last_corpus = read_data(STR2CSTR(filename));
+		corpus_loaded = TRUE;
+		return Qtrue;
+	} else {
+		return Qtrue;
+	}
+}
+/*
+ * Set the corpus.
+ */
+static VALUE wrap_ruby_corpus(VALUE self, VALUE rcorpus) {
+	corpus* c;
+	int i = 0;
+	int j = 0;
+	c = malloc(sizeof(corpus));
+	c->num_terms = NUM2INT(rb_iv_get(rcorpus, "@num_terms"));
+	c->num_docs = NUM2INT(rb_iv_get(rcorpus, "@num_docs"));
+	c->docs = (document*) malloc(sizeof(document) * c->num_docs);
+	VALUE doc_ary = rb_iv_get(rcorpus, "@documents");
+	for (i = 0; i < c->num_docs; i++) {
+		VALUE one_doc = rb_ary_entry(doc_ary, i);
+		VALUE words = rb_iv_get(one_doc, "@words");
+		VALUE counts = rb_iv_get(one_doc, "@counts");
+		c->docs[i].length = NUM2INT(rb_iv_get(one_doc, "@length"));
+		c->docs[i].total = NUM2INT(rb_iv_get(one_doc, "@total"));
+		c->docs[i].words = malloc(sizeof(int) * c->docs[i].length);
+		c->docs[i].counts = malloc(sizeof(int) * c->docs[i].length);
+		for (j = 0; j < c->docs[i].length; j++) {
+			int one_word = NUM2INT(rb_ary_entry(words, j));
+			int one_count = NUM2INT(rb_ary_entry(counts, j));
+      if( one_word > c->num_terms ) {
+        rb_raise(rb_eRuntimeError, "error term count(%d) less than word index(%d)", c->num_terms, one_word);
+      }
+			c->docs[i].words[j] = one_word;
+			c->docs[i].counts[j] = one_count;
+		}
+	}
+	last_corpus = c;
+	corpus_loaded = TRUE;
+	rb_iv_set(self, "@corpus", rcorpus);
+	return Qtrue;
+}
+/*
+ * Get the gamma values after the model has been run.
+ */
+static VALUE wrap_get_gamma(VALUE self) {
+	if (!model_loaded)
+		return Qnil;
+	// last_gamma is a double[num_docs][num_topics]
+	VALUE arr;
+	int i = 0, j = 0;
+	arr = rb_ary_new2(last_corpus->num_docs);
+	for (i = 0; i < last_corpus->num_docs; i++) {
+		VALUE arr2 = rb_ary_new2(last_model->num_topics);
+		for (j = 0; j < last_model->num_topics; j++) {
+			rb_ary_store(arr2, j, rb_float_new(last_gamma[i][j]));
+		}
+		rb_ary_store(arr, i, arr2);
+	}
+	return arr;
+}
+/*
+ * Compute the phi values by running inference after the initial EM run has been completed.
+ *
+ * Returns a 3D matrix:  <tt>num_docs x length x num_topics</tt>.
+ */
+static VALUE wrap_get_phi(VALUE self) {
+    if (!model_loaded)
+        return Qnil;
+    VALUE arr = rb_ary_new2(last_corpus->num_docs);
+    int i = 0, j = 0, k = 0;
+    //int max_length = max_corpus_length(last_corpus);
+    short error = 0;
+    for (i = 0; i < last_corpus->num_docs; i++) {
+        VALUE arr1 = rb_ary_new2(last_corpus->docs[i].length);
+        lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi, &error);
+        for (j = 0; j < last_corpus->docs[i].length; j++) {
+            VALUE arr2 = rb_ary_new2(last_model->num_topics);
+            for (k = 0; k < last_model->num_topics; k++) {
+                rb_ary_store(arr2, k, rb_float_new(last_phi[j][k]));
+            }
+            rb_ary_store(arr1, j, arr2);
+        }
+        rb_ary_store(arr, i, arr1);
+    }
+    return arr;
+}
+/*
+ * Get the beta matrix after the model has been run.
+ */
+static VALUE wrap_get_model_beta(VALUE self) {
+	if (!model_loaded)
+		return Qnil;
+	// beta is a double[num_topics][num_terms]
+	VALUE arr;
+	int i = 0, j = 0;
+	arr = rb_ary_new2(last_model->num_topics);
+	for (i = 0; i < last_model->num_topics; i++) {
+		VALUE arr2 = rb_ary_new2(last_model->num_terms);
+		for (j = 0; j < last_model->num_terms; j++) {
+			rb_ary_store(arr2, j, rb_float_new(last_model->log_prob_w[i][j]));
+		}
+		rb_ary_store(arr, i, arr2);
+	}
+	return arr;
+}
+/*
+ * Get the settings used for the model.
+ */
+static VALUE wrap_get_model_settings(VALUE self) {
+	if (!model_loaded)
+		return Qnil;
+	VALUE arr;
+	arr = rb_ary_new();
+	rb_ary_push(arr, rb_int_new(last_model->num_topics));
+	rb_ary_push(arr, rb_int_new(last_model->num_terms));
+	rb_ary_push(arr, rb_float_new(last_model->alpha));
+	return arr;		//	[num_topics, num_terms, alpha]
+}
+void Init_lda() {
+  corpus_loaded = FALSE;
+  model_loaded = FALSE;
+  VERBOSE = TRUE;
+  rb_require("lda-ruby");
+  rb_cLdaModule   = rb_define_module("Lda");
+  rb_cLda         = rb_define_class_under(rb_cLdaModule, "Lda", rb_cObject);
+  rb_cLdaCorpus   = rb_define_class_under(rb_cLdaModule, "Corpus", rb_cObject);
+  rb_cLdaDocument = rb_define_class_under(rb_cLdaModule, "Document", rb_cObject);
+  // method to load the corpus
+  rb_define_method(rb_cLda, "fast_load_corpus_from_file", wrap_load_corpus, 1);
+  rb_define_method(rb_cLda, "corpus=", wrap_ruby_corpus, 1);
+  // method to run em
+  rb_define_method(rb_cLda, "em", wrap_em, 1);
+  // method to load settings from file
+  rb_define_method(rb_cLda, "load_settings", wrap_load_settings, 1);
+  // method to set all the config options at once
+  rb_define_method(rb_cLda, "set_config", wrap_set_config, 5);
+  // accessor stuff for main settings
+  rb_define_method(rb_cLda, "max_iter", wrap_get_max_iter, 0);
+  rb_define_method(rb_cLda, "max_iter=", wrap_set_max_iter, 1);
+  rb_define_method(rb_cLda, "convergence", wrap_get_converged, 0);
+  rb_define_method(rb_cLda, "convergence=", wrap_set_converged, 1);
+  rb_define_method(rb_cLda, "em_max_iter", wrap_get_em_max_iter, 0);
+  rb_define_method(rb_cLda, "em_max_iter=", wrap_set_em_max_iter, 1);
+  rb_define_method(rb_cLda, "em_convergence", wrap_get_em_converged, 0);
+  rb_define_method(rb_cLda, "em_convergence=", wrap_set_em_converged, 1);
+  rb_define_method(rb_cLda, "init_alpha=", wrap_set_initial_alpha, 1);
+  rb_define_method(rb_cLda, "init_alpha", wrap_get_initial_alpha, 0);
+  rb_define_method(rb_cLda, "est_alpha=", wrap_set_estimate_alpha, 1);
+  rb_define_method(rb_cLda, "est_alpha", wrap_get_estimate_alpha, 0);
+  rb_define_method(rb_cLda, "num_topics", wrap_get_num_topics, 0);
+  rb_define_method(rb_cLda, "num_topics=", wrap_set_num_topics, 1);
+  rb_define_method(rb_cLda, "verbose", wrap_get_verbosity, 0);
+  rb_define_method(rb_cLda, "verbose=", wrap_set_verbosity, 1);
+  // retrieve model and gamma
+  rb_define_method(rb_cLda, "beta", wrap_get_model_beta, 0);
+  rb_define_method(rb_cLda, "gamma", wrap_get_gamma, 0);
+  rb_define_method(rb_cLda, "compute_phi", wrap_get_phi, 0);
+  rb_define_method(rb_cLda, "model", wrap_get_model_settings, 0);
+}
+#endif