RubyGems - lda-ruby - Versions diffs - 0.3.1 - Mend

lda-ruby 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

data/.gitignore +5 -0
data/CHANGELOG +22 -0
data/README +21 -0
data/README.markdown +38 -0
data/Rakefile +58 -0
data/VERSION.yml +4 -0
data/ext/lda-ruby/Makefile +181 -0
data/ext/lda-ruby/cokus.c +145 -0
data/ext/lda-ruby/cokus.h +27 -0
data/ext/lda-ruby/extconf.rb +9 -0
data/ext/lda-ruby/lda-alpha.c +96 -0
data/ext/lda-ruby/lda-alpha.h +21 -0
data/ext/lda-ruby/lda-data.c +67 -0
data/ext/lda-ruby/lda-data.h +14 -0
data/ext/lda-ruby/lda-inference.c +1007 -0
data/ext/lda-ruby/lda-inference.h +63 -0
data/ext/lda-ruby/lda-model.c +345 -0
data/ext/lda-ruby/lda-model.h +29 -0
data/ext/lda-ruby/lda.h +54 -0
data/ext/lda-ruby/utils.c +111 -0
data/ext/lda-ruby/utils.h +18 -0
data/lda-ruby.gemspec +78 -0
data/lib/lda-ruby.rb +168 -0
data/lib/lda-ruby/corpus/corpus.rb +34 -0
data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
data/lib/lda-ruby/corpus/text_corpus.rb +22 -0
data/lib/lda-ruby/document/data_document.rb +30 -0
data/lib/lda-ruby/document/document.rb +36 -0
data/lib/lda-ruby/document/text_document.rb +37 -0
data/lib/lda-ruby/vocabulary.rb +46 -0
data/license.txt +504 -0
data/test/data/.gitignore +2 -0
data/test/data/docs.dat +46 -0
data/test/data/wiki-test-docs.yml +123 -0
data/test/lda_ruby_test.rb +274 -0
data/test/test_helper.rb +10 -0
metadata +95 -0

data/ext/lda-ruby/lda-inference.h ADDED Viewed

@@ -0,0 +1,63 @@
+#ifndef LDA_INFERENCE_H
+#define LDA_INFERENCE_H
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include "lda.h"
+#include "utils.h"
+int LAG = 5;
+float EM_CONVERGED;
+int EM_MAX_ITER;
+int ESTIMATE_ALPHA;
+double INITIAL_ALPHA;
+int NTOPICS;
+float VAR_CONVERGED;
+int VAR_MAX_ITER;
+#ifdef USE_RUBY
+corpus *last_corpus;
+lda_model *last_model;
+double **last_gamma;
+double **last_phi;
+enum BOOL { FALSE, TRUE } corpus_loaded, model_loaded, VERBOSE;
+#endif
+double lda_inference(document*, lda_model*, double*, double**, short*);
+double compute_likelihood(document*, lda_model*, double**, double*);
+double doc_e_step(document* doc,
+                  double* gamma,
+                  double** phi,
+                  lda_model* model,
+                  lda_suffstats* ss);
+void save_gamma(char* filename,
+                double** gamma,
+                int num_docs,
+                int num_topics);
+void run_em(char* start,
+            char* directory,
+            corpus* corpus);
+#ifdef USE_RUBY
+void run_quiet_em(char* start, corpus* corpus);
+#endif
+void read_settings(char* filename);
+void infer(char* model_root,
+           char* save,
+           corpus* corpus);
+#endif

data/ext/lda-ruby/lda-model.c ADDED Viewed

@@ -0,0 +1,345 @@
+// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
+// This file is part of LDA-C.
+// LDA-C is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 2 of the License, or (at your
+// option) any later version.
+// LDA-C is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+#include "lda-model.h"
+#include <string.h>
+/*
+* compute MLE lda model from sufficient statistics
+	*
+*/
+void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
+	int k; int w;
+	for (k = 0; k < model->num_topics; k++)
+	{
+		for (w = 0; w < model->num_terms; w++)
+		{
+			if (ss->class_word[k][w] > 0)
+			{
+				model->log_prob_w[k][w] =
+					log(ss->class_word[k][w]) -
+					log(ss->class_total[k]);
+			}
+			else
+				model->log_prob_w[k][w] = -100;
+		}
+	}
+	if (estimate_alpha == 1)
+	{
+		model->alpha = opt_alpha(ss->alpha_suffstats,
+			ss->num_docs,
+			model->num_topics);
+		printf("new alpha = %5.5f\n", model->alpha);
+	}
+}
+void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
+	int k; int w;
+	for (k = 0; k < model->num_topics; k++)
+	{
+		for (w = 0; w < model->num_terms; w++)
+		{
+			if (ss->class_word[k][w] > 0)
+			{
+				model->log_prob_w[k][w] =
+					log(ss->class_word[k][w]) -
+					log(ss->class_total[k]);
+			}
+			else
+				model->log_prob_w[k][w] = -100;
+		}
+	}
+	if (estimate_alpha == 1)
+	{
+		model->alpha = quiet_opt_alpha(ss->alpha_suffstats,
+			ss->num_docs,
+			model->num_topics);
+	}
+}
+/*
+* allocate sufficient statistics
+	*
+*/
+lda_suffstats* new_lda_suffstats(lda_model* model) {
+	register int i;
+	int num_topics = model->num_topics;
+	int num_terms = model->num_terms;
+	lda_suffstats* ss = (lda_suffstats*)malloc(sizeof(lda_suffstats));
+  memset(ss,0,sizeof(lda_suffstats));
+	ss->class_total = (double*)malloc(sizeof(double)*num_topics);
+	ss->class_word = (double**)malloc(sizeof(double*)*num_topics);
+	for (i = 0; i < num_topics; ++i) {
+		ss->class_total[i] = 0;
+		ss->class_word[i] = (double*)malloc(sizeof(double)*num_terms);
+    memset(ss->class_word[i],0.0,sizeof(double)*num_terms);
+	}
+	return(ss);
+}
+/*
+ * deallocate new lda suffstats
+ *
+ */
+void free_lda_suffstats(lda_model* model, lda_suffstats* ss) {
+  int i;
+	int num_topics = model->num_topics;
+  free(ss->class_total);
+	for (i = 0; i < num_topics; ++i) {
+    free(ss->class_word[i]);
+  }
+  free(ss->class_word);
+  free(ss);
+}
+/*
+* various intializations for the sufficient statistics
+	*
+*/
+void zero_initialize_ss(lda_suffstats* ss, lda_model* model) {
+	int k, w;
+	for (k = 0; k < model->num_topics; k++)
+	{
+		ss->class_total[k] = 0;
+		for (w = 0; w < model->num_terms; w++)
+		{
+			ss->class_word[k][w] = 0;
+		}
+	}
+	ss->num_docs = 0;
+	ss->alpha_suffstats = 0;
+}
+void random_initialize_ss(lda_suffstats* ss, lda_model* model) {
+	int num_topics = model->num_topics;
+	int num_terms = model->num_terms;
+	int k, n;
+	for (k = 0; k < num_topics; k++)
+	{
+		for (n = 0; n < num_terms; n++)
+		{
+			ss->class_word[k][n] += 1.0/num_terms + myrand();
+			ss->class_total[k] += ss->class_word[k][n];
+		}
+	}
+}
+void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c)
+{
+	int num_topics = model->num_topics;
+	int i, k, d, n;
+	document* doc;
+	for (k = 0; k < num_topics; k++)
+	{
+		for (i = 0; i < NUM_INIT; i++)
+		{
+			d = floor(myrand() * c->num_docs);
+			printf("initialized with document %d\n", d);
+			doc = &(c->docs[d]);
+			for (n = 0; n < doc->length; n++)
+			{
+				ss->class_word[k][doc->words[n]] += doc->counts[n];
+			}
+		}
+		for (n = 0; n < model->num_terms; n++)
+		{
+			ss->class_word[k][n] += 1.0;
+			ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
+		}
+	}
+}
+void quiet_corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c)
+{
+	int num_topics = model->num_topics;
+	int i, k, d, n;
+	document* doc;
+	for (k = 0; k < num_topics; k++)
+	{
+		for (i = 0; i < NUM_INIT; i++)
+		{
+			d = floor(myrand() * c->num_docs);
+			doc = &(c->docs[d]);
+			for (n = 0; n < doc->length; n++)
+			{
+				ss->class_word[k][doc->words[n]] += doc->counts[n];
+			}
+		}
+		for (n = 0; n < model->num_terms; n++)
+		{
+			ss->class_word[k][n] += 1.0;
+			ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
+		}
+	}
+}
+/*
+ * Use the first num_topics documents of the corpus as the seeds.  If num_topics > num_docs, results might be hairy.
+ */
+void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c) {
+    int num_topics = MIN(model->num_topics, c->num_docs);
+    int k, n;
+    document* doc;
+    for (k = 0; k < num_topics; k++) {
+        doc = &(c->docs[k]);
+        for (n = 0; n < doc->length; n++) {
+            ss->class_word[k][doc->words[n]] += doc->counts[n];
+        }
+        for (n = 0; n < model->num_terms; n++) {
+            ss->class_word[k][n] += 1.0;
+            ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
+        }
+    }
+}
+/*
+* allocate new lda model
+	*
+*/
+lda_model* new_lda_model(int num_terms, int num_topics) {
+	int i;
+	lda_model* model;
+	model = malloc(sizeof(lda_model));
+	model->num_topics = num_topics;
+	model->num_terms = num_terms;
+	model->alpha = 1.0;
+	model->log_prob_w = malloc(sizeof(double*)*num_topics);
+  printf("new model with: %d topics and %d terms\n", num_topics, num_terms);
+	for (i = 0; i < num_topics; i++)
+	{
+		model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
+    memset(model->log_prob_w[i],0,sizeof(double)*num_terms);
+	}
+	return(model);
+}
+lda_model* quiet_new_lda_model(int num_terms, int num_topics) {
+	int i;
+	lda_model* model;
+	model = malloc(sizeof(lda_model));
+	model->num_topics = num_topics;
+	model->num_terms = num_terms;
+	model->alpha = 1.0;
+	model->log_prob_w = malloc(sizeof(double*)*num_topics);
+	for (i = 0; i < num_topics; i++)
+	{
+		model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
+    memset(model->log_prob_w[i],0,sizeof(double)*num_terms);
+	}
+	return(model);
+}
+/*
+ * deallocate new lda model
+ *
+ */
+void free_lda_model(lda_model* model) {
+	int i;
+	for (i = 0; i < model->num_topics; i++)
+	{
+		free(model->log_prob_w[i]);
+	}
+	free(model->log_prob_w);
+}
+/*
+ * save an lda model
+ *
+ */
+void save_lda_model(lda_model* model, char* model_root) {
+	char filename[100];
+	FILE* fileptr;
+	int i, j;
+	sprintf(filename, "%s.beta", model_root);
+	fileptr = fopen(filename, "w");
+	for (i = 0; i < model->num_topics; i++) {
+		for (j = 0; j < model->num_terms; j++) {
+			fprintf(fileptr, " %5.10f", model->log_prob_w[i][j]);
+		}
+		fprintf(fileptr, "\n");
+	}
+	fclose(fileptr);
+	sprintf(filename, "%s.other", model_root);
+	fileptr = fopen(filename, "w");
+	fprintf(fileptr, "num_topics %d\n", model->num_topics);
+	fprintf(fileptr, "num_terms %d\n", model->num_terms);
+	fprintf(fileptr, "alpha %5.10f\n", model->alpha);
+	fclose(fileptr);
+}
+lda_model* load_lda_model(char* model_root) {
+	char filename[100];
+	FILE* fileptr;
+	int i, j, num_terms, num_topics;
+	float x, alpha;
+	sprintf(filename, "%s.other", model_root);
+	printf("loading %s\n", filename);
+	fileptr = fopen(filename, "r");
+	fscanf(fileptr, "num_topics %d\n", &num_topics);
+	fscanf(fileptr, "num_terms %d\n", &num_terms);
+	fscanf(fileptr, "alpha %f\n", &alpha);
+	fclose(fileptr);
+	lda_model* model = new_lda_model(num_terms, num_topics);
+	model->alpha = alpha;
+	sprintf(filename, "%s.beta", model_root);
+	printf("loading %s\n", filename);
+	fileptr = fopen(filename, "r");
+	for (i = 0; i < num_topics; i++)
+	{
+		for (j = 0; j < num_terms; j++)
+		{
+			fscanf(fileptr, "%f", &x);
+			model->log_prob_w[i][j] = x;
+		}
+	}
+	fclose(fileptr);
+	return(model);
+}

data/ext/lda-ruby/lda-model.h ADDED Viewed

@@ -0,0 +1,29 @@
+#ifndef LDA_MODEL_H
+#define LDA_MODEL
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "lda.h"
+#include "lda-alpha.h"
+#include "cokus.h"
+#define myrand() (double) (((unsigned long) randomMT()) / 4294967296.)
+#define NUM_INIT 1
+#define MIN(A,B) (int)((A > B) ? (B) : (A))
+void free_lda_model(lda_model*);
+void save_lda_model(lda_model*, char*);
+lda_model* new_lda_model(int, int);
+lda_suffstats* new_lda_suffstats(lda_model* model);
+void free_lda_suffstats(lda_model* model, lda_suffstats* ss);
+void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
+void quiet_corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
+void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c);
+void random_initialize_ss(lda_suffstats* ss, lda_model* model);
+void zero_initialize_ss(lda_suffstats* ss, lda_model* model);
+void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha);
+void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha);
+lda_model* load_lda_model(char* model_root);
+#endif

data/ext/lda-ruby/lda.h ADDED Viewed

@@ -0,0 +1,54 @@
+// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
+// This file is part of LDA-C.
+// LDA-C is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 2 of the License, or (at your
+// option) any later version.
+// LDA-C is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+#ifndef LDA_H
+#define LDA_H
+typedef struct {
+	int* words;
+	int* counts;
+	int length;
+	int total;
+} document;
+typedef struct {
+	document* docs;
+	int num_terms;
+	int num_docs;
+} corpus;
+typedef struct {
+	double alpha;
+	double** log_prob_w;
+	int num_topics;
+	int num_terms;
+} lda_model;
+typedef struct {
+	double** class_word;
+	double* class_total;
+	double alpha_suffstats;
+	int num_docs;
+} lda_suffstats;
+#endif