RubyGems - ealdent-lda-ruby - Versions diffs - 0.1.1 - Mend

ealdent-lda-ruby 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/lda-inference.h ADDED

@@ -0,0 +1,57 @@
+#ifndef LDA_INFERENCE_H
+#define LDA_INFERENCE_H
+#include <math.h>
+#include <float.h>
+#include <assert.h>
+#include "lda.h"
+#include "utils.h"
+int LAG = 5;
+float EM_CONVERGED;
+int EM_MAX_ITER;
+int ESTIMATE_ALPHA;
+double INITIAL_ALPHA;
+int NTOPICS;
+float VAR_CONVERGED;
+int VAR_MAX_ITER;
+#ifdef USE_RUBY
+corpus *last_corpus;
+lda_model *last_model;
+double **last_gamma;
+enum BOOL { FALSE, TRUE } corpus_loaded, model_loaded;
+#endif
+double lda_inference(document*, lda_model*, double*, double**);
+double compute_likelihood(document*, lda_model*, double**, double*);
+double doc_e_step(document* doc,
+                  double* gamma,
+                  double** phi,
+                  lda_model* model,
+                  lda_suffstats* ss);
+void save_gamma(char* filename,
+                double** gamma,
+                int num_docs,
+                int num_topics);
+void run_em(char* start,
+            char* directory,
+            corpus* corpus);
+#ifdef USE_RUBY
+void run_quiet_em(char* start, corpus* corpus);
+#endif
+void read_settings(char* filename);
+void infer(char* model_root,
+           char* save,
+           corpus* corpus);
+#endif

data/lda-model.c ADDED

@@ -0,0 +1,238 @@
+// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
+// This file is part of LDA-C.
+// LDA-C is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 2 of the License, or (at your
+// option) any later version.
+// LDA-C is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+#include "lda-model.h"
+/*
+* compute MLE lda model from sufficient statistics
+	*
+*/
+void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
+	int k; int w;
+	for (k = 0; k < model->num_topics; k++)
+	{
+		for (w = 0; w < model->num_terms; w++)
+		{
+			if (ss->class_word[k][w] > 0)
+			{
+				model->log_prob_w[k][w] =
+					log(ss->class_word[k][w]) -
+					log(ss->class_total[k]);
+			}
+			else
+				model->log_prob_w[k][w] = -100;
+		}
+	}
+	if (estimate_alpha == 1)
+	{
+		model->alpha = opt_alpha(ss->alpha_suffstats,
+			ss->num_docs,
+			model->num_topics);
+		printf("new alpha = %5.5f\n", model->alpha);
+	}
+}
+/*
+* allocate sufficient statistics
+	*
+*/
+lda_suffstats* new_lda_suffstats(lda_model* model) {
+	int num_topics = model->num_topics;
+	int num_terms = model->num_terms;
+	int i,j;
+	lda_suffstats* ss = malloc(sizeof(lda_suffstats));
+	ss->class_total = malloc(sizeof(double)*num_topics);
+	ss->class_word = malloc(sizeof(double*)*num_topics);
+	for (i = 0; i < num_topics; i++)
+	{
+		ss->class_total[i] = 0;
+		ss->class_word[i] = malloc(sizeof(double)*num_terms);
+		for (j = 0; j < num_terms; j++)
+		{
+			ss->class_word[i][j] = 0;
+		}
+	}
+	return(ss);
+}
+/*
+* various intializations for the sufficient statistics
+	*
+*/
+void zero_initialize_ss(lda_suffstats* ss, lda_model* model) {
+	int k, w;
+	for (k = 0; k < model->num_topics; k++)
+	{
+		ss->class_total[k] = 0;
+		for (w = 0; w < model->num_terms; w++)
+		{
+			ss->class_word[k][w] = 0;
+		}
+	}
+	ss->num_docs = 0;
+	ss->alpha_suffstats = 0;
+}
+void random_initialize_ss(lda_suffstats* ss, lda_model* model) {
+	int num_topics = model->num_topics;
+	int num_terms = model->num_terms;
+	int k, n;
+	for (k = 0; k < num_topics; k++)
+	{
+		for (n = 0; n < num_terms; n++)
+		{
+			ss->class_word[k][n] += 1.0/num_terms + myrand();
+			ss->class_total[k] += ss->class_word[k][n];
+		}
+	}
+}
+void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c)
+{
+	int num_topics = model->num_topics;
+	int i, k, d, n;
+	document* doc;
+	for (k = 0; k < num_topics; k++)
+	{
+		for (i = 0; i < NUM_INIT; i++)
+		{
+			d = floor(myrand() * c->num_docs);
+			printf("initialized with document %d\n", d);
+			doc = &(c->docs[d]);
+			for (n = 0; n < doc->length; n++)
+			{
+				ss->class_word[k][doc->words[n]] += doc->counts[n];
+			}
+		}
+		for (n = 0; n < model->num_terms; n++)
+		{
+			ss->class_word[k][n] += 1.0;
+			ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
+		}
+	}
+}
+/*
+* allocate new lda model
+	*
+*/
+lda_model* new_lda_model(int num_terms, int num_topics) {
+	int i,j;
+	lda_model* model;
+	model = malloc(sizeof(lda_model));
+	model->num_topics = num_topics;
+	model->num_terms = num_terms;
+	model->alpha = 1.0;
+	model->log_prob_w = malloc(sizeof(double*)*num_topics);
+	for (i = 0; i < num_topics; i++)
+	{
+		model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
+		for (j = 0; j < num_terms; j++)
+			model->log_prob_w[i][j] = 0;
+	}
+	return(model);
+}
+/*
+ * deallocate new lda model
+ *
+ */
+void free_lda_model(lda_model* model) {
+	int i;
+	for (i = 0; i < model->num_topics; i++)
+	{
+		free(model->log_prob_w[i]);
+	}
+	free(model->log_prob_w);
+}
+/*
+ * save an lda model
+ *
+ */
+void save_lda_model(lda_model* model, char* model_root) {
+	char filename[100];
+	FILE* fileptr;
+	int i, j;
+	sprintf(filename, "%s.beta", model_root);
+	fileptr = fopen(filename, "w");
+	for (i = 0; i < model->num_topics; i++) {
+		for (j = 0; j < model->num_terms; j++) {
+			fprintf(fileptr, " %5.10f", model->log_prob_w[i][j]);
+		}
+		fprintf(fileptr, "\n");
+	}
+	fclose(fileptr);
+	sprintf(filename, "%s.other", model_root);
+	fileptr = fopen(filename, "w");
+	fprintf(fileptr, "num_topics %d\n", model->num_topics);
+	fprintf(fileptr, "num_terms %d\n", model->num_terms);
+	fprintf(fileptr, "alpha %5.10f\n", model->alpha);
+	fclose(fileptr);
+}
+lda_model* load_lda_model(char* model_root) {
+	char filename[100];
+	FILE* fileptr;
+	int i, j, num_terms, num_topics;
+	float x, alpha;
+	sprintf(filename, "%s.other", model_root);
+	printf("loading %s\n", filename);
+	fileptr = fopen(filename, "r");
+	fscanf(fileptr, "num_topics %d\n", &num_topics);
+	fscanf(fileptr, "num_terms %d\n", &num_terms);
+	fscanf(fileptr, "alpha %f\n", &alpha);
+	fclose(fileptr);
+	lda_model* model = new_lda_model(num_terms, num_topics);
+	model->alpha = alpha;
+	sprintf(filename, "%s.beta", model_root);
+	printf("loading %s\n", filename);
+	fileptr = fopen(filename, "r");
+	for (i = 0; i < num_topics; i++)
+	{
+		for (j = 0; j < num_terms; j++)
+		{
+			fscanf(fileptr, "%f", &x);
+			model->log_prob_w[i][j] = x;
+		}
+	}
+	fclose(fileptr);
+	return(model);
+}

data/lda-model.h ADDED

@@ -0,0 +1,24 @@
+#ifndef LDA_MODEL_H
+#define LDA_MODEL
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "lda.h"
+#include "lda-alpha.h"
+#include "cokus.h"
+#define myrand() (double) (((unsigned long) randomMT()) / 4294967296.)
+#define NUM_INIT 1
+void free_lda_model(lda_model*);
+void save_lda_model(lda_model*, char*);
+lda_model* new_lda_model(int, int);
+lda_suffstats* new_lda_suffstats(lda_model* model);
+void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
+void random_initialize_ss(lda_suffstats* ss, lda_model* model);
+void zero_initialize_ss(lda_suffstats* ss, lda_model* model);
+void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha);
+lda_model* load_lda_model(char* model_root);
+#endif

data/lda.h ADDED

@@ -0,0 +1,54 @@
+// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
+// This file is part of LDA-C.
+// LDA-C is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 2 of the License, or (at your
+// option) any later version.
+// LDA-C is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+#ifndef LDA_H
+#define LDA_H
+typedef struct {
+	int* words;
+	int* counts;
+	int length;
+	int total;
+} document;
+typedef struct {
+	document* docs;
+	int num_terms;
+	int num_docs;
+} corpus;
+typedef struct {
+	double alpha;
+	double** log_prob_w;
+	int num_topics;
+	int num_terms;
+} lda_model;
+typedef struct {
+	double** class_word;
+	double* class_total;
+	double alpha_suffstats;
+	int num_docs;
+} lda_suffstats;
+#endif

data/lda.rb ADDED

@@ -0,0 +1,252 @@
+require 'set'
+module Lda
+  #
+  # Corpus class handles the data passed to the LDA algorithm.
+  #
+  class Corpus
+    attr_reader :documents, :num_docs, :num_terms
+    def initialize
+      @documents = Array.new
+      @all_terms = Set.new
+      @num_terms = 0
+      @num_docs  = 0
+    end
+    # Add a new document to the corpus.  This can either be
+    # an svmlight-style formatted line with the first element
+    # being the number of words, or it can be a Document object.
+    def add_document(doc)
+      if doc.is_a?(Document)
+        @documents << doc
+        @all_terms = @all_terms + doc.words
+      elsif doc.is_a?(String)
+        d = Document.new(doc)
+        @all_terms = @all_terms + d.words
+        @documents << d
+      end
+      @num_docs += 1
+      @num_terms = @all_terms.size
+      true
+    end
+    # Populate this corpus from the data in the file.
+    def load_from_file(filename)
+      File.open(filename, 'r') do |f|
+        f.each do |line|
+          self.add_document(line)
+        end
+      end
+      true
+    end
+  end
+  # A single document.
+  class Document
+    attr_accessor :words, :counts, :length, :total
+    # Create the Document using the svmlight-style text line:
+    #
+    #   num_words w1:freq1 w2:freq2 ... w_n:freq_n
+    #
+    # Ex.
+    #   5 1:2 3:1 4:2 7:3 12:1
+    #
+    # The value for the number of words should equal the number of pairs
+    # following it, though this isn't strictly enforced.  Order of word-pair
+    # indices is not important.
+    #
+    def initialize(doc_line=nil)
+      if doc_line.is_a?(String)
+        tmp = doc_line.split
+        @words = Array.new
+        @counts = Array.new
+        @total = 0
+        tmp.slice(1,tmp.size).each do |pair|
+          tmp2 = pair.split(":")
+          @words << tmp2[0].to_i
+          @counts << tmp2[1].to_i
+        end
+        @length = @words.size
+        @total = @counts.inject(0) {|sum, i| sum + i}
+      else    # doc_line == nil
+        @words = Array.new
+        @counts = Array.new
+        @total = 0
+        @length = 0
+      end
+    end
+    def recompute
+      @total = @counts.inject(0) {|sum, i| sum + i}
+      @length = @words.size
+    end
+  end
+  class Lda
+    attr_reader :vocab, :corpus
+    #
+    # Create a new LDA instance with the default settings.
+    #
+    def initialize
+      self.load_default_settings
+      @corpus = nil
+      @vocab = nil
+    end
+    #
+    # Load the default settings.
+    #  * max_iter = 20
+    #  * convergence = 1e-6
+    #  * em_max_iter = 100
+    #  * em_convergence = 1e-4
+    #  * num_topics = 20
+    #  * init_alpha = 0.3
+    #  * est_alpha = 1
+    #
+    def load_default_settings
+      self.max_iter = 20
+      self.convergence = 1e-6
+      self.em_max_iter = 100
+      self.em_convergence = 1e-4
+      self.num_topics = 20
+      self.init_alpha = 0.3
+      self.est_alpha = 1
+      nil
+    end
+    #
+    # Load the corpus from file.  The corpus is in svmlight-style where the
+    # first element of each line is the number of words in the document and
+    # then each element is the pair word_idx:weight.
+    #
+    #   num_words word1:wgt1 word2:wgt2 ... word_n:wgt_n
+    #
+    # The value for the number of words should equal the number of pairs
+    # following it, though this isn't strictly enforced in this method.
+    #
+    def load_corpus(filename)
+      c = Corpus.new
+      c.load_from_file(filename)
+      self.corpus = c
+      @corpus = c
+      true
+    end
+    #
+    # Load the vocabulary file which is a list of words, one per line
+    # where the line number corresponds the word list index.  This allows
+    # the words to be extracted for topics later.
+    #
+    # +vocab+ can either be the filename of the vocabulary file or the
+    # array itself.
+    #
+    def load_vocabulary(vocab)
+      @vocab = Array.new
+      File.open(filename, 'r') do |f|
+        f.each do |line|
+          @vocab << line.strip
+        end
+      end
+      true
+    end
+    #
+    # Visualization method for printing out the top +words_per_topic+ words
+    # for each topic.
+    #
+    # See also +top_words+.
+    #
+    def print_topics(words_per_topic=10)
+      unless @vocab
+        puts "No vocabulary loaded."
+        return nil
+      end
+      beta = self.beta
+      indices = (0..(@vocab.size - 1)).to_a
+      topic_num = 0
+      beta.each do |topic|
+        indices.sort! {|x, y| -(topic[x] <=> topic[y])}
+        outp = []
+        puts "Topic #{topic_num}"
+        words_per_topic.times do |i|
+          outp << @vocab[indices[i]]
+        end
+        puts "\t" + outp.join("\n\t")
+        puts ""
+        topic_num += 1
+      end
+      nil
+    end
+    #
+    # After the model has been run and a vocabulary has been loaded, return the
+    # +words_per_topic+ top words chosen by the model for each topic.  This is
+    # returned as a hash mapping the topic number to an array of top words
+    # (in descending order of importance).
+    #
+    #   topic_number => [w1, w2, ..., w_n]
+    #
+    # See also +print_topics+.
+    #
+    def top_words(words_per_topic=10)
+      unless @vocab
+        puts "No vocabulary loaded."
+        return nil
+      end
+      # Load the model
+      beta = self.beta
+      unless beta
+        puts "Model has not been run."
+        return nil
+      end
+      # find the highest scoring words per topic
+      topics = Hash.new
+      indices = (0..(@vocab.size - 1)).to_a
+      topic_num = 0
+      beta.each do |topic|
+        topics[topic_num] = Array.new
+        indices.sort! {|x, y| -(topic[x] <=> topic[y])}
+        words_per_topic.times do |i|
+          topics[topic_num] << @vocab[indices[i]]
+        end
+        topic_num += 1
+      end
+      topics
+    end
+    #
+    # String representation displaying current settings.
+    #
+    def to_s
+      outp = []
+      outp << "LDA Settings:"
+      outp << "    Initial alpha: %0.6f" % self.init_alpha
+    	outp << "      # of topics: %d" % self.num_topics
+    	outp << "   Max iterations: %d" % self.max_iter
+    	outp << "      Convergence: %0.6f" % self.convergence
+    	outp << "EM max iterations: %d" % self.em_max_iter
+    	outp << "   EM convergence: %0.6f" % self.em_convergence
+    	outp << "   Estimate alpha: %d" % self.est_alpha
+    	return outp.join("\n")
+    end
+  end
+end
+# load the c-side stuff
+require 'lda_ext'