RubyGems - lda-ruby - Versions diffs - 0.3.1 - Mend

lda-ruby 0.3.1

Files changed (38) hide show

data/.gitignore +5 -0
data/CHANGELOG +22 -0
data/README +21 -0
data/README.markdown +38 -0
data/Rakefile +58 -0
data/VERSION.yml +4 -0
data/ext/lda-ruby/Makefile +181 -0
data/ext/lda-ruby/cokus.c +145 -0
data/ext/lda-ruby/cokus.h +27 -0
data/ext/lda-ruby/extconf.rb +9 -0
data/ext/lda-ruby/lda-alpha.c +96 -0
data/ext/lda-ruby/lda-alpha.h +21 -0
data/ext/lda-ruby/lda-data.c +67 -0
data/ext/lda-ruby/lda-data.h +14 -0
data/ext/lda-ruby/lda-inference.c +1007 -0
data/ext/lda-ruby/lda-inference.h +63 -0
data/ext/lda-ruby/lda-model.c +345 -0
data/ext/lda-ruby/lda-model.h +29 -0
data/ext/lda-ruby/lda.h +54 -0
data/ext/lda-ruby/utils.c +111 -0
data/ext/lda-ruby/utils.h +18 -0
data/lda-ruby.gemspec +78 -0
data/lib/lda-ruby.rb +168 -0
data/lib/lda-ruby/corpus/corpus.rb +34 -0
data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
data/lib/lda-ruby/corpus/text_corpus.rb +22 -0
data/lib/lda-ruby/document/data_document.rb +30 -0
data/lib/lda-ruby/document/document.rb +36 -0
data/lib/lda-ruby/document/text_document.rb +37 -0
data/lib/lda-ruby/vocabulary.rb +46 -0
data/license.txt +504 -0
data/test/data/.gitignore +2 -0
data/test/data/docs.dat +46 -0
data/test/data/wiki-test-docs.yml +123 -0
data/test/lda_ruby_test.rb +274 -0
data/test/test_helper.rb +10 -0
metadata +95 -0

data/ext/lda-ruby/utils.c ADDED Viewed

@@ -0,0 +1,111 @@
+#include "utils.h"
+/*
+ * given log(a) and log(b), return log(a + b)
+ *
+ */
+double log_sum(double log_a, double log_b)
+{
+  double v;
+  if (log_a < log_b)
+  {
+      v = log_b+log(1 + exp(log_a-log_b));
+  }
+  else
+  {
+      v = log_a+log(1 + exp(log_b-log_a));
+  }
+  return(v);
+}
+ /**
+   * Proc to calculate the value of the trigamma, the second
+   * derivative of the loggamma function. Accepts positive matrices.
+   * From Abromowitz and Stegun.  Uses formulas 6.4.11 and 6.4.12 with
+   * recurrence formula 6.4.6.  Each requires workspace at least 5
+   * times the size of X.
+   *
+   **/
+double trigamma(double x)
+{
+    double p;
+    int i;
+    x=x+6;
+    p=1/(x*x);
+    p=(((((0.075757575757576*p-0.033333333333333)*p+0.0238095238095238)
+         *p-0.033333333333333)*p+0.166666666666667)*p+1)/x+0.5*p;
+    for (i=0; i<6 ;i++)
+    {
+        x=x-1;
+        p=1/(x*x)+p;
+    }
+    return(p);
+}
+/*
+ * taylor approximation of first derivative of the log gamma function
+ *
+ */
+double digamma(double x)
+{
+    double p;
+    x=x+6;
+    p=1/(x*x);
+    p=(((0.004166666666667*p-0.003968253986254)*p+
+	0.008333333333333)*p-0.083333333333333)*p;
+    p=p+log(x)-0.5/x-1/(x-1)-1/(x-2)-1/(x-3)-1/(x-4)-1/(x-5)-1/(x-6);
+    return p;
+}
+double log_gamma(double x)
+{
+     double z=1/(x*x);
+    x=x+6;
+    z=(((-0.000595238095238*z+0.000793650793651)
+	*z-0.002777777777778)*z+0.083333333333333)/x;
+    z=(x-0.5)*log(x)-x+0.918938533204673+z-log(x-1)-
+	log(x-2)-log(x-3)-log(x-4)-log(x-5)-log(x-6);
+    return z;
+}
+/*
+ * make directory
+ *
+ */
+void make_directory(char* name)
+{
+    mkdir(name, S_IRUSR|S_IWUSR|S_IXUSR);
+}
+/*
+ * argmax
+ *
+ */
+int argmax(double* x, int n)
+{
+    int i;
+    double max = x[0];
+    int argmax = 0;
+    for (i = 1; i < n; i++)
+    {
+        if (x[i] > max)
+        {
+            max = x[i];
+            argmax = i;
+        }
+    }
+    return(argmax);
+}

data/ext/lda-ruby/utils.h ADDED Viewed

@@ -0,0 +1,18 @@
+#ifndef UTILS_H
+#define UTILS_H
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+double log_sum(double log_a, double log_b);
+double trigamma(double x);
+double digamma(double x);
+double log_gamma(double x);
+void make_directory(char* name);
+int argmax(double* x, int n);
+#endif

data/lda-ruby.gemspec ADDED Viewed

@@ -0,0 +1,78 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE
+# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{lda-ruby}
+  s.version = "0.3.1"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["David Blei", "Jason Adams"]
+  s.date = %q{2009-08-11}
+  s.description = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.}
+  s.email = %q{jasonmadams@gmail.com}
+  s.extensions = ["ext/lda-ruby/extconf.rb"]
+  s.extra_rdoc_files = [
+    "README",
+     "README.markdown"
+  ]
+  s.files = [
+    ".gitignore",
+     "CHANGELOG",
+     "README",
+     "README.markdown",
+     "Rakefile",
+     "VERSION.yml",
+     "ext/lda-ruby/Makefile",
+     "ext/lda-ruby/cokus.c",
+     "ext/lda-ruby/cokus.h",
+     "ext/lda-ruby/extconf.rb",
+     "ext/lda-ruby/lda-alpha.c",
+     "ext/lda-ruby/lda-alpha.h",
+     "ext/lda-ruby/lda-data.c",
+     "ext/lda-ruby/lda-data.h",
+     "ext/lda-ruby/lda-inference.c",
+     "ext/lda-ruby/lda-inference.h",
+     "ext/lda-ruby/lda-model.c",
+     "ext/lda-ruby/lda-model.h",
+     "ext/lda-ruby/lda.h",
+     "ext/lda-ruby/utils.c",
+     "ext/lda-ruby/utils.h",
+     "lda-ruby.gemspec",
+     "lib/lda-ruby.rb",
+     "lib/lda-ruby/corpus/corpus.rb",
+     "lib/lda-ruby/corpus/data_corpus.rb",
+     "lib/lda-ruby/corpus/directory_corpus.rb",
+     "lib/lda-ruby/corpus/text_corpus.rb",
+     "lib/lda-ruby/document/data_document.rb",
+     "lib/lda-ruby/document/document.rb",
+     "lib/lda-ruby/document/text_document.rb",
+     "lib/lda-ruby/vocabulary.rb",
+     "license.txt",
+     "test/data/.gitignore",
+     "test/data/docs.dat",
+     "test/data/wiki-test-docs.yml",
+     "test/lda_ruby_test.rb",
+     "test/test_helper.rb"
+  ]
+  s.homepage = %q{http://github.com/ealdent/lda-ruby}
+  s.rdoc_options = ["--charset=UTF-8"]
+  s.require_paths = ["lib", "ext"]
+  s.rubygems_version = %q{1.3.4}
+  s.summary = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei.}
+  s.test_files = [
+    "test/lda_ruby_test.rb",
+     "test/test_helper.rb"
+  ]
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 3
+    if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
+    else
+    end
+  else
+  end
+end

data/lib/lda-ruby.rb ADDED Viewed

@@ -0,0 +1,168 @@
+$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
+require 'lda-ruby/lda'
+require 'lda-ruby/document/document'
+require 'lda-ruby/document/data_document'
+require 'lda-ruby/document/text_document'
+require 'lda-ruby/corpus/corpus'
+require 'lda-ruby/corpus/data_corpus'
+require 'lda-ruby/corpus/text_corpus'
+require 'lda-ruby/corpus/directory_corpus'
+require 'lda-ruby/vocabulary'
+module Lda
+  class Lda
+    attr_reader :vocab, :corpus
+    def initialize(corpus)
+      load_default_settings
+      @vocab = nil
+      self.corpus = corpus
+      @vocab = corpus.vocabulary.to_a if corpus.vocabulary
+      @phi = nil
+    end
+    def load_default_settings
+      self.max_iter = 20
+      self.convergence = 1e-6
+      self.em_max_iter = 100
+      self.em_convergence = 1e-4
+      self.num_topics = 20
+      self.init_alpha = 0.3
+      self.est_alpha = 1
+      [20, 1e-6, 100, 1e-4, 20, 0.3, 1]
+    end
+    def load_corpus(filename)
+      @corpus = Corpus.new
+      @corpus.load_from_file(filename)
+      true
+    end
+    def load_vocabulary(vocab)
+      if vocab.is_a?(Array)
+        @vocab = Marshal::load(Marshal::dump(vocab))      # deep clone array
+      elsif vocab.is_a?(Vocabulary)
+        @vocab = vocab.to_a
+      else
+        @vocab = File.open(vocab, 'r') { |f| f.read.split(/\s+/) }
+      end
+      true
+    end
+    #
+    # Visualization method for printing out the top +words_per_topic+ words
+    # for each topic.
+    #
+    # See also +top_words+.
+    #
+    def print_topics(words_per_topic = 10)
+      raise 'No vocabulary loaded.' unless @vocab
+      self.beta.each_with_index do |topic, topic_num|
+        # Sort the topic array and return the sorted indices of the best scores
+        indices = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
+        puts "Topic #{topic_num}"
+        puts "\t#{indices.map {|i| @vocab[i]}.join("\n\t")}"
+        puts ""
+      end
+      nil
+    end
+    #
+    # After the model has been run and a vocabulary has been loaded, return the
+    # +words_per_topic+ top words chosen by the model for each topic.  This is
+    # returned as a hash mapping the topic number to an array of top words
+    # (in descending order of importance).
+    #
+    #   topic_number => [w1, w2, ..., w_n]
+    #
+    # See also +print_topics+.
+    #
+    def top_word_indices(words_per_topic = 10)
+      raise 'No vocabulary loaded.' unless @vocab
+      # find the highest scoring words per topic
+      topics = Hash.new
+      indices = (0...@vocab.size).to_a
+      self.beta.each_with_index do |topic, topic_num|
+        topics[topic_num] = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
+      end
+      topics
+    end
+    def top_words(words_per_topic = 10)
+      output = Hash.new
+      topics = top_word_indices(words_per_topic)
+      topics.each_pair do |topic_num, words|
+        output[topic_num] = words.map { |w| @vocab[w] }
+      end
+      output
+    end
+    #
+    # Get the phi matrix which can be used to assign probabilities to words
+    # belonging to a specific topic in each document.  The return value is a
+    # 3D matrix:  num_docs x doc_length x num_topics.  The value is cached
+    # after the first call, so if it needs to be recomputed, set the +recompute+
+    # value to true.
+    #
+    def phi(recompute=false)
+      if @phi.nil? || recompute
+        @phi = self.compute_phi
+      end
+      @phi
+    end
+    #
+    # Compute the average log probability for each topic for each document in the corpus.
+    # This method returns a matrix:  num_docs x num_topics with the average log probability
+    # for the topic in the document.
+    #
+    def compute_topic_document_probability
+      outp = Array.new
+      @corpus.documents.each_with_index do |doc, idx|
+        tops = [0.0] * self.num_topics
+        ttl  = doc.counts.inject(0.0) {|sum, i| sum + i}
+        self.phi[idx].each_with_index do |word_dist, word_idx|
+          word_dist.each_with_index do |top_prob, top_idx|
+            tops[top_idx] += Math.log(top_prob) * doc.counts[word_idx]
+          end
+        end
+        tops = tops.map {|i| i / ttl}
+        outp << tops
+      end
+      outp
+    end
+    #
+    # String representation displaying current settings.
+    #
+    def to_s
+      outp = ["LDA Settings:"]
+      outp << "    Initial alpha: %0.6f" % self.init_alpha
+      outp << "      # of topics: %d" % self.num_topics
+      outp << "   Max iterations: %d" % self.max_iter
+      outp << "      Convergence: %0.6f" % self.convergence
+      outp << "EM max iterations: %d" % self.em_max_iter
+      outp << "   EM convergence: %0.6f" % self.em_convergence
+      outp << "   Estimate alpha: %d" % self.est_alpha
+      outp.join("\n")
+    end
+  end
+end

data/lib/lda-ruby/corpus/corpus.rb ADDED Viewed

@@ -0,0 +1,34 @@
+require 'set'
+module Lda
+  class Corpus
+    attr_reader :documents, :num_docs, :num_terms, :vocabulary
+    def initialize
+      @documents = Array.new
+      @all_terms = Set.new
+      @num_terms = @num_docs = 0
+      @vocabulary = Vocabulary.new
+    end
+    def add_document(doc)
+      raise 'Parameter +doc+ must be of type Document' unless doc.kind_of?(Document)
+      @documents << doc
+      @all_terms += doc.words
+      @num_docs += 1
+      @num_terms = @all_terms.size
+      update_vocabulary(doc)
+      nil
+    end
+    protected
+    def update_vocabulary(doc)
+      doc.tokens.each { |w| @vocabulary.check_word(w) }
+    end
+  end
+end

data/lib/lda-ruby/corpus/data_corpus.rb ADDED Viewed

@@ -0,0 +1,22 @@
+module Lda
+  class DataCorpus < Corpus
+    attr_reader :filename
+    def initialize(filename)
+      super()
+      @filename = filename
+      load_from_file
+    end
+    protected
+    def load_from_file
+      txt = File.open(@filename, 'r') { |f| f.read }
+      lines = txt.split(/[\r\n]+/)
+      lines.each do |line|
+        add_document(DataDocument.new(self, line))
+      end
+    end
+  end
+end

data/lib/lda-ruby/corpus/directory_corpus.rb ADDED Viewed

@@ -0,0 +1,25 @@
+module Lda
+  class DirectoryCorpus < Corpus
+    attr_reader :path, :extension
+    # load documents from a directory
+    def initialize(path, extension = nil)
+      super()
+      @path = path.dup.freeze
+      @extension = extension ? extension.dup.freeze : nil
+      load_from_directory
+    end
+    protected
+    def load_from_directory
+      dir_glob = File.join(@path, (@extension ? "*.#{@extension}" : "*"))
+      Dir.glob(dir_glob).each do |filename|
+        add_document(TextDocument.build_from_file(self, filename))
+      end
+    end
+  end
+end