RubyGems - ealdent-lda-ruby - Versions diffs - 0.2.3 → 0.3.0 - Mend

ealdent-lda-ruby 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

data/.gitignore +5 -0
data/README +5 -6
data/README.markdown +8 -9
data/Rakefile +58 -0
data/VERSION.yml +2 -2
data/ext/lda-ruby/Makefile +181 -0
data/{lib → ext/lda-ruby}/cokus.c +0 -0
data/{lib → ext/lda-ruby}/cokus.h +0 -0
data/ext/lda-ruby/extconf.rb +9 -0
data/{lib → ext/lda-ruby}/lda-alpha.c +0 -0
data/{lib → ext/lda-ruby}/lda-alpha.h +0 -0
data/{lib → ext/lda-ruby}/lda-data.c +0 -0
data/{lib → ext/lda-ruby}/lda-data.h +0 -0
data/{lib → ext/lda-ruby}/lda-inference.c +43 -44
data/{lib → ext/lda-ruby}/lda-inference.h +0 -0
data/{lib → ext/lda-ruby}/lda-model.c +18 -3
data/{lib → ext/lda-ruby}/lda-model.h +0 -0
data/{lib → ext/lda-ruby}/lda.h +0 -0
data/{lib → ext/lda-ruby}/utils.c +0 -0
data/{lib → ext/lda-ruby}/utils.h +0 -0
data/lda-ruby.gemspec +74 -0
data/lib/lda-ruby.rb +157 -0
data/lib/lda-ruby/corpus/corpus.rb +34 -0
data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
data/lib/lda-ruby/corpus/text_corpus.rb +22 -0
data/lib/lda-ruby/document/data_document.rb +30 -0
data/lib/lda-ruby/document/document.rb +36 -0
data/lib/lda-ruby/document/text_document.rb +32 -0
data/lib/lda-ruby/vocabulary.rb +39 -0
data/test/data/.gitignore +2 -0
data/test/data/docs.dat +46 -0
data/test/data/wiki-test-docs.yml +123 -0
data/test/lda_ruby_test.rb +274 -0
data/test/test_helper.rb +10 -0
metadata +47 -36
data/lib/extconf.rb +0 -7
data/lib/lda.rb +0 -319

data/{lib → ext/lda-ruby}/lda-inference.h RENAMED

File without changes

data/{lib → ext/lda-ruby}/lda-model.c RENAMED

@@ -75,8 +75,6 @@ void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
 		model->alpha = quiet_opt_alpha(ss->alpha_suffstats,
 			ss->num_docs,
 			model->num_topics);
-		printf("new alpha = %5.5f\n", model->alpha);
 	}
 }
@@ -217,7 +215,7 @@ void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c)
     int num_topics = MIN(model->num_topics, c->num_docs);
     int k, n;
     document* doc;
     for (k = 0; k < num_topics; k++) {
         doc = &(c->docs[k]);
         for (n = 0; n < doc->length; n++) {
@@ -253,6 +251,23 @@ lda_model* new_lda_model(int num_terms, int num_topics) {
 	return(model);
 }
+lda_model* quiet_new_lda_model(int num_terms, int num_topics) {
+	int i;
+	lda_model* model;
+	model = malloc(sizeof(lda_model));
+	model->num_topics = num_topics;
+	model->num_terms = num_terms;
+	model->alpha = 1.0;
+	model->log_prob_w = malloc(sizeof(double*)*num_topics);
+	for (i = 0; i < num_topics; i++)
+	{
+		model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
+    memset(model->log_prob_w[i],0,sizeof(double)*num_terms);
+	}
+	return(model);
+}
 /*
  * deallocate new lda model

data/{lib → ext/lda-ruby}/lda-model.h RENAMED

File without changes

data/{lib → ext/lda-ruby}/lda.h RENAMED

File without changes

data/{lib → ext/lda-ruby}/utils.c RENAMED

File without changes

data/{lib → ext/lda-ruby}/utils.h RENAMED

File without changes

data/lda-ruby.gemspec ADDED

@@ -0,0 +1,74 @@
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{lda-ruby}
+  s.version = "0.3.0"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["David Blei", "Jason Adams"]
+  s.date = %q{2009-07-24}
+  s.description = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.}
+  s.email = %q{jasonmadams@gmail.com}
+  s.extensions = ["ext/lda-ruby/extconf.rb"]
+  s.extra_rdoc_files = [
+    "README",
+     "README.markdown"
+  ]
+  s.files = [
+    ".gitignore",
+     "README",
+     "README.markdown",
+     "Rakefile",
+     "VERSION.yml",
+     "ext/lda-ruby/Makefile",
+     "ext/lda-ruby/cokus.c",
+     "ext/lda-ruby/cokus.h",
+     "ext/lda-ruby/extconf.rb",
+     "ext/lda-ruby/lda-alpha.c",
+     "ext/lda-ruby/lda-alpha.h",
+     "ext/lda-ruby/lda-data.c",
+     "ext/lda-ruby/lda-data.h",
+     "ext/lda-ruby/lda-inference.c",
+     "ext/lda-ruby/lda-inference.h",
+     "ext/lda-ruby/lda-model.c",
+     "ext/lda-ruby/lda-model.h",
+     "ext/lda-ruby/lda.h",
+     "ext/lda-ruby/utils.c",
+     "ext/lda-ruby/utils.h",
+     "lda-ruby.gemspec",
+     "lib/lda-ruby.rb",
+     "lib/lda-ruby/corpus/corpus.rb",
+     "lib/lda-ruby/corpus/data_corpus.rb",
+     "lib/lda-ruby/corpus/directory_corpus.rb",
+     "lib/lda-ruby/corpus/text_corpus.rb",
+     "lib/lda-ruby/document/data_document.rb",
+     "lib/lda-ruby/document/document.rb",
+     "lib/lda-ruby/document/text_document.rb",
+     "lib/lda-ruby/vocabulary.rb",
+     "license.txt",
+     "test/data/.gitignore",
+     "test/data/docs.dat",
+     "test/data/wiki-test-docs.yml",
+     "test/lda_ruby_test.rb",
+     "test/test_helper.rb"
+  ]
+  s.homepage = %q{http://github.com/ealdent/lda-ruby}
+  s.rdoc_options = ["--charset=UTF-8"]
+  s.require_paths = ["lib", "ext"]
+  s.rubygems_version = %q{1.3.4}
+  s.summary = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei.}
+  s.test_files = [
+    "test/lda_ruby_test.rb",
+     "test/test_helper.rb"
+  ]
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 3
+    if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
+    else
+    end
+  else
+  end
+end

data/lib/lda-ruby.rb ADDED

@@ -0,0 +1,157 @@
+$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
+require 'lda-ruby/lda'
+require 'lda-ruby/document/document'
+require 'lda-ruby/document/data_document'
+require 'lda-ruby/document/text_document'
+require 'lda-ruby/corpus/corpus'
+require 'lda-ruby/corpus/data_corpus'
+require 'lda-ruby/corpus/text_corpus'
+require 'lda-ruby/corpus/directory_corpus'
+require 'lda-ruby/vocabulary'
+module Lda
+  class Lda
+    attr_reader :vocab, :corpus
+    def initialize(corpus)
+      load_default_settings
+      @vocab = nil
+      self.corpus = corpus
+      @vocab = corpus.vocabulary.to_a if corpus.vocabulary
+      @phi = nil
+    end
+    def load_default_settings
+      self.max_iter = 20
+      self.convergence = 1e-6
+      self.em_max_iter = 100
+      self.em_convergence = 1e-4
+      self.num_topics = 20
+      self.init_alpha = 0.3
+      self.est_alpha = 1
+      [20, 1e-6, 100, 1e-4, 20, 0.3, 1]
+    end
+    def load_corpus(filename)
+      @corpus = Corpus.new
+      @corpus.load_from_file(filename)
+      true
+    end
+    def load_vocabulary(vocab)
+      if vocab.is_a?(Array)
+        @vocab = Marshal::load(Marshal::dump(vocab))      # deep clone array
+      elsif vocab.is_a?(Vocabulary)
+        @vocab = vocab.to_a
+      else
+        @vocab = File.open(vocab, 'r') { |f| f.read.split(/\s+/) }
+      end
+      true
+    end
+    #
+    # Visualization method for printing out the top +words_per_topic+ words
+    # for each topic.
+    #
+    # See also +top_words+.
+    #
+    def print_topics(words_per_topic = 10)
+      raise 'No vocabulary loaded.' unless @vocab
+      self.beta.each_with_index do |topic, topic_num|
+        # Sort the topic array and return the sorted indices of the best scores
+        indices = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
+        puts "Topic #{topic_num}"
+        puts "\t#{indices.map {|i| @vocab[i]}.join("\n\t")}"
+        puts ""
+      end
+      nil
+    end
+    #
+    # After the model has been run and a vocabulary has been loaded, return the
+    # +words_per_topic+ top words chosen by the model for each topic.  This is
+    # returned as a hash mapping the topic number to an array of top words
+    # (in descending order of importance).
+    #
+    #   topic_number => [w1, w2, ..., w_n]
+    #
+    # See also +print_topics+.
+    #
+    def top_words(words_per_topic = 10)
+      raise 'No vocabulary loaded.' unless @vocab
+      # find the highest scoring words per topic
+      topics = Hash.new
+      indices = (0...@vocab.size).to_a
+      self.beta.each_with_index do |topic, topic_num|
+        topics[topic_num] = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
+      end
+      topics
+    end
+    #
+    # Get the phi matrix which can be used to assign probabilities to words
+    # belonging to a specific topic in each document.  The return value is a
+    # 3D matrix:  num_docs x doc_length x num_topics.  The value is cached
+    # after the first call, so if it needs to be recomputed, set the +recompute+
+    # value to true.
+    #
+    def phi(recompute=false)
+      if @phi.nil? || recompute
+        @phi = self.compute_phi
+      end
+      @phi
+    end
+    #
+    # Compute the average log probability for each topic for each document in the corpus.
+    # This method returns a matrix:  num_docs x num_topics with the average log probability
+    # for the topic in the document.
+    #
+    def compute_topic_document_probability
+      outp = Array.new
+      @corpus.documents.each_with_index do |doc, idx|
+        tops = [0.0] * self.num_topics
+        ttl  = doc.counts.inject(0.0) {|sum, i| sum + i}
+        self.phi[idx].each_with_index do |word_dist, word_idx|
+          word_dist.each_with_index do |top_prob, top_idx|
+            tops[top_idx] += Math.log(top_prob) * doc.counts[word_idx]
+          end
+        end
+        tops = tops.map {|i| i / ttl}
+        outp << tops
+      end
+      outp
+    end
+    #
+    # String representation displaying current settings.
+    #
+    def to_s
+      outp = ["LDA Settings:"]
+      outp << "    Initial alpha: %0.6f" % self.init_alpha
+      outp << "      # of topics: %d" % self.num_topics
+      outp << "   Max iterations: %d" % self.max_iter
+      outp << "      Convergence: %0.6f" % self.convergence
+      outp << "EM max iterations: %d" % self.em_max_iter
+      outp << "   EM convergence: %0.6f" % self.em_convergence
+      outp << "   Estimate alpha: %d" % self.est_alpha
+      outp.join("\n")
+    end
+  end
+end

data/lib/lda-ruby/corpus/corpus.rb ADDED

@@ -0,0 +1,34 @@
+require 'set'
+module Lda
+  class Corpus
+    attr_reader :documents, :num_docs, :num_terms, :vocabulary
+    def initialize
+      @documents = Array.new
+      @all_terms = Set.new
+      @num_terms = @num_docs = 0
+      @vocabulary = Vocabulary.new
+    end
+    def add_document(doc)
+      raise 'Parameter +doc+ must be of type Document' unless doc.kind_of?(Document)
+      @documents << doc
+      @all_terms += doc.words
+      @num_docs += 1
+      @num_terms = @all_terms.size
+      update_vocabulary(doc)
+      nil
+    end
+    protected
+    def update_vocabulary(doc)
+      doc.tokens.each { |w| @vocabulary.check_word(w) }
+    end
+  end
+end

data/lib/lda-ruby/corpus/data_corpus.rb ADDED

@@ -0,0 +1,22 @@
+module Lda
+  class DataCorpus < Corpus
+    attr_reader :filename
+    def initialize(filename)
+      super()
+      @filename = filename
+      load_from_file
+    end
+    protected
+    def load_from_file
+      txt = File.open(@filename, 'r') { |f| f.read }
+      lines = txt.split(/[\r\n]+/)
+      lines.each do |line|
+        add_document(DataDocument.new(self, line))
+      end
+    end
+  end
+end

data/lib/lda-ruby/corpus/directory_corpus.rb ADDED

@@ -0,0 +1,25 @@
+module Lda
+  class DirectoryCorpus < Corpus
+    attr_reader :path, :extension
+    # load documents from a directory
+    def initialize(path, extension = nil)
+      super()
+      @path = path.dup.freeze
+      @extension = extension ? extension.dup.freeze : nil
+      load_from_directory
+    end
+    protected
+    def load_from_directory
+      dir_glob = File.join(@path, (@extension ? "*.#{@extension}" : "*"))
+      Dir.glob(dir_glob).each do |filename|
+        add_document(TextDocument.build_from_file(self, filename))
+      end
+    end
+  end
+end

data/lib/lda-ruby/corpus/text_corpus.rb ADDED

@@ -0,0 +1,22 @@
+module Lda
+  class TextCorpus < Corpus
+    attr_reader :filename
+    # Load text documents from YAML file if filename is given.
+    def initialize(filename)
+      super()
+      @filename = filename
+      load_from_file
+    end
+    protected
+    def load_from_file
+      docs = YAML.load_file(@filename)
+      docs.each do |doc|
+        add_document(TextDocument.new(self, doc))
+      end
+    end
+  end
+end

data/lib/lda-ruby/document/data_document.rb ADDED

@@ -0,0 +1,30 @@
+#
+# Create the Document using the svmlight-style text line:
+#
+#   num_words w1:freq1 w2:freq2 ... w_n:freq_n
+#
+# Ex.
+#   5 1:2 3:1 4:2 7:3 12:1
+#
+# The value for the number of words should equal the number of pairs
+# following it, though this isn't at all enforced.  Order of word-pair
+# indices is not important.
+#
+module Lda
+  class DataDocument < Document
+    def initialize(corpus, data)
+      super(corpus)
+      items = data.split(/\s+/)
+      pairs = items[1..items.size].map { |item| item.split(':') }
+      pairs.each do |feature_identifier, feature_weight|
+        @words << feature_identifier.to_i
+        @counts << feature_weight.to_i
+      end
+      recompute
+    end
+  end
+end

data/lib/lda-ruby/document/document.rb ADDED

@@ -0,0 +1,36 @@
+module Lda
+  class Document
+    attr_reader :corpus, :words, :counts, :length, :total, :tokens
+    def initialize(corpus)
+      @corpus = corpus
+      @words  = Array.new
+      @counts = Array.new
+      @tokens = Array.new
+      @length = 0
+      @total  = 0
+    end
+    #
+    # Recompute the total and length values.
+    #
+    def recompute
+      @total = @counts.inject(0) { |sum, i| sum + i }
+      @length = @words.size
+    end
+    def has_text?
+      false
+    end
+    def handle(tokens)
+      tokens
+    end
+    def tokenize(text)
+      clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ')        # remove everything but letters and ' and leave only single spaces
+      @tokens = handle(clean_text.split(' '))
+    end
+  end
+end