RubyGems - lda-ruby - Versions diffs - 0.4.0-x86_64-linux - Mend

lda-ruby 0.4.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +61 -0
data/Gemfile +9 -0
data/README.md +157 -0
data/VERSION.yml +5 -0
data/docs/modernization-handoff.md +190 -0
data/docs/porting-strategy.md +127 -0
data/docs/precompiled-platform-policy.md +68 -0
data/docs/release-runbook.md +157 -0
data/ext/lda-ruby/cokus.c +145 -0
data/ext/lda-ruby/cokus.h +27 -0
data/ext/lda-ruby/extconf.rb +13 -0
data/ext/lda-ruby/lda-alpha.c +96 -0
data/ext/lda-ruby/lda-alpha.h +21 -0
data/ext/lda-ruby/lda-data.c +67 -0
data/ext/lda-ruby/lda-data.h +14 -0
data/ext/lda-ruby/lda-inference.c +1023 -0
data/ext/lda-ruby/lda-inference.h +63 -0
data/ext/lda-ruby/lda-model.c +345 -0
data/ext/lda-ruby/lda-model.h +31 -0
data/ext/lda-ruby/lda.h +54 -0
data/ext/lda-ruby/utils.c +111 -0
data/ext/lda-ruby/utils.h +18 -0
data/ext/lda-ruby-rust/Cargo.toml +12 -0
data/ext/lda-ruby-rust/README.md +48 -0
data/ext/lda-ruby-rust/extconf.rb +123 -0
data/ext/lda-ruby-rust/src/lib.rs +456 -0
data/lda-ruby.gemspec +78 -0
data/lib/lda-ruby/backends/base.rb +129 -0
data/lib/lda-ruby/backends/native.rb +158 -0
data/lib/lda-ruby/backends/pure_ruby.rb +613 -0
data/lib/lda-ruby/backends/rust.rb +226 -0
data/lib/lda-ruby/backends.rb +58 -0
data/lib/lda-ruby/config/stopwords.yml +571 -0
data/lib/lda-ruby/corpus/corpus.rb +45 -0
data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
data/lib/lda-ruby/corpus/text_corpus.rb +27 -0
data/lib/lda-ruby/document/data_document.rb +30 -0
data/lib/lda-ruby/document/document.rb +40 -0
data/lib/lda-ruby/document/text_document.rb +39 -0
data/lib/lda-ruby/lda.so +0 -0
data/lib/lda-ruby/rust_build_policy.rb +21 -0
data/lib/lda-ruby/version.rb +5 -0
data/lib/lda-ruby/vocabulary.rb +46 -0
data/lib/lda-ruby.rb +413 -0
data/lib/lda_ruby_rust.so +0 -0
data/license.txt +504 -0
data/test/backend_compatibility_test.rb +146 -0
data/test/backends_selection_test.rb +100 -0
data/test/data/docs.dat +46 -0
data/test/data/sample.rb +20 -0
data/test/data/wiki-test-docs.yml +123 -0
data/test/gemspec_test.rb +27 -0
data/test/lda_ruby_test.rb +319 -0
data/test/packaged_gem_smoke_test.rb +33 -0
data/test/release_scripts_test.rb +54 -0
data/test/rust_build_policy_test.rb +23 -0
data/test/simple_pipeline_test.rb +22 -0
data/test/simple_yaml.rb +17 -0
data/test/test_helper.rb +10 -0
metadata +111 -0

data/ext/lda-ruby/lda-inference.h ADDED Viewed

@@ -0,0 +1,63 @@
+#ifndef LDA_INFERENCE_H
+#define LDA_INFERENCE_H
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include "lda.h"
+#include "utils.h"
+int LAG = 5;
+float EM_CONVERGED;
+int EM_MAX_ITER;
+int ESTIMATE_ALPHA;
+double INITIAL_ALPHA;
+int NTOPICS;
+float VAR_CONVERGED;
+int VAR_MAX_ITER;
+#ifdef USE_RUBY
+corpus *last_corpus;
+lda_model *last_model;
+double **last_gamma;
+double **last_phi;
+enum BOOL { FALSE, TRUE } corpus_loaded, model_loaded, VERBOSE;
+#endif
+double lda_inference(document*, lda_model*, double*, double**, short*);
+double compute_likelihood(document*, lda_model*, double**, double*);
+double doc_e_step(document* doc,
+                  double* gamma,
+                  double** phi,
+                  lda_model* model,
+                  lda_suffstats* ss);
+void save_gamma(char* filename,
+                double** gamma,
+                int num_docs,
+                int num_topics);
+void run_em(char* start,
+            char* directory,
+            corpus* corpus);
+#ifdef USE_RUBY
+void run_quiet_em(char* start, corpus* corpus);
+#endif
+void read_settings(char* filename);
+void infer(char* model_root,
+           char* save,
+           corpus* corpus);
+#endif

data/ext/lda-ruby/lda-model.c ADDED Viewed

@@ -0,0 +1,345 @@
+// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
+// This file is part of LDA-C.
+// LDA-C is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 2 of the License, or (at your
+// option) any later version.
+// LDA-C is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+#include "lda-model.h"
+#include <string.h>
+/*
+* compute MLE lda model from sufficient statistics
+	*
+*/
+void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
+	int k; int w;
+	for (k = 0; k < model->num_topics; k++)
+	{
+		for (w = 0; w < model->num_terms; w++)
+		{
+			if (ss->class_word[k][w] > 0)
+			{
+				model->log_prob_w[k][w] =
+					log(ss->class_word[k][w]) -
+					log(ss->class_total[k]);
+			}
+			else
+				model->log_prob_w[k][w] = -100;
+		}
+	}
+	if (estimate_alpha == 1)
+	{
+		model->alpha = opt_alpha(ss->alpha_suffstats,
+			ss->num_docs,
+			model->num_topics);
+		printf("new alpha = %5.5f\n", model->alpha);
+	}
+}
+void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
+	int k; int w;
+	for (k = 0; k < model->num_topics; k++)
+	{
+		for (w = 0; w < model->num_terms; w++)
+		{
+			if (ss->class_word[k][w] > 0)
+			{
+				model->log_prob_w[k][w] =
+					log(ss->class_word[k][w]) -
+					log(ss->class_total[k]);
+			}
+			else
+				model->log_prob_w[k][w] = -100;
+		}
+	}
+	if (estimate_alpha == 1)
+	{
+		model->alpha = quiet_opt_alpha(ss->alpha_suffstats,
+			ss->num_docs,
+			model->num_topics);
+	}
+}
+/*
+* allocate sufficient statistics
+	*
+*/
+lda_suffstats* new_lda_suffstats(lda_model* model) {
+	register int i;
+	int num_topics = model->num_topics;
+	int num_terms = model->num_terms;
+	lda_suffstats* ss = (lda_suffstats*)malloc(sizeof(lda_suffstats));
+  memset(ss,0,sizeof(lda_suffstats));
+	ss->class_total = (double*)malloc(sizeof(double)*num_topics);
+	ss->class_word = (double**)malloc(sizeof(double*)*num_topics);
+	for (i = 0; i < num_topics; ++i) {
+		ss->class_total[i] = 0;
+		ss->class_word[i] = (double*)malloc(sizeof(double)*num_terms);
+    memset(ss->class_word[i],0.0,sizeof(double)*num_terms);
+	}
+	return(ss);
+}
+/*
+ * deallocate new lda suffstats
+ *
+ */
+void free_lda_suffstats(lda_model* model, lda_suffstats* ss) {
+  int i;
+	int num_topics = model->num_topics;
+  free(ss->class_total);
+	for (i = 0; i < num_topics; ++i) {
+    free(ss->class_word[i]);
+  }
+  free(ss->class_word);
+  free(ss);
+}
+/*
+* various intializations for the sufficient statistics
+	*
+*/
+void zero_initialize_ss(lda_suffstats* ss, lda_model* model) {
+	int k, w;
+	for (k = 0; k < model->num_topics; k++)
+	{
+		ss->class_total[k] = 0;
+		for (w = 0; w < model->num_terms; w++)
+		{
+			ss->class_word[k][w] = 0;
+		}
+	}
+	ss->num_docs = 0;
+	ss->alpha_suffstats = 0;
+}
+void random_initialize_ss(lda_suffstats* ss, lda_model* model) {
+	int num_topics = model->num_topics;
+	int num_terms = model->num_terms;
+	int k, n;
+	for (k = 0; k < num_topics; k++)
+	{
+		for (n = 0; n < num_terms; n++)
+		{
+			ss->class_word[k][n] += 1.0/num_terms + myrand();
+			ss->class_total[k] += ss->class_word[k][n];
+		}
+	}
+}
+void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c)
+{
+	int num_topics = model->num_topics;
+	int i, k, d, n;
+	document* doc;
+	for (k = 0; k < num_topics; k++)
+	{
+		for (i = 0; i < NUM_INIT; i++)
+		{
+			d = floor(myrand() * c->num_docs);
+			printf("initialized with document %d\n", d);
+			doc = &(c->docs[d]);
+			for (n = 0; n < doc->length; n++)
+			{
+				ss->class_word[k][doc->words[n]] += doc->counts[n];
+			}
+		}
+		for (n = 0; n < model->num_terms; n++)
+		{
+			ss->class_word[k][n] += 1.0;
+			ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
+		}
+	}
+}
+void quiet_corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c)
+{
+	int num_topics = model->num_topics;
+	int i, k, d, n;
+	document* doc;
+	for (k = 0; k < num_topics; k++)
+	{
+		for (i = 0; i < NUM_INIT; i++)
+		{
+			d = floor(myrand() * c->num_docs);
+			doc = &(c->docs[d]);
+			for (n = 0; n < doc->length; n++)
+			{
+				ss->class_word[k][doc->words[n]] += doc->counts[n];
+			}
+		}
+		for (n = 0; n < model->num_terms; n++)
+		{
+			ss->class_word[k][n] += 1.0;
+			ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
+		}
+	}
+}
+/*
+ * Use the first num_topics documents of the corpus as the seeds.  If num_topics > num_docs, results might be hairy.
+ */
+void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c) {
+    int num_topics = MIN(model->num_topics, c->num_docs);
+    int k, n;
+    document* doc;
+    for (k = 0; k < num_topics; k++) {
+        doc = &(c->docs[k]);
+        for (n = 0; n < doc->length; n++) {
+            ss->class_word[k][doc->words[n]] += doc->counts[n];
+        }
+        for (n = 0; n < model->num_terms; n++) {
+            ss->class_word[k][n] += 1.0;
+            ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
+        }
+    }
+}
+/*
+* allocate new lda model
+	*
+*/
+lda_model* new_lda_model(int num_terms, int num_topics) {
+	int i;
+	lda_model* model;
+	model = malloc(sizeof(lda_model));
+	model->num_topics = num_topics;
+	model->num_terms = num_terms;
+	model->alpha = 1.0;
+	model->log_prob_w = malloc(sizeof(double*)*num_topics);
+  printf("new model with: %d topics and %d terms\n", num_topics, num_terms);
+	for (i = 0; i < num_topics; i++)
+	{
+		model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
+    memset(model->log_prob_w[i],0,sizeof(double)*num_terms);
+	}
+	return(model);
+}
+lda_model* quiet_new_lda_model(int num_terms, int num_topics) {
+	int i;
+	lda_model* model;
+	model = malloc(sizeof(lda_model));
+	model->num_topics = num_topics;
+	model->num_terms = num_terms;
+	model->alpha = 1.0;
+	model->log_prob_w = malloc(sizeof(double*)*num_topics);
+	for (i = 0; i < num_topics; i++)
+	{
+		model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
+    memset(model->log_prob_w[i],0,sizeof(double)*num_terms);
+	}
+	return(model);
+}
+/*
+ * deallocate new lda model
+ *
+ */
+void free_lda_model(lda_model* model) {
+	int i;
+	for (i = 0; i < model->num_topics; i++)
+	{
+		free(model->log_prob_w[i]);
+	}
+	free(model->log_prob_w);
+}
+/*
+ * save an lda model
+ *
+ */
+void save_lda_model(lda_model* model, char* model_root) {
+	char filename[100];
+	FILE* fileptr;
+	int i, j;
+	sprintf(filename, "%s.beta", model_root);
+	fileptr = fopen(filename, "w");
+	for (i = 0; i < model->num_topics; i++) {
+		for (j = 0; j < model->num_terms; j++) {
+			fprintf(fileptr, " %5.10f", model->log_prob_w[i][j]);
+		}
+		fprintf(fileptr, "\n");
+	}
+	fclose(fileptr);
+	sprintf(filename, "%s.other", model_root);
+	fileptr = fopen(filename, "w");
+	fprintf(fileptr, "num_topics %d\n", model->num_topics);
+	fprintf(fileptr, "num_terms %d\n", model->num_terms);
+	fprintf(fileptr, "alpha %5.10f\n", model->alpha);
+	fclose(fileptr);
+}
+lda_model* load_lda_model(char* model_root) {
+	char filename[100];
+	FILE* fileptr;
+	int i, j, num_terms, num_topics;
+	float x, alpha;
+	sprintf(filename, "%s.other", model_root);
+	printf("loading %s\n", filename);
+	fileptr = fopen(filename, "r");
+	fscanf(fileptr, "num_topics %d\n", &num_topics);
+	fscanf(fileptr, "num_terms %d\n", &num_terms);
+	fscanf(fileptr, "alpha %f\n", &alpha);
+	fclose(fileptr);
+	lda_model* model = new_lda_model(num_terms, num_topics);
+	model->alpha = alpha;
+	sprintf(filename, "%s.beta", model_root);
+	printf("loading %s\n", filename);
+	fileptr = fopen(filename, "r");
+	for (i = 0; i < num_topics; i++)
+	{
+		for (j = 0; j < num_terms; j++)
+		{
+			fscanf(fileptr, "%f", &x);
+			model->log_prob_w[i][j] = x;
+		}
+	}
+	fclose(fileptr);
+	return(model);
+}

data/ext/lda-ruby/lda-model.h ADDED Viewed

@@ -0,0 +1,31 @@
+#ifndef LDA_MODEL_H
+#define LDA_MODEL
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "lda.h"
+#include "lda-alpha.h"
+#include "cokus.h"
+#define myrand() (double) (((unsigned long) randomMT()) / 4294967296.)
+#define NUM_INIT 1
+#define MIN(A,B) (int)((A > B) ? (B) : (A))
+void free_lda_model(lda_model*);
+void save_lda_model(lda_model*, char*);
+lda_model* new_lda_model(int, int);
+lda_model* quiet_new_lda_model(int num_terms, int num_topics);
+lda_model* new_lda_model(int num_terms, int num_topics);
+lda_suffstats* new_lda_suffstats(lda_model* model);
+void free_lda_suffstats(lda_model* model, lda_suffstats* ss);
+void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
+void quiet_corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
+void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c);
+void random_initialize_ss(lda_suffstats* ss, lda_model* model);
+void zero_initialize_ss(lda_suffstats* ss, lda_model* model);
+void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha);
+void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha);
+lda_model* load_lda_model(char* model_root);
+#endif

data/ext/lda-ruby/lda.h ADDED Viewed

@@ -0,0 +1,54 @@
+// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
+// This file is part of LDA-C.
+// LDA-C is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 2 of the License, or (at your
+// option) any later version.
+// LDA-C is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+#ifndef LDA_H
+#define LDA_H
+typedef struct {
+	int* words;
+	int* counts;
+	int length;
+	int total;
+} document;
+typedef struct {
+	document* docs;
+	int num_terms;
+	int num_docs;
+} corpus;
+typedef struct {
+	double alpha;
+	double** log_prob_w;
+	int num_topics;
+	int num_terms;
+} lda_model;
+typedef struct {
+	double** class_word;
+	double* class_total;
+	double alpha_suffstats;
+	int num_docs;
+} lda_suffstats;
+#endif

data/ext/lda-ruby/utils.c ADDED Viewed

@@ -0,0 +1,111 @@
+#include "utils.h"
+/*
+ * given log(a) and log(b), return log(a + b)
+ *
+ */
+double log_sum(double log_a, double log_b)
+{
+  double v;
+  if (log_a < log_b)
+  {
+      v = log_b+log(1 + exp(log_a-log_b));
+  }
+  else
+  {
+      v = log_a+log(1 + exp(log_b-log_a));
+  }
+  return(v);
+}
+ /**
+   * Proc to calculate the value of the trigamma, the second
+   * derivative of the loggamma function. Accepts positive matrices.
+   * From Abromowitz and Stegun.  Uses formulas 6.4.11 and 6.4.12 with
+   * recurrence formula 6.4.6.  Each requires workspace at least 5
+   * times the size of X.
+   *
+   **/
+double trigamma(double x)
+{
+    double p;
+    int i;
+    x=x+6;
+    p=1/(x*x);
+    p=(((((0.075757575757576*p-0.033333333333333)*p+0.0238095238095238)
+         *p-0.033333333333333)*p+0.166666666666667)*p+1)/x+0.5*p;
+    for (i=0; i<6 ;i++)
+    {
+        x=x-1;
+        p=1/(x*x)+p;
+    }
+    return(p);
+}
+/*
+ * taylor approximation of first derivative of the log gamma function
+ *
+ */
+double digamma(double x)
+{
+    double p;
+    x=x+6;
+    p=1/(x*x);
+    p=(((0.004166666666667*p-0.003968253986254)*p+
+	0.008333333333333)*p-0.083333333333333)*p;
+    p=p+log(x)-0.5/x-1/(x-1)-1/(x-2)-1/(x-3)-1/(x-4)-1/(x-5)-1/(x-6);
+    return p;
+}
+double log_gamma(double x)
+{
+     double z=1/(x*x);
+    x=x+6;
+    z=(((-0.000595238095238*z+0.000793650793651)
+	*z-0.002777777777778)*z+0.083333333333333)/x;
+    z=(x-0.5)*log(x)-x+0.918938533204673+z-log(x-1)-
+	log(x-2)-log(x-3)-log(x-4)-log(x-5)-log(x-6);
+    return z;
+}
+/*
+ * make directory
+ *
+ */
+void make_directory(char* name)
+{
+    mkdir(name, S_IRUSR|S_IWUSR|S_IXUSR);
+}
+/*
+ * argmax
+ *
+ */
+int argmax(double* x, int n)
+{
+    int i;
+    double max = x[0];
+    int argmax = 0;
+    for (i = 1; i < n; i++)
+    {
+        if (x[i] > max)
+        {
+            max = x[i];
+            argmax = i;
+        }
+    }
+    return(argmax);
+}

data/ext/lda-ruby/utils.h ADDED Viewed

@@ -0,0 +1,18 @@
+#ifndef UTILS_H
+#define UTILS_H
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+double log_sum(double log_a, double log_b);
+double trigamma(double x);
+double digamma(double x);
+double log_gamma(double x);
+void make_directory(char* name);
+int argmax(double* x, int n);
+#endif

data/ext/lda-ruby-rust/Cargo.toml ADDED Viewed

@@ -0,0 +1,12 @@
+[package]
+name = "lda_ruby_rust"
+version = "0.1.0"
+edition = "2021"
+rust-version = "1.74"
+[lib]
+name = "lda_ruby_rust"
+crate-type = ["cdylib"]
+[dependencies]
+magnus = "0.7"

data/ext/lda-ruby-rust/README.md ADDED Viewed

@@ -0,0 +1,48 @@
+# Experimental Rust Extension Scaffold
+This directory contains an experimental Rust extension scaffold built with `magnus`.
+Current scope:
+- Defines `Lda::RustBackend` module in Ruby.
+- Exposes capability hooks:
+  - `Lda::RustBackend.available?`
+  - `Lda::RustBackend.abi_version`
+  - `Lda::RustBackend.before_em(start, num_docs, num_terms)`
+  - `Lda::RustBackend.topic_weights_for_word(beta, gamma, word_index, min_probability)`
+  - `Lda::RustBackend.accumulate_topic_term_counts(topic_term_counts, phi_d, words, counts)`
+  - `Lda::RustBackend.infer_document(beta, gamma_initial, words, counts, max_iter, convergence, min_probability, init_alpha)`
+  - `Lda::RustBackend.infer_corpus_iteration(beta, document_words, document_counts, max_iter, convergence, min_probability, init_alpha)`
+  - `Lda::RustBackend.normalize_topic_term_counts(topic_term_counts, min_probability)`
+  - `Lda::RustBackend.average_gamma_shift(previous_gamma, current_gamma)`
+  - `Lda::RustBackend.topic_document_probability(phi_tensor, document_counts, num_topics, min_probability)`
+  - `Lda::RustBackend.seeded_topic_term_probabilities(document_words, document_counts, topics, terms, min_probability)`
+Hot-path kernels currently executed in Rust when `backend: :rust` is active:
+- topic weights for a word across topics
+- topic-term count accumulation from per-document `phi`
+- full per-document inference loop (batched inner EM updates)
+- full per-iteration corpus inference (batched document processing)
+- topic-term normalization and log-probability finalization for EM beta updates
+- gamma convergence shift reduction between EM iterations
+- topic-document average log-probability computation
+- seeded topic-term initialization
+Remaining numeric LDA kernels are still provided by the pure Ruby backend and will move incrementally.
+## Local build (optional)
+```bash
+cd ext/lda-ruby-rust
+cargo build --release
+```
+Then run Ruby with `require "lda_ruby_rust"` available on load path.
+## Install-time policy
+During source gem installs, `ext/lda-ruby-rust/extconf.rb` can optionally build this extension.
+- `LDA_RUBY_RUST_BUILD=auto` (default): build when `cargo` is available.
+- `LDA_RUBY_RUST_BUILD=always`: require a successful Rust build or fail installation.
+- `LDA_RUBY_RUST_BUILD=never`: always skip Rust build.