RubyGems - lda-ruby - Versions diffs - 0.4.0-x86_64-linux - Mend

lda-ruby 0.4.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +61 -0
data/Gemfile +9 -0
data/README.md +157 -0
data/VERSION.yml +5 -0
data/docs/modernization-handoff.md +190 -0
data/docs/porting-strategy.md +127 -0
data/docs/precompiled-platform-policy.md +68 -0
data/docs/release-runbook.md +157 -0
data/ext/lda-ruby/cokus.c +145 -0
data/ext/lda-ruby/cokus.h +27 -0
data/ext/lda-ruby/extconf.rb +13 -0
data/ext/lda-ruby/lda-alpha.c +96 -0
data/ext/lda-ruby/lda-alpha.h +21 -0
data/ext/lda-ruby/lda-data.c +67 -0
data/ext/lda-ruby/lda-data.h +14 -0
data/ext/lda-ruby/lda-inference.c +1023 -0
data/ext/lda-ruby/lda-inference.h +63 -0
data/ext/lda-ruby/lda-model.c +345 -0
data/ext/lda-ruby/lda-model.h +31 -0
data/ext/lda-ruby/lda.h +54 -0
data/ext/lda-ruby/utils.c +111 -0
data/ext/lda-ruby/utils.h +18 -0
data/ext/lda-ruby-rust/Cargo.toml +12 -0
data/ext/lda-ruby-rust/README.md +48 -0
data/ext/lda-ruby-rust/extconf.rb +123 -0
data/ext/lda-ruby-rust/src/lib.rs +456 -0
data/lda-ruby.gemspec +78 -0
data/lib/lda-ruby/backends/base.rb +129 -0
data/lib/lda-ruby/backends/native.rb +158 -0
data/lib/lda-ruby/backends/pure_ruby.rb +613 -0
data/lib/lda-ruby/backends/rust.rb +226 -0
data/lib/lda-ruby/backends.rb +58 -0
data/lib/lda-ruby/config/stopwords.yml +571 -0
data/lib/lda-ruby/corpus/corpus.rb +45 -0
data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
data/lib/lda-ruby/corpus/text_corpus.rb +27 -0
data/lib/lda-ruby/document/data_document.rb +30 -0
data/lib/lda-ruby/document/document.rb +40 -0
data/lib/lda-ruby/document/text_document.rb +39 -0
data/lib/lda-ruby/lda.so +0 -0
data/lib/lda-ruby/rust_build_policy.rb +21 -0
data/lib/lda-ruby/version.rb +5 -0
data/lib/lda-ruby/vocabulary.rb +46 -0
data/lib/lda-ruby.rb +413 -0
data/lib/lda_ruby_rust.so +0 -0
data/license.txt +504 -0
data/test/backend_compatibility_test.rb +146 -0
data/test/backends_selection_test.rb +100 -0
data/test/data/docs.dat +46 -0
data/test/data/sample.rb +20 -0
data/test/data/wiki-test-docs.yml +123 -0
data/test/gemspec_test.rb +27 -0
data/test/lda_ruby_test.rb +319 -0
data/test/packaged_gem_smoke_test.rb +33 -0
data/test/release_scripts_test.rb +54 -0
data/test/rust_build_policy_test.rb +23 -0
data/test/simple_pipeline_test.rb +22 -0
data/test/simple_yaml.rb +17 -0
data/test/test_helper.rb +10 -0
metadata +111 -0

data/docs/release-runbook.md ADDED Viewed

@@ -0,0 +1,157 @@
+# Release Runbook (Phase 5A + 5B)
+This runbook defines the maintainer workflow for shipping `lda-ruby` source and precompiled platform gem releases.
+Authoritative platform/support policy is maintained in `docs/precompiled-platform-policy.md`.
+## Scope
+- Release artifact types:
+  - source gem: `pkg/lda-ruby-<version>.gem`
+  - precompiled gems (current targets are defined in `docs/precompiled-platform-policy.md`)
+- Release trigger: git tag (`vX.Y.Z`) with matching version files
+- Publish targets:
+  - RubyGems (`gem push`)
+  - GitHub Releases (gem + checksum attachment)
+## Prerequisites
+1. Access:
+   - push/tag rights on `master`
+   - access to GitHub Actions environments for release approvals
+   - RubyGems owner access for `lda-ruby`
+2. Local tooling:
+   - Ruby 3.2+ with Bundler
+   - Rust toolchain (`cargo`) for local precompiled-gem build checks
+   - `libclang` available to Rust bindgen
+   - Docker (recommended for reproducible checks)
+3. Repository state:
+   - release commit merged to `master`
+   - clean working tree
+   - version files in sync
+## Required Secrets and Environments
+GitHub repository secret:
+- `RUBYGEMS_API_KEY`: API key with push rights for `lda-ruby`.
+GitHub Actions environment:
+- `release`: protect this environment with required reviewer approval.
+- Both publish jobs in `.github/workflows/release.yml` are bound to `release`.
+## Release Preparation
+1. Prepare and update release files:
+   ```bash
+   ./bin/release-prepare 0.4.0
+   ```
+2. Review changes:
+   - `VERSION.yml`
+   - `lib/lda-ruby/version.rb`
+   - `CHANGELOG.md`
+3. Validate full release checks locally:
+   ```bash
+   SKIP_DOCKER=1 ./bin/release-preflight
+   ./bin/test-packaged-gem-manifest
+   ```
+4. Validate local precompiled gem flow for your current host platform:
+   ```bash
+   ./bin/release-precompiled-artifacts --tag v0.4.0 --skip-preflight
+   ```
+   Note: `release-precompiled-artifacts` only supports building for the current host platform (no cross-compilation).
+5. Commit and merge to `master`.
+## Dry-Run Path (No Publish)
+Use `workflow_dispatch` with `publish=false`.
+Behavior:
+- runs release validation and artifact build
+- uploads source + precompiled `pkg/lda-ruby-*.gem` and checksum files as workflow artifacts
+- does not push to RubyGems
+- does not create a GitHub release
+Latest verified dry-run reference:
+- date: 2026-02-25
+- workflow run: `https://github.com/ealdent/lda-ruby/actions/runs/22382692416`
+- dispatch parameters: `release_tag=v0.4.0`, `publish=false`
+- result: success across `validate`, `build_artifacts`, and full `build_precompiled_artifacts` matrix
+Optional local dry-run equivalent:
+```bash
+./bin/release-artifacts --tag v0.4.0
+./bin/release-precompiled-artifacts --tag v0.4.0 --skip-preflight
+```
+## Publish Path (Tag-Driven)
+1. Ensure the release commit is on `master`.
+2. Create and push the release tag:
+   ```bash
+   git checkout master
+   git pull --ff-only
+   git tag -a v0.4.0 -m "Release v0.4.0"
+   git push origin v0.4.0
+   ```
+3. Monitor `.github/workflows/release.yml`:
+   - `validate`
+   - `build_artifacts`
+   - `build_precompiled_artifacts` (linux + macOS matrix)
+   - environment-gated `publish_rubygems`
+   - environment-gated `publish_github_release`
+4. Approve the protected `release` environment when prompted.
+5. Confirm published outputs:
+   - RubyGems shows `lda-ruby` `0.4.0` source gem and platform gems
+   - GitHub release `v0.4.0` exists with all gem and `.sha256` attachments
+## Rollback and Recovery
+If publish fails before RubyGems push:
+1. Fix issue on `master`.
+2. Delete and recreate the tag only if the broken tag did not produce public artifacts:
+   - `git tag -d vX.Y.Z`
+   - `git push origin :refs/tags/vX.Y.Z`
+3. Re-tag and re-run release.
+If RubyGems push succeeds but GitHub release fails:
+1. Re-run only the GitHub release path by re-running the workflow job after fix.
+2. Do not re-push gem for the same version.
+If an incorrect gem is published:
+1. Yank from RubyGems:
+   ```bash
+   gem yank lda-ruby -v X.Y.Z
+   ```
+2. Publish a corrective version (for example `X.Y.(Z+1)`), do not re-use yanked version numbers.
+3. Update `CHANGELOG.md` and release notes to document the correction.
+## Troubleshooting
+- `Could not find 'bundler'`: install the Bundler version pinned in `Gemfile.lock`.
+- `cargo not found` in rust-enabled checks: ensure Rust toolchain is installed or run in Docker.
+- `libclang` not found while building precompiled gems: install LLVM/libclang and set `LIBCLANG_PATH` if needed.
+- Linux `Install Rust bindgen dependencies` can take several minutes on fresh runners due apt package index and package installs.
+- macOS Rust link errors (`symbol(s) not found` for Ruby APIs): ensure build path preserves `-C link-arg=-Wl,-undefined,dynamic_lookup` in `RUSTFLAGS`.
+- Tag/version mismatch: run `./bin/check-version-sync --tag vX.Y.Z`.
+- Artifact mismatch during release: rebuild with `./bin/release-artifacts --tag vX.Y.Z`.
+- Precompiled artifact mismatch: rebuild with `./bin/release-precompiled-artifacts --tag vX.Y.Z --skip-preflight`.

data/ext/lda-ruby/cokus.c ADDED Viewed

@@ -0,0 +1,145 @@
+// This is the ``Mersenne Twister'' random number generator MT19937, which
+// generates pseudorandom integers uniformly distributed in 0..(2^32 - 1)
+// starting from any odd seed in 0..(2^32 - 1).  This version is a recode
+// by Shawn Cokus (Cokus@math.washington.edu) on March 8, 1998 of a version by
+// Takuji Nishimura (who had suggestions from Topher Cooper and Marc Rieffel in
+// July-August 1997).
+//
+// Effectiveness of the recoding (on Goedel2.math.washington.edu, a DEC Alpha
+// running OSF/1) using GCC -O3 as a compiler: before recoding: 51.6 sec. to
+// generate 300 million random numbers; after recoding: 24.0 sec. for the same
+// (i.e., 46.5% of original time), so speed is now about 12.5 million random
+// number generations per second on this machine.
+//
+// According to the URL <http://www.math.keio.ac.jp/~matumoto/emt.html>
+// (and paraphrasing a bit in places), the Mersenne Twister is ``designed
+// with consideration of the flaws of various existing generators,'' has
+// a period of 2^19937 - 1, gives a sequence that is 623-dimensionally
+// equidistributed, and ``has passed many stringent tests, including the
+// die-hard test of G. Marsaglia and the load test of P. Hellekalek and
+// S. Wegenkittl.''  It is efficient in memory usage (typically using 2506
+// to 5012 bytes of static data, depending on data type sizes, and the code
+// is quite short as well).  It generates random numbers in batches of 624
+// at a time, so the caching and pipelining of modern systems is exploited.
+// It is also divide- and mod-free.
+//
+// This library is free software; you can redistribute it and/or modify it
+// under the terms of the GNU Library General Public License as published by
+// the Free Software Foundation (either version 2 of the License or, at your
+// option, any later version).  This library is distributed in the hope that
+// it will be useful, but WITHOUT ANY WARRANTY, without even the implied
+// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+// the GNU Library General Public License for more details.  You should have
+// received a copy of the GNU Library General Public License along with this
+// library; if not, write to the Free Software Foundation, Inc., 59 Temple
+// Place, Suite 330, Boston, MA 02111-1307, USA.
+//
+// The code as Shawn received it included the following notice:
+//
+//   Copyright (C) 1997 Makoto Matsumoto and Takuji Nishimura.  When
+//   you use this, send an e-mail to <matumoto@math.keio.ac.jp> with
+//   an appropriate reference to your work.
+//
+// It would be nice to CC: <Cokus@math.washington.edu> when you write.
+//
+#include "cokus.h"
+static uint32   state[N+1];     // state vector + 1 extra to not violate ANSI C
+static uint32   *next;          // next random value is computed from here
+static int      left = -1;      // can *next++ this many times before reloading
+void seedMT(uint32 seed)
+ {
+    //
+    // We initialize state[0..(N-1)] via the generator
+    //
+    //   x_new = (69069 * x_old) mod 2^32
+    //
+    // from Line 15 of Table 1, p. 106, Sec. 3.3.4 of Knuth's
+    // _The Art of Computer Programming_, Volume 2, 3rd ed.
+    //
+    // Notes (SJC): I do not know what the initial state requirements
+    // of the Mersenne Twister are, but it seems this seeding generator
+    // could be better.  It achieves the maximum period for its modulus
+    // (2^30) iff x_initial is odd (p. 20-21, Sec. 3.2.1.2, Knuth); if
+    // x_initial can be even, you have sequences like 0, 0, 0, ...;
+    // 2^31, 2^31, 2^31, ...; 2^30, 2^30, 2^30, ...; 2^29, 2^29 + 2^31,
+    // 2^29, 2^29 + 2^31, ..., etc. so I force seed to be odd below.
+    //
+    // Even if x_initial is odd, if x_initial is 1 mod 4 then
+    //
+    //   the          lowest bit of x is always 1,
+    //   the  next-to-lowest bit of x is always 0,
+    //   the 2nd-from-lowest bit of x alternates      ... 0 1 0 1 0 1 0 1 ... ,
+    //   the 3rd-from-lowest bit of x 4-cycles        ... 0 1 1 0 0 1 1 0 ... ,
+    //   the 4th-from-lowest bit of x has the 8-cycle ... 0 0 0 1 1 1 1 0 ... ,
+    //    ...
+    //
+    // and if x_initial is 3 mod 4 then
+    //
+    //   the          lowest bit of x is always 1,
+    //   the  next-to-lowest bit of x is always 1,
+    //   the 2nd-from-lowest bit of x alternates      ... 0 1 0 1 0 1 0 1 ... ,
+    //   the 3rd-from-lowest bit of x 4-cycles        ... 0 0 1 1 0 0 1 1 ... ,
+    //   the 4th-from-lowest bit of x has the 8-cycle ... 0 0 1 1 1 1 0 0 ... ,
+    //    ...
+    //
+    // The generator's potency (min. s>=0 with (69069-1)^s = 0 mod 2^32) is
+    // 16, which seems to be alright by p. 25, Sec. 3.2.1.3 of Knuth.  It
+    // also does well in the dimension 2..5 spectral tests, but it could be
+    // better in dimension 6 (Line 15, Table 1, p. 106, Sec. 3.3.4, Knuth).
+    //
+    // Note that the random number user does not see the values generated
+    // here directly since reloadMT() will always munge them first, so maybe
+    // none of all of this matters.  In fact, the seed values made here could
+    // even be extra-special desirable if the Mersenne Twister theory says
+    // so-- that's why the only change I made is to restrict to odd seeds.
+    //
+    register uint32 x = (seed | 1U) & 0xFFFFFFFFU, *s = state;
+    register int    j;
+    for(left=0, *s++=x, j=N; --j;
+        *s++ = (x*=69069U) & 0xFFFFFFFFU);
+ }
+uint32 reloadMT(void)
+{
+    register uint32 *p0=state, *p2=state+2, *pM=state+M, s0, s1;
+    register int    j;
+    if(left < -1)
+        seedMT(4357U);
+    left=N-1, next=state+1;
+    for(s0=state[0], s1=state[1], j=N-M+1; --j; s0=s1, s1=*p2++)
+        *p0++ = *pM++ ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U);
+    for(pM=state, j=M; --j; s0=s1, s1=*p2++)
+        *p0++ = *pM++ ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U);
+    s1=state[0], *p0 = *pM ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U);
+    s1 ^= (s1 >> 11);
+    s1 ^= (s1 <<  7) & 0x9D2C5680U;
+    s1 ^= (s1 << 15) & 0xEFC60000U;
+    return(s1 ^ (s1 >> 18));
+ }
+uint32 randomMT(void)
+ {
+    uint32 y;
+    if(--left < 0)
+        return(reloadMT());
+    y  = *next++;
+    y ^= (y >> 11);
+    y ^= (y <<  7) & 0x9D2C5680U;
+    y ^= (y << 15) & 0xEFC60000U;
+    y ^= (y >> 18);
+    return(y);
+ }

data/ext/lda-ruby/cokus.h ADDED Viewed

@@ -0,0 +1,27 @@
+#ifndef COKUS_H
+#define COKUS_H
+#include <stdio.h>
+#include <stdlib.h>
+//
+// uint32 must be an unsigned integer type capable of holding at least 32
+// bits; exactly 32 should be fastest, but 64 is better on an Alpha with
+// GCC at -O3 optimization so try your options and see what's best for you
+//
+typedef unsigned long uint32;
+#define N              (624)                 // length of state vector
+#define M              (397)                 // a period parameter
+#define K              (0x9908B0DFU)         // a magic constant
+#define hiBit(u)       ((u) & 0x80000000U)   // mask all but highest   bit of u
+#define loBit(u)       ((u) & 0x00000001U)   // mask all but lowest    bit of u
+#define loBits(u)      ((u) & 0x7FFFFFFFU)   // mask     the highest   bit of u
+#define mixBits(u, v)  (hiBit(u)|loBits(v))  // move hi bit of u to hi bit of v
+void seedMT(uint32 seed);
+uint32 reloadMT(void);
+uint32 randomMT(void);
+#endif

data/ext/lda-ruby/extconf.rb ADDED Viewed

@@ -0,0 +1,13 @@
+# frozen_string_literal: true
+require "mkmf"
+extension_name = "lda-ruby/lda"
+dir_config(extension_name)
+$defs << "-DUSE_RUBY"
+append_cflags("-Wall")
+append_cflags("-Wextra")
+append_cflags("-Wno-unused-parameter")
+create_makefile(extension_name)

data/ext/lda-ruby/lda-alpha.c ADDED Viewed

@@ -0,0 +1,96 @@
+// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
+// This file is part of LDA-C.
+// LDA-C is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 2 of the License, or (at your
+// option) any later version.
+// LDA-C is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+#include "lda-alpha.h"
+/*
+ * objective function and its derivatives
+ *
+ */
+double alhood(double a, double ss, int D, int K)
+{ return(D * (lgamma(K * a) - K * lgamma(a)) + (a - 1) * ss); }
+double d_alhood(double a, double ss, int D, int K)
+{ return(D * (K * digamma(K * a) - K * digamma(a)) + ss); }
+double d2_alhood(double a, int D, int K)
+{ return(D * (K * K * trigamma(K * a) - K * trigamma(a))); }
+/*
+ * newtons method
+ *
+ */
+double opt_alpha(double ss, int D, int K)
+{
+    double a, log_a, init_a = 100;
+    double f, df, d2f;
+    int iter = 0;
+    log_a = log(init_a);
+    do
+    {
+        iter++;
+        a = exp(log_a);
+        if (isnan(a))
+        {
+            init_a = init_a * 10;
+            printf("warning : alpha is nan; new init = %5.5f\n", init_a);
+            a = init_a;
+            log_a = log(a);
+        }
+        f = alhood(a, ss, D, K);
+        df = d_alhood(a, ss, D, K);
+        d2f = d2_alhood(a, D, K);
+        log_a = log_a - df/(d2f * a + df);
+        printf("alpha maximization : %5.5f   %5.5f\n", f, df);
+    }
+    while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
+    return(exp(log_a));
+}
+double quiet_opt_alpha(double ss, int D, int K)
+{
+    double a, log_a, init_a = 100;
+    double f, df, d2f;
+    int iter = 0;
+    log_a = log(init_a);
+    do
+    {
+        iter++;
+        a = exp(log_a);
+        if (isnan(a))
+        {
+            init_a = init_a * 10;
+            //printf("warning : alpha is nan; new init = %5.5f\n", init_a);
+            a = init_a;
+            log_a = log(a);
+        }
+        f = alhood(a, ss, D, K);
+        df = d_alhood(a, ss, D, K);
+        d2f = d2_alhood(a, D, K);
+        log_a = log_a - df/(d2f * a + df);
+        //printf("alpha maximization : %5.5f   %5.5f\n", f, df);
+    }
+    while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
+    return(exp(log_a));
+}

data/ext/lda-ruby/lda-alpha.h ADDED Viewed

@@ -0,0 +1,21 @@
+#ifndef LDA_ALPHA_H
+#define LDA_ALPHA_H
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include "lda.h"
+#include "utils.h"
+#define NEWTON_THRESH 1e-5
+#define MAX_ALPHA_ITER 1000
+double alhood(double a, double ss, int D, int K);
+double d_alhood(double a, double ss, int D, int K);
+double d2_alhood(double a, int D, int K);
+double opt_alpha(double ss, int D, int K);
+double quiet_opt_alpha(double ss, int D, int K);
+//void maximize_alpha(double** gamma, lda_model* model, int num_docs);
+#endif

data/ext/lda-ruby/lda-data.c ADDED Viewed

@@ -0,0 +1,67 @@
+// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
+// This file is part of LDA-C.
+// LDA-C is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 2 of the License, or (at your
+// option) any later version.
+// LDA-C is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+#include "lda-data.h"
+corpus* read_data(char* data_filename)
+{
+	FILE *fileptr;
+	int length, count, word, n, nd, nw;
+	corpus* c;
+	printf("reading data from %s\n", data_filename);
+	c = malloc(sizeof(corpus));
+	c->docs = 0;
+	c->num_terms = 0;
+	c->num_docs = 0;
+	fileptr = fopen(data_filename, "r");
+	nd = 0; nw = 0;
+	while ((fscanf(fileptr, "%10d", &length) != EOF))
+	{
+		c->docs = (document*) realloc(c->docs, sizeof(document)*(nd+1));
+		c->docs[nd].length = length;
+		c->docs[nd].total = 0;
+		c->docs[nd].words = malloc(sizeof(int)*length);
+		c->docs[nd].counts = malloc(sizeof(int)*length);
+		for (n = 0; n < length; n++)
+		{
+			fscanf(fileptr, "%10d:%10d", &word, &count);
+			word = word - OFFSET;
+			c->docs[nd].words[n] = word;
+			c->docs[nd].counts[n] = count;
+			c->docs[nd].total += count;
+			if (word >= nw) { nw = word + 1; }
+		}
+		nd++;
+	}
+	fclose(fileptr);
+	c->num_docs = nd;
+	c->num_terms = nw;
+	printf("number of docs    : %d\n", nd);
+	printf("number of terms   : %d\n", nw);
+	return(c);
+}
+int max_corpus_length(corpus* c)
+{
+	int n, max = 0;
+	for (n = 0; n < c->num_docs; n++)
+		if (c->docs[n].length > max) max = c->docs[n].length;
+	return(max);
+}

data/ext/lda-ruby/lda-data.h ADDED Viewed

@@ -0,0 +1,14 @@
+#ifndef LDA_DATA_H
+#define LDA_DATA_H
+#include <stdio.h>
+#include <stdlib.h>
+#include "lda.h"
+#define OFFSET 0;                  // offset for reading data
+corpus* read_data(char* data_filename);
+int max_corpus_length(corpus* c);
+#endif