RubyGems - wapiti - Versions diffs - 0.0.1 - Mend

wapiti 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

data/.autotest +13 -0
data/.gitignore +5 -0
data/.rspec +3 -0
data/Gemfile +6 -0
data/LICENSE +30 -0
data/README.md +153 -0
data/Rakefile +33 -0
data/ext/wapiti/bcd.c +392 -0
data/ext/wapiti/decoder.c +535 -0
data/ext/wapiti/decoder.h +46 -0
data/ext/wapiti/extconf.rb +8 -0
data/ext/wapiti/gradient.c +818 -0
data/ext/wapiti/gradient.h +81 -0
data/ext/wapiti/lbfgs.c +294 -0
data/ext/wapiti/model.c +296 -0
data/ext/wapiti/model.h +100 -0
data/ext/wapiti/native.c +1238 -0
data/ext/wapiti/native.h +15 -0
data/ext/wapiti/options.c +278 -0
data/ext/wapiti/options.h +91 -0
data/ext/wapiti/pattern.c +395 -0
data/ext/wapiti/pattern.h +56 -0
data/ext/wapiti/progress.c +167 -0
data/ext/wapiti/progress.h +43 -0
data/ext/wapiti/quark.c +272 -0
data/ext/wapiti/quark.h +46 -0
data/ext/wapiti/reader.c +553 -0
data/ext/wapiti/reader.h +73 -0
data/ext/wapiti/rprop.c +191 -0
data/ext/wapiti/sequence.h +148 -0
data/ext/wapiti/sgdl1.c +218 -0
data/ext/wapiti/thread.c +171 -0
data/ext/wapiti/thread.h +42 -0
data/ext/wapiti/tools.c +202 -0
data/ext/wapiti/tools.h +54 -0
data/ext/wapiti/trainers.h +39 -0
data/ext/wapiti/vmath.c +372 -0
data/ext/wapiti/vmath.h +51 -0
data/ext/wapiti/wapiti.c +288 -0
data/ext/wapiti/wapiti.h +45 -0
data/lib/wapiti.rb +30 -0
data/lib/wapiti/errors.rb +17 -0
data/lib/wapiti/model.rb +49 -0
data/lib/wapiti/options.rb +113 -0
data/lib/wapiti/utility.rb +15 -0
data/lib/wapiti/version.rb +3 -0
data/spec/fixtures/ch.mod +18550 -0
data/spec/fixtures/chpattern.txt +52 -0
data/spec/fixtures/chtest.txt +1973 -0
data/spec/fixtures/chtrain.txt +19995 -0
data/spec/fixtures/nppattern.txt +52 -0
data/spec/fixtures/nptest.txt +1973 -0
data/spec/fixtures/nptrain.txt +19995 -0
data/spec/fixtures/pattern.txt +14 -0
data/spec/fixtures/test.txt +60000 -0
data/spec/fixtures/train.txt +1200 -0
data/spec/spec_helper.rb +21 -0
data/spec/wapiti/model_spec.rb +173 -0
data/spec/wapiti/native_spec.rb +12 -0
data/spec/wapiti/options_spec.rb +175 -0
data/spec/wapiti/utility_spec.rb +22 -0
data/wapiti.gemspec +35 -0
metadata +178 -0

data/ext/wapiti/wapiti.c ADDED

@@ -0,0 +1,288 @@
+/*
+ *      Wapiti - A linear-chain CRF tool
+ *
+ * Copyright (c) 2009-2011  CNRS
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "decoder.h"
+#include "model.h"
+#include "options.h"
+#include "progress.h"
+#include "quark.h"
+#include "reader.h"
+#include "sequence.h"
+#include "tools.h"
+#include "trainers.h"
+#include "wapiti.h"
+/*******************************************************************************
+ * Training
+ ******************************************************************************/
+static void trn_auto(mdl_t *mdl) {
+	const int maxiter = mdl->opt->maxiter;
+	mdl->opt->maxiter = 3;
+	trn_sgdl1(mdl);
+	mdl->opt->maxiter = maxiter;
+	trn_lbfgs(mdl);
+}
+static const struct {
+	char *name;
+	void (* train)(mdl_t *mdl);
+} trn_lst[] = {
+	{"l-bfgs", trn_lbfgs},
+	{"sgd-l1", trn_sgdl1},
+	{"bcd",    trn_bcd  },
+	{"rprop",  trn_rprop},
+	{"rprop+", trn_rprop},
+	{"rprop-", trn_rprop},
+	{"auto",   trn_auto }
+};
+static const int trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
+void dotrain(mdl_t *mdl) {
+	// Check if the user requested the trainer list. If this is not the
+	// case, search the trainer.
+	if (!strcmp(mdl->opt->algo, "list")) {
+		info("Available training algorithms:\n");
+		for (int i = 0; i < trn_cnt; i++)
+			info("\t%s\n", trn_lst[i].name);
+		exit(EXIT_SUCCESS);
+	}
+	int trn;
+	for (trn = 0; trn < trn_cnt; trn++)
+		if (!strcmp(mdl->opt->algo, trn_lst[trn].name))
+			break;
+	if (trn == trn_cnt)
+		fatal("unknown algorithm '%s'", mdl->opt->algo);
+	// Load a previous model to train again if specified by the user.
+	if (mdl->opt->model != NULL) {
+		info("* Load previous model\n");
+		FILE *file = fopen(mdl->opt->model, "r");
+		if (file == NULL)
+			pfatal("cannot open input model file");
+		mdl_load(mdl, file);
+	}
+	// Load the pattern file. This will unlock the database if previously
+	// locked by loading a model.
+	if (mdl->opt->pattern != NULL) {
+		info("* Load patterns\n");
+		FILE *file = fopen(mdl->opt->pattern, "r");
+		if (file == NULL)
+			pfatal("cannot open pattern file");
+		rdr_loadpat(mdl->reader, file);
+		fclose(file);
+		qrk_lock(mdl->reader->obs, false);
+	}
+	// Load the training data. When this is done we lock the quarks as we
+	// don't want to put in the model, informations present only in the
+	// devlopment set.
+	info("* Load training data\n");
+	FILE *file = stdin;
+	if (mdl->opt->input != NULL) {
+		file = fopen(mdl->opt->input, "r");
+		if (file == NULL)
+			pfatal("cannot open input data file");
+	}
+	mdl->train = rdr_readdat(mdl->reader, file, true);
+	if (mdl->opt->input != NULL)
+		fclose(file);
+	qrk_lock(mdl->reader->lbl, true);
+	qrk_lock(mdl->reader->obs, true);
+	if (mdl->train == NULL || mdl->train->nseq == 0)
+		fatal("no train data loaded");
+	// If present, load the development set in the model. If not specified,
+	// the training dataset will be used instead.
+	if (mdl->opt->devel != NULL) {
+		info("* Load development data\n");
+		FILE *file = fopen(mdl->opt->devel, "r");
+		if (file == NULL)
+			pfatal("cannot open development file");
+		mdl->devel = rdr_readdat(mdl->reader, file, true);
+		fclose(file);
+	}
+	// Initialize the model. If a previous model was loaded, this will be
+	// just a resync, else the model structure will be created.
+	if (mdl->theta == NULL)
+		info("* Initialize the model\n");
+	else
+		info("* Resync the model\n");
+	mdl_sync(mdl);
+	// Display some statistics as we all love this.
+	info("* Summary\n");
+	info("    nb train:    %d\n", mdl->train->nseq);
+	if (mdl->devel != NULL)
+		info("    nb devel:    %d\n", mdl->devel->nseq);
+	info("    nb labels:   %zu\n", mdl->nlbl);
+	info("    nb blocks:   %zu\n", mdl->nobs);
+	info("    nb features: %zu\n", mdl->nftr);
+	// And train the model...
+	info("* Train the model with %s\n", mdl->opt->algo);
+	uit_setup(mdl);
+	trn_lst[trn].train(mdl);
+	uit_cleanup(mdl);
+	// If requested compact the model.
+	if (mdl->opt->compact) {
+		const size_t O = mdl->nobs;
+		const size_t F = mdl->nftr;
+		info("* Compacting the model\n");
+		mdl_compact(mdl);
+		info("    %8zu observations removed\n", O - mdl->nobs);
+		info("    %8zu features removed\n", F - mdl->nftr);
+	}
+	// And save the trained model
+	info("* Save the model\n");
+	file = stdout;
+	if (mdl->opt->output != NULL) {
+		file = fopen(mdl->opt->output, "w");
+		if (file == NULL)
+			pfatal("cannot open output model");
+	}
+	mdl_save(mdl, file);
+	if (mdl->opt->output != NULL)
+		fclose(file);
+	info("* Done\n");
+}
+/*******************************************************************************
+ * Labeling
+ ******************************************************************************/
+void dolabel(mdl_t *mdl) {
+	// First, load the model provided by the user. This is mandatory to
+	// label new datas ;-)
+	if (mdl->opt->model == NULL)
+		fatal("you must specify a model");
+	info("* Load model\n");
+	FILE *file = fopen(mdl->opt->model, "r");
+	if (file == NULL)
+		pfatal("cannot open input model file");
+	mdl_load(mdl, file);
+	// Open input and output files
+	FILE *fin = stdin, *fout = stdout;
+	if (mdl->opt->input != NULL) {
+		fin = fopen(mdl->opt->input, "r");
+		if (fin == NULL)
+			pfatal("cannot open input data file");
+	}
+	if (mdl->opt->output != NULL) {
+		fout = fopen(mdl->opt->output, "w");
+		if (fout == NULL)
+			pfatal("cannot open output data file");
+	}
+	// Do the labelling
+	info("* Label sequences\n");
+	tag_label(mdl, fin, fout);
+	info("* Done\n");
+	// And close files
+	if (mdl->opt->input != NULL)
+		fclose(fin);
+	if (mdl->opt->output != NULL)
+		fclose(fout);
+}
+/*******************************************************************************
+ * Dumping
+ ******************************************************************************/
+void dodump(mdl_t *mdl) {
+	// Load input model file
+	info("* Load model\n");
+	FILE *fin = stdin;
+	if (mdl->opt->input != NULL) {
+		fin = fopen(mdl->opt->input, "r");
+		if (fin == NULL)
+			pfatal("cannot open input data file");
+	}
+	mdl_load(mdl, fin);
+	if (mdl->opt->input != NULL)
+		fclose(fin);
+	// Open output file
+	FILE *fout = stdout;
+	if (mdl->opt->output != NULL) {
+		fout = fopen(mdl->opt->output, "w");
+		if (fout == NULL)
+			pfatal("cannot open output data file");
+	}
+	// Dump model
+	info("* Dump model\n");
+	const size_t Y = mdl->nlbl;
+	const size_t O = mdl->nobs;
+	const qrk_t *Qlbl = mdl->reader->lbl;
+	const qrk_t *Qobs = mdl->reader->obs;
+	for (size_t o = 0; o < O; o++) {
+		const char *obs = qrk_id2str(Qobs, o);
+		bool empty = true;
+		if (mdl->kind[o] & 1) {
+			const double *w = mdl->theta + mdl->uoff[o];
+			for (size_t y = 0; y < Y; y++) {
+				if (w[y] == 0.0)
+					continue;
+				const char *ly = qrk_id2str(Qlbl, y);
+				fprintf(fout, "%s\t#\t%s\t%f\n", obs, ly, w[y]);
+				empty = false;
+			}
+		}
+		if (mdl->kind[o] & 2) {
+			const double *w = mdl->theta + mdl->boff[o];
+			for (size_t d = 0; d < Y * Y; d++) {
+				if (w[d] == 0.0)
+					continue;
+				const char *ly  = qrk_id2str(Qlbl, d % Y);
+				const char *lyp = qrk_id2str(Qlbl, d / Y);
+				fprintf(fout, "%s\t%s\t%s\t%f\n", obs, lyp, ly,
+				       w[d]);
+				empty = false;
+			}
+		}
+		if (!empty)
+			fprintf(fout, "\n");
+	}
+	if (mdl->opt->output != NULL)
+		fclose(fout);
+}
+/*******************************************************************************
+ * Entry point
+ ******************************************************************************/
+int wapiti_main(int argc, char *argv[argc]) {
+	// We first parse command line switchs
+	opt_t opt = opt_defaults;
+	opt_parse(argc, argv, &opt);
+	// Next we prepare the model
+	mdl_t *mdl = mdl_new(rdr_new(opt.maxent));
+	mdl->opt = &opt;
+	// And switch to requested mode
+	switch (opt.mode) {
+		case 0: dotrain(mdl); break;
+		case 1: dolabel(mdl); break;
+		case 2: dodump(mdl); break;
+	}
+	// And cleanup
+	mdl_free(mdl);
+	return EXIT_SUCCESS;
+}

data/ext/wapiti/wapiti.h ADDED

@@ -0,0 +1,45 @@
+/*
+ *      Wapiti - A linear-chain CRF tool
+ *
+ * Copyright (c) 2009-2011  CNRS
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef wapiti_h
+#define wapiti_h
+#define VERSION "1.2.0"
+/* XVM_ANSI:
+ *   By uncomenting the following define, you can force wapiti to not use SSE2
+ *   even if available.
+ */
+//#define XVM_ANSI
+/* MTH_ANSI:
+ *  By uncomenting the following define, you can disable the use of POSIX
+ *  threads in the multi-threading part of Wapiti, for non-POSIX systems.
+ */
+//#define MTH_ANSI
+#endif

data/lib/wapiti.rb ADDED

@@ -0,0 +1,30 @@
+require 'logger'
+require 'tempfile'
+require 'wapiti/version'
+module Wapiti
+	Logger = ::Logger.new(STDOUT)
+	Logger.level = ::Logger::WARN
+	class << self
+		def log
+			Logger
+		end
+		def debug!
+			log.level == ::Logger::DEBUG
+		end
+	end
+end
+require 'wapiti/errors'
+require 'wapiti/native'
+require 'wapiti/options'
+require 'wapiti/model'
+require 'wapiti/utility'

data/lib/wapiti/errors.rb ADDED

@@ -0,0 +1,17 @@
+module Wapiti
+	class Error < StandardError
+		attr_accessor :original
+		def initialize(message = '', original = $!)
+			super(message)
+			@original = original
+		end
+	end
+	class NativeError < Error; end
+	class ConfigurationError < Error; end
+end

data/lib/wapiti/model.rb ADDED

@@ -0,0 +1,49 @@
+module Wapiti
+	class Model
+		class << self
+			def train(data, options, &block)
+				config = Options.new(options, &block)
+				# check configuration
+				if config.pattern.empty?
+					raise ConfigurationError, 'invalid options: no pattern specified'
+				end
+				unless config.valid?
+					raise ConfigurationError, "invalid options: #{ config.validate.join('; ') }"
+				end
+				new(config).train(data)
+			end
+			def load(filename)
+				m = new
+				m.path = filename
+				m.load
+				m
+			end
+		end
+		attr_accessor :path
+		def pattern
+			options.pattern
+		end
+		def pattern=(filename)
+			options.pattern = filename
+		end
+		private
+		def tokenize(input)
+			input
+		end
+	end
+end

data/lib/wapiti/options.rb ADDED

@@ -0,0 +1,113 @@
+module Wapiti
+	class Options
+		include Comparable
+		class << self
+			# Returns a sorted list of available option attributes.
+			def attribute_names
+				@attribute_names ||= %w{ stop_window convergence_window posterior
+					max_iterations jobsize threads rho1 rho2 stop_epsilon score check
+					algorithm pattern development_data maxent compact sparse label
+					}.sort.map(&:to_sym).freeze
+			end
+			# Returns the default options.
+			def defaults
+				@defaults ||= new.attributes
+			end
+			# Returns the list of supported algorithm options.
+			def algorithms
+				@algorithms ||= %w{ l-bfgs sgd-l1 bcd rprop rprop+ rprop- auto }.freeze
+			end
+		end
+		# Returns the value of the attribute identified by +name+ or nil
+		# if there is no such attribute.
+		def [](name)
+			has_attribute?(name) ? send(name) : nil
+		end
+		# Updates the value of the attribute identified by +name+ with the
+		# passed-in +value+.
+		def []=(name, value)
+			raise ArgumentError, "bad attribute name: #{name}" unless has_attribute?(name)
+			send("#{name}=", value)
+		end
+		# Updates all the attributes from the passed-in hash.
+		def update(attributes = {})
+			attributes.each_pair do |k,v|
+				mid = "#{k}="
+				send(mid, v) if respond_to?(mid)
+			end
+			self
+		end
+		alias update_attributes update
+		def lbfgs
+			{ :clip => clip, :histsz => histsz, :maxls => maxls }
+		end
+		def sgdl1
+			{ :eta0 => eta0, :alpha => alpha }
+		end
+		def bcd
+			{ :kappa => kappa }
+		end
+		def rprop
+			{
+				:stpmin => stpmin, :stpmax => stpmax, :stpinc => stpinc,
+				:stpdec => stpdec, :cutoff => cutoff
+			}
+		end
+		# Returns a hash of all the attributes with their names and values.
+		def attributes
+			Hash[*Options.attribute_names.map { |a| [a, send(a)] }.flatten]
+		end
+		alias to_hash attributes
+		def has_attribute?(attribute)
+			Options.attribute_names.include?(attribute)
+		end
+		def valid_algorithm?
+			self.class.algorithms.include?(algorithm)
+		end
+		def valid?
+			validate.empty?
+		end
+		def validate
+			e = []
+			%w{ threads jobsize alpha histsz maxls eta0 alpha nbest }.each do |name|
+				e << "invalid value for #{name}: #{send(name)}" unless send(name) > 0
+			end
+			%w{ rho1 rho2 }.each do |name|
+				e << "invalid value for #{name}: #{send(name)}" unless send(name) >= 0.0
+			end
+			e << "unknown algorithm: #{algorithm}" unless valid_algorithm?
+			e << "BCD not supported for training maxent models" if maxent && algorithm == 'bcd'
+			e
+		end
+		def <=>(other)
+			other.respond_to?(:attributes) ? attributes <=> other.attributes : nil
+		end
+	end
+end