RubyGems - wapiti - Versions diffs - 0.0.5 → 0.1.0 - Mend

wapiti 0.0.5 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

checksums.yaml +7 -0
data/.simplecov +3 -0
data/Gemfile +25 -2
data/HISTORY.md +5 -1
data/LICENSE +14 -13
data/README.md +9 -16
data/Rakefile +38 -8
data/ext/wapiti/bcd.c +126 -124
data/ext/wapiti/decoder.c +203 -124
data/ext/wapiti/decoder.h +6 -4
data/ext/wapiti/extconf.rb +2 -2
data/ext/wapiti/gradient.c +491 -320
data/ext/wapiti/gradient.h +52 -34
data/ext/wapiti/lbfgs.c +74 -33
data/ext/wapiti/model.c +47 -37
data/ext/wapiti/model.h +22 -20
data/ext/wapiti/native.c +850 -839
data/ext/wapiti/native.h +1 -1
data/ext/wapiti/options.c +52 -20
data/ext/wapiti/options.h +37 -30
data/ext/wapiti/pattern.c +35 -33
data/ext/wapiti/pattern.h +12 -11
data/ext/wapiti/progress.c +14 -13
data/ext/wapiti/progress.h +3 -2
data/ext/wapiti/quark.c +14 -16
data/ext/wapiti/quark.h +6 -5
data/ext/wapiti/reader.c +83 -69
data/ext/wapiti/reader.h +11 -9
data/ext/wapiti/rprop.c +84 -43
data/ext/wapiti/sequence.h +18 -16
data/ext/wapiti/sgdl1.c +45 -43
data/ext/wapiti/thread.c +19 -17
data/ext/wapiti/thread.h +5 -4
data/ext/wapiti/tools.c +7 -7
data/ext/wapiti/tools.h +3 -4
data/ext/wapiti/trainers.h +1 -1
data/ext/wapiti/vmath.c +40 -38
data/ext/wapiti/vmath.h +12 -11
data/ext/wapiti/wapiti.c +159 -37
data/ext/wapiti/wapiti.h +18 -4
data/lib/wapiti.rb +15 -15
data/lib/wapiti/errors.rb +15 -15
data/lib/wapiti/model.rb +92 -84
data/lib/wapiti/options.rb +123 -124
data/lib/wapiti/utility.rb +14 -14
data/lib/wapiti/version.rb +2 -2
data/spec/spec_helper.rb +29 -9
data/spec/wapiti/model_spec.rb +230 -194
data/spec/wapiti/native_spec.rb +7 -8
data/spec/wapiti/options_spec.rb +184 -174
data/wapiti.gemspec +22 -8
metadata +38 -42
data/.gitignore +0 -5

data/ext/wapiti/wapiti.c CHANGED

@@ -1,7 +1,7 @@
 /*
  *      Wapiti - A linear-chain CRF tool
  *
- * Copyright (c) 2009-2011  CNRS
+ * Copyright (c) 2009-2013  CNRS
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -24,19 +24,22 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
+#include <ctype.h>
+#include <inttypes.h>
 #include <stdbool.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include "decoder.h"
-#include "model.h"
 #include "options.h"
 #include "progress.h"
 #include "quark.h"
 #include "reader.h"
 #include "sequence.h"
+#include "model.h"
 #include "tools.h"
 #include "trainers.h"
 #include "wapiti.h"
@@ -44,16 +47,15 @@
 /*******************************************************************************
  * Training
  ******************************************************************************/
-static void trn_auto(mdl_t *mdl) {
-	const int maxiter = mdl->opt->maxiter;
-	mdl->opt->maxiter = 3;
-	trn_sgdl1(mdl);
-	mdl->opt->maxiter = maxiter;
-	trn_lbfgs(mdl);
-}
+static const char *typ_lst[] = {
+	"maxent",
+	"memm",
+	"crf"
+};
+static const uint32_t typ_cnt = sizeof(typ_lst) / sizeof(typ_lst[0]);
 static const struct {
-	char *name;
+	const char *name;
 	void (* train)(mdl_t *mdl);
 } trn_lst[] = {
 	{"l-bfgs", trn_lbfgs},
@@ -62,20 +64,31 @@ static const struct {
 	{"rprop",  trn_rprop},
 	{"rprop+", trn_rprop},
 	{"rprop-", trn_rprop},
-	{"auto",   trn_auto }
 };
-static const int trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
+static const uint32_t trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
 void dotrain(mdl_t *mdl) {
-	// Check if the user requested the trainer list. If this is not the
-	// case, search the trainer.
+	// Check if the user requested the type or trainer list. If this is not
+	// the case, search them in the lists.
+	if (!strcmp(mdl->opt->type, "list")) {
+		info("Available types of models:\n");
+		for (uint32_t i = 0; i < typ_cnt; i++)
+			info("\t%s\n", typ_lst[i]);
+		exit(EXIT_SUCCESS);
+	}
 	if (!strcmp(mdl->opt->algo, "list")) {
 		info("Available training algorithms:\n");
-		for (int i = 0; i < trn_cnt; i++)
+		for (uint32_t i = 0; i < trn_cnt; i++)
 			info("\t%s\n", trn_lst[i].name);
 		exit(EXIT_SUCCESS);
 	}
-	int trn;
+	uint32_t typ, trn;
+	for (typ = 0; typ < typ_cnt; typ++)
+		if (!strcmp(mdl->opt->type, typ_lst[typ]))
+			break;
+	if (typ == typ_cnt)
+		fatal("unknown model type '%s'", mdl->opt->type);
+	mdl->type = typ;
 	for (trn = 0; trn < trn_cnt; trn++)
 		if (!strcmp(mdl->opt->algo, trn_lst[trn].name))
 			break;
@@ -136,12 +149,12 @@ void dotrain(mdl_t *mdl) {
 	mdl_sync(mdl);
 	// Display some statistics as we all love this.
 	info("* Summary\n");
-	info("    nb train:    %d\n", mdl->train->nseq);
+	info("    nb train:    %"PRIu32"\n", mdl->train->nseq);
 	if (mdl->devel != NULL)
-		info("    nb devel:    %d\n", mdl->devel->nseq);
-	info("    nb labels:   %zu\n", mdl->nlbl);
-	info("    nb blocks:   %zu\n", mdl->nobs);
-	info("    nb features: %zu\n", mdl->nftr);
+		info("    nb devel:    %"PRIu32"\n", mdl->devel->nseq);
+	info("    nb labels:   %"PRIu32"\n", mdl->nlbl);
+	info("    nb blocks:   %"PRIu64"\n", mdl->nobs);
+	info("    nb features: %"PRIu64"\n", mdl->nftr);
 	// And train the model...
 	info("* Train the model with %s\n", mdl->opt->algo);
 	uit_setup(mdl);
@@ -149,12 +162,12 @@ void dotrain(mdl_t *mdl) {
 	uit_cleanup(mdl);
 	// If requested compact the model.
 	if (mdl->opt->compact) {
-		const size_t O = mdl->nobs;
-		const size_t F = mdl->nftr;
+		const uint64_t O = mdl->nobs;
+		const uint64_t F = mdl->nftr;
 		info("* Compacting the model\n");
 		mdl_compact(mdl);
-		info("    %8zu observations removed\n", O - mdl->nobs);
-		info("    %8zu features removed\n", F - mdl->nftr);
+		info("    %8"PRIu64" observations removed\n", O - mdl->nobs);
+		info("    %8"PRIu64" features removed\n", F - mdl->nftr);
 	}
 	// And save the trained model
 	info("* Save the model\n");
@@ -209,7 +222,7 @@ void dolabel(mdl_t *mdl) {
 /*******************************************************************************
  * Dumping
  ******************************************************************************/
-void dodump(mdl_t *mdl) {
+static void dodump(mdl_t *mdl) {
 	// Load input model file
 	info("* Load model\n");
 	FILE *fin = stdin;
@@ -230,32 +243,35 @@ void dodump(mdl_t *mdl) {
 	}
 	// Dump model
 	info("* Dump model\n");
-	const size_t Y = mdl->nlbl;
-	const size_t O = mdl->nobs;
+	const uint32_t Y = mdl->nlbl;
+	const uint64_t O = mdl->nobs;
 	const qrk_t *Qlbl = mdl->reader->lbl;
 	const qrk_t *Qobs = mdl->reader->obs;
-	for (size_t o = 0; o < O; o++) {
+	char fmt[16];
+	sprintf(fmt, "%%.%df\n", mdl->opt->prec);
+	for (uint64_t o = 0; o < O; o++) {
 		const char *obs = qrk_id2str(Qobs, o);
 		bool empty = true;
 		if (mdl->kind[o] & 1) {
 			const double *w = mdl->theta + mdl->uoff[o];
-			for (size_t y = 0; y < Y; y++) {
-				if (w[y] == 0.0)
+			for (uint32_t y = 0; y < Y; y++) {
+				if (!mdl->opt->all && w[y] == 0.0)
 					continue;
 				const char *ly = qrk_id2str(Qlbl, y);
-				fprintf(fout, "%s\t#\t%s\t%f\n", obs, ly, w[y]);
+				fprintf(fout, "%s\t#\t%s\t", obs, ly);
+				fprintf(fout, fmt, w[y]);
 				empty = false;
 			}
 		}
 		if (mdl->kind[o] & 2) {
 			const double *w = mdl->theta + mdl->boff[o];
-			for (size_t d = 0; d < Y * Y; d++) {
-				if (w[d] == 0.0)
+			for (uint32_t d = 0; d < Y * Y; d++) {
+				if (!mdl->opt->all && w[d] == 0.0)
 					continue;
 				const char *ly  = qrk_id2str(Qlbl, d % Y);
 				const char *lyp = qrk_id2str(Qlbl, d / Y);
-				fprintf(fout, "%s\t%s\t%s\t%f\n", obs, lyp, ly,
-				       w[d]);
+				fprintf(fout, "%s\t%s\t%s\t", obs, lyp, ly);
+				fprintf(fout, fmt, w[d]);
 				empty = false;
 			}
 		}
@@ -266,6 +282,110 @@ void dodump(mdl_t *mdl) {
 		fclose(fout);
 }
+/*******************************************************************************
+ * Updating
+ ******************************************************************************/
+void doupdt(mdl_t *mdl) {
+	// Load input model file
+	info("* Load model\n");
+	if (mdl->opt->model == NULL)
+		fatal("no model file provided");
+	FILE *Min = fopen(mdl->opt->model, "r");
+	if (Min == NULL)
+		pfatal("cannot open model file %s", mdl->opt->model);
+	mdl_load(mdl, Min);
+	fclose(Min);
+	// Open patch file
+	info("* Update model\n");
+	FILE *fin = stdin;
+	if (mdl->opt->input != NULL) {
+		fin = fopen(mdl->opt->input, "r");
+		if (fin == NULL)
+			pfatal("cannot open update file");
+	}
+	int nline = 0;
+	while (!feof(fin)) {
+		char *raw = rdr_readline(fin);
+		if (raw == NULL)
+			break;
+		char *line = raw;
+		nline++;
+		// First we split the line in space separated tokens. We expect
+		// four of them and skip empty lines.
+		char *toks[4];
+		int ntoks = 0;
+		while (ntoks < 4) {
+			while (isspace(*line))
+				line++;
+			if (*line == '\0')
+				break;
+			toks[ntoks++] = line;
+			while (*line != '\0' && !isspace(*line))
+				line++;
+			if (*line == '\0')
+				break;
+			*line++ = '\0';
+		}
+		if (ntoks == 0) {
+			free(raw);
+			continue;
+		} else if (ntoks != 4) {
+			fatal("invalid line at %d", nline);
+		}
+		// Parse the tokens, the first three should be string maping to
+		// observations and labels and the last should be the weight.
+		uint64_t obs = none, yp = none, y = none;
+		obs = qrk_str2id(mdl->reader->obs, toks[0]);
+		if (obs == none)
+			fatal("bad on observation on line %d", nline);
+		if (strcmp(toks[1], "#")) {
+			yp = qrk_str2id(mdl->reader->lbl, toks[1]);
+			if (yp == none)
+				fatal("bad label <%s> line %d", toks[1], nline);
+		}
+		y = qrk_str2id(mdl->reader->lbl, toks[2]);
+		if (y == none)
+			fatal("bad label <%s> line %d", toks[2], nline);
+		double wgh = 0.0;
+		if (sscanf(toks[3], "%lf", &wgh) != 1)
+			fatal("bad weight on line %d", nline);
+		const uint32_t Y = mdl->nlbl;
+		if (yp == none) {
+			double *w = mdl->theta + mdl->uoff[obs];
+			w[y] = wgh;
+		} else {
+			double *w = mdl->theta + mdl->boff[obs];
+			w[yp * Y + y] = wgh;
+		}
+		free(raw);
+	}
+	if (mdl->opt->input != NULL)
+		fclose(fin);
+	// If requested compact the model.
+	if (mdl->opt->compact) {
+		const uint64_t O = mdl->nobs;
+		const uint64_t F = mdl->nftr;
+		info("* Compacting the model\n");
+		mdl_compact(mdl);
+		info("    %8"PRIu64" observations removed\n", O - mdl->nobs);
+		info("    %8"PRIu64" features removed\n", F - mdl->nftr);
+	}
+	// And save the updated model
+	info("* Save the model\n");
+	FILE *file = stdout;
+	if (mdl->opt->output != NULL) {
+		file = fopen(mdl->opt->output, "w");
+		if (file == NULL)
+			pfatal("cannot open output model");
+	}
+	mdl_save(mdl, file);
+	if (mdl->opt->output != NULL)
+		fclose(file);
+	info("* Done\n");
+}
 /*******************************************************************************
  * Entry point
  ******************************************************************************/
@@ -280,9 +400,11 @@ int wapiti_main(int argc, char *argv[argc]) {
 	switch (opt.mode) {
 		case 0: dotrain(mdl); break;
 		case 1: dolabel(mdl); break;
-		case 2: dodump(mdl); break;
+		case 2: dodump(mdl);  break;
+		case 3: doupdt(mdl);  break;
 	}
 	// And cleanup
 	mdl_free(mdl);
 	return EXIT_SUCCESS;
 }

data/ext/wapiti/wapiti.h CHANGED

@@ -1,7 +1,7 @@
 /*
  *      Wapiti - A linear-chain CRF tool
  *
- * Copyright (c) 2009-2011  CNRS
+ * Copyright (c) 2009-2013  CNRS
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -27,7 +27,7 @@
 #ifndef wapiti_h
 #define wapiti_h
-#define VERSION "1.2.0"
+#define VERSION "1.5.0"
 /* XVM_ANSI:
  *   By uncomenting the following define, you can force wapiti to not use SSE2
@@ -36,10 +36,24 @@
 //#define XVM_ANSI
 /* MTH_ANSI:
- *  By uncomenting the following define, you can disable the use of POSIX
- *  threads in the multi-threading part of Wapiti, for non-POSIX systems.
+ *   By uncomenting the following define, you can disable the use of POSIX
+ *   threads in the multi-threading part of Wapiti, for non-POSIX systems.
  */
 //#define MTH_ANSI
+/* ATM_ANSI:
+ *   By uncomenting the following define, you can disable the use of atomic
+ *   operation to update the gradient. This imply that multi-threaded gradient
+ *   computation will require more memory but is more portable.
+ */
+//#define ATM_ANSI
+/* Without multi-threading we disable atomic updates as they are not needed and
+ * can only decrease performances in this case.
+ */
+#ifdef MTH_ANSI
+#define ATM_ANSI
+#endif
 #endif

data/lib/wapiti.rb CHANGED

@@ -5,20 +5,20 @@ require 'tempfile'
 require 'wapiti/version'
 module Wapiti
-	Logger = ::Logger.new(STDOUT)
-	Logger.level = ::Logger::WARN
-	class << self
-		def log
-			Logger
-		end
-		def debug!
-			log.level == ::Logger::DEBUG
-		end
-	end
+  Logger = ::Logger.new(STDOUT)
+  Logger.level = ::Logger::WARN
+  class << self
+    def log
+      Logger
+    end
+    def debug!
+      log.level == ::Logger::DEBUG
+    end
+  end
 end
 require 'wapiti/errors'
@@ -27,4 +27,4 @@ require 'wapiti/native'
 require 'wapiti/options'
 require 'wapiti/model'
-require 'wapiti/utility'
+require 'wapiti/utility'

data/lib/wapiti/errors.rb CHANGED

@@ -1,17 +1,17 @@
 module Wapiti
-	class Error < StandardError
-		attr_accessor :original
-		def initialize(message = '', original = $!)
-			super(message)
-			@original = original
-		end
-	end
-	class NativeError < Error; end
-	class ConfigurationError < Error; end
-end
+  class Error < StandardError
+    attr_accessor :original
+    def initialize(message = '', original = $!)
+      super(message)
+      @original = original
+    end
+  end
+  class NativeError < Error; end
+  class ConfigurationError < Error; end
+end

data/lib/wapiti/model.rb CHANGED

@@ -1,85 +1,93 @@
 module Wapiti
-	class Model
-		class << self
-			def train(data, options, &block)
-				config = Options.new(options, &block)
-				# check configuration
-				# if config.pattern.empty?
-				# 	raise ConfigurationError, 'invalid options: no pattern specified'
-				# end
-				unless config.valid?
-					raise ConfigurationError, "invalid options: #{ config.validate.join('; ') }"
-				end
-				new(config).train(data)
-			end
-			def load(filename)
-				m = new
-				m.path = filename
-				m.load
-				m
-			end
-		end
-		attr_accessor :path
-		attr_reader :token_count, :token_errors, :sequence_count, :sequence_errors
-		def pattern
-			options.pattern
-		end
-		def pattern=(filename)
-			options.pattern = filename
-		end
-		alias native_label label
-		def label(input, opts = nil)
-			options.update(opts) unless opts.nil?
-			block_given? ? native_label(input, &Proc.new) : native_label(input)
-		end
-		alias native_train train
-		def train(input, opts = nil)
-			options.update(opts) unless opts.nil?
-			block_given? ? native_train(input, &Proc.new) : native_train(input)
-		end
-		def statistics
-			s = {}
-			s[:tokens] = {
-				:total => token_count, :errors => @token_errors,
-				:rate => token_errors.to_f / token_count.to_f * 100.0
-			}
-			s[:sequences] = {
-				:total => sequence_count, :errors => sequence_errors,
-				:rate => sequence_errors.to_f / sequence_count.to_f * 100.0
-			}
-			s
-		end
-		alias stats statistics
-		def clear_counters
-			@token_count = @token_errors = @sequence_count = @sequence_errors = 0
-		end
-		alias clear clear_counters
-		# alias native_save save
-		private :native_label, :native_train
-	end
-end
+  class Model
+    class << self
+      def train(data, options, &block)
+        config = Options.new(options, &block)
+        # check configuration
+        # if config.pattern.empty?
+        #   raise ConfigurationError, 'invalid options: no pattern specified'
+        # end
+        unless config.valid?
+          raise ConfigurationError, "invalid options: #{ config.validate.join('; ') }"
+        end
+        new(config).train(data)
+      end
+      def load(filename)
+        m = new
+        m.path = filename
+        m.load
+        m
+      end
+    end
+    attr_accessor :path
+    attr_reader :token_count, :token_errors, :sequence_count, :sequence_errors
+    def pattern
+      options.pattern
+    end
+    def pattern=(filename)
+      options.pattern = filename
+    end
+    alias native_label label
+    def label(input, opts = nil)
+      options.update(opts) unless opts.nil?
+      block_given? ? native_label(input, &Proc.new) : native_label(input)
+    end
+    alias native_train train
+    def train(input, opts = nil)
+      options.update(opts) unless opts.nil?
+      block_given? ? native_train(input, &Proc.new) : native_train(input)
+    end
+    def statistics
+      s = {}
+      s[:tokens] = {
+        :total => token_count, :errors => token_errors, :rate => token_error_rate
+      }
+      s[:sequences] = {
+        :total => sequence_count, :errors => sequence_errors, :rate => sequence_error_rate
+      }
+      s
+    end
+    alias stats statistics
+    def clear_counters
+      @token_count = @token_errors = @sequence_count = @sequence_errors = 0
+    end
+    alias clear clear_counters
+    def token_error_rate
+      return 0 if token_errors.zero?
+      token_errors / token_count.to_f * 100.0
+    end
+    def sequence_error_rate
+      return 0 if sequence_errors.zero?
+      sequence_errors / sequence_count.to_f * 100.0
+    end
+    # alias native_save save
+    private :native_label, :native_train
+  end
+end