RubyGems - wapiti - Versions diffs - 0.0.5 → 0.1.0 - Mend

wapiti 0.0.5 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

checksums.yaml +7 -0
data/.simplecov +3 -0
data/Gemfile +25 -2
data/HISTORY.md +5 -1
data/LICENSE +14 -13
data/README.md +9 -16
data/Rakefile +38 -8
data/ext/wapiti/bcd.c +126 -124
data/ext/wapiti/decoder.c +203 -124
data/ext/wapiti/decoder.h +6 -4
data/ext/wapiti/extconf.rb +2 -2
data/ext/wapiti/gradient.c +491 -320
data/ext/wapiti/gradient.h +52 -34
data/ext/wapiti/lbfgs.c +74 -33
data/ext/wapiti/model.c +47 -37
data/ext/wapiti/model.h +22 -20
data/ext/wapiti/native.c +850 -839
data/ext/wapiti/native.h +1 -1
data/ext/wapiti/options.c +52 -20
data/ext/wapiti/options.h +37 -30
data/ext/wapiti/pattern.c +35 -33
data/ext/wapiti/pattern.h +12 -11
data/ext/wapiti/progress.c +14 -13
data/ext/wapiti/progress.h +3 -2
data/ext/wapiti/quark.c +14 -16
data/ext/wapiti/quark.h +6 -5
data/ext/wapiti/reader.c +83 -69
data/ext/wapiti/reader.h +11 -9
data/ext/wapiti/rprop.c +84 -43
data/ext/wapiti/sequence.h +18 -16
data/ext/wapiti/sgdl1.c +45 -43
data/ext/wapiti/thread.c +19 -17
data/ext/wapiti/thread.h +5 -4
data/ext/wapiti/tools.c +7 -7
data/ext/wapiti/tools.h +3 -4
data/ext/wapiti/trainers.h +1 -1
data/ext/wapiti/vmath.c +40 -38
data/ext/wapiti/vmath.h +12 -11
data/ext/wapiti/wapiti.c +159 -37
data/ext/wapiti/wapiti.h +18 -4
data/lib/wapiti.rb +15 -15
data/lib/wapiti/errors.rb +15 -15
data/lib/wapiti/model.rb +92 -84
data/lib/wapiti/options.rb +123 -124
data/lib/wapiti/utility.rb +14 -14
data/lib/wapiti/version.rb +2 -2
data/spec/spec_helper.rb +29 -9
data/spec/wapiti/model_spec.rb +230 -194
data/spec/wapiti/native_spec.rb +7 -8
data/spec/wapiti/options_spec.rb +184 -174
data/wapiti.gemspec +22 -8
metadata +38 -42
data/.gitignore +0 -5

data/ext/wapiti/native.h CHANGED

@@ -16,4 +16,4 @@ extern VALUE cModel;
 extern VALUE cNativeError;
 extern VALUE cLogger;
-#endif
+#endif

data/ext/wapiti/options.c CHANGED

@@ -1,7 +1,7 @@
 /*
  *      Wapiti - A linear-chain CRF tool
  *
- * Copyright (c) 2009-2011  CNRS
+ * Copyright (c) 2009-2013  CNRS
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -25,9 +25,11 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
+#include <inttypes.h>
 #include <limits.h>
 #include <stdbool.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -54,13 +56,16 @@ static void opt_help(const char *pname) {
 		"\t-h | --help      display this help message\n"
 		"\t   | --version   display version information\n"
 		"\n"
-		"Training mode:\n"
+		"Train mode:\n"
 		"    %1$s train [options] [input data] [model file]\n"
 		"\t   | --me               force maxent mode\n"
+		"\t-T | --type     STRING  type of model to train\n"
 		"\t-a | --algo     STRING  training algorithm to use\n"
 		"\t-p | --pattern  FILE    patterns for extracting features\n"
 		"\t-m | --model    FILE    model file to preload\n"
 		"\t-d | --devel    FILE    development dataset\n"
+		"\t   | --rstate   FILE    optimizer state to restore\n"
+		"\t   | --sstate   FILE    optimizer state to save\n"
 		"\t-c | --compact          compact model after training\n"
 		"\t-t | --nthread  INT     number of worker threads\n"
 		"\t-j | --jobsize  INT     job size for worker threads\n"
@@ -83,7 +88,7 @@ static void opt_help(const char *pname) {
 		"\t   | --stpdec   FLOAT   (rprop)  step decrement factor\n"
 		"\t   | --cutoff           (rprop)  alternate projection\n"
 		"\n"
-		"Labelling mode:\n"
+		"Label mode:\n"
 		"    %1$s label [options] [input data] [output data]\n"
 		"\t   | --me               force maxent mode\n"
 		"\t-m | --model    FILE    model file to load\n"
@@ -92,9 +97,18 @@ static void opt_help(const char *pname) {
 		"\t-s | --score            add scores to output\n"
 		"\t-p | --post             label using posteriors\n"
 		"\t-n | --nbest    INT     output n-best list\n"
+		"\t   | --force            use forced decoding\n"
 		"\n"
-		"Dumping mode\n"
-		"    %1$s dump [input model] [output text]\n";
+		"Dump mode\n"
+		"    %1$s dump [options] [input model] [output text]\n"
+		"\t-p | --prec     INT     set weights precision\n"
+		"\t   | --all              also output 0 weights\n"
+		"\n"
+		"Update mode\n"
+		"    %1$s update [options] [patch file] [output model]\n"
+		"\t-m | --model    FILE    model file to load\n"
+		"\t-c | --compact          compact model after training\n"
+	;
 	fprintf(stderr, msg, pname);
 }
@@ -104,8 +118,10 @@ static void opt_help(const char *pname) {
 const opt_t opt_defaults = {
 	.mode    = -1,
 	.input   = NULL,     .output  = NULL,
+	.type    = "crf",
 	.maxent  = false,
 	.algo    = "l-bfgs", .pattern = NULL,  .model   = NULL, .devel   = NULL,
+	.rstate  = NULL,     .sstate  = NULL,
 	.compact = false,    .sparse  = false,
 	.nthread = 1,        .jobsize = 64,    .maxiter = 0,
 	.rho1    = 0.5,      .rho2    = 0.0001,
@@ -116,7 +132,8 @@ const opt_t opt_defaults = {
 	.rprop = {.stpmin = 1e-8, .stpmax = 50.0, .stpinc = 1.2, .stpdec = 0.5,
 	          .cutoff = false},
 	.label   = false,    .check   = false, .outsc = false,
-	.lblpost = false,    .nbest = 1
+	.lblpost = false,    .nbest   = 1,     .force = false,
+	.prec    = 5,        .all     = false,
 };
 /* opt_switch:
@@ -125,29 +142,32 @@ const opt_t opt_defaults = {
  */
 struct {
 	int     mode;
-	char   *dshort;
-	char   *dlong;
+	const char   *dshort;
+	const char   *dlong;
 	char    kind;
 	size_t  offset;
 } opt_switch[] = {
+	{0, "-T", "--type",    'S', offsetof(opt_t, type        )},
 	{0, "##", "--me",      'B', offsetof(opt_t, maxent      )},
 	{0, "-a", "--algo",    'S', offsetof(opt_t, algo        )},
 	{0, "-p", "--pattern", 'S', offsetof(opt_t, pattern     )},
 	{0, "-m", "--model",   'S', offsetof(opt_t, model       )},
 	{0, "-d", "--devel",   'S', offsetof(opt_t, devel       )},
+	{0, "##", "--rstate",  'S', offsetof(opt_t, rstate      )},
+	{0, "##", "--sstate",  'S', offsetof(opt_t, sstate      )},
 	{0, "-c", "--compact", 'B', offsetof(opt_t, compact     )},
 	{0, "-s", "--sparse",  'B', offsetof(opt_t, sparse      )},
-	{0, "-t", "--nthread", 'I', offsetof(opt_t, nthread     )},
-	{0, "-j", "--josize",  'I', offsetof(opt_t, jobsize     )},
-	{0, "-i", "--maxiter", 'I', offsetof(opt_t, maxiter     )},
+	{0, "-t", "--nthread", 'U', offsetof(opt_t, nthread     )},
+	{0, "-j", "--jobsize", 'U', offsetof(opt_t, jobsize     )},
+	{0, "-i", "--maxiter", 'U', offsetof(opt_t, maxiter     )},
 	{0, "-1", "--rho1",    'F', offsetof(opt_t, rho1        )},
 	{0, "-2", "--rho2",    'F', offsetof(opt_t, rho2        )},
-	{0, "-o", "--objsz",   'I', offsetof(opt_t, objwin      )},
-	{0, "-w", "--stopwin", 'I', offsetof(opt_t, stopwin     )},
+	{0, "-o", "--objwin",  'U', offsetof(opt_t, objwin      )},
+	{0, "-w", "--stopwin", 'U', offsetof(opt_t, stopwin     )},
 	{0, "-e", "--stopeps", 'F', offsetof(opt_t, stopeps     )},
 	{0, "##", "--clip",    'B', offsetof(opt_t, lbfgs.clip  )},
-	{0, "##", "--histsz",  'I', offsetof(opt_t, lbfgs.histsz)},
-	{0, "##", "--maxls",   'I', offsetof(opt_t, lbfgs.maxls )},
+	{0, "##", "--histsz",  'U', offsetof(opt_t, lbfgs.histsz)},
+	{0, "##", "--maxls",   'U', offsetof(opt_t, lbfgs.maxls )},
 	{0, "##", "--eta0",    'F', offsetof(opt_t, sgdl1.eta0  )},
 	{0," ##", "--alpha",   'F', offsetof(opt_t, sgdl1.alpha )},
 	{0, "##", "--kappa",   'F', offsetof(opt_t, bcd.kappa   )},
@@ -162,7 +182,12 @@ struct {
 	{1, "-c", "--check",   'B', offsetof(opt_t, check       )},
 	{1, "-s", "--score",   'B', offsetof(opt_t, outsc       )},
 	{1, "-p", "--post",    'B', offsetof(opt_t, lblpost     )},
-	{1, "-n", "--nbest",   'I', offsetof(opt_t, nbest       )},
+	{1, "-n", "--nbest",   'U', offsetof(opt_t, nbest       )},
+	{1, "##", "--force",   'B', offsetof(opt_t, force       )},
+	{2, "-p", "--prec",    'U', offsetof(opt_t, prec        )},
+	{2, "##", "--all",     'B', offsetof(opt_t, all         )},
+	{3, "-m", "--model",   'S', offsetof(opt_t, model       )},
+	{3, "-c", "--compact", 'B', offsetof(opt_t, compact     )},
 	{-1, NULL, NULL, '\0', 0}
 };
@@ -195,6 +220,8 @@ void opt_parse(int argc, char *argv[argc], opt_t *opt) {
 		opt->mode = 1;
 	} else if (!strcmp(argv[0], "d") || !strcmp(argv[0], "dump")) {
 		opt->mode = 2;
+	} else if (!strcmp(argv[0], "u") || !strcmp(argv[0], "update")) {
+		opt->mode = 3;
 	} else {
 		fatal("unknown mode <%s>", argv[0]);
 	}
@@ -204,7 +231,7 @@ void opt_parse(int argc, char *argv[argc], opt_t *opt) {
 	opt->output = NULL;
 	while (argc > 0) {
 		const char *arg = argv[0];
-		int idx;
+		uint32_t idx;
 		// Check if this argument is a filename or an option
 		if (arg[0] != '-') {
 			if (opt->input == NULL)
@@ -237,8 +264,9 @@ void opt_parse(int argc, char *argv[argc], opt_t *opt) {
 				*((char **)ptr) = argv[1];
 				argc -= 2, argv += 2;
 				break;
-			case 'I':
-				if (sscanf(argv[1], "%d", (int *)ptr) != 1)
+			case 'U':
+				if (sscanf(argv[1], "%"SCNu32,
+						(uint32_t *)ptr) != 1)
 					fatal(err_badval, arg);
 				argc -= 2, argv += 2;
 				break;
@@ -272,7 +300,11 @@ void opt_parse(int argc, char *argv[argc], opt_t *opt) {
 	argchecksub("--alpha",   opt->sgdl1.alpha  >  0.0);
 	argchecksub("--nbest",   opt->nbest        >  0  );
 	#undef argchecksub
-	if (opt->maxent && !strcmp(opt->algo, "bcd"))
+	if ((opt->maxent || !strcmp(opt->type, "maxent")) && !strcmp(opt->algo, "bcd"))
 		fatal("BCD not supported for training maxent models");
+	if (!strcmp(opt->type, "memm") && !strcmp(opt->algo, "bcd"))
+		fatal("BCD not supported for training MEMM models");
+	if (opt->check && opt->force)
+		fatal("--check and --force cannot be used together");
 }

data/ext/wapiti/options.h CHANGED

@@ -1,7 +1,7 @@
 /*
  *      Wapiti - A linear-chain CRF tool
  *
- * Copyright (c) 2009-2011  CNRS
+ * Copyright (c) 2009-2013  CNRS
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -27,6 +27,7 @@
 #ifndef options_h
 #define options_h
+#include <stdint.h>
 #include <stdbool.h>
 #include "wapiti.h"
@@ -37,50 +38,56 @@
  */
 typedef struct opt_s opt_t;
 struct opt_s {
-	int    mode;
-	char  *input,  *output;
-	bool   maxent;
+	int       mode;
+	char     *input,  *output;
+	bool      maxent;
 	// Options for training
-	char  *algo,   *pattern;
-	char  *model,  *devel;
-	bool   compact, sparse;
-	int    nthread;
-	int    jobsize;
-	int    maxiter;
-	double rho1,    rho2;
+	const char     *type;
+	const char     *algo,   *pattern;
+	char     *model,  *devel;
+	char     *rstate, *sstate;
+	bool      compact, sparse;
+	uint32_t  nthread;
+	uint32_t  jobsize;
+	uint32_t  maxiter;
+	double    rho1,    rho2;
 	// Window size criterion
-	int    objwin;
-	int    stopwin;
-	double stopeps;
+	uint32_t  objwin;
+	uint32_t  stopwin;
+	double    stopeps;
 	// Options specific to L-BFGS
 	struct {
-		bool   clip;
-		int    histsz;
-		int    maxls;
+		bool     clip;
+		uint32_t histsz;
+		uint32_t maxls;
 	} lbfgs;
 	// Options specific to SGD-L1
 	struct {
-		double eta0;
-		double alpha;
+		double   eta0;
+		double   alpha;
 	} sgdl1;
 	// Options specific to BCD
 	struct {
-		double kappa;
+		double   kappa;
 	} bcd;
 	// Options specific to RPROP
 	struct {
-		double stpmin;
-		double stpmax;
-		double stpinc;
-		double stpdec;
-		bool   cutoff;
+		double   stpmin;
+		double   stpmax;
+		double   stpinc;
+		double   stpdec;
+		bool     cutoff;
 	} rprop;
 	// Options for labelling
-	bool   label;
-	bool   check;
-	bool   outsc;
-	bool   lblpost;
-	int    nbest;
+	bool      label;
+	bool      check;
+	bool      outsc;
+	bool      lblpost;
+	uint32_t  nbest;
+	bool      force;
+	// Options for model dump
+	int       prec;
+	bool      all;
 };
 extern const opt_t opt_defaults;

data/ext/wapiti/pattern.c CHANGED

@@ -1,7 +1,7 @@
 /*
  *      Wapiti - A linear-chain CRF tool
  *
- * Copyright (c) 2009-2011  CNRS
+ * Copyright (c) 2009-2013  CNRS
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -26,8 +26,10 @@
  */
 #include <ctype.h>
+#include <inttypes.h>
 #include <stdbool.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -101,7 +103,7 @@ static bool rex_matchit(const char *ch, const char *str) {
  *   is length is returned in len. The mathing is done through tail-recursion
  *   for good performances.
  */
-static bool rex_matchme(const char *re, const char *str, int *len) {
+static bool rex_matchme(const char *re, const char *str, uint32_t *len) {
 	// Special check for end of regexp
 	if (re[0] == '\0')
 		return true;
@@ -120,7 +122,7 @@ static bool rex_matchme(const char *re, const char *str, int *len) {
 	if (nxt[0] == '*') {
 		nxt++;
 		do {
-			const int save = *len;
+			const uint32_t save = *len;
 			if (rex_matchme(nxt, str, len))
 				return true;
 			*len = save + 1;
@@ -150,7 +152,7 @@ static bool rex_matchme(const char *re, const char *str, int *len) {
  *   position of the start of the match is returned and is len is returned in
  *   len, else -1 is returned.
  */
-static int rex_match(const char *re, const char *str, int *len) {
+static int32_t rex_match(const char *re, const char *str, uint32_t *len) {
 	// Special case for anchor at start
 	if (*re == '^') {
 		*len = 0;
@@ -159,7 +161,7 @@ static int rex_match(const char *re, const char *str, int *len) {
 		return -1;
 	}
 	// And general case for any position
-	int pos = 0;
+	int32_t pos = 0;
 	do {
 		*len = 0;
 		if (rex_matchme(re, str + pos, len))
@@ -215,8 +217,8 @@ pat_t *pat_comp(char *p) {
 	// on an over-estimation of the number of required item. As compiled
 	// pattern take a neglectible amount of memory, this waste is not
 	// important.
-	int mitems = 0;
-	for (int pos = 0; p[pos] != '\0'; pos++)
+	uint32_t mitems = 0;
+	for (uint32_t pos = 0; p[pos] != '\0'; pos++)
 		if (p[pos] == '%')
 			mitems++;
 	mitems = mitems * 2 + 1;
@@ -225,9 +227,9 @@ pat_t *pat_comp(char *p) {
 	// Next, we go through the pattern compiling the items as they are
 	// found. Commands are parsed and put in a corresponding item, and
 	// segment of char not in a command are put in a 's' item.
-	int nitems = 0;
-	int ntoks = 0;
-	int pos = 0;
+	uint32_t nitems = 0;
+	uint32_t ntoks = 0;
+	uint32_t pos = 0;
 	while (p[pos] != '\0') {
 		pat_item_t *item = &(pat->items[nitems++]);
 		item->value = NULL;
@@ -243,14 +245,14 @@ pat_t *pat_comp(char *p) {
 			// Next we parse the offset and column and store them in
 			// the item.
 			const char *at = p + pos;
-			int off, col, nch;
+			uint32_t col;
+			int32_t off;
+			int nch;
 			item->absolute = false;
-			if (sscanf(at, "[@%d,%d%n", &off, &col, &nch) == 2)
+			if (sscanf(at, "[@%"SCNi32",%"SCNu32"%n", &off, &col, &nch) == 2)
 				item->absolute = true;
-			else if (sscanf(at, "[%d,%d%n", &off, &col, &nch) != 2)
+			else if (sscanf(at, "[%"SCNi32",%"SCNu32"%n", &off, &col, &nch) != 2)
 				fatal("invalid pattern: %s", p);
-			if (col < 0)
-				fatal("invalid column number: %d", col);
 			item->offset = off;
 			item->column = col;
 			ntoks = max(ntoks, col);
@@ -261,7 +263,7 @@ pat_t *pat_comp(char *p) {
 			if (type == 't' || type == 'm') {
 				if (p[pos] != ',' && p[pos + 1] != '"')
 					fatal("missing arg in pattern: %s", p);
-				const int start = (pos += 2);
+				const int32_t start = (pos += 2);
 				while (p[pos] != '\0') {
 					if (p[pos] == '"')
 						break;
@@ -271,7 +273,7 @@ pat_t *pat_comp(char *p) {
 				}
 				if (p[pos] != '"')
 					fatal("unended argument: %s", p);
-				const int len = pos - start;
+				const int32_t len = pos - start;
 				item->value = wapiti_xmalloc(sizeof(char) * (len + 1));
 				memcpy(item->value, p + start, len);
 				item->value[len] = '\0';
@@ -285,10 +287,10 @@ pat_t *pat_comp(char *p) {
 			// No command here, so build an 's' item with the chars
 			// until end of pattern or next command and put it in
 			// the list.
-			const int start = pos;
+			const int32_t start = pos;
 			while (p[pos] != '\0' && p[pos] != '%')
 				pos++;
-			const int len = pos - start;
+			const int32_t len = pos - start;
 			item->type  = 's';
 			item->caps  = false;
 			item->value = wapiti_xmalloc(sizeof(char) * (len + 1));
@@ -307,18 +309,18 @@ pat_t *pat_comp(char *p) {
  *   newly allocated memory block and the caller is responsible to free it when
  *   not needed anymore.
  */
-char *pat_exec(const pat_t *pat, const tok_t *tok, int at) {
-	static char *bval[] = {"_x-1", "_x-2", "_x-3", "_x-4", "_x-#"};
-	static char *eval[] = {"_x+1", "_x+2", "_x+3", "_x+4", "_x+#"};
-	const int T = tok->len;
+char *pat_exec(const pat_t *pat, const tok_t *tok, uint32_t at) {
+	static const char *bval[] = {"_x-1", "_x-2", "_x-3", "_x-4", "_x-#"};
+	static const char *eval[] = {"_x+1", "_x+2", "_x+3", "_x+4", "_x+#"};
+	const uint32_t T = tok->len;
 	// Prepare the buffer who will hold the result
-	int size = 16, pos = 0;
+	uint32_t size = 16, pos = 0;
 	char *buffer = wapiti_xmalloc(sizeof(char) * size);
 	// And loop over the compiled items
-	for (int it = 0; it < pat->nitems; it++) {
+	for (uint32_t it = 0; it < pat->nitems; it++) {
 		const pat_item_t *item = &(pat->items[it]);
-		char *value = NULL;
-		int len = 0;
+		const char *value = NULL;
+		uint32_t len = 0;
 		// First, if needed, we retrieve the token at the referenced
 		// position in the sequence. We store it in value and let the
 		// command handler do what it need with it.
@@ -332,11 +334,11 @@ char *pat_exec(const pat_t *pat, const tok_t *tok, int at) {
 			} else {
 				pos += at;
 			}
-			int col = item->column;
+			uint32_t col = item->column;
 			if (pos < 0)
 				value = bval[min(-pos - 1, 4)];
-			else if (pos >= T)
-				value = eval[min( pos - T, 4)];
+			else if (pos >= (int32_t)T)
+				value = eval[min( pos - (int32_t)T, 4)];
 			else if (col >= tok->cnts[pos])
 				fatal("missing tokens, cannot apply pattern");
 			else
@@ -356,7 +358,7 @@ char *pat_exec(const pat_t *pat, const tok_t *tok, int at) {
 				value = "true";
 			len = strlen(value);
 		} else if (item->type == 'm') {
-			int pos = rex_match(item->value, value, &len);
+			int32_t pos = rex_match(item->value, value, &len);
 			if (pos == -1)
 				len = 0;
 			value += pos;
@@ -370,7 +372,7 @@ char *pat_exec(const pat_t *pat, const tok_t *tok, int at) {
 		}
 		memcpy(buffer + pos, value, len);
 		if (item->caps)
-			for (int i = pos; i < pos + len; i++)
+			for (uint32_t i = pos; i < pos + len; i++)
 				buffer[i] = tolower(buffer[i]);
 		pos += len;
 	}
@@ -386,7 +388,7 @@ char *pat_exec(const pat_t *pat, const tok_t *tok, int at) {
  *   not use this pointer again.
  */
 void pat_free(pat_t *pat) {
-	for (int it = 0; it < pat->nitems; it++)
+	for (uint32_t it = 0; it < pat->nitems; it++)
 		free(pat->items[it].value);
 	free(pat->src);
 	free(pat);