RubyGems - wapiti - Versions diffs - 0.1.1 → 1.0.0 - Mend

wapiti 0.1.1 → 1.0.0

Files changed (52) hide show

checksums.yaml +5 -5
data/HISTORY.md +8 -0
data/LICENSE +1 -1
data/README.md +39 -95
data/ext/wapiti/bcd.c +1 -1
data/ext/wapiti/extconf.rb +15 -1
data/ext/wapiti/lbfgs.c +6 -6
data/ext/wapiti/model.c +2 -3
data/ext/wapiti/model.h +0 -7
data/ext/wapiti/native.c +89 -239
data/ext/wapiti/native.h +0 -5
data/ext/wapiti/pattern.c +1 -1
data/ext/wapiti/progress.c +19 -44
data/ext/wapiti/progress.h +1 -4
data/ext/wapiti/rprop.c +3 -4
data/ext/wapiti/sgdl1.c +3 -3
data/ext/wapiti/tools.c +36 -30
data/ext/wapiti/tools.h +9 -4
data/ext/wapiti/trainers.c +55 -0
data/ext/wapiti/trainers.h +4 -1
data/lib/wapiti.rb +4 -24
data/lib/wapiti/dataset.rb +162 -0
data/lib/wapiti/errors.rb +0 -4
data/lib/wapiti/log.rb +29 -0
data/lib/wapiti/model.rb +63 -40
data/lib/wapiti/options.rb +66 -29
data/lib/wapiti/sequence.rb +105 -0
data/lib/wapiti/token.rb +74 -0
data/lib/wapiti/version.rb +1 -1
metadata +20 -80
data/.autotest +0 -13
data/.rspec +0 -3
data/.simplecov +0 -3
data/Gemfile +0 -29
data/Rakefile +0 -63
data/ext/wapiti/wapiti.c +0 -410
data/spec/fixtures/ch.mod +0 -18550
data/spec/fixtures/chpattern.txt +0 -52
data/spec/fixtures/chtest.txt +0 -1973
data/spec/fixtures/chtrain.txt +0 -19995
data/spec/fixtures/nppattern.txt +0 -52
data/spec/fixtures/nptest.txt +0 -1973
data/spec/fixtures/nptrain.txt +0 -19995
data/spec/fixtures/pattern.txt +0 -14
data/spec/fixtures/test.txt +0 -60000
data/spec/fixtures/train.txt +0 -1200
data/spec/spec_helper.rb +0 -41
data/spec/wapiti/model_spec.rb +0 -233
data/spec/wapiti/native_spec.rb +0 -11
data/spec/wapiti/options_spec.rb +0 -185
data/spec/wapiti/utility_spec.rb +0 -22
data/wapiti.gemspec +0 -49

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: f8e3f42f711858caf64ca6682ea3aea8d9439a02
-  data.tar.gz: 32569d35d3e6d2af1adf91df7a1a8d7ba996d198
+SHA256:
+  metadata.gz: f0357489368d9bbe57ea34e2bee7be8f6ae7542875971acdf48f24b038aebb32
+  data.tar.gz: 7267f065c30e82f581942cae7789bc4d71999f7a418c1549b5cd41a8ed4a1b80
 SHA512:
-  metadata.gz: 2ed36dbff7978ca5a731bc3381faad7b9fc362dcdd46ce0b7495dceacd6d21458a33bab5250ee71788d4b2e553625d3b39e61b4140ef701498f04742cf10d19c
-  data.tar.gz: d9e10608342bc351c88cbb61d37af504b983c3e5b973ee2a29919951987ec4d3a3919d98f053990300207820cf59ab297c80a6efb4806060999c0090079b9e39
+  metadata.gz: 194656b3d90ed6fedf32a2e1a8dc34cec4eccaeec71098c2499054697c66a3209d9436cfcf66e1b4b43e19e2f2b50f4ad5e2682df9d21f126e812940591de226
+  data.tar.gz: 9d365d09193a7c1657b1583331f20d1f985ad0f8c351947c791dd407a163463f4eb21ee6ac74545bd0f61ca6f88ab0ed669aaeefdaefdd03f2543cc22078a4f5

data/HISTORY.md CHANGED Viewed

@@ -1,3 +1,11 @@
+1.0.0 / 2017-12-xx
+==================
+* Added support for Windows platform
+* Open files only if names are untainted
+* Finalized API
+* Fixed error reporting
+* Removed progress logging
 0.1.1 / 2014-02-27
 ==================
 * Updated train routine

data/LICENSE CHANGED Viewed

@@ -1,5 +1,5 @@
 Wapiti-Ruby
-Copyright 2011-2014 Sylvester Keil. All rights reserved.
+Copyright 2011-2018 Sylvester Keil. All rights reserved.
 Wapiti - A linear-chain CRF tool
 Copyright 2009-2013  CNRS. All rights reserved.

data/README.md CHANGED Viewed

@@ -5,18 +5,14 @@ The Wapiti-Ruby gem provides a wicked fast linear-chain CRF
 API for sequence segmentation and labelling; it is based on the
 codebase of [wapiti](http://wapiti.limsi.fr/).
-[![Build Status](https://secure.travis-ci.org/inukshuk/wapiti-ruby.png)](http://travis-ci.org/inukshuk/wapiti-ruby)
-[![Coverage Status](https://coveralls.io/repos/inukshuk/wapiti-ruby/badge.png?branch=master)](https://coveralls.io/r/inukshuk/wapiti-ruby?branch=master)
+[![Linux Build Status](https://travis-ci.org/inukshuk/wapiti-ruby.svg?branch=master)](https://travis-ci.org/inukshuk/wapiti-ruby)
+[![Windows Build Status](https://ci.appveyor.com/api/projects/status/12rtxe2o8p55g1w6/branch/master?svg=true)](https://ci.appveyor.com/project/inukshuk/wapiti-ruby/branch/master)
+[![Coverage Status](https://coveralls.io/repos/github/inukshuk/wapiti-ruby/badge.svg?branch=master)](https://coveralls.io/github/inukshuk/wapiti-ruby?branch=master)
 Requirements
 ------------
-Wapiti is written in C and Ruby and requires a compiler with C99
-support (e.g., gcc); on GNU/Linux systems it will be fairly easy to install
-all necessary packages through your distribution.
-The Wapiti Ruby gem has been confirmed to work with MRI 2.x, 1.9.x, 1.8.7,
-and Rubinius.
+Wapiti is written in C and Ruby and requires a compiler with C99 support;
+it has been confirmed to work on Linux, macOS, and Windows.
 Quickstart
 ----------
@@ -29,49 +25,35 @@ Quickstart
 Using a pattern and training data stored in a file:
-    model = Wapiti.train('train.txt', :pattern => 'pattern.txt')
-    => #<Wapiti::Model:0x0000010188f868>
+    model = Wapiti.train('train.txt', pattern: 'pattern.txt')
+    #=> #<Wapiti::Model:0x0000010188f868>
     model.labels
-    => ["B-ADJP", "B-ADVP", "B-CONJP" ...]
+    #=> ["B-ADJP", "B-ADVP", "B-CONJP" ...]
     model.save('ch.mod')
-    => # saves the model as 'ch.mod'
+    #=> saves the model as 'ch.mod'
-Alternatively, you can pass in the training data as an array; the array
-should contain one array for each sequence of training data.
+Alternatively, you can pass in the training data as a `Wapiti::Dataset`;
+this class supports the default text format used by Wapiti as well as
+additiional formats (such as YAML or XML) and an API to make it easier
+to manage data sets used for input and training.
-    data = []
-    data << ['Confidence NN B-NP', 'in IN B-PP', 'the DT B-NP', 'pound NN I-NP', '. . O']
-    ...
+    data = Wapiti::Dataset.open('chtrain.xml')
     model = Wapiti.train(data, options)
-You can consult the `Wapiti::Options` class for a list of supported
-configuration options and algorithms:
-    Wapiti::Options.attribute_names
-    => [:algorithm, :check, :compact, :convergence_window, :development_data,
-    :jobsize, :label, :max_iterations, :maxent, :pattern, :posterior, :rho1,
-    :rho2, :score, :sparse, :stop_epsilon, :stop_window, :threads]
-    Wapiti::Options.algorithms
-    => ["l-bfgs", "sgd-l1", "bcd", "rprop", "rprop+", "rprop-", "auto"]
+You can consult the `Wapiti::Options.attribute_names` class for a list of
+supported configuration options and `Wapiti::Options.algorithms` for
+all supported algorithms:
 Use `#valid?` or `#validate` (which returns error messages) to make sure
 your configuration is supported by Wapiti.
-You can pass options either as an options hash or by adding a block to the
-method invocation:
-    model = Wapiti::Model.train(data) do |config|
-      config.pattern = 'pattern.txt'
-      threads = 4
-    end
 Before saving your model you can use `compact` to reduce the model's size:
     model.save 'm1.mod'
-    => # m1.mod file size 1.8M
+    #=> m1.mod file size 1.8M
     model.compact
     model.save 'm2.mod'
-    => # m2.mod file size 471K
+    #=> m2.mod file size 471K
 ### Loading existing Models
@@ -80,50 +62,33 @@ Before saving your model you can use `compact` to reduce the model's size:
 ### Labelling
-By calling `#label` on a Model instance you can add labels to your sequence
-data:
+By calling `#label` on a Model instance you can add labels to a dataset:
     model = Wapiti.load('m2.mod')
-    model.label('test.txt')
-    => [[["Confidence NN B-NP", "B-NP"], ["in IN B-PP", "B-PP"] ... ]
+    input = Wapiti::Dataset.load('chtest.txt')
+    output = model.label(input, tagged: true)
-The result is an array of sequence arrays; each sequence array consists of
-the original token and feature string (when using test data, the final
-feature is usually the expected label) and the label calculated by Wapiti.
+The result is a new `Wapiti::Dataset` with the predicted labels for each
+token. If your input data was already tagged, you can compare the input
+and output datasets to evaluate your results:
-As with training data, you can pass in data either by filename or as
-a Ruby Array:
-    model.label [['Confidence NN', 'in IN', 'the DT', 'pound NN', '. .']]
-    => [[["Confidence NN", "B-NP"], ["in IN", "B-PP"], ["the DT", "B-NP"],
-    ["pound NN", "I-NP"], [". .", "O"]]]
+    output - input
+    # => new dataset of output sequences which are tagged differently than expected
 If you pass a block to `#label` Wapiti will yield each token and the
 corresponding label:
-    model.label [['Confidence NN', 'in IN', 'the DT', 'pound NN', '. .']] do |token, label|
+    model.label input do |token, label|
       [token.downcase, label.downcase]
     end
-    => [[["confidence nn", "b-np"], ["in in", "b-pp"], ["the dt", "b-np"],
-    ["pound nn", "i-np"], [". .", "o"]]]
 Note that if you set the *:score* option (either in the Model's `#options` or
 when calling `#label`), the score for each label will be appended to
 each token/label tuple as a floating point number or passed as a third
 argument to the passed-in block.
-    model.label [['Confidence NN']], :score => true
-    => [[["Confidence NN", "B-NP", 4.642034838737357]]]
-Similarly, if you set the *:nbest* option to a value greater than one, Wapiti
-will append more label and, optionally, score values to each tuple.
-    model.label [['Confidence NN']], :score => true, :nbest => 3, :skip_tokens => true
-    => [[["B-NP", 4.642034838737357, "B-VP", 1.7040256847206927, "B-ADJP", 0.7636429298060177]]]
-Note how we also suppressed the output of the token string using the
-*:skip_tokens* option.
+    model.label input, score: true
+    # => Dataset where each token will include a score
 ### Statistics
@@ -131,41 +96,20 @@ By setting the *:check* option you can tell Wapiti to keep statistics during
 the labelling phase (for the statistics to be meaningful you obviously need
 to provide input data that is already labelled). Wapiti does not reset the
 counters during consecutive calls to `#label` to allow you to collect
-accumulative date; however, you can reset the counters at any time, by calling
-`#clear_counters`.
+accumulative stats; however, you can reset the counters at any time, by calling
+`#reset_counters`.
 After calling `#label` with the *:check* options set and appropriately labelled
 input, you can access the statistics via `#statistics` (the individual values
 are also available through the associated attribute readers).
-    model.label 'test.txt', :check => true
-    => {:tokens=>{:total=>1896, :errors=>137, :rate=>7.225738396624472},
-    :sequences=>{:total=>77, :errors=>50, :rate=>64.93506493506494}}
-Citing
-------
-If you're using Wapiti-Ruby for research purposes, please use the following
-citation of the original wapiti package:
-    @article{lavergne2010practical,
-      author    = {Lavergne, Thomas and Capp\'{e}, Olivier and Yvon, Fran\c{c}ois},
-      title     = {Practical Very Large Scale {CRFs}},
-      booktitle = {Proceedings the 48th Annual Meeting of the Association for
-                  Computational Linguistics (ACL)},
-      month     = {July},
-      year      = {2010},
-      location  = {Uppsala, Sweden},
-      publisher = {Association for Computational Linguistics},
-      pages     = {504--513},
-      url       = {http://www.aclweb.org/anthology/P10-1052}
-    }
-If you're profiting from any of the Wapiti-Ruby specific features you are
-welcome to also refer back to the
-[Wapiti-Ruby homepage](http://github.com/inukshuk/wapiti-ruby/).
+    model.label input, check: true
+    model.stats
+    => {:token=>{:count=>1896, :errors=>137, :rate=>7.225738396624472},
+    :sequence=>{:count=>77, :errors=>50, :rate=>64.93506493506494}}
+For convenience, you can also use the `#check` method, which
+will reset the counters, check your input, and return the stats.
 Contributing
 ------------
@@ -183,7 +127,7 @@ example, fix the bug and submit a pull request.
 License
 -------
-Copyright 2011-2014 Sylvester Keil. All rights reserved.
+Copyright 2011-2018 Sylvester Keil. All rights reserved.
 Copyright 2009-2013 CNRS. All rights reserved.

data/ext/wapiti/bcd.c CHANGED Viewed

@@ -377,7 +377,7 @@ void trn_bcd(mdl_t *mdl) {
 			// And update the model
 			bcd_update(mdl, bcd, o);
 		}
-		if (!uit_progress(mdl, i, -1.0))
+		if (!uit_progress(mdl))
 			break;
 	}
 	// Cleanup memory

data/ext/wapiti/extconf.rb CHANGED Viewed

@@ -1,6 +1,20 @@
 require 'mkmf'
+require 'rbconfig'
-$CFLAGS << %q{ -std=c99 -W -Wall -Wno-declaration-after-statement -O3 }
+cflags = %w{
+  -std=c99
+  -W
+  -Wall
+  -Wno-declaration-after-statement
+  -O3
+}
+case RbConfig::CONFIG['host_os']
+when /^linux/i
+  cflags[0] = '-std=gnu99'
+end
+$CFLAGS << ' ' << cflags.join(' ')
 have_library('pthread')
 have_library('m')

data/ext/wapiti/lbfgs.c CHANGED Viewed

@@ -104,12 +104,12 @@ void trn_lbfgs(mdl_t *mdl) {
 			uint64_t f;
 			if (fscanf(file, "%"PRIu64, &f) != 1)
 				fatal("1 %s", err);
-			if (fscanf(file, "%la %la", &xp[f], &gp[f]) != 2)
+			if (fscanf(file, "%le %le", &xp[f], &gp[f]) != 2)
 				fatal("2 %s", err);
 			for (uint32_t m = 0; m < M; m++) {
-				if (fscanf(file, "%la", &s[m][f]) != 1)
+				if (fscanf(file, "%le", &s[m][f]) != 1)
 					fatal("3 %s", err);
-				if (fscanf(file, "%la", &y[m][f]) != 1)
+				if (fscanf(file, "%le", &y[m][f]) != 1)
 					fatal("4 %s", err);
 			}
 		}
@@ -271,7 +271,7 @@ void trn_lbfgs(mdl_t *mdl) {
 			memcpy(x, xp, sizeof(double) * F);
 			break;
 		}
-		if (uit_progress(mdl, k + 1, fx) == false)
+		if (uit_progress(mdl) == false)
 			break;
 		// 3rd step: we update the history used for approximating the
 		// inverse of the diagonal of the hessian
@@ -314,9 +314,9 @@ void trn_lbfgs(mdl_t *mdl) {
 		fprintf(file, "#state#0#%"PRIu32"#%"PRIu64"\n", M, F);
 		for (uint64_t f = 0; f < F; f++) {
 			fprintf(file, "%"PRIu64, f);
-			fprintf(file, " %la %la", xp[f], gp[f]);
+			fprintf(file, " %le %le", xp[f], gp[f]);
 			for (uint32_t m = 0; m < M; m++)
-				fprintf(file, " %la %la", s[m][f], y[m][f]);
+				fprintf(file, " %le %le", s[m][f], y[m][f]);
 			fprintf(file, "\n");
 		}
 		fclose(file);

data/ext/wapiti/model.c CHANGED Viewed

@@ -74,7 +74,6 @@ mdl_t *mdl_new(rdr_t *rdr) {
 	mdl->train  = mdl->devel = NULL;
 	mdl->reader = rdr;
 	mdl->werr   = NULL;
-	mdl->total  = 0.0;
 	return mdl;
 }
@@ -272,7 +271,7 @@ void mdl_save(mdl_t *mdl, FILE *file) {
 	rdr_save(mdl->reader, file);
 	for (uint64_t f = 0; f < mdl->nftr; f++)
 		if (mdl->theta[f] != 0.0)
-			fprintf(file, "%"PRIu64"=%la\n", f, mdl->theta[f]);
+			fprintf(file, "%"PRIu64"=%le\n", f, mdl->theta[f]);
 }
 /* mdl_load:
@@ -298,7 +297,7 @@ void mdl_load(mdl_t *mdl, FILE *file) {
 	for (uint64_t i = 0; i < nact; i++) {
 		uint64_t f;
 		double v;
-		if (fscanf(file, "%"SCNu64"=%la\n", &f, &v) != 2)
+		if (fscanf(file, "%"SCNu64"=%le\n", &f, &v) != 2)
 			fatal(err);
 		mdl->theta[f] = v;
 	}

data/ext/wapiti/model.h CHANGED Viewed

@@ -30,15 +30,12 @@
 #include <stddef.h>
 #include <stdint.h>
-#include <sys/time.h>
 #include "options.h"
 #include "sequence.h"
 #include "reader.h"
 #include "wapiti.h"
-typedef struct timeval tms_t;
 /* mdl_t:
  *   Represent a linear-chain CRF model. The model contain both unigram and
  *   bigram features. It is caracterized by <nlbl> the number of labels, <nobs>
@@ -86,10 +83,6 @@ struct mdl_s {
 	double   *werr;    //       Window of error rate of last iters
 	uint32_t  wcnt;    //       Number of iters in the window
 	uint32_t  wpos;    //       Position for the next iter
-	// Timing
-	tms_t     timer;   //       start time of last iter
-	double    total;   //       total training time
 };
 mdl_t *mdl_new(rdr_t *rdr);

data/ext/wapiti/native.c CHANGED Viewed

@@ -10,43 +10,16 @@
 #include "quark.h"
 #include "tools.h"
 #include "wapiti.h"
 #include "native.h"
 VALUE mWapiti;
 VALUE mNative;
 VALUE cOptions;
 VALUE cModel;
+VALUE cArgumentError;
 VALUE cNativeError;
-VALUE cConfigurationError;
 VALUE cLogger;
-/* --- Forward declarations --- */
-int wapiti_main(int argc, char *argv[argc]);
-void dolabel(mdl_t *mdl);
-/* --- Utilities --- */
-static const struct {
-  const char *name;
-  void (* train)(mdl_t *mdl);
-} trn_lst[] = {
-  {"l-bfgs", trn_lbfgs},
-  {"sgd-l1", trn_sgdl1},
-  {"bcd",    trn_bcd  },
-  {"rprop",  trn_rprop},
-  {"rprop+", trn_rprop},
-  {"rprop-", trn_rprop}
-};
-static const uint32_t trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
 /* --- Options Class --- */
 // Auxiliary Methods
@@ -68,6 +41,14 @@ static void copy_string(char **dst, VALUE rb_string) {
   memcpy(*dst, StringValuePtr(rb_string), RSTRING_LEN(rb_string) + 1);
 }
+// Moves a string to the heap. We use this to move default
+// values to the heap during initialization.
+static char *to_heap(const char *string) {
+  char* ptr = calloc(strlen(string), sizeof(char));
+  memcpy(ptr, string, strlen(string));
+  return ptr;
+}
 // Constructor / Desctructor
@@ -76,11 +57,11 @@ static void mark_options(opt_t* options __attribute__((__unused__))) {
 }
 static void deallocate_options(opt_t* options) {
   // free string options
   if (options->input) { free(options->input); }
   if (options->output) { free(options->output); }
   if (options->algo) { free((void*)options->algo); }
+  if (options->type) { free((void*)options->type); }
   if (options->devel) { free(options->devel); }
   if (options->pattern) { free((void*)options->pattern); }
@@ -101,21 +82,20 @@ static VALUE initialize_options(int argc, VALUE *argv, VALUE self) {
     options->maxiter = INT_MAX;
   }
-  // copy the default algorithm name to the heap so that all options strings
-  // are on the heap
-  char* tmp = calloc(strlen(options->algo), sizeof(char));
-  memcpy(tmp, options->algo, strlen(options->algo));
-  options->algo = tmp;
+  // Copy default algorithm and type name to the heap
+  // so that all options strings are on the heap.
+  options->algo = to_heap(options->algo);
+  options->type = to_heap(options->type);
   if (argc > 1) {
-    rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
+    rb_raise(cArgumentError,
       "wrong number of arguments (%d for 0..1)", argc);
   }
   // set defaults
   if (argc) {
     Check_Type(argv[0], T_HASH);
-    (void)rb_funcall(self, rb_intern("update"), 1, argv[0]);
+    (void)rb_funcall(self, rb_intern("update!"), 1, argv[0]);
   }
   // yield self if block_given?
@@ -431,7 +411,6 @@ static VALUE options_model(VALUE self) {
 static VALUE options_set_model(VALUE self, VALUE rb_string) {
   opt_t *options = get_options(self);
   copy_string(&(options->model), rb_string);
   return rb_string;
 }
@@ -443,19 +422,17 @@ static VALUE options_algorithm(VALUE self) {
 static VALUE options_set_algorithm(VALUE self, VALUE rb_string) {
   opt_t *options = get_options(self);
   copy_string((char**)&(options->algo), rb_string);
   return rb_string;
 }
-static VALUE options_development_data(VALUE self) {
-  char *development_data = get_options(self)->devel;
-  return rb_str_new2(development_data ? development_data : "");
+static VALUE options_type(VALUE self) {
+  const char *type = get_options(self)->type;
+  return rb_str_new2(type ? type : "");
 }
-static VALUE options_set_development_data(VALUE self, VALUE rb_string) {
+static VALUE options_set_type(VALUE self, VALUE rb_string) {
   opt_t *options = get_options(self);
-  copy_string(&(options->devel), rb_string);
+  copy_string((char**)&(options->type), rb_string);
   return rb_string;
 }
@@ -565,11 +542,8 @@ void Init_options() {
   rb_define_alias(cOptions, "algo", "algorithm");
   rb_define_alias(cOptions, "algo=", "algorithm=");
-  rb_define_method(cOptions, "development_data", options_development_data, 0);
-  rb_define_method(cOptions, "development_data=", options_set_development_data, 1);
-  rb_define_alias(cOptions, "devel", "development_data");
-  rb_define_alias(cOptions, "devel=", "development_data=");
+  rb_define_method(cOptions, "type", options_type, 0);
+  rb_define_method(cOptions, "type=", options_set_type, 1);
   rb_define_method(cOptions, "clip", options_clip, 0);
   rb_define_method(cOptions, "clip=", options_set_clip, 1);
@@ -640,7 +614,7 @@ static VALUE allocate_model(VALUE self) {
 static VALUE model_set_options(VALUE self, VALUE rb_options) {
   if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
-    rb_raise(cNativeError, "argument must be a Wapiti::Options instance");
+    rb_raise(cArgumentError, "argument must be a Wapiti::Options instance");
   }
   mdl_t *model = get_model(self);
@@ -661,22 +635,20 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
   VALUE options;
   if (argc > 1) {
-    rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
+    rb_raise(cArgumentError,
       "wrong number of arguments (%d for 0..1)", argc);
   }
   if (argc) {
     if (TYPE(argv[0]) == T_HASH) {
       options = rb_funcall(cOptions, rb_intern("new"), 1, argv[0]);
-    }
-    else {
+    } else {
       if (strncmp("Wapiti::Options", rb_obj_classname(argv[0]), 15) != 0) {
-        rb_raise(cNativeError, "argument must be a hash or an options instance");
+        rb_raise(cArgumentError, "argument must be a hash or an options instance");
       }
       options = argv[0];
     }
-  }
-  else {
+  } else {
     options = rb_funcall(cOptions, rb_intern("new"), 0);
   }
@@ -693,7 +665,7 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
   }
   // initialize counters
-  rb_funcall(self, rb_intern("clear_counters"), 0);
+  rb_funcall(self, rb_intern("reset_counters"), 0);
   return self;
 }
@@ -713,10 +685,6 @@ static VALUE model_nftr(VALUE self) {
   return INT2FIX(get_model(self)->nftr);
 }
-static VALUE model_total(VALUE self) {
-  return rb_float_new(get_model(self)->total);
-}
 // Instance methods
@@ -738,7 +706,7 @@ static VALUE model_compact(VALUE self) {
 // otherwise uses the passed-in argument as the Model's path.
 static VALUE model_save(int argc, VALUE *argv, VALUE self) {
   if (argc > 1) {
-    rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
+    rb_raise(cArgumentError,
       "wrong number of arguments (%d for 0..1)", argc);
   }
@@ -751,17 +719,13 @@ static VALUE model_save(int argc, VALUE *argv, VALUE self) {
   }
   // open the output file
-  FILE *file = 0;
   VALUE path = rb_ivar_get(self, rb_intern("@path"));
   if (NIL_P(path)) {
-    rb_raise(cNativeError, "failed to save model: no path given");
-  }
-  if (!(file = fopen(StringValueCStr(path), "w"))) {
-    rb_raise(cNativeError, "failed to save model: failed to open model file");
+    fatal("failed to save model: no path given");
   }
+  FILE *file = ufopen(path, "w");
   mdl_save(model, file);
   fclose(file);
@@ -770,7 +734,7 @@ static VALUE model_save(int argc, VALUE *argv, VALUE self) {
 static VALUE model_load(int argc, VALUE *argv, VALUE self) {
   if (argc > 1) {
-    rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
+    rb_raise(cArgumentError,
       "wrong number of arguments (%d for 0..1)", argc);
   }
@@ -783,17 +747,13 @@ static VALUE model_load(int argc, VALUE *argv, VALUE self) {
   }
   // open the model file
-  FILE *file = 0;
   VALUE path = rb_ivar_get(self, rb_intern("@path"));
   if (NIL_P(path)) {
-    rb_raise(cNativeError, "failed to load model: no path given");
-  }
-  if (!(file = fopen(StringValueCStr(path), "r"))) {
-    rb_raise(cNativeError, "failed to load model: failed to open model file");
+    fatal("failed to load model: no path given");
   }
+  FILE *file = ufopen(path, "r");
   mdl_load(model, file);
   fclose(file);
@@ -849,31 +809,44 @@ static dat_t *to_dat(rdr_t *reader, VALUE data, bool labelled) {
   return dat;
 }
+static dat_t *ld_dat(rdr_t *reader, VALUE data, bool labelled) {
+  FILE *file;
+  dat_t *dat = (dat_t*)0;
-static VALUE model_train(VALUE self, VALUE data) {
+  switch (TYPE(data)) {
+    case T_STRING:
+      file = ufopen(data, "r");
+      dat = rdr_readdat(reader, file, labelled);
+      fclose(file);
+      break;
-  mdl_t* model = get_model(self);
+    case T_ARRAY:
+      dat = to_dat(reader, data, labelled);
+      break;
-  uint32_t trn;
-  for (trn = 0; trn < trn_cnt; trn++) {
-    if (!strcmp(model->opt->algo, trn_lst[trn].name)) break;
+    default:
+      fatal("invalid data type (expected instance of String or Array)");
   }
-  if (trn == trn_cnt) {
-    rb_raise(cNativeError,
-        "failed to train model: unknown algorithm '%s'", model->opt->algo);
-  }
+  return dat;
+}
+static VALUE model_train(VALUE self, VALUE train, VALUE devel) {
   FILE *file;
+  mdl_t *model = get_model(self);
+  trn_t trn = trn_get(model->opt->algo);
+  model->type = typ_get(model->opt->type);
   // Load the pattern file. This will unlock the database if previously
   // locked by loading a model.
   if (model->opt->pattern) {
+    info("load patterns");
     file = fopen(model->opt->pattern, "r");
     if (!file) {
-      rb_raise(cNativeError,
-          "failed to train model: failed to load pattern file '%s'", model->opt->pattern);
+      pfatal("failed to train model: failed to load pattern file '%s'",
+        model->opt->pattern);
     }
     rdr_loadpat(model->reader, file);
@@ -886,58 +859,45 @@ static VALUE model_train(VALUE self, VALUE data) {
   // Load the training data. When this is done we lock the quarks as we
   // don't want to put in the model, informations present only in the
   // development set.
-  switch (TYPE(data)) {
-    case T_STRING:
-      if (!(file = fopen(StringValuePtr(data), "r"))) {
-        rb_raise(cNativeError, "failed to train model: failed to open training data '%s", StringValuePtr(data));
-      }
-      model->train = rdr_readdat(model->reader, file, true);
-      fclose(file);
-      break;
-    case T_ARRAY:
-      model->train = to_dat(model->reader, data, true);
-      break;
-    default:
-      rb_raise(cNativeError, "failed to train model: invalid training data type (expected instance of String or Array)");
-  }
+  model->train = ld_dat(model->reader, train, true);
   qrk_lock(model->reader->lbl, true);
   qrk_lock(model->reader->obs, true);
   if (!model->train || model->train->nseq == 0) {
-    rb_raise(cNativeError, "failed to train model: no training data loaded");
+    fatal("failed to train model: no training data loaded");
   }
   // If present, load the development set in the model. If not specified,
   // the training dataset will be used instead.
-  if (model->opt->devel) {
-    if (!(file = fopen(model->opt->devel, "r"))) {
-      rb_raise(cNativeError,
-          "failed to train model: cannot open development file '%s'", model->opt->devel);
-    }
-    model->devel = rdr_readdat(model->reader, file, true);
-    fclose(file);
+  if (TYPE(devel) != T_NIL) {
+    model->devel = ld_dat(model->reader, devel, true);
   }
-  // Initialize the model. If a previous model was loaded, this will be
-  // just a resync, else the model structure will be created.
-  // rb_funcall(self, rb_intern("sync"), 0);
+	// Initialize the model. If a previous model was loaded, this will be
+	// just a resync, else the model structure will be created.
+  info((model->theta == NULL) ? "initialize model" : "re-sync model");
 	mdl_sync(model);
-  // Train the model.
+	info("nb train:    %"PRIu32"", model->train->nseq);
+	if (model->devel != NULL)
+		info("nb devel:    %"PRIu32"", model->devel->nseq);
+	info("nb labels:   %"PRIu32"", model->nlbl);
+	info("nb blocks:   %"PRIu64"", model->nobs);
+	info("nb features: %"PRIu64"", model->nftr);
+	info("training model with %s", model->opt->algo);
   uit_setup(model);
-  trn_lst[trn].train(model);
+  trn(model);
   uit_cleanup(model);
-  // If requested compact the model.
   if (model->opt->compact) {
-    // rb_funcall(self, rb_intern("compact"), 0);
+		const uint64_t O = model->nobs;
+		const uint64_t F = model->nftr;
+		info("compacting model");
 		mdl_compact(model);
+		info("%8"PRIu64" observations removed", O - model->nobs);
+		info("%8"PRIu64" features removed", F - model->nftr);
   }
   return self;
@@ -980,8 +940,7 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
   if (N == 1) {
     tag_viterbi(model, seq, out, scs, psc);
-  }
-  else {
+  } else {
     tag_nbviterbi(model, seq, N, (void*)out, scs, (void*)psc);
   }
@@ -993,16 +952,13 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
     if (!model->opt->label) {
       VALUE token = rb_str_new2(raw->lines[t]);
-      #ifdef HAVE_RUBY_ENCODING_H
       int enc = rb_enc_find_index("UTF-8");
       rb_enc_associate_index(token, enc);
-      #endif
       rb_ary_push(tokens, token);
     }
     for (n = 0; n < N; ++n) {
       uint64_t lbl = out[t * N + n];
       rb_ary_push(tokens, rb_str_new2(qrk_id2str(lbls, lbl)));
@@ -1010,7 +966,6 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
       if (model->opt->outsc) {
         rb_ary_push(tokens, rb_float_new(psc[t * N + n]));
       }
     }
     // yield token/label pair to block if given
@@ -1020,9 +975,7 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
     rb_ary_push(sequence, tokens);
     // TODO output sequence score: scs[n] (float)
   }
   // Statistics
@@ -1036,8 +989,7 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
       if (seq->pos[t].lbl != out[t * N]) {
         terr++;
         err = 1;
-      }
-      else {
+      } else {
         stat[2][out[t * N]]++;
       }
     }
@@ -1053,10 +1005,8 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
     serr = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_errors")));
     rb_ivar_set(self, rb_intern("@sequence_errors"), INT2FIX(serr + err));
   }
   // Cleanup memory used for this sequence
   xfree(scs);
   xfree(psc);
@@ -1090,7 +1040,6 @@ static VALUE decode_sequence_array(VALUE self, VALUE array) {
     for (j = 0; j < k; ++j) {
       VALUE line = rb_ary_entry(sequence, j);
       Check_Type(line, T_STRING);
       raw->lines[j] = StringValueCStr(line);
     }
@@ -1103,13 +1052,7 @@ static VALUE decode_sequence_array(VALUE self, VALUE array) {
 }
 static VALUE decode_sequence_file(VALUE self, VALUE path) {
-  Check_Type(path, T_STRING);
-  FILE *file;
-  if (!(file = fopen(StringValueCStr(path), "r"))) {
-    rb_raise(cNativeError, "failed to label data: could not open file '%s'", StringValueCStr(path));
-  }
+  FILE *file = ufopen(path, "r");
   mdl_t *model = get_model(self);
   raw_t *raw;
@@ -1119,7 +1062,6 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
   // to take care of not discarding the raw input as we want to send it
   // back to the output with the additional predicted labels.
   while (!feof(file)) {
     // So, first read an input sequence keeping the raw_t object
     // available, and label it with Viterbi.
     if ((raw = rdr_readraw(model->reader, file)) == 0) {
@@ -1133,12 +1075,12 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
   return result;
 }
-// cal-seq:
+// call-seq:
 //   m.label(tokens, options = {})  # => array of labelled tokens
 //   m.label(filename, options = {}) # => array of labelled tokens
 //
 static VALUE model_label(VALUE self, VALUE data) {
-  VALUE result;
+  VALUE result = (VALUE)0;
   switch (TYPE(data)) {
     case T_STRING:
@@ -1148,7 +1090,7 @@ static VALUE model_label(VALUE self, VALUE data) {
       result = decode_sequence_array(self, data);
       break;
     default:
-      rb_raise(cNativeError, "failed to label data: invalid data (expected type String or Array)");
+      fatal("failed to label data: invalid data (expected type String or Array)");
   }
   return result;
@@ -1157,125 +1099,33 @@ static VALUE model_label(VALUE self, VALUE data) {
 static void Init_model() {
   cModel = rb_define_class_under(mWapiti, "Model", rb_cObject);
   rb_define_alloc_func(cModel, allocate_model);
-  rb_define_method(cModel, "initialize", initialize_model, -1);
   rb_define_attr(cModel, "options", 1, 0);
+  rb_define_method(cModel, "initialize", initialize_model, -1);
   rb_define_method(cModel, "nlbl", model_nlbl, 0);
   rb_define_method(cModel, "labels", model_labels, 0);
   rb_define_method(cModel, "nobs", model_nobs, 0);
   rb_define_alias(cModel, "observations", "nobs");
   rb_define_method(cModel, "nftr", model_nftr, 0);
   rb_define_alias(cModel, "features", "nftr");
-  rb_define_method(cModel, "total", model_total, 0);
   rb_define_method(cModel, "sync", model_sync, 0);
   rb_define_method(cModel, "compact", model_compact, 0);
   rb_define_method(cModel, "save", model_save, -1);
   rb_define_method(cModel, "load", model_load, -1);
-  rb_define_method(cModel, "train", model_train, 1);
+  rb_define_method(cModel, "train", model_train, 2);
   rb_define_method(cModel, "label", model_label, 1);
 }
-/* --- Top-Level Utility Methods --- */
-static VALUE label(VALUE self __attribute__((__unused__)), VALUE rb_options) {
-  if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
-    rb_raise(cNativeError, "argument must be a native options instance");
-  }
-  opt_t *options = get_options(rb_options);
-  if (options->mode != 1) {
-    rb_raise(cNativeError, "invalid options argument: mode should be set to 1 for labelling");
-  }
-  mdl_t *model = mdl_new(rdr_new(options->maxent));
-  model->opt = options;
-  dolabel(model);
-  mdl_free(model);
-  return Qnil;
-}
-#if defined EXTRA
-static VALUE dump(VALUE self __attribute__((__unused__)), VALUE rb_options) {
-  if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
-    rb_raise(cNativeError, "argument must be a native options instance");
-  }
-  opt_t *options = get_options(rb_options);
-  if (options->mode != 2) {
-    rb_raise(cNativeError, "invalid options argument: mode should be set to 2 for training");
-  }
-  mdl_t *model = mdl_new(rdr_new(options->maxent));
-  model->opt = options;
-  dodump(model);
-  mdl_free(model);
-  return Qnil;
-}
-// This function is a proxy for Wapiti's main entry point.
-static VALUE wapiti(VALUE self __attribute__((__unused__)), VALUE arguments) {
-  int result = -1, argc = 0;
-  char **ap, *argv[18], *input, *tmp;
-  Check_Type(arguments, T_STRING);
-  tmp = StringValueCStr(arguments);
-  // allocate space for argument vector
-  input = (char*)malloc(strlen(tmp) + 8);
-  // prepend command name
-  strncpy(input, "wapiti ", 8);
-  strncat(input, tmp, strlen(input) - 8);
-  // remember allocation pointer
-  tmp = input;
-  // turn input string into argument vector (using
-  // only the first seventeen tokens from input)
-  for (ap = argv; (*ap = strsep(&input, " \t")) != (char*)0; ++argc) {
-    if ((**ap != '\0') && (++ap >= &argv[18])) break;
-  }
-  // call main entry point
-  result = wapiti_main(argc, argv);
-  // free allocated memory
-  free(tmp);
-  return INT2FIX(result);
-}
-#endif
 /* --- Wapiti Extension Entry Point --- */
 void Init_native() {
   mWapiti = rb_const_get(rb_mKernel, rb_intern("Wapiti"));
   mNative = rb_define_module_under(mWapiti, "Native");
+  cArgumentError = rb_const_get(rb_mKernel, rb_intern("ArgumentError"));
   cNativeError = rb_const_get(mWapiti, rb_intern("NativeError"));
-  cConfigurationError = rb_const_get(mWapiti, rb_intern("ConfigurationError"));
   cLogger = rb_funcall(mWapiti, rb_intern("log"), 0);
-  rb_define_singleton_method(mNative, "label", label, 1);
-  // rb_define_singleton_method(mNative, "wapiti", wapiti, 1);
   rb_define_const(mNative, "VERSION", rb_str_new2(VERSION));
   Init_options();