RubyGems - wapiti - Versions diffs - 0.0.2 → 0.0.3 - Mend

wapiti 0.0.2 → 0.0.3

Files changed (8) hide show

data/README.md CHANGED

@@ -103,6 +103,42 @@ corresponding label:
     => [[["confidence nn", "b-np"], ["in in", "b-pp"], ["the dt", "b-np"],
     ["pound nn", "i-np"], [". .", "o"]]]
+Note that if you set the *:score* option (either in the Model's `#options` or
+when calling `#label`), the score for each label will be appended to
+each token/label tuple as a floating point number or passed as a third
+argument to the passed-in block.
+    model.label [['Confidence NN']], :score => true
+    => [[["Confidence NN", "B-NP", 4.642034838737357]]]
+Similarly, if you set the *:nbest* option to a value greater than one, Wapiti
+will append more label and, optionally, score values to each tuple.
+    model.label [['Confidence NN']], :score => true, :nbest => 3, :skip_tokens => true
+    => [[["B-NP", 4.642034838737357, "B-VP", 1.7040256847206927, "B-ADJP", 0.7636429298060177]]]
+Note how we also suppressed the output of the token string using the
+*:skip_tokens* option.
+### Statistics
+By setting the *:check* option you can tell Wapiti to keep statistics during
+the labelling phase (for the statistics to be meaningful you obviously need
+to provide input data that is already labelled). Wapiti does not reset the
+counters during consecutive calls to `#label` to allow you to collect
+accumulative date; however, you can reset the counters at any time, by calling
+`#clear_counters`.
+After calling `#label` with the *:check* options set and appropriately labelled
+input, you can access the statistics via `#statistics` (the individual values
+are also available through the associated attribute readers).
+    model.label 'test.txt', :check => true
+    => {:tokens=>{:total=>1896, :errors=>137, :rate=>0.0007225738396624472},
+    :sequences=>{:total=>77, :errors=>50, :rate=>0.006493506493506494}}
 Citing
 ------

data/ext/wapiti/native.c CHANGED

@@ -522,10 +522,10 @@ void Init_options() {
 	rb_define_alias(cOptions, "sparse?", "sparse");
-	rb_define_method(cOptions, "label", options_label, 0);
-	rb_define_method(cOptions, "label=", options_set_label, 1);
+	rb_define_method(cOptions, "skip_tokens", options_label, 0);
+	rb_define_method(cOptions, "skip_tokens=", options_set_label, 1);
-	rb_define_alias(cOptions, "label?", "label");
+	rb_define_alias(cOptions, "skip_tokens?", "skip_tokens");
 	rb_define_method(cOptions, "check", options_check, 0);
 	rb_define_method(cOptions, "check=", options_set_check, 1);
@@ -680,7 +680,7 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
 		options = rb_funcall(cOptions, rb_intern("new"), 0);
 	}
-	// yield self if block_given?
+	// yield options if block_given?
 	if (rb_block_given_p()) {
 	 	rb_yield(options);
 	}
@@ -691,6 +691,9 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
 	if (get_options(options)->model) {
 		rb_funcall(self, rb_intern("load"), 0);
 	}
+	// initialize counters
+	rb_funcall(self, rb_intern("clear_counters"), 0);
 	return self;
 }
@@ -958,63 +961,94 @@ static VALUE model_labels(VALUE self) {
 	return labels;
 }
-static VALUE decode_sequence(mdl_t *model, raw_t *raw) {
+static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
 	qrk_t *lbls = model->reader->lbl;
-	const size_t Y = model->nlbl;
-	const size_t N = model->opt->nbest;
+	const unsigned int Y = model->nlbl;
+	const unsigned int N = model->opt->nbest;
 	seq_t *seq = rdr_raw2seq(model->reader, raw, model->opt->check);
-	const int T = seq->len;
+	const unsigned int T = seq->len;
+	unsigned int n, t, tcnt = 0, terr = 0, scnt = 0, serr = 0, stat[3][Y];
 	size_t *out = xmalloc(sizeof(size_t) * T * N);
 	double *psc = xmalloc(sizeof(double) * T * N);
 	double *scs = xmalloc(sizeof(double) * N);
-	VALUE result = rb_ary_new2(N), sequence, tokens;
+	VALUE sequence, tokens;
 	if (N == 1) {
 		tag_viterbi(model, seq, (size_t*)out, scs, (double*)psc);
 	}
 	else {
-		tag_nbviterbi(model, seq, N, (void*)out, scs, (void*)psc);
+		tag_nbviterbi(model, seq, N, (size_t*)out, scs, (double*)psc);
 	}
-	// Next we output the raw sequence with an aditional column for
-	// the predicted labels
-	for (size_t n = 0; n < N; n++) {
-		sequence = rb_ary_new();
-		// if (model->opt->outsc)
-			// fprintf(fout, "# %d %f\n", (int)n, scs[n]);
-		for (int t = 0; t < T; t++) {
-			tokens = rb_ary_new();
-			if (!model->opt->label) {
-				rb_ary_push(tokens, rb_str_new2(raw->lines[t]));
-			}
+	sequence = rb_ary_new();
+	for (t = 0; t < T; ++t) {
+		tokens = rb_ary_new();
+		if (!model->opt->label) {
+			rb_ary_push(tokens, rb_str_new2(raw->lines[t]));
+		}
+		for (n = 0; n < N; ++n) {
 			size_t lbl = out[t * N + n];
 			rb_ary_push(tokens, rb_str_new2(qrk_id2str(lbls, lbl)));
-			// if (model->opt->outsc) {
-			// 	fprintf(fout, "\t%s", lblstr);
-			// 	fprintf(fout, "/%f", psc[t * N + n]);
-			// }
-			// yield token/label pair to block if given
-			if (rb_block_given_p()) {
-				tokens = rb_yield(tokens);
+			// output individual score
+			if (model->opt->outsc) {
+				rb_ary_push(tokens, rb_float_new(psc[t * N + n]));
 			}
-			rb_ary_push(sequence, tokens);
+		}
+		// yield token/label pair to block if given
+		if (rb_block_given_p()) {
+			tokens = rb_yield(tokens);
 		}
-		rb_ary_push(result, sequence);
+		rb_ary_push(sequence, tokens);
+		// TODO output sequence score: scs[n] (float)
+	}
+	// Statistics
+	if (model->opt->check) {
+		int err = 0;
+		for (t = 0; t < T; ++t) {
+			stat[0][seq->pos[t].lbl]++;
+			stat[1][out[t * N]]++;
+			if (seq->pos[t].lbl != out[t * N]) {
+				terr++;
+				err = 1;
+			}
+			else {
+				stat[2][out[t * N]]++;
+			}
+		}
+		tcnt = FIX2INT(rb_ivar_get(self, rb_intern("@token_count")));
+		rb_ivar_set(self, rb_intern("@token_count"), INT2FIX(tcnt + (unsigned int)T));
+		terr += FIX2INT(rb_ivar_get(self, rb_intern("@token_errors")));
+		rb_ivar_set(self, rb_intern("@token_errors"), INT2FIX(terr));
+		scnt = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_count")));
+		rb_ivar_set(self, rb_intern("@sequence_count"), INT2FIX(++scnt));
+		serr = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_errors")));
+		rb_ivar_set(self, rb_intern("@sequence_errors"), INT2FIX(serr + err));
 	}
 	// Cleanup memory used for this sequence
 	xfree(scs);
@@ -1023,7 +1057,7 @@ static VALUE decode_sequence(mdl_t *model, raw_t *raw) {
 	rdr_freeseq(seq);
-	return result;
+	return sequence;
 }
 static VALUE decode_sequence_array(VALUE self, VALUE array) {
@@ -1053,7 +1087,7 @@ static VALUE decode_sequence_array(VALUE self, VALUE array) {
 			raw->lines[j] = StringValueCStr(line);
 		}
-		rb_funcall(result, rb_intern("concat"), 1, decode_sequence(model, raw));
+		rb_ary_push(result, decode_sequence(self, model, raw));
 		xfree(raw);
 	}
@@ -1085,7 +1119,7 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
 			break;
 		}
-		rb_funcall(result, rb_intern("concat"), 1, decode_sequence(model, raw));
+		rb_ary_push(result, decode_sequence(self, model, raw));
 		rdr_freeraw(raw);
 	}
@@ -1093,8 +1127,8 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
 }
 // cal-seq:
-//   m.label(tokens)  # => array of labelled tokens
-//   m.label(filename) # => array of labelled tokens
+//   m.label(tokens, options = {})  # => array of labelled tokens
+//   m.label(filename, options = {}) # => array of labelled tokens
 //
 static VALUE model_label(VALUE self, VALUE data) {
 	VALUE result;
@@ -1120,6 +1154,7 @@ static void Init_model() {
 	rb_define_method(cModel, "initialize", initialize_model, -1);
 	rb_define_attr(cModel, "options", 1, 0);
 	rb_define_method(cModel, "nlbl", model_nlbl, 0);
 	rb_define_method(cModel, "labels", model_labels, 0);

data/lib/wapiti/model.rb CHANGED

@@ -30,6 +30,8 @@ module Wapiti
 		attr_accessor :path
+		attr_reader :token_count, :token_errors, :sequence_count, :sequence_errors
 		def pattern
 			options.pattern
 		end
@@ -38,6 +40,35 @@ module Wapiti
 			options.pattern = filename
 		end
+		alias native_label label
+		def label(input, opts = nil)
+			options.update(opts) unless opts.nil?
+			block_given? ? native_label(input, &Proc.new) : native_label(input)
+		end
+		def statistics
+			s = {}
+			s[:tokens] = {
+				:total => token_count, :errors => @token_errors,
+				:rate => token_errors / (token_count * 100.0)
+			}
+			s[:sequences] = {
+				:total => sequence_count, :errors => sequence_errors,
+				:rate => sequence_errors / (sequence_count * 100.0)
+			}
+			s
+		end
+		alias stats statistics
+		def clear_counters
+			@token_count = @token_errors = @sequence_count = @sequence_errors = 0
+		end
+		alias clear clear_counters
 		private
 		def tokenize(input)

data/lib/wapiti/options.rb CHANGED

@@ -10,7 +10,7 @@ module Wapiti
 			def attribute_names
 				@attribute_names ||= %w{ stop_window convergence_window posterior
 					max_iterations jobsize threads rho1 rho2 stop_epsilon score check
-					algorithm pattern development_data maxent compact sparse label
+					algorithm pattern development_data maxent compact sparse skip_tokens
 					}.sort.map(&:to_sym).freeze
 			end
@@ -103,7 +103,15 @@ module Wapiti
 			e << "BCD not supported for training maxent models" if maxent && algorithm == 'bcd'
 			e
 		end
+		%w{ maxent compact sparse label check score posterior }.each do |m|
+			writer = "#{m}=".to_sym
+			define_method("#{m}!") do
+				send(writer, true)
+				self
+			end
+		end
 		def <=>(other)
 			other.respond_to?(:attributes) ? attributes <=> other.attributes : nil
 		end

data/lib/wapiti/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Wapiti
-  VERSION = '0.0.2'.freeze
+  VERSION = '0.0.3'.freeze
 end

data/spec/wapiti/model_spec.rb CHANGED

@@ -143,6 +143,22 @@ module Wapiti
 						labels[0].map(&:last).should == %w{ b-np o b-np o }
 				  end
+					context 'with the :score option set' do
+						before(:each) { model.options.score! }
+						it 'returns an array of token-label-score tuples' do
+							model.label(input)[0].map { |t,l,s| s.class }.uniq == [Float]
+						end
+					end
+					context 'with the :nbest option set to 2' do
+						before(:each) { model.options.nbest = 2 }
+						it 'returns an array of token-label-label tuples' do
+							model.label(input)[0][-1][1,2] == %w{ O O }
+						end
+					end
 				end
@@ -155,7 +171,7 @@ module Wapiti
 						labels[0].take(5).map(&:last).should == %w{ B-NP B-PP B-NP I-NP B-VP }
 					end
 				end
 			end
 		end

data/spec/wapiti/options_spec.rb CHANGED

@@ -139,7 +139,7 @@ module Wapiti
 		end
-		%w{ maxent compact sparse label check score posterior }.each do |m|
+		%w{ maxent compact sparse skip_tokens check score posterior }.each do |m|
 			describe "##{m}" do
 				it 'returns false by default' do
 					options.send(m).should be false

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wapiti
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-09-01 00:00:00.000000000Z
+date: 2011-09-02 00:00:00.000000000Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
-  requirement: &2156429940 !ruby/object:Gem::Requirement
+  requirement: &2156033940 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -21,10 +21,10 @@ dependencies:
         version: '0.9'
   type: :development
   prerelease: false
-  version_requirements: *2156429940
+  version_requirements: *2156033940
 - !ruby/object:Gem::Dependency
   name: rake-compiler
-  requirement: &2156428740 !ruby/object:Gem::Requirement
+  requirement: &2156032240 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -32,10 +32,10 @@ dependencies:
         version: '0.7'
   type: :development
   prerelease: false
-  version_requirements: *2156428740
+  version_requirements: *2156032240
 - !ruby/object:Gem::Dependency
   name: ZenTest
-  requirement: &2156427300 !ruby/object:Gem::Requirement
+  requirement: &2156031260 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -43,10 +43,10 @@ dependencies:
         version: '4.6'
   type: :development
   prerelease: false
-  version_requirements: *2156427300
+  version_requirements: *2156031260
 - !ruby/object:Gem::Dependency
   name: rspec
-  requirement: &2156425920 !ruby/object:Gem::Requirement
+  requirement: &2156022680 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -54,7 +54,7 @@ dependencies:
         version: '2.6'
   type: :development
   prerelease: false
-  version_requirements: *2156425920
+  version_requirements: *2156022680
 description: This gem provides a Ruby API for Conditional Random Fields (CRF). It
   is implemented as a C exstension and based on the wicked fast "wapiti" package.
 email: