RubyGems - wapiti - Versions diffs - 0.0.2 → 0.0.3 - Mend

wapiti 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/README.md CHANGED

@@ -103,6 +103,42 @@ corresponding label:
     => [[["confidence nn", "b-np"], ["in in", "b-pp"], ["the dt", "b-np"],
     ["pound nn", "i-np"], [". .", "o"]]]
+Note that if you set the *:score* option (either in the Model's `#options` or
+when calling `#label`), the score for each label will be appended to
+each token/label tuple as a floating point number or passed as a third
+argument to the passed-in block.
+    model.label [['Confidence NN']], :score => true
+    => [[["Confidence NN", "B-NP", 4.642034838737357]]]
+Similarly, if you set the *:nbest* option to a value greater than one, Wapiti
+will append more label and, optionally, score values to each tuple.
+    model.label [['Confidence NN']], :score => true, :nbest => 3, :skip_tokens => true
+    => [[["B-NP", 4.642034838737357, "B-VP", 1.7040256847206927, "B-ADJP", 0.7636429298060177]]]
+Note how we also suppressed the output of the token string using the
+*:skip_tokens* option.
+### Statistics
+By setting the *:check* option you can tell Wapiti to keep statistics during
+the labelling phase (for the statistics to be meaningful you obviously need
+to provide input data that is already labelled). Wapiti does not reset the
+counters during consecutive calls to `#label` to allow you to collect
+accumulative date; however, you can reset the counters at any time, by calling
+`#clear_counters`.
+After calling `#label` with the *:check* options set and appropriately labelled
+input, you can access the statistics via `#statistics` (the individual values
+are also available through the associated attribute readers).
+    model.label 'test.txt', :check => true
+    => {:tokens=>{:total=>1896, :errors=>137, :rate=>0.0007225738396624472},
+    :sequences=>{:total=>77, :errors=>50, :rate=>0.006493506493506494}}
 Citing
 ------

data/ext/wapiti/native.c CHANGED

@@ -522,10 +522,10 @@ void Init_options() {
 	rb_define_alias(cOptions, "sparse?", "sparse");
-	rb_define_method(cOptions, "label", options_label, 0);
-	rb_define_method(cOptions, "label=", options_set_label, 1);
+	rb_define_method(cOptions, "skip_tokens", options_label, 0);
+	rb_define_method(cOptions, "skip_tokens=", options_set_label, 1);
-	rb_define_alias(cOptions, "label?", "label");
+	rb_define_alias(cOptions, "skip_tokens?", "skip_tokens");
 	rb_define_method(cOptions, "check", options_check, 0);
 	rb_define_method(cOptions, "check=", options_set_check, 1);
@@ -680,7 +680,7 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
 		options = rb_funcall(cOptions, rb_intern("new"), 0);
 	}
-	// yield self if block_given?
+	// yield options if block_given?
 	if (rb_block_given_p()) {
 	 	rb_yield(options);
 	}
@@ -691,6 +691,9 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
 	if (get_options(options)->model) {
 		rb_funcall(self, rb_intern("load"), 0);
 	}
+	// initialize counters
+	rb_funcall(self, rb_intern("clear_counters"), 0);
 	return self;
 }
@@ -958,63 +961,94 @@ static VALUE model_labels(VALUE self) {
 	return labels;
 }
-static VALUE decode_sequence(mdl_t *model, raw_t *raw) {
+static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
 	qrk_t *lbls = model->reader->lbl;
-	const size_t Y = model->nlbl;
-	const size_t N = model->opt->nbest;
+	const unsigned int Y = model->nlbl;
+	const unsigned int N = model->opt->nbest;
 	seq_t *seq = rdr_raw2seq(model->reader, raw, model->opt->check);
-	const int T = seq->len;
+	const unsigned int T = seq->len;
+	unsigned int n, t, tcnt = 0, terr = 0, scnt = 0, serr = 0, stat[3][Y];
 	size_t *out = xmalloc(sizeof(size_t) * T * N);
 	double *psc = xmalloc(sizeof(double) * T * N);
 	double *scs = xmalloc(sizeof(double) * N);
-	VALUE result = rb_ary_new2(N), sequence, tokens;
+	VALUE sequence, tokens;
 	if (N == 1) {
 		tag_viterbi(model, seq, (size_t*)out, scs, (double*)psc);
 	}
 	else {
-		tag_nbviterbi(model, seq, N, (void*)out, scs, (void*)psc);
+		tag_nbviterbi(model, seq, N, (size_t*)out, scs, (double*)psc);
 	}
-	// Next we output the raw sequence with an aditional column for
-	// the predicted labels
-	for (size_t n = 0; n < N; n++) {
-		sequence = rb_ary_new();
-		// if (model->opt->outsc)
-			// fprintf(fout, "# %d %f\n", (int)n, scs[n]);
-		for (int t = 0; t < T; t++) {
-			tokens = rb_ary_new();
-			if (!model->opt->label) {
-				rb_ary_push(tokens, rb_str_new2(raw->lines[t]));
-			}
+	sequence = rb_ary_new();
+	for (t = 0; t < T; ++t) {
+		tokens = rb_ary_new();
+		if (!model->opt->label) {
+			rb_ary_push(tokens, rb_str_new2(raw->lines[t]));
+		}
+		for (n = 0; n < N; ++n) {
 			size_t lbl = out[t * N + n];
 			rb_ary_push(tokens, rb_str_new2(qrk_id2str(lbls, lbl)));
-			// if (model->opt->outsc) {
-			// 	fprintf(fout, "\t%s", lblstr);
-			// 	fprintf(fout, "/%f", psc[t * N + n]);
-			// }
-			// yield token/label pair to block if given
-			if (rb_block_given_p()) {
-				tokens = rb_yield(tokens);
+			// output individual score
+			if (model->opt->outsc) {
+				rb_ary_push(tokens, rb_float_new(psc[t * N + n]));
 			}
-			rb_ary_push(sequence, tokens);
+		}
+		// yield token/label pair to block if given
+		if (rb_block_given_p()) {
+			tokens = rb_yield(tokens);
 		}
-		rb_ary_push(result, sequence);
+		rb_ary_push(sequence, tokens);
+		// TODO output sequence score: scs[n] (float)
+	}
+	// Statistics
+	if (model->opt->check) {
+		int err = 0;
+		for (t = 0; t < T; ++t) {
+			stat[0][seq->pos[t].lbl]++;
+			stat[1][out[t * N]]++;
+			if (seq->pos[t].lbl != out[t * N]) {
+				terr++;
+				err = 1;
+			}
+			else {
+				stat[2][out[t * N]]++;
+			}
+		}
+		tcnt = FIX2INT(rb_ivar_get(self, rb_intern("@token_count")));
+		rb_ivar_set(self, rb_intern("@token_count"), INT2FIX(tcnt + (unsigned int)T));
+		terr += FIX2INT(rb_ivar_get(self, rb_intern("@token_errors")));
+		rb_ivar_set(self, rb_intern("@token_errors"), INT2FIX(terr));
+		scnt = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_count")));
+		rb_ivar_set(self, rb_intern("@sequence_count"), INT2FIX(++scnt));
+		serr = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_errors")));
+		rb_ivar_set(self, rb_intern("@sequence_errors"), INT2FIX(serr + err));
 	}
 	// Cleanup memory used for this sequence
 	xfree(scs);
@@ -1023,7 +1057,7 @@ static VALUE decode_sequence(mdl_t *model, raw_t *raw) {
 	rdr_freeseq(seq);
-	return result;
+	return sequence;
 }
 static VALUE decode_sequence_array(VALUE self, VALUE array) {
@@ -1053,7 +1087,7 @@ static VALUE decode_sequence_array(VALUE self, VALUE array) {
 			raw->lines[j] = StringValueCStr(line);
 		}
-		rb_funcall(result, rb_intern("concat"), 1, decode_sequence(model, raw));
+		rb_ary_push(result, decode_sequence(self, model, raw));
 		xfree(raw);
 	}
@@ -1085,7 +1119,7 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
 			break;
 		}
-		rb_funcall(result, rb_intern("concat"), 1, decode_sequence(model, raw));
+		rb_ary_push(result, decode_sequence(self, model, raw));
 		rdr_freeraw(raw);
 	}
@@ -1093,8 +1127,8 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
 }
 // cal-seq:
-//   m.label(tokens)  # => array of labelled tokens
-//   m.label(filename) # => array of labelled tokens
+//   m.label(tokens, options = {})  # => array of labelled tokens
+//   m.label(filename, options = {}) # => array of labelled tokens
 //
 static VALUE model_label(VALUE self, VALUE data) {
 	VALUE result;
@@ -1120,6 +1154,7 @@ static void Init_model() {
 	rb_define_method(cModel, "initialize", initialize_model, -1);
 	rb_define_attr(cModel, "options", 1, 0);
 	rb_define_method(cModel, "nlbl", model_nlbl, 0);
 	rb_define_method(cModel, "labels", model_labels, 0);

data/lib/wapiti/model.rb CHANGED

@@ -30,6 +30,8 @@ module Wapiti
 		attr_accessor :path
+		attr_reader :token_count, :token_errors, :sequence_count, :sequence_errors
 		def pattern
 			options.pattern
 		end
@@ -38,6 +40,35 @@ module Wapiti
 			options.pattern = filename
 		end
+		alias native_label label
+		def label(input, opts = nil)
+			options.update(opts) unless opts.nil?
+			block_given? ? native_label(input, &Proc.new) : native_label(input)
+		end
+		def statistics
+			s = {}
+			s[:tokens] = {
+				:total => token_count, :errors => @token_errors,
+				:rate => token_errors / (token_count * 100.0)
+			}
+			s[:sequences] = {
+				:total => sequence_count, :errors => sequence_errors,
+				:rate => sequence_errors / (sequence_count * 100.0)
+			}
+			s
+		end
+		alias stats statistics
+		def clear_counters
+			@token_count = @token_errors = @sequence_count = @sequence_errors = 0
+		end
+		alias clear clear_counters
 		private
 		def tokenize(input)

data/lib/wapiti/options.rb CHANGED

@@ -10,7 +10,7 @@ module Wapiti
 			def attribute_names
 				@attribute_names ||= %w{ stop_window convergence_window posterior
 					max_iterations jobsize threads rho1 rho2 stop_epsilon score check
-					algorithm pattern development_data maxent compact sparse label
+					algorithm pattern development_data maxent compact sparse skip_tokens
 					}.sort.map(&:to_sym).freeze
 			end
@@ -103,7 +103,15 @@ module Wapiti
 			e << "BCD not supported for training maxent models" if maxent && algorithm == 'bcd'
 			e
 		end
+		%w{ maxent compact sparse label check score posterior }.each do |m|
+			writer = "#{m}=".to_sym
+			define_method("#{m}!") do
+				send(writer, true)
+				self
+			end
+		end
 		def <=>(other)
 			other.respond_to?(:attributes) ? attributes <=> other.attributes : nil
 		end

data/lib/wapiti/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Wapiti
-  VERSION = '0.0.2'.freeze
+  VERSION = '0.0.3'.freeze
 end

data/spec/wapiti/model_spec.rb CHANGED

@@ -143,6 +143,22 @@ module Wapiti
 						labels[0].map(&:last).should == %w{ b-np o b-np o }
 				  end
+					context 'with the :score option set' do
+						before(:each) { model.options.score! }
+						it 'returns an array of token-label-score tuples' do
+							model.label(input)[0].map { |t,l,s| s.class }.uniq == [Float]
+						end
+					end
+					context 'with the :nbest option set to 2' do
+						before(:each) { model.options.nbest = 2 }
+						it 'returns an array of token-label-label tuples' do
+							model.label(input)[0][-1][1,2] == %w{ O O }
+						end
+					end
 				end
@@ -155,7 +171,7 @@ module Wapiti
 						labels[0].take(5).map(&:last).should == %w{ B-NP B-PP B-NP I-NP B-VP }
 					end
 				end
 			end
 		end

data/spec/wapiti/options_spec.rb CHANGED

@@ -139,7 +139,7 @@ module Wapiti
 		end
-		%w{ maxent compact sparse label check score posterior }.each do |m|
+		%w{ maxent compact sparse skip_tokens check score posterior }.each do |m|
 			describe "##{m}" do
 				it 'returns false by default' do
 					options.send(m).should be false

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wapiti
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-09-01 00:00:00.000000000Z
+date: 2011-09-02 00:00:00.000000000Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
-  requirement: &2156429940 !ruby/object:Gem::Requirement
+  requirement: &2156033940 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -21,10 +21,10 @@ dependencies:
         version: '0.9'
   type: :development
   prerelease: false
-  version_requirements: *2156429940
+  version_requirements: *2156033940
 - !ruby/object:Gem::Dependency
   name: rake-compiler
-  requirement: &2156428740 !ruby/object:Gem::Requirement
+  requirement: &2156032240 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -32,10 +32,10 @@ dependencies:
         version: '0.7'
   type: :development
   prerelease: false
-  version_requirements: *2156428740
+  version_requirements: *2156032240
 - !ruby/object:Gem::Dependency
   name: ZenTest
-  requirement: &2156427300 !ruby/object:Gem::Requirement
+  requirement: &2156031260 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -43,10 +43,10 @@ dependencies:
         version: '4.6'
   type: :development
   prerelease: false
-  version_requirements: *2156427300
+  version_requirements: *2156031260
 - !ruby/object:Gem::Dependency
   name: rspec
-  requirement: &2156425920 !ruby/object:Gem::Requirement
+  requirement: &2156022680 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -54,7 +54,7 @@ dependencies:
         version: '2.6'
   type: :development
   prerelease: false
-  version_requirements: *2156425920
+  version_requirements: *2156022680
 description: This gem provides a Ruby API for Conditional Random Fields (CRF). It
   is implemented as a C exstension and based on the wicked fast "wapiti" package.
 email: