wapiti 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -103,6 +103,42 @@ corresponding label:
103
103
  => [[["confidence nn", "b-np"], ["in in", "b-pp"], ["the dt", "b-np"],
104
104
  ["pound nn", "i-np"], [". .", "o"]]]
105
105
 
106
+ Note that if you set the *:score* option (either in the Model's `#options` or
107
+ when calling `#label`), the score for each label will be appended to
108
+ each token/label tuple as a floating point number or passed as a third
109
+ argument to the passed-in block.
110
+
111
+ model.label [['Confidence NN']], :score => true
112
+ => [[["Confidence NN", "B-NP", 4.642034838737357]]]
113
+
114
+ Similarly, if you set the *:nbest* option to a value greater than one, Wapiti
115
+ will append more label and, optionally, score values to each tuple.
116
+
117
+ model.label [['Confidence NN']], :score => true, :nbest => 3, :skip_tokens => true
118
+ => [[["B-NP", 4.642034838737357, "B-VP", 1.7040256847206927, "B-ADJP", 0.7636429298060177]]]
119
+
120
+ Note how we also suppressed the output of the token string using the
121
+ *:skip_tokens* option.
122
+
123
+
124
+ ### Statistics
125
+
126
+ By setting the *:check* option you can tell Wapiti to keep statistics during
127
+ the labelling phase (for the statistics to be meaningful you obviously need
128
+ to provide input data that is already labelled). Wapiti does not reset the
129
+ counters during consecutive calls to `#label` to allow you to collect
130
+ accumulative date; however, you can reset the counters at any time, by calling
131
+ `#clear_counters`.
132
+
133
+ After calling `#label` with the *:check* options set and appropriately labelled
134
+ input, you can access the statistics via `#statistics` (the individual values
135
+ are also available through the associated attribute readers).
136
+
137
+ model.label 'test.txt', :check => true
138
+ => {:tokens=>{:total=>1896, :errors=>137, :rate=>0.0007225738396624472},
139
+ :sequences=>{:total=>77, :errors=>50, :rate=>0.006493506493506494}}
140
+
141
+
106
142
 
107
143
  Citing
108
144
  ------
@@ -522,10 +522,10 @@ void Init_options() {
522
522
 
523
523
  rb_define_alias(cOptions, "sparse?", "sparse");
524
524
 
525
- rb_define_method(cOptions, "label", options_label, 0);
526
- rb_define_method(cOptions, "label=", options_set_label, 1);
525
+ rb_define_method(cOptions, "skip_tokens", options_label, 0);
526
+ rb_define_method(cOptions, "skip_tokens=", options_set_label, 1);
527
527
 
528
- rb_define_alias(cOptions, "label?", "label");
528
+ rb_define_alias(cOptions, "skip_tokens?", "skip_tokens");
529
529
 
530
530
  rb_define_method(cOptions, "check", options_check, 0);
531
531
  rb_define_method(cOptions, "check=", options_set_check, 1);
@@ -680,7 +680,7 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
680
680
  options = rb_funcall(cOptions, rb_intern("new"), 0);
681
681
  }
682
682
 
683
- // yield self if block_given?
683
+ // yield options if block_given?
684
684
  if (rb_block_given_p()) {
685
685
  rb_yield(options);
686
686
  }
@@ -691,6 +691,9 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
691
691
  if (get_options(options)->model) {
692
692
  rb_funcall(self, rb_intern("load"), 0);
693
693
  }
694
+
695
+ // initialize counters
696
+ rb_funcall(self, rb_intern("clear_counters"), 0);
694
697
 
695
698
  return self;
696
699
  }
@@ -958,63 +961,94 @@ static VALUE model_labels(VALUE self) {
958
961
  return labels;
959
962
  }
960
963
 
961
- static VALUE decode_sequence(mdl_t *model, raw_t *raw) {
964
+ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
962
965
  qrk_t *lbls = model->reader->lbl;
963
966
 
964
- const size_t Y = model->nlbl;
965
- const size_t N = model->opt->nbest;
967
+ const unsigned int Y = model->nlbl;
968
+ const unsigned int N = model->opt->nbest;
966
969
 
967
970
  seq_t *seq = rdr_raw2seq(model->reader, raw, model->opt->check);
968
971
 
969
- const int T = seq->len;
972
+ const unsigned int T = seq->len;
973
+ unsigned int n, t, tcnt = 0, terr = 0, scnt = 0, serr = 0, stat[3][Y];
970
974
 
971
975
  size_t *out = xmalloc(sizeof(size_t) * T * N);
972
976
  double *psc = xmalloc(sizeof(double) * T * N);
973
977
  double *scs = xmalloc(sizeof(double) * N);
974
978
 
975
- VALUE result = rb_ary_new2(N), sequence, tokens;
979
+ VALUE sequence, tokens;
976
980
 
977
981
  if (N == 1) {
978
982
  tag_viterbi(model, seq, (size_t*)out, scs, (double*)psc);
979
983
  }
980
984
  else {
981
- tag_nbviterbi(model, seq, N, (void*)out, scs, (void*)psc);
985
+ tag_nbviterbi(model, seq, N, (size_t*)out, scs, (double*)psc);
982
986
  }
983
-
984
- // Next we output the raw sequence with an aditional column for
985
- // the predicted labels
986
- for (size_t n = 0; n < N; n++) {
987
987
 
988
- sequence = rb_ary_new();
989
-
990
- // if (model->opt->outsc)
991
- // fprintf(fout, "# %d %f\n", (int)n, scs[n]);
992
-
993
- for (int t = 0; t < T; t++) {
994
- tokens = rb_ary_new();
995
-
996
- if (!model->opt->label) {
997
- rb_ary_push(tokens, rb_str_new2(raw->lines[t]));
998
- }
988
+ sequence = rb_ary_new();
989
+
990
+ for (t = 0; t < T; ++t) {
991
+ tokens = rb_ary_new();
992
+
993
+ if (!model->opt->label) {
994
+ rb_ary_push(tokens, rb_str_new2(raw->lines[t]));
995
+ }
996
+
997
+ for (n = 0; n < N; ++n) {
999
998
 
1000
999
  size_t lbl = out[t * N + n];
1001
1000
  rb_ary_push(tokens, rb_str_new2(qrk_id2str(lbls, lbl)));
1002
1001
 
1003
- // if (model->opt->outsc) {
1004
- // fprintf(fout, "\t%s", lblstr);
1005
- // fprintf(fout, "/%f", psc[t * N + n]);
1006
- // }
1007
-
1008
- // yield token/label pair to block if given
1009
- if (rb_block_given_p()) {
1010
- tokens = rb_yield(tokens);
1002
+ // output individual score
1003
+ if (model->opt->outsc) {
1004
+ rb_ary_push(tokens, rb_float_new(psc[t * N + n]));
1011
1005
  }
1012
1006
 
1013
- rb_ary_push(sequence, tokens);
1007
+ }
1008
+
1009
+ // yield token/label pair to block if given
1010
+ if (rb_block_given_p()) {
1011
+ tokens = rb_yield(tokens);
1014
1012
  }
1015
1013
 
1016
- rb_ary_push(result, sequence);
1014
+ rb_ary_push(sequence, tokens);
1015
+
1016
+
1017
+ // TODO output sequence score: scs[n] (float)
1018
+
1019
+ }
1020
+
1021
+ // Statistics
1022
+ if (model->opt->check) {
1023
+ int err = 0;
1024
+
1025
+ for (t = 0; t < T; ++t) {
1026
+ stat[0][seq->pos[t].lbl]++;
1027
+ stat[1][out[t * N]]++;
1028
+
1029
+ if (seq->pos[t].lbl != out[t * N]) {
1030
+ terr++;
1031
+ err = 1;
1032
+ }
1033
+ else {
1034
+ stat[2][out[t * N]]++;
1035
+ }
1036
+ }
1037
+
1038
+ tcnt = FIX2INT(rb_ivar_get(self, rb_intern("@token_count")));
1039
+ rb_ivar_set(self, rb_intern("@token_count"), INT2FIX(tcnt + (unsigned int)T));
1040
+
1041
+ terr += FIX2INT(rb_ivar_get(self, rb_intern("@token_errors")));
1042
+ rb_ivar_set(self, rb_intern("@token_errors"), INT2FIX(terr));
1043
+
1044
+ scnt = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_count")));
1045
+ rb_ivar_set(self, rb_intern("@sequence_count"), INT2FIX(++scnt));
1046
+
1047
+ serr = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_errors")));
1048
+ rb_ivar_set(self, rb_intern("@sequence_errors"), INT2FIX(serr + err));
1049
+
1017
1050
  }
1051
+
1018
1052
 
1019
1053
  // Cleanup memory used for this sequence
1020
1054
  xfree(scs);
@@ -1023,7 +1057,7 @@ static VALUE decode_sequence(mdl_t *model, raw_t *raw) {
1023
1057
 
1024
1058
  rdr_freeseq(seq);
1025
1059
 
1026
- return result;
1060
+ return sequence;
1027
1061
  }
1028
1062
 
1029
1063
  static VALUE decode_sequence_array(VALUE self, VALUE array) {
@@ -1053,7 +1087,7 @@ static VALUE decode_sequence_array(VALUE self, VALUE array) {
1053
1087
  raw->lines[j] = StringValueCStr(line);
1054
1088
  }
1055
1089
 
1056
- rb_funcall(result, rb_intern("concat"), 1, decode_sequence(model, raw));
1090
+ rb_ary_push(result, decode_sequence(self, model, raw));
1057
1091
 
1058
1092
  xfree(raw);
1059
1093
  }
@@ -1085,7 +1119,7 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
1085
1119
  break;
1086
1120
  }
1087
1121
 
1088
- rb_funcall(result, rb_intern("concat"), 1, decode_sequence(model, raw));
1122
+ rb_ary_push(result, decode_sequence(self, model, raw));
1089
1123
  rdr_freeraw(raw);
1090
1124
  }
1091
1125
 
@@ -1093,8 +1127,8 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
1093
1127
  }
1094
1128
 
1095
1129
  // cal-seq:
1096
- // m.label(tokens) # => array of labelled tokens
1097
- // m.label(filename) # => array of labelled tokens
1130
+ // m.label(tokens, options = {}) # => array of labelled tokens
1131
+ // m.label(filename, options = {}) # => array of labelled tokens
1098
1132
  //
1099
1133
  static VALUE model_label(VALUE self, VALUE data) {
1100
1134
  VALUE result;
@@ -1120,6 +1154,7 @@ static void Init_model() {
1120
1154
  rb_define_method(cModel, "initialize", initialize_model, -1);
1121
1155
 
1122
1156
  rb_define_attr(cModel, "options", 1, 0);
1157
+
1123
1158
 
1124
1159
  rb_define_method(cModel, "nlbl", model_nlbl, 0);
1125
1160
  rb_define_method(cModel, "labels", model_labels, 0);
@@ -30,6 +30,8 @@ module Wapiti
30
30
 
31
31
  attr_accessor :path
32
32
 
33
+ attr_reader :token_count, :token_errors, :sequence_count, :sequence_errors
34
+
33
35
  def pattern
34
36
  options.pattern
35
37
  end
@@ -38,6 +40,35 @@ module Wapiti
38
40
  options.pattern = filename
39
41
  end
40
42
 
43
+ alias native_label label
44
+
45
+ def label(input, opts = nil)
46
+ options.update(opts) unless opts.nil?
47
+ block_given? ? native_label(input, &Proc.new) : native_label(input)
48
+ end
49
+
50
+ def statistics
51
+ s = {}
52
+ s[:tokens] = {
53
+ :total => token_count, :errors => @token_errors,
54
+ :rate => token_errors / (token_count * 100.0)
55
+ }
56
+ s[:sequences] = {
57
+ :total => sequence_count, :errors => sequence_errors,
58
+ :rate => sequence_errors / (sequence_count * 100.0)
59
+ }
60
+ s
61
+ end
62
+
63
+ alias stats statistics
64
+
65
+ def clear_counters
66
+ @token_count = @token_errors = @sequence_count = @sequence_errors = 0
67
+ end
68
+
69
+ alias clear clear_counters
70
+
71
+
41
72
  private
42
73
 
43
74
  def tokenize(input)
@@ -10,7 +10,7 @@ module Wapiti
10
10
  def attribute_names
11
11
  @attribute_names ||= %w{ stop_window convergence_window posterior
12
12
  max_iterations jobsize threads rho1 rho2 stop_epsilon score check
13
- algorithm pattern development_data maxent compact sparse label
13
+ algorithm pattern development_data maxent compact sparse skip_tokens
14
14
  }.sort.map(&:to_sym).freeze
15
15
  end
16
16
 
@@ -103,7 +103,15 @@ module Wapiti
103
103
  e << "BCD not supported for training maxent models" if maxent && algorithm == 'bcd'
104
104
  e
105
105
  end
106
-
106
+
107
+ %w{ maxent compact sparse label check score posterior }.each do |m|
108
+ writer = "#{m}=".to_sym
109
+ define_method("#{m}!") do
110
+ send(writer, true)
111
+ self
112
+ end
113
+ end
114
+
107
115
  def <=>(other)
108
116
  other.respond_to?(:attributes) ? attributes <=> other.attributes : nil
109
117
  end
@@ -1,3 +1,3 @@
1
1
  module Wapiti
2
- VERSION = '0.0.2'.freeze
2
+ VERSION = '0.0.3'.freeze
3
3
  end
@@ -143,6 +143,22 @@ module Wapiti
143
143
  labels[0].map(&:last).should == %w{ b-np o b-np o }
144
144
  end
145
145
 
146
+ context 'with the :score option set' do
147
+ before(:each) { model.options.score! }
148
+
149
+ it 'returns an array of token-label-score tuples' do
150
+ model.label(input)[0].map { |t,l,s| s.class }.uniq == [Float]
151
+ end
152
+ end
153
+
154
+ context 'with the :nbest option set to 2' do
155
+ before(:each) { model.options.nbest = 2 }
156
+
157
+ it 'returns an array of token-label-label tuples' do
158
+ model.label(input)[0][-1][1,2] == %w{ O O }
159
+ end
160
+ end
161
+
146
162
  end
147
163
 
148
164
 
@@ -155,7 +171,7 @@ module Wapiti
155
171
  labels[0].take(5).map(&:last).should == %w{ B-NP B-PP B-NP I-NP B-VP }
156
172
  end
157
173
  end
158
-
174
+
159
175
  end
160
176
 
161
177
  end
@@ -139,7 +139,7 @@ module Wapiti
139
139
  end
140
140
 
141
141
 
142
- %w{ maxent compact sparse label check score posterior }.each do |m|
142
+ %w{ maxent compact sparse skip_tokens check score posterior }.each do |m|
143
143
  describe "##{m}" do
144
144
  it 'returns false by default' do
145
145
  options.send(m).should be false
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wapiti
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-09-01 00:00:00.000000000Z
12
+ date: 2011-09-02 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
16
- requirement: &2156429940 !ruby/object:Gem::Requirement
16
+ requirement: &2156033940 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0.9'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2156429940
24
+ version_requirements: *2156033940
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rake-compiler
27
- requirement: &2156428740 !ruby/object:Gem::Requirement
27
+ requirement: &2156032240 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0.7'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2156428740
35
+ version_requirements: *2156032240
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ZenTest
38
- requirement: &2156427300 !ruby/object:Gem::Requirement
38
+ requirement: &2156031260 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '4.6'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2156427300
46
+ version_requirements: *2156031260
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rspec
49
- requirement: &2156425920 !ruby/object:Gem::Requirement
49
+ requirement: &2156022680 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,7 +54,7 @@ dependencies:
54
54
  version: '2.6'
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *2156425920
57
+ version_requirements: *2156022680
58
58
  description: This gem provides a Ruby API for Conditional Random Fields (CRF). It
59
59
  is implemented as a C exstension and based on the wicked fast "wapiti" package.
60
60
  email: