wapiti 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -103,6 +103,42 @@ corresponding label:
103
103
  => [[["confidence nn", "b-np"], ["in in", "b-pp"], ["the dt", "b-np"],
104
104
  ["pound nn", "i-np"], [". .", "o"]]]
105
105
 
106
+ Note that if you set the *:score* option (either in the Model's `#options` or
107
+ when calling `#label`), the score for each label will be appended to
108
+ each token/label tuple as a floating point number or passed as a third
109
+ argument to the passed-in block.
110
+
111
+ model.label [['Confidence NN']], :score => true
112
+ => [[["Confidence NN", "B-NP", 4.642034838737357]]]
113
+
114
+ Similarly, if you set the *:nbest* option to a value greater than one, Wapiti
115
+ will append more label and, optionally, score values to each tuple.
116
+
117
+ model.label [['Confidence NN']], :score => true, :nbest => 3, :skip_tokens => true
118
+ => [[["B-NP", 4.642034838737357, "B-VP", 1.7040256847206927, "B-ADJP", 0.7636429298060177]]]
119
+
120
+ Note how we also suppressed the output of the token string using the
121
+ *:skip_tokens* option.
122
+
123
+
124
+ ### Statistics
125
+
126
+ By setting the *:check* option you can tell Wapiti to keep statistics during
127
+ the labelling phase (for the statistics to be meaningful you obviously need
128
+ to provide input data that is already labelled). Wapiti does not reset the
129
+ counters during consecutive calls to `#label` to allow you to collect
130
+ accumulative date; however, you can reset the counters at any time, by calling
131
+ `#clear_counters`.
132
+
133
+ After calling `#label` with the *:check* options set and appropriately labelled
134
+ input, you can access the statistics via `#statistics` (the individual values
135
+ are also available through the associated attribute readers).
136
+
137
+ model.label 'test.txt', :check => true
138
+ => {:tokens=>{:total=>1896, :errors=>137, :rate=>0.0007225738396624472},
139
+ :sequences=>{:total=>77, :errors=>50, :rate=>0.006493506493506494}}
140
+
141
+
106
142
 
107
143
  Citing
108
144
  ------
@@ -522,10 +522,10 @@ void Init_options() {
522
522
 
523
523
  rb_define_alias(cOptions, "sparse?", "sparse");
524
524
 
525
- rb_define_method(cOptions, "label", options_label, 0);
526
- rb_define_method(cOptions, "label=", options_set_label, 1);
525
+ rb_define_method(cOptions, "skip_tokens", options_label, 0);
526
+ rb_define_method(cOptions, "skip_tokens=", options_set_label, 1);
527
527
 
528
- rb_define_alias(cOptions, "label?", "label");
528
+ rb_define_alias(cOptions, "skip_tokens?", "skip_tokens");
529
529
 
530
530
  rb_define_method(cOptions, "check", options_check, 0);
531
531
  rb_define_method(cOptions, "check=", options_set_check, 1);
@@ -680,7 +680,7 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
680
680
  options = rb_funcall(cOptions, rb_intern("new"), 0);
681
681
  }
682
682
 
683
- // yield self if block_given?
683
+ // yield options if block_given?
684
684
  if (rb_block_given_p()) {
685
685
  rb_yield(options);
686
686
  }
@@ -691,6 +691,9 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
691
691
  if (get_options(options)->model) {
692
692
  rb_funcall(self, rb_intern("load"), 0);
693
693
  }
694
+
695
+ // initialize counters
696
+ rb_funcall(self, rb_intern("clear_counters"), 0);
694
697
 
695
698
  return self;
696
699
  }
@@ -958,63 +961,94 @@ static VALUE model_labels(VALUE self) {
958
961
  return labels;
959
962
  }
960
963
 
961
- static VALUE decode_sequence(mdl_t *model, raw_t *raw) {
964
+ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
962
965
  qrk_t *lbls = model->reader->lbl;
963
966
 
964
- const size_t Y = model->nlbl;
965
- const size_t N = model->opt->nbest;
967
+ const unsigned int Y = model->nlbl;
968
+ const unsigned int N = model->opt->nbest;
966
969
 
967
970
  seq_t *seq = rdr_raw2seq(model->reader, raw, model->opt->check);
968
971
 
969
- const int T = seq->len;
972
+ const unsigned int T = seq->len;
973
+ unsigned int n, t, tcnt = 0, terr = 0, scnt = 0, serr = 0, stat[3][Y];
970
974
 
971
975
  size_t *out = xmalloc(sizeof(size_t) * T * N);
972
976
  double *psc = xmalloc(sizeof(double) * T * N);
973
977
  double *scs = xmalloc(sizeof(double) * N);
974
978
 
975
- VALUE result = rb_ary_new2(N), sequence, tokens;
979
+ VALUE sequence, tokens;
976
980
 
977
981
  if (N == 1) {
978
982
  tag_viterbi(model, seq, (size_t*)out, scs, (double*)psc);
979
983
  }
980
984
  else {
981
- tag_nbviterbi(model, seq, N, (void*)out, scs, (void*)psc);
985
+ tag_nbviterbi(model, seq, N, (size_t*)out, scs, (double*)psc);
982
986
  }
983
-
984
- // Next we output the raw sequence with an aditional column for
985
- // the predicted labels
986
- for (size_t n = 0; n < N; n++) {
987
987
 
988
- sequence = rb_ary_new();
989
-
990
- // if (model->opt->outsc)
991
- // fprintf(fout, "# %d %f\n", (int)n, scs[n]);
992
-
993
- for (int t = 0; t < T; t++) {
994
- tokens = rb_ary_new();
995
-
996
- if (!model->opt->label) {
997
- rb_ary_push(tokens, rb_str_new2(raw->lines[t]));
998
- }
988
+ sequence = rb_ary_new();
989
+
990
+ for (t = 0; t < T; ++t) {
991
+ tokens = rb_ary_new();
992
+
993
+ if (!model->opt->label) {
994
+ rb_ary_push(tokens, rb_str_new2(raw->lines[t]));
995
+ }
996
+
997
+ for (n = 0; n < N; ++n) {
999
998
 
1000
999
  size_t lbl = out[t * N + n];
1001
1000
  rb_ary_push(tokens, rb_str_new2(qrk_id2str(lbls, lbl)));
1002
1001
 
1003
- // if (model->opt->outsc) {
1004
- // fprintf(fout, "\t%s", lblstr);
1005
- // fprintf(fout, "/%f", psc[t * N + n]);
1006
- // }
1007
-
1008
- // yield token/label pair to block if given
1009
- if (rb_block_given_p()) {
1010
- tokens = rb_yield(tokens);
1002
+ // output individual score
1003
+ if (model->opt->outsc) {
1004
+ rb_ary_push(tokens, rb_float_new(psc[t * N + n]));
1011
1005
  }
1012
1006
 
1013
- rb_ary_push(sequence, tokens);
1007
+ }
1008
+
1009
+ // yield token/label pair to block if given
1010
+ if (rb_block_given_p()) {
1011
+ tokens = rb_yield(tokens);
1014
1012
  }
1015
1013
 
1016
- rb_ary_push(result, sequence);
1014
+ rb_ary_push(sequence, tokens);
1015
+
1016
+
1017
+ // TODO output sequence score: scs[n] (float)
1018
+
1019
+ }
1020
+
1021
+ // Statistics
1022
+ if (model->opt->check) {
1023
+ int err = 0;
1024
+
1025
+ for (t = 0; t < T; ++t) {
1026
+ stat[0][seq->pos[t].lbl]++;
1027
+ stat[1][out[t * N]]++;
1028
+
1029
+ if (seq->pos[t].lbl != out[t * N]) {
1030
+ terr++;
1031
+ err = 1;
1032
+ }
1033
+ else {
1034
+ stat[2][out[t * N]]++;
1035
+ }
1036
+ }
1037
+
1038
+ tcnt = FIX2INT(rb_ivar_get(self, rb_intern("@token_count")));
1039
+ rb_ivar_set(self, rb_intern("@token_count"), INT2FIX(tcnt + (unsigned int)T));
1040
+
1041
+ terr += FIX2INT(rb_ivar_get(self, rb_intern("@token_errors")));
1042
+ rb_ivar_set(self, rb_intern("@token_errors"), INT2FIX(terr));
1043
+
1044
+ scnt = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_count")));
1045
+ rb_ivar_set(self, rb_intern("@sequence_count"), INT2FIX(++scnt));
1046
+
1047
+ serr = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_errors")));
1048
+ rb_ivar_set(self, rb_intern("@sequence_errors"), INT2FIX(serr + err));
1049
+
1017
1050
  }
1051
+
1018
1052
 
1019
1053
  // Cleanup memory used for this sequence
1020
1054
  xfree(scs);
@@ -1023,7 +1057,7 @@ static VALUE decode_sequence(mdl_t *model, raw_t *raw) {
1023
1057
 
1024
1058
  rdr_freeseq(seq);
1025
1059
 
1026
- return result;
1060
+ return sequence;
1027
1061
  }
1028
1062
 
1029
1063
  static VALUE decode_sequence_array(VALUE self, VALUE array) {
@@ -1053,7 +1087,7 @@ static VALUE decode_sequence_array(VALUE self, VALUE array) {
1053
1087
  raw->lines[j] = StringValueCStr(line);
1054
1088
  }
1055
1089
 
1056
- rb_funcall(result, rb_intern("concat"), 1, decode_sequence(model, raw));
1090
+ rb_ary_push(result, decode_sequence(self, model, raw));
1057
1091
 
1058
1092
  xfree(raw);
1059
1093
  }
@@ -1085,7 +1119,7 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
1085
1119
  break;
1086
1120
  }
1087
1121
 
1088
- rb_funcall(result, rb_intern("concat"), 1, decode_sequence(model, raw));
1122
+ rb_ary_push(result, decode_sequence(self, model, raw));
1089
1123
  rdr_freeraw(raw);
1090
1124
  }
1091
1125
 
@@ -1093,8 +1127,8 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
1093
1127
  }
1094
1128
 
1095
1129
  // cal-seq:
1096
- // m.label(tokens) # => array of labelled tokens
1097
- // m.label(filename) # => array of labelled tokens
1130
+ // m.label(tokens, options = {}) # => array of labelled tokens
1131
+ // m.label(filename, options = {}) # => array of labelled tokens
1098
1132
  //
1099
1133
  static VALUE model_label(VALUE self, VALUE data) {
1100
1134
  VALUE result;
@@ -1120,6 +1154,7 @@ static void Init_model() {
1120
1154
  rb_define_method(cModel, "initialize", initialize_model, -1);
1121
1155
 
1122
1156
  rb_define_attr(cModel, "options", 1, 0);
1157
+
1123
1158
 
1124
1159
  rb_define_method(cModel, "nlbl", model_nlbl, 0);
1125
1160
  rb_define_method(cModel, "labels", model_labels, 0);
@@ -30,6 +30,8 @@ module Wapiti
30
30
 
31
31
  attr_accessor :path
32
32
 
33
+ attr_reader :token_count, :token_errors, :sequence_count, :sequence_errors
34
+
33
35
  def pattern
34
36
  options.pattern
35
37
  end
@@ -38,6 +40,35 @@ module Wapiti
38
40
  options.pattern = filename
39
41
  end
40
42
 
43
+ alias native_label label
44
+
45
+ def label(input, opts = nil)
46
+ options.update(opts) unless opts.nil?
47
+ block_given? ? native_label(input, &Proc.new) : native_label(input)
48
+ end
49
+
50
+ def statistics
51
+ s = {}
52
+ s[:tokens] = {
53
+ :total => token_count, :errors => @token_errors,
54
+ :rate => token_errors / (token_count * 100.0)
55
+ }
56
+ s[:sequences] = {
57
+ :total => sequence_count, :errors => sequence_errors,
58
+ :rate => sequence_errors / (sequence_count * 100.0)
59
+ }
60
+ s
61
+ end
62
+
63
+ alias stats statistics
64
+
65
+ def clear_counters
66
+ @token_count = @token_errors = @sequence_count = @sequence_errors = 0
67
+ end
68
+
69
+ alias clear clear_counters
70
+
71
+
41
72
  private
42
73
 
43
74
  def tokenize(input)
@@ -10,7 +10,7 @@ module Wapiti
10
10
  def attribute_names
11
11
  @attribute_names ||= %w{ stop_window convergence_window posterior
12
12
  max_iterations jobsize threads rho1 rho2 stop_epsilon score check
13
- algorithm pattern development_data maxent compact sparse label
13
+ algorithm pattern development_data maxent compact sparse skip_tokens
14
14
  }.sort.map(&:to_sym).freeze
15
15
  end
16
16
 
@@ -103,7 +103,15 @@ module Wapiti
103
103
  e << "BCD not supported for training maxent models" if maxent && algorithm == 'bcd'
104
104
  e
105
105
  end
106
-
106
+
107
+ %w{ maxent compact sparse label check score posterior }.each do |m|
108
+ writer = "#{m}=".to_sym
109
+ define_method("#{m}!") do
110
+ send(writer, true)
111
+ self
112
+ end
113
+ end
114
+
107
115
  def <=>(other)
108
116
  other.respond_to?(:attributes) ? attributes <=> other.attributes : nil
109
117
  end
@@ -1,3 +1,3 @@
1
1
  module Wapiti
2
- VERSION = '0.0.2'.freeze
2
+ VERSION = '0.0.3'.freeze
3
3
  end
@@ -143,6 +143,22 @@ module Wapiti
143
143
  labels[0].map(&:last).should == %w{ b-np o b-np o }
144
144
  end
145
145
 
146
+ context 'with the :score option set' do
147
+ before(:each) { model.options.score! }
148
+
149
+ it 'returns an array of token-label-score tuples' do
150
+ model.label(input)[0].map { |t,l,s| s.class }.uniq == [Float]
151
+ end
152
+ end
153
+
154
+ context 'with the :nbest option set to 2' do
155
+ before(:each) { model.options.nbest = 2 }
156
+
157
+ it 'returns an array of token-label-label tuples' do
158
+ model.label(input)[0][-1][1,2] == %w{ O O }
159
+ end
160
+ end
161
+
146
162
  end
147
163
 
148
164
 
@@ -155,7 +171,7 @@ module Wapiti
155
171
  labels[0].take(5).map(&:last).should == %w{ B-NP B-PP B-NP I-NP B-VP }
156
172
  end
157
173
  end
158
-
174
+
159
175
  end
160
176
 
161
177
  end
@@ -139,7 +139,7 @@ module Wapiti
139
139
  end
140
140
 
141
141
 
142
- %w{ maxent compact sparse label check score posterior }.each do |m|
142
+ %w{ maxent compact sparse skip_tokens check score posterior }.each do |m|
143
143
  describe "##{m}" do
144
144
  it 'returns false by default' do
145
145
  options.send(m).should be false
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wapiti
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-09-01 00:00:00.000000000Z
12
+ date: 2011-09-02 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
16
- requirement: &2156429940 !ruby/object:Gem::Requirement
16
+ requirement: &2156033940 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0.9'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2156429940
24
+ version_requirements: *2156033940
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rake-compiler
27
- requirement: &2156428740 !ruby/object:Gem::Requirement
27
+ requirement: &2156032240 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0.7'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2156428740
35
+ version_requirements: *2156032240
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ZenTest
38
- requirement: &2156427300 !ruby/object:Gem::Requirement
38
+ requirement: &2156031260 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '4.6'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2156427300
46
+ version_requirements: *2156031260
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rspec
49
- requirement: &2156425920 !ruby/object:Gem::Requirement
49
+ requirement: &2156022680 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,7 +54,7 @@ dependencies:
54
54
  version: '2.6'
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *2156425920
57
+ version_requirements: *2156022680
58
58
  description: This gem provides a Ruby API for Conditional Random Fields (CRF). It
59
59
  is implemented as a C exstension and based on the wicked fast "wapiti" package.
60
60
  email: