wapiti 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +36 -0
- data/ext/wapiti/native.c +75 -40
- data/lib/wapiti/model.rb +31 -0
- data/lib/wapiti/options.rb +10 -2
- data/lib/wapiti/version.rb +1 -1
- data/spec/wapiti/model_spec.rb +17 -1
- data/spec/wapiti/options_spec.rb +1 -1
- metadata +10 -10
data/README.md
CHANGED
@@ -103,6 +103,42 @@ corresponding label:
|
|
103
103
|
=> [[["confidence nn", "b-np"], ["in in", "b-pp"], ["the dt", "b-np"],
|
104
104
|
["pound nn", "i-np"], [". .", "o"]]]
|
105
105
|
|
106
|
+
Note that if you set the *:score* option (either in the Model's `#options` or
|
107
|
+
when calling `#label`), the score for each label will be appended to
|
108
|
+
each token/label tuple as a floating point number or passed as a third
|
109
|
+
argument to the passed-in block.
|
110
|
+
|
111
|
+
model.label [['Confidence NN']], :score => true
|
112
|
+
=> [[["Confidence NN", "B-NP", 4.642034838737357]]]
|
113
|
+
|
114
|
+
Similarly, if you set the *:nbest* option to a value greater than one, Wapiti
|
115
|
+
will append more label and, optionally, score values to each tuple.
|
116
|
+
|
117
|
+
model.label [['Confidence NN']], :score => true, :nbest => 3, :skip_tokens => true
|
118
|
+
=> [[["B-NP", 4.642034838737357, "B-VP", 1.7040256847206927, "B-ADJP", 0.7636429298060177]]]
|
119
|
+
|
120
|
+
Note how we also suppressed the output of the token string using the
|
121
|
+
*:skip_tokens* option.
|
122
|
+
|
123
|
+
|
124
|
+
### Statistics
|
125
|
+
|
126
|
+
By setting the *:check* option you can tell Wapiti to keep statistics during
|
127
|
+
the labelling phase (for the statistics to be meaningful you obviously need
|
128
|
+
to provide input data that is already labelled). Wapiti does not reset the
|
129
|
+
counters during consecutive calls to `#label` to allow you to collect
|
130
|
+
accumulative date; however, you can reset the counters at any time, by calling
|
131
|
+
`#clear_counters`.
|
132
|
+
|
133
|
+
After calling `#label` with the *:check* options set and appropriately labelled
|
134
|
+
input, you can access the statistics via `#statistics` (the individual values
|
135
|
+
are also available through the associated attribute readers).
|
136
|
+
|
137
|
+
model.label 'test.txt', :check => true
|
138
|
+
=> {:tokens=>{:total=>1896, :errors=>137, :rate=>0.0007225738396624472},
|
139
|
+
:sequences=>{:total=>77, :errors=>50, :rate=>0.006493506493506494}}
|
140
|
+
|
141
|
+
|
106
142
|
|
107
143
|
Citing
|
108
144
|
------
|
data/ext/wapiti/native.c
CHANGED
@@ -522,10 +522,10 @@ void Init_options() {
|
|
522
522
|
|
523
523
|
rb_define_alias(cOptions, "sparse?", "sparse");
|
524
524
|
|
525
|
-
rb_define_method(cOptions, "
|
526
|
-
rb_define_method(cOptions, "
|
525
|
+
rb_define_method(cOptions, "skip_tokens", options_label, 0);
|
526
|
+
rb_define_method(cOptions, "skip_tokens=", options_set_label, 1);
|
527
527
|
|
528
|
-
rb_define_alias(cOptions, "
|
528
|
+
rb_define_alias(cOptions, "skip_tokens?", "skip_tokens");
|
529
529
|
|
530
530
|
rb_define_method(cOptions, "check", options_check, 0);
|
531
531
|
rb_define_method(cOptions, "check=", options_set_check, 1);
|
@@ -680,7 +680,7 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
|
|
680
680
|
options = rb_funcall(cOptions, rb_intern("new"), 0);
|
681
681
|
}
|
682
682
|
|
683
|
-
// yield
|
683
|
+
// yield options if block_given?
|
684
684
|
if (rb_block_given_p()) {
|
685
685
|
rb_yield(options);
|
686
686
|
}
|
@@ -691,6 +691,9 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
|
|
691
691
|
if (get_options(options)->model) {
|
692
692
|
rb_funcall(self, rb_intern("load"), 0);
|
693
693
|
}
|
694
|
+
|
695
|
+
// initialize counters
|
696
|
+
rb_funcall(self, rb_intern("clear_counters"), 0);
|
694
697
|
|
695
698
|
return self;
|
696
699
|
}
|
@@ -958,63 +961,94 @@ static VALUE model_labels(VALUE self) {
|
|
958
961
|
return labels;
|
959
962
|
}
|
960
963
|
|
961
|
-
static VALUE decode_sequence(mdl_t *model, raw_t *raw) {
|
964
|
+
static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
|
962
965
|
qrk_t *lbls = model->reader->lbl;
|
963
966
|
|
964
|
-
const
|
965
|
-
const
|
967
|
+
const unsigned int Y = model->nlbl;
|
968
|
+
const unsigned int N = model->opt->nbest;
|
966
969
|
|
967
970
|
seq_t *seq = rdr_raw2seq(model->reader, raw, model->opt->check);
|
968
971
|
|
969
|
-
const int T = seq->len;
|
972
|
+
const unsigned int T = seq->len;
|
973
|
+
unsigned int n, t, tcnt = 0, terr = 0, scnt = 0, serr = 0, stat[3][Y];
|
970
974
|
|
971
975
|
size_t *out = xmalloc(sizeof(size_t) * T * N);
|
972
976
|
double *psc = xmalloc(sizeof(double) * T * N);
|
973
977
|
double *scs = xmalloc(sizeof(double) * N);
|
974
978
|
|
975
|
-
VALUE
|
979
|
+
VALUE sequence, tokens;
|
976
980
|
|
977
981
|
if (N == 1) {
|
978
982
|
tag_viterbi(model, seq, (size_t*)out, scs, (double*)psc);
|
979
983
|
}
|
980
984
|
else {
|
981
|
-
tag_nbviterbi(model, seq, N, (
|
985
|
+
tag_nbviterbi(model, seq, N, (size_t*)out, scs, (double*)psc);
|
982
986
|
}
|
983
|
-
|
984
|
-
// Next we output the raw sequence with an aditional column for
|
985
|
-
// the predicted labels
|
986
|
-
for (size_t n = 0; n < N; n++) {
|
987
987
|
|
988
|
-
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
|
994
|
-
tokens
|
995
|
-
|
996
|
-
|
997
|
-
|
998
|
-
}
|
988
|
+
sequence = rb_ary_new();
|
989
|
+
|
990
|
+
for (t = 0; t < T; ++t) {
|
991
|
+
tokens = rb_ary_new();
|
992
|
+
|
993
|
+
if (!model->opt->label) {
|
994
|
+
rb_ary_push(tokens, rb_str_new2(raw->lines[t]));
|
995
|
+
}
|
996
|
+
|
997
|
+
for (n = 0; n < N; ++n) {
|
999
998
|
|
1000
999
|
size_t lbl = out[t * N + n];
|
1001
1000
|
rb_ary_push(tokens, rb_str_new2(qrk_id2str(lbls, lbl)));
|
1002
1001
|
|
1003
|
-
//
|
1004
|
-
|
1005
|
-
|
1006
|
-
// }
|
1007
|
-
|
1008
|
-
// yield token/label pair to block if given
|
1009
|
-
if (rb_block_given_p()) {
|
1010
|
-
tokens = rb_yield(tokens);
|
1002
|
+
// output individual score
|
1003
|
+
if (model->opt->outsc) {
|
1004
|
+
rb_ary_push(tokens, rb_float_new(psc[t * N + n]));
|
1011
1005
|
}
|
1012
1006
|
|
1013
|
-
|
1007
|
+
}
|
1008
|
+
|
1009
|
+
// yield token/label pair to block if given
|
1010
|
+
if (rb_block_given_p()) {
|
1011
|
+
tokens = rb_yield(tokens);
|
1014
1012
|
}
|
1015
1013
|
|
1016
|
-
rb_ary_push(
|
1014
|
+
rb_ary_push(sequence, tokens);
|
1015
|
+
|
1016
|
+
|
1017
|
+
// TODO output sequence score: scs[n] (float)
|
1018
|
+
|
1019
|
+
}
|
1020
|
+
|
1021
|
+
// Statistics
|
1022
|
+
if (model->opt->check) {
|
1023
|
+
int err = 0;
|
1024
|
+
|
1025
|
+
for (t = 0; t < T; ++t) {
|
1026
|
+
stat[0][seq->pos[t].lbl]++;
|
1027
|
+
stat[1][out[t * N]]++;
|
1028
|
+
|
1029
|
+
if (seq->pos[t].lbl != out[t * N]) {
|
1030
|
+
terr++;
|
1031
|
+
err = 1;
|
1032
|
+
}
|
1033
|
+
else {
|
1034
|
+
stat[2][out[t * N]]++;
|
1035
|
+
}
|
1036
|
+
}
|
1037
|
+
|
1038
|
+
tcnt = FIX2INT(rb_ivar_get(self, rb_intern("@token_count")));
|
1039
|
+
rb_ivar_set(self, rb_intern("@token_count"), INT2FIX(tcnt + (unsigned int)T));
|
1040
|
+
|
1041
|
+
terr += FIX2INT(rb_ivar_get(self, rb_intern("@token_errors")));
|
1042
|
+
rb_ivar_set(self, rb_intern("@token_errors"), INT2FIX(terr));
|
1043
|
+
|
1044
|
+
scnt = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_count")));
|
1045
|
+
rb_ivar_set(self, rb_intern("@sequence_count"), INT2FIX(++scnt));
|
1046
|
+
|
1047
|
+
serr = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_errors")));
|
1048
|
+
rb_ivar_set(self, rb_intern("@sequence_errors"), INT2FIX(serr + err));
|
1049
|
+
|
1017
1050
|
}
|
1051
|
+
|
1018
1052
|
|
1019
1053
|
// Cleanup memory used for this sequence
|
1020
1054
|
xfree(scs);
|
@@ -1023,7 +1057,7 @@ static VALUE decode_sequence(mdl_t *model, raw_t *raw) {
|
|
1023
1057
|
|
1024
1058
|
rdr_freeseq(seq);
|
1025
1059
|
|
1026
|
-
return
|
1060
|
+
return sequence;
|
1027
1061
|
}
|
1028
1062
|
|
1029
1063
|
static VALUE decode_sequence_array(VALUE self, VALUE array) {
|
@@ -1053,7 +1087,7 @@ static VALUE decode_sequence_array(VALUE self, VALUE array) {
|
|
1053
1087
|
raw->lines[j] = StringValueCStr(line);
|
1054
1088
|
}
|
1055
1089
|
|
1056
|
-
|
1090
|
+
rb_ary_push(result, decode_sequence(self, model, raw));
|
1057
1091
|
|
1058
1092
|
xfree(raw);
|
1059
1093
|
}
|
@@ -1085,7 +1119,7 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
|
|
1085
1119
|
break;
|
1086
1120
|
}
|
1087
1121
|
|
1088
|
-
|
1122
|
+
rb_ary_push(result, decode_sequence(self, model, raw));
|
1089
1123
|
rdr_freeraw(raw);
|
1090
1124
|
}
|
1091
1125
|
|
@@ -1093,8 +1127,8 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
|
|
1093
1127
|
}
|
1094
1128
|
|
1095
1129
|
// cal-seq:
|
1096
|
-
// m.label(tokens) # => array of labelled tokens
|
1097
|
-
// m.label(filename) # => array of labelled tokens
|
1130
|
+
// m.label(tokens, options = {}) # => array of labelled tokens
|
1131
|
+
// m.label(filename, options = {}) # => array of labelled tokens
|
1098
1132
|
//
|
1099
1133
|
static VALUE model_label(VALUE self, VALUE data) {
|
1100
1134
|
VALUE result;
|
@@ -1120,6 +1154,7 @@ static void Init_model() {
|
|
1120
1154
|
rb_define_method(cModel, "initialize", initialize_model, -1);
|
1121
1155
|
|
1122
1156
|
rb_define_attr(cModel, "options", 1, 0);
|
1157
|
+
|
1123
1158
|
|
1124
1159
|
rb_define_method(cModel, "nlbl", model_nlbl, 0);
|
1125
1160
|
rb_define_method(cModel, "labels", model_labels, 0);
|
data/lib/wapiti/model.rb
CHANGED
@@ -30,6 +30,8 @@ module Wapiti
|
|
30
30
|
|
31
31
|
attr_accessor :path
|
32
32
|
|
33
|
+
attr_reader :token_count, :token_errors, :sequence_count, :sequence_errors
|
34
|
+
|
33
35
|
def pattern
|
34
36
|
options.pattern
|
35
37
|
end
|
@@ -38,6 +40,35 @@ module Wapiti
|
|
38
40
|
options.pattern = filename
|
39
41
|
end
|
40
42
|
|
43
|
+
alias native_label label
|
44
|
+
|
45
|
+
def label(input, opts = nil)
|
46
|
+
options.update(opts) unless opts.nil?
|
47
|
+
block_given? ? native_label(input, &Proc.new) : native_label(input)
|
48
|
+
end
|
49
|
+
|
50
|
+
def statistics
|
51
|
+
s = {}
|
52
|
+
s[:tokens] = {
|
53
|
+
:total => token_count, :errors => @token_errors,
|
54
|
+
:rate => token_errors / (token_count * 100.0)
|
55
|
+
}
|
56
|
+
s[:sequences] = {
|
57
|
+
:total => sequence_count, :errors => sequence_errors,
|
58
|
+
:rate => sequence_errors / (sequence_count * 100.0)
|
59
|
+
}
|
60
|
+
s
|
61
|
+
end
|
62
|
+
|
63
|
+
alias stats statistics
|
64
|
+
|
65
|
+
def clear_counters
|
66
|
+
@token_count = @token_errors = @sequence_count = @sequence_errors = 0
|
67
|
+
end
|
68
|
+
|
69
|
+
alias clear clear_counters
|
70
|
+
|
71
|
+
|
41
72
|
private
|
42
73
|
|
43
74
|
def tokenize(input)
|
data/lib/wapiti/options.rb
CHANGED
@@ -10,7 +10,7 @@ module Wapiti
|
|
10
10
|
def attribute_names
|
11
11
|
@attribute_names ||= %w{ stop_window convergence_window posterior
|
12
12
|
max_iterations jobsize threads rho1 rho2 stop_epsilon score check
|
13
|
-
algorithm pattern development_data maxent compact sparse
|
13
|
+
algorithm pattern development_data maxent compact sparse skip_tokens
|
14
14
|
}.sort.map(&:to_sym).freeze
|
15
15
|
end
|
16
16
|
|
@@ -103,7 +103,15 @@ module Wapiti
|
|
103
103
|
e << "BCD not supported for training maxent models" if maxent && algorithm == 'bcd'
|
104
104
|
e
|
105
105
|
end
|
106
|
-
|
106
|
+
|
107
|
+
%w{ maxent compact sparse label check score posterior }.each do |m|
|
108
|
+
writer = "#{m}=".to_sym
|
109
|
+
define_method("#{m}!") do
|
110
|
+
send(writer, true)
|
111
|
+
self
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
107
115
|
def <=>(other)
|
108
116
|
other.respond_to?(:attributes) ? attributes <=> other.attributes : nil
|
109
117
|
end
|
data/lib/wapiti/version.rb
CHANGED
data/spec/wapiti/model_spec.rb
CHANGED
@@ -143,6 +143,22 @@ module Wapiti
|
|
143
143
|
labels[0].map(&:last).should == %w{ b-np o b-np o }
|
144
144
|
end
|
145
145
|
|
146
|
+
context 'with the :score option set' do
|
147
|
+
before(:each) { model.options.score! }
|
148
|
+
|
149
|
+
it 'returns an array of token-label-score tuples' do
|
150
|
+
model.label(input)[0].map { |t,l,s| s.class }.uniq == [Float]
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
context 'with the :nbest option set to 2' do
|
155
|
+
before(:each) { model.options.nbest = 2 }
|
156
|
+
|
157
|
+
it 'returns an array of token-label-label tuples' do
|
158
|
+
model.label(input)[0][-1][1,2] == %w{ O O }
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
146
162
|
end
|
147
163
|
|
148
164
|
|
@@ -155,7 +171,7 @@ module Wapiti
|
|
155
171
|
labels[0].take(5).map(&:last).should == %w{ B-NP B-PP B-NP I-NP B-VP }
|
156
172
|
end
|
157
173
|
end
|
158
|
-
|
174
|
+
|
159
175
|
end
|
160
176
|
|
161
177
|
end
|
data/spec/wapiti/options_spec.rb
CHANGED
@@ -139,7 +139,7 @@ module Wapiti
|
|
139
139
|
end
|
140
140
|
|
141
141
|
|
142
|
-
%w{ maxent compact sparse
|
142
|
+
%w{ maxent compact sparse skip_tokens check score posterior }.each do |m|
|
143
143
|
describe "##{m}" do
|
144
144
|
it 'returns false by default' do
|
145
145
|
options.send(m).should be false
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wapiti
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-09-
|
12
|
+
date: 2011-09-02 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
16
|
-
requirement: &
|
16
|
+
requirement: &2156033940 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0.9'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2156033940
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rake-compiler
|
27
|
-
requirement: &
|
27
|
+
requirement: &2156032240 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0.7'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2156032240
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ZenTest
|
38
|
-
requirement: &
|
38
|
+
requirement: &2156031260 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '4.6'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2156031260
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rspec
|
49
|
-
requirement: &
|
49
|
+
requirement: &2156022680 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,7 +54,7 @@ dependencies:
|
|
54
54
|
version: '2.6'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2156022680
|
58
58
|
description: This gem provides a Ruby API for Conditional Random Fields (CRF). It
|
59
59
|
is implemented as a C exstension and based on the wicked fast "wapiti" package.
|
60
60
|
email:
|