wapiti 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +36 -0
- data/ext/wapiti/native.c +75 -40
- data/lib/wapiti/model.rb +31 -0
- data/lib/wapiti/options.rb +10 -2
- data/lib/wapiti/version.rb +1 -1
- data/spec/wapiti/model_spec.rb +17 -1
- data/spec/wapiti/options_spec.rb +1 -1
- metadata +10 -10
data/README.md
CHANGED
@@ -103,6 +103,42 @@ corresponding label:
|
|
103
103
|
=> [[["confidence nn", "b-np"], ["in in", "b-pp"], ["the dt", "b-np"],
|
104
104
|
["pound nn", "i-np"], [". .", "o"]]]
|
105
105
|
|
106
|
+
Note that if you set the *:score* option (either in the Model's `#options` or
|
107
|
+
when calling `#label`), the score for each label will be appended to
|
108
|
+
each token/label tuple as a floating point number or passed as a third
|
109
|
+
argument to the passed-in block.
|
110
|
+
|
111
|
+
model.label [['Confidence NN']], :score => true
|
112
|
+
=> [[["Confidence NN", "B-NP", 4.642034838737357]]]
|
113
|
+
|
114
|
+
Similarly, if you set the *:nbest* option to a value greater than one, Wapiti
|
115
|
+
will append more label and, optionally, score values to each tuple.
|
116
|
+
|
117
|
+
model.label [['Confidence NN']], :score => true, :nbest => 3, :skip_tokens => true
|
118
|
+
=> [[["B-NP", 4.642034838737357, "B-VP", 1.7040256847206927, "B-ADJP", 0.7636429298060177]]]
|
119
|
+
|
120
|
+
Note how we also suppressed the output of the token string using the
|
121
|
+
*:skip_tokens* option.
|
122
|
+
|
123
|
+
|
124
|
+
### Statistics
|
125
|
+
|
126
|
+
By setting the *:check* option you can tell Wapiti to keep statistics during
|
127
|
+
the labelling phase (for the statistics to be meaningful you obviously need
|
128
|
+
to provide input data that is already labelled). Wapiti does not reset the
|
129
|
+
counters during consecutive calls to `#label` to allow you to collect
|
130
|
+
accumulative date; however, you can reset the counters at any time, by calling
|
131
|
+
`#clear_counters`.
|
132
|
+
|
133
|
+
After calling `#label` with the *:check* options set and appropriately labelled
|
134
|
+
input, you can access the statistics via `#statistics` (the individual values
|
135
|
+
are also available through the associated attribute readers).
|
136
|
+
|
137
|
+
model.label 'test.txt', :check => true
|
138
|
+
=> {:tokens=>{:total=>1896, :errors=>137, :rate=>0.0007225738396624472},
|
139
|
+
:sequences=>{:total=>77, :errors=>50, :rate=>0.006493506493506494}}
|
140
|
+
|
141
|
+
|
106
142
|
|
107
143
|
Citing
|
108
144
|
------
|
data/ext/wapiti/native.c
CHANGED
@@ -522,10 +522,10 @@ void Init_options() {
|
|
522
522
|
|
523
523
|
rb_define_alias(cOptions, "sparse?", "sparse");
|
524
524
|
|
525
|
-
rb_define_method(cOptions, "
|
526
|
-
rb_define_method(cOptions, "
|
525
|
+
rb_define_method(cOptions, "skip_tokens", options_label, 0);
|
526
|
+
rb_define_method(cOptions, "skip_tokens=", options_set_label, 1);
|
527
527
|
|
528
|
-
rb_define_alias(cOptions, "
|
528
|
+
rb_define_alias(cOptions, "skip_tokens?", "skip_tokens");
|
529
529
|
|
530
530
|
rb_define_method(cOptions, "check", options_check, 0);
|
531
531
|
rb_define_method(cOptions, "check=", options_set_check, 1);
|
@@ -680,7 +680,7 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
|
|
680
680
|
options = rb_funcall(cOptions, rb_intern("new"), 0);
|
681
681
|
}
|
682
682
|
|
683
|
-
// yield
|
683
|
+
// yield options if block_given?
|
684
684
|
if (rb_block_given_p()) {
|
685
685
|
rb_yield(options);
|
686
686
|
}
|
@@ -691,6 +691,9 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
|
|
691
691
|
if (get_options(options)->model) {
|
692
692
|
rb_funcall(self, rb_intern("load"), 0);
|
693
693
|
}
|
694
|
+
|
695
|
+
// initialize counters
|
696
|
+
rb_funcall(self, rb_intern("clear_counters"), 0);
|
694
697
|
|
695
698
|
return self;
|
696
699
|
}
|
@@ -958,63 +961,94 @@ static VALUE model_labels(VALUE self) {
|
|
958
961
|
return labels;
|
959
962
|
}
|
960
963
|
|
961
|
-
static VALUE decode_sequence(mdl_t *model, raw_t *raw) {
|
964
|
+
static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
|
962
965
|
qrk_t *lbls = model->reader->lbl;
|
963
966
|
|
964
|
-
const
|
965
|
-
const
|
967
|
+
const unsigned int Y = model->nlbl;
|
968
|
+
const unsigned int N = model->opt->nbest;
|
966
969
|
|
967
970
|
seq_t *seq = rdr_raw2seq(model->reader, raw, model->opt->check);
|
968
971
|
|
969
|
-
const int T = seq->len;
|
972
|
+
const unsigned int T = seq->len;
|
973
|
+
unsigned int n, t, tcnt = 0, terr = 0, scnt = 0, serr = 0, stat[3][Y];
|
970
974
|
|
971
975
|
size_t *out = xmalloc(sizeof(size_t) * T * N);
|
972
976
|
double *psc = xmalloc(sizeof(double) * T * N);
|
973
977
|
double *scs = xmalloc(sizeof(double) * N);
|
974
978
|
|
975
|
-
VALUE
|
979
|
+
VALUE sequence, tokens;
|
976
980
|
|
977
981
|
if (N == 1) {
|
978
982
|
tag_viterbi(model, seq, (size_t*)out, scs, (double*)psc);
|
979
983
|
}
|
980
984
|
else {
|
981
|
-
tag_nbviterbi(model, seq, N, (
|
985
|
+
tag_nbviterbi(model, seq, N, (size_t*)out, scs, (double*)psc);
|
982
986
|
}
|
983
|
-
|
984
|
-
// Next we output the raw sequence with an aditional column for
|
985
|
-
// the predicted labels
|
986
|
-
for (size_t n = 0; n < N; n++) {
|
987
987
|
|
988
|
-
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
|
994
|
-
tokens
|
995
|
-
|
996
|
-
|
997
|
-
|
998
|
-
}
|
988
|
+
sequence = rb_ary_new();
|
989
|
+
|
990
|
+
for (t = 0; t < T; ++t) {
|
991
|
+
tokens = rb_ary_new();
|
992
|
+
|
993
|
+
if (!model->opt->label) {
|
994
|
+
rb_ary_push(tokens, rb_str_new2(raw->lines[t]));
|
995
|
+
}
|
996
|
+
|
997
|
+
for (n = 0; n < N; ++n) {
|
999
998
|
|
1000
999
|
size_t lbl = out[t * N + n];
|
1001
1000
|
rb_ary_push(tokens, rb_str_new2(qrk_id2str(lbls, lbl)));
|
1002
1001
|
|
1003
|
-
//
|
1004
|
-
|
1005
|
-
|
1006
|
-
// }
|
1007
|
-
|
1008
|
-
// yield token/label pair to block if given
|
1009
|
-
if (rb_block_given_p()) {
|
1010
|
-
tokens = rb_yield(tokens);
|
1002
|
+
// output individual score
|
1003
|
+
if (model->opt->outsc) {
|
1004
|
+
rb_ary_push(tokens, rb_float_new(psc[t * N + n]));
|
1011
1005
|
}
|
1012
1006
|
|
1013
|
-
|
1007
|
+
}
|
1008
|
+
|
1009
|
+
// yield token/label pair to block if given
|
1010
|
+
if (rb_block_given_p()) {
|
1011
|
+
tokens = rb_yield(tokens);
|
1014
1012
|
}
|
1015
1013
|
|
1016
|
-
rb_ary_push(
|
1014
|
+
rb_ary_push(sequence, tokens);
|
1015
|
+
|
1016
|
+
|
1017
|
+
// TODO output sequence score: scs[n] (float)
|
1018
|
+
|
1019
|
+
}
|
1020
|
+
|
1021
|
+
// Statistics
|
1022
|
+
if (model->opt->check) {
|
1023
|
+
int err = 0;
|
1024
|
+
|
1025
|
+
for (t = 0; t < T; ++t) {
|
1026
|
+
stat[0][seq->pos[t].lbl]++;
|
1027
|
+
stat[1][out[t * N]]++;
|
1028
|
+
|
1029
|
+
if (seq->pos[t].lbl != out[t * N]) {
|
1030
|
+
terr++;
|
1031
|
+
err = 1;
|
1032
|
+
}
|
1033
|
+
else {
|
1034
|
+
stat[2][out[t * N]]++;
|
1035
|
+
}
|
1036
|
+
}
|
1037
|
+
|
1038
|
+
tcnt = FIX2INT(rb_ivar_get(self, rb_intern("@token_count")));
|
1039
|
+
rb_ivar_set(self, rb_intern("@token_count"), INT2FIX(tcnt + (unsigned int)T));
|
1040
|
+
|
1041
|
+
terr += FIX2INT(rb_ivar_get(self, rb_intern("@token_errors")));
|
1042
|
+
rb_ivar_set(self, rb_intern("@token_errors"), INT2FIX(terr));
|
1043
|
+
|
1044
|
+
scnt = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_count")));
|
1045
|
+
rb_ivar_set(self, rb_intern("@sequence_count"), INT2FIX(++scnt));
|
1046
|
+
|
1047
|
+
serr = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_errors")));
|
1048
|
+
rb_ivar_set(self, rb_intern("@sequence_errors"), INT2FIX(serr + err));
|
1049
|
+
|
1017
1050
|
}
|
1051
|
+
|
1018
1052
|
|
1019
1053
|
// Cleanup memory used for this sequence
|
1020
1054
|
xfree(scs);
|
@@ -1023,7 +1057,7 @@ static VALUE decode_sequence(mdl_t *model, raw_t *raw) {
|
|
1023
1057
|
|
1024
1058
|
rdr_freeseq(seq);
|
1025
1059
|
|
1026
|
-
return
|
1060
|
+
return sequence;
|
1027
1061
|
}
|
1028
1062
|
|
1029
1063
|
static VALUE decode_sequence_array(VALUE self, VALUE array) {
|
@@ -1053,7 +1087,7 @@ static VALUE decode_sequence_array(VALUE self, VALUE array) {
|
|
1053
1087
|
raw->lines[j] = StringValueCStr(line);
|
1054
1088
|
}
|
1055
1089
|
|
1056
|
-
|
1090
|
+
rb_ary_push(result, decode_sequence(self, model, raw));
|
1057
1091
|
|
1058
1092
|
xfree(raw);
|
1059
1093
|
}
|
@@ -1085,7 +1119,7 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
|
|
1085
1119
|
break;
|
1086
1120
|
}
|
1087
1121
|
|
1088
|
-
|
1122
|
+
rb_ary_push(result, decode_sequence(self, model, raw));
|
1089
1123
|
rdr_freeraw(raw);
|
1090
1124
|
}
|
1091
1125
|
|
@@ -1093,8 +1127,8 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
|
|
1093
1127
|
}
|
1094
1128
|
|
1095
1129
|
// cal-seq:
|
1096
|
-
// m.label(tokens) # => array of labelled tokens
|
1097
|
-
// m.label(filename) # => array of labelled tokens
|
1130
|
+
// m.label(tokens, options = {}) # => array of labelled tokens
|
1131
|
+
// m.label(filename, options = {}) # => array of labelled tokens
|
1098
1132
|
//
|
1099
1133
|
static VALUE model_label(VALUE self, VALUE data) {
|
1100
1134
|
VALUE result;
|
@@ -1120,6 +1154,7 @@ static void Init_model() {
|
|
1120
1154
|
rb_define_method(cModel, "initialize", initialize_model, -1);
|
1121
1155
|
|
1122
1156
|
rb_define_attr(cModel, "options", 1, 0);
|
1157
|
+
|
1123
1158
|
|
1124
1159
|
rb_define_method(cModel, "nlbl", model_nlbl, 0);
|
1125
1160
|
rb_define_method(cModel, "labels", model_labels, 0);
|
data/lib/wapiti/model.rb
CHANGED
@@ -30,6 +30,8 @@ module Wapiti
|
|
30
30
|
|
31
31
|
attr_accessor :path
|
32
32
|
|
33
|
+
attr_reader :token_count, :token_errors, :sequence_count, :sequence_errors
|
34
|
+
|
33
35
|
def pattern
|
34
36
|
options.pattern
|
35
37
|
end
|
@@ -38,6 +40,35 @@ module Wapiti
|
|
38
40
|
options.pattern = filename
|
39
41
|
end
|
40
42
|
|
43
|
+
alias native_label label
|
44
|
+
|
45
|
+
def label(input, opts = nil)
|
46
|
+
options.update(opts) unless opts.nil?
|
47
|
+
block_given? ? native_label(input, &Proc.new) : native_label(input)
|
48
|
+
end
|
49
|
+
|
50
|
+
def statistics
|
51
|
+
s = {}
|
52
|
+
s[:tokens] = {
|
53
|
+
:total => token_count, :errors => @token_errors,
|
54
|
+
:rate => token_errors / (token_count * 100.0)
|
55
|
+
}
|
56
|
+
s[:sequences] = {
|
57
|
+
:total => sequence_count, :errors => sequence_errors,
|
58
|
+
:rate => sequence_errors / (sequence_count * 100.0)
|
59
|
+
}
|
60
|
+
s
|
61
|
+
end
|
62
|
+
|
63
|
+
alias stats statistics
|
64
|
+
|
65
|
+
def clear_counters
|
66
|
+
@token_count = @token_errors = @sequence_count = @sequence_errors = 0
|
67
|
+
end
|
68
|
+
|
69
|
+
alias clear clear_counters
|
70
|
+
|
71
|
+
|
41
72
|
private
|
42
73
|
|
43
74
|
def tokenize(input)
|
data/lib/wapiti/options.rb
CHANGED
@@ -10,7 +10,7 @@ module Wapiti
|
|
10
10
|
def attribute_names
|
11
11
|
@attribute_names ||= %w{ stop_window convergence_window posterior
|
12
12
|
max_iterations jobsize threads rho1 rho2 stop_epsilon score check
|
13
|
-
algorithm pattern development_data maxent compact sparse
|
13
|
+
algorithm pattern development_data maxent compact sparse skip_tokens
|
14
14
|
}.sort.map(&:to_sym).freeze
|
15
15
|
end
|
16
16
|
|
@@ -103,7 +103,15 @@ module Wapiti
|
|
103
103
|
e << "BCD not supported for training maxent models" if maxent && algorithm == 'bcd'
|
104
104
|
e
|
105
105
|
end
|
106
|
-
|
106
|
+
|
107
|
+
%w{ maxent compact sparse label check score posterior }.each do |m|
|
108
|
+
writer = "#{m}=".to_sym
|
109
|
+
define_method("#{m}!") do
|
110
|
+
send(writer, true)
|
111
|
+
self
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
107
115
|
def <=>(other)
|
108
116
|
other.respond_to?(:attributes) ? attributes <=> other.attributes : nil
|
109
117
|
end
|
data/lib/wapiti/version.rb
CHANGED
data/spec/wapiti/model_spec.rb
CHANGED
@@ -143,6 +143,22 @@ module Wapiti
|
|
143
143
|
labels[0].map(&:last).should == %w{ b-np o b-np o }
|
144
144
|
end
|
145
145
|
|
146
|
+
context 'with the :score option set' do
|
147
|
+
before(:each) { model.options.score! }
|
148
|
+
|
149
|
+
it 'returns an array of token-label-score tuples' do
|
150
|
+
model.label(input)[0].map { |t,l,s| s.class }.uniq == [Float]
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
context 'with the :nbest option set to 2' do
|
155
|
+
before(:each) { model.options.nbest = 2 }
|
156
|
+
|
157
|
+
it 'returns an array of token-label-label tuples' do
|
158
|
+
model.label(input)[0][-1][1,2] == %w{ O O }
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
146
162
|
end
|
147
163
|
|
148
164
|
|
@@ -155,7 +171,7 @@ module Wapiti
|
|
155
171
|
labels[0].take(5).map(&:last).should == %w{ B-NP B-PP B-NP I-NP B-VP }
|
156
172
|
end
|
157
173
|
end
|
158
|
-
|
174
|
+
|
159
175
|
end
|
160
176
|
|
161
177
|
end
|
data/spec/wapiti/options_spec.rb
CHANGED
@@ -139,7 +139,7 @@ module Wapiti
|
|
139
139
|
end
|
140
140
|
|
141
141
|
|
142
|
-
%w{ maxent compact sparse
|
142
|
+
%w{ maxent compact sparse skip_tokens check score posterior }.each do |m|
|
143
143
|
describe "##{m}" do
|
144
144
|
it 'returns false by default' do
|
145
145
|
options.send(m).should be false
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wapiti
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-09-
|
12
|
+
date: 2011-09-02 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
16
|
-
requirement: &
|
16
|
+
requirement: &2156033940 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0.9'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2156033940
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rake-compiler
|
27
|
-
requirement: &
|
27
|
+
requirement: &2156032240 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0.7'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2156032240
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ZenTest
|
38
|
-
requirement: &
|
38
|
+
requirement: &2156031260 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '4.6'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2156031260
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rspec
|
49
|
-
requirement: &
|
49
|
+
requirement: &2156022680 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,7 +54,7 @@ dependencies:
|
|
54
54
|
version: '2.6'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2156022680
|
58
58
|
description: This gem provides a Ruby API for Conditional Random Fields (CRF). It
|
59
59
|
is implemented as a C exstension and based on the wicked fast "wapiti" package.
|
60
60
|
email:
|