wapiti 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +13 -0
- data/.gitignore +5 -0
- data/.rspec +3 -0
- data/Gemfile +6 -0
- data/LICENSE +30 -0
- data/README.md +153 -0
- data/Rakefile +33 -0
- data/ext/wapiti/bcd.c +392 -0
- data/ext/wapiti/decoder.c +535 -0
- data/ext/wapiti/decoder.h +46 -0
- data/ext/wapiti/extconf.rb +8 -0
- data/ext/wapiti/gradient.c +818 -0
- data/ext/wapiti/gradient.h +81 -0
- data/ext/wapiti/lbfgs.c +294 -0
- data/ext/wapiti/model.c +296 -0
- data/ext/wapiti/model.h +100 -0
- data/ext/wapiti/native.c +1238 -0
- data/ext/wapiti/native.h +15 -0
- data/ext/wapiti/options.c +278 -0
- data/ext/wapiti/options.h +91 -0
- data/ext/wapiti/pattern.c +395 -0
- data/ext/wapiti/pattern.h +56 -0
- data/ext/wapiti/progress.c +167 -0
- data/ext/wapiti/progress.h +43 -0
- data/ext/wapiti/quark.c +272 -0
- data/ext/wapiti/quark.h +46 -0
- data/ext/wapiti/reader.c +553 -0
- data/ext/wapiti/reader.h +73 -0
- data/ext/wapiti/rprop.c +191 -0
- data/ext/wapiti/sequence.h +148 -0
- data/ext/wapiti/sgdl1.c +218 -0
- data/ext/wapiti/thread.c +171 -0
- data/ext/wapiti/thread.h +42 -0
- data/ext/wapiti/tools.c +202 -0
- data/ext/wapiti/tools.h +54 -0
- data/ext/wapiti/trainers.h +39 -0
- data/ext/wapiti/vmath.c +372 -0
- data/ext/wapiti/vmath.h +51 -0
- data/ext/wapiti/wapiti.c +288 -0
- data/ext/wapiti/wapiti.h +45 -0
- data/lib/wapiti.rb +30 -0
- data/lib/wapiti/errors.rb +17 -0
- data/lib/wapiti/model.rb +49 -0
- data/lib/wapiti/options.rb +113 -0
- data/lib/wapiti/utility.rb +15 -0
- data/lib/wapiti/version.rb +3 -0
- data/spec/fixtures/ch.mod +18550 -0
- data/spec/fixtures/chpattern.txt +52 -0
- data/spec/fixtures/chtest.txt +1973 -0
- data/spec/fixtures/chtrain.txt +19995 -0
- data/spec/fixtures/nppattern.txt +52 -0
- data/spec/fixtures/nptest.txt +1973 -0
- data/spec/fixtures/nptrain.txt +19995 -0
- data/spec/fixtures/pattern.txt +14 -0
- data/spec/fixtures/test.txt +60000 -0
- data/spec/fixtures/train.txt +1200 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/wapiti/model_spec.rb +173 -0
- data/spec/wapiti/native_spec.rb +12 -0
- data/spec/wapiti/options_spec.rb +175 -0
- data/spec/wapiti/utility_spec.rb +22 -0
- data/wapiti.gemspec +35 -0
- metadata +178 -0
@@ -0,0 +1,535 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#include <float.h>
|
29
|
+
#include <stddef.h>
|
30
|
+
#include <stdlib.h>
|
31
|
+
#include <stdio.h>
|
32
|
+
|
33
|
+
#include "wapiti.h"
|
34
|
+
#include "gradient.h"
|
35
|
+
#include "model.h"
|
36
|
+
#include "quark.h"
|
37
|
+
#include "reader.h"
|
38
|
+
#include "sequence.h"
|
39
|
+
#include "thread.h"
|
40
|
+
#include "tools.h"
|
41
|
+
#include "decoder.h"
|
42
|
+
|
43
|
+
/******************************************************************************
|
44
|
+
* Sequence tagging
|
45
|
+
*
|
46
|
+
* This module implement sequence tagging using a trained model and model
|
47
|
+
* evaluation on devlopment set.
|
48
|
+
*
|
49
|
+
* The viterbi can be quite intensive on the stack if you push in it long
|
50
|
+
* sequence and use large labels set. It's less a problem than in gradient
|
51
|
+
* computations but it can show up in particular cases. The fix is to call it
|
52
|
+
* through the mth_spawn function and request enough stack space, this will be
|
53
|
+
* fixed in next version.
|
54
|
+
******************************************************************************/
|
55
|
+
|
56
|
+
/* tag_expsc:
|
57
|
+
* Compute the score lattice for classical Viterbi decoding. This is the same
|
58
|
+
* as for the first step of the gradient computation with the exception that
|
59
|
+
* we don't need to take the exponential of the scores as the Viterbi decoding
|
60
|
+
* works in log-space.
|
61
|
+
*/
|
62
|
+
static int tag_expsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
|
63
|
+
const double *x = mdl->theta;
|
64
|
+
const size_t Y = mdl->nlbl;
|
65
|
+
const int T = seq->len;
|
66
|
+
double (*psi)[T][Y][Y] = (void *)vpsi;
|
67
|
+
// We first have to compute the Ψ_t(y',y,x_t) weights defined as
|
68
|
+
// Ψ_t(y',y,x_t) = \exp( ∑_k θ_k f_k(y',y,x_t) )
|
69
|
+
// So at position 't' in the sequence, for each couple (y',y) we have
|
70
|
+
// to sum weights of all features.
|
71
|
+
// This is the same than what we do for computing the gradient but, as
|
72
|
+
// the viterbi algorithm also work in the logarithmic space, we can
|
73
|
+
// remove the exponential.
|
74
|
+
//
|
75
|
+
// Only the observations present at this position will have a non-nul
|
76
|
+
// weight so we can sum only on thoses.
|
77
|
+
//
|
78
|
+
// As we use only two kind of features: unigram and bigram, we can
|
79
|
+
// rewrite this as
|
80
|
+
// ∑_k μ_k(y, x_t) f_k(y, x_t) + ∑_k λ_k(y', y, x_t) f_k(y', y, x_t)
|
81
|
+
// Where the first sum is over the unigrams features and the second is
|
82
|
+
// over bigrams ones.
|
83
|
+
//
|
84
|
+
// This allow us to compute Ψ efficiently in two steps
|
85
|
+
// 1/ we sum the unigrams features weights by looping over actives
|
86
|
+
// unigrams observations. (we compute this sum once and use it
|
87
|
+
// for each value of y')
|
88
|
+
// 2/ we add the bigrams features weights by looping over actives
|
89
|
+
// bigrams observations (we don't have to do this for t=0 since
|
90
|
+
// there is no bigrams here)
|
91
|
+
for (int t = 0; t < T; t++) {
|
92
|
+
const pos_t *pos = &(seq->pos[t]);
|
93
|
+
for (size_t y = 0; y < Y; y++) {
|
94
|
+
double sum = 0.0;
|
95
|
+
for (size_t n = 0; n < pos->ucnt; n++) {
|
96
|
+
const size_t o = pos->uobs[n];
|
97
|
+
sum += x[mdl->uoff[o] + y];
|
98
|
+
}
|
99
|
+
for (size_t yp = 0; yp < Y; yp++)
|
100
|
+
(*psi)[t][yp][y] = sum;
|
101
|
+
}
|
102
|
+
}
|
103
|
+
for (int t = 1; t < T; t++) {
|
104
|
+
const pos_t *pos = &(seq->pos[t]);
|
105
|
+
for (size_t yp = 0, d = 0; yp < Y; yp++) {
|
106
|
+
for (size_t y = 0; y < Y; y++, d++) {
|
107
|
+
double sum = 0.0;
|
108
|
+
for (size_t n = 0; n < pos->bcnt; n++) {
|
109
|
+
const size_t o = pos->bobs[n];
|
110
|
+
sum += x[mdl->boff[o] + d];
|
111
|
+
}
|
112
|
+
(*psi)[t][yp][y] += sum;
|
113
|
+
}
|
114
|
+
}
|
115
|
+
}
|
116
|
+
return 0;
|
117
|
+
}
|
118
|
+
|
119
|
+
/* tag_postsc:
|
120
|
+
* This function compute score lattice with posteriors. This generally result
|
121
|
+
* in a slightly best labelling and allow to output normalized score for the
|
122
|
+
* sequence and for each labels but this is more costly as we have to perform
|
123
|
+
* a full forward backward instead of just the forward pass.
|
124
|
+
*/
|
125
|
+
static int tag_postsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
|
126
|
+
const size_t Y = mdl->nlbl;
|
127
|
+
const int T = seq->len;
|
128
|
+
double (*psi)[T][Y][Y] = (void *)vpsi;
|
129
|
+
grd_t *grd = grd_new(mdl, NULL);
|
130
|
+
grd->first = 0;
|
131
|
+
grd->last = T - 1;
|
132
|
+
grd_check(grd, seq->len);
|
133
|
+
if (mdl->opt->sparse) {
|
134
|
+
grd_spdopsi(grd, seq);
|
135
|
+
grd_spfwdbwd(grd, seq);
|
136
|
+
} else {
|
137
|
+
grd_fldopsi(grd, seq);
|
138
|
+
grd_flfwdbwd(grd, seq);
|
139
|
+
}
|
140
|
+
double (*alpha)[T][Y] = (void *)grd->alpha;
|
141
|
+
double (*beta )[T][Y] = (void *)grd->beta;
|
142
|
+
double *unorm = grd->unorm;
|
143
|
+
for (int t = 0; t < T; t++) {
|
144
|
+
for (size_t y = 0; y < Y; y++) {
|
145
|
+
double e = (*alpha)[t][y] * (*beta)[t][y] * unorm[t];
|
146
|
+
for (size_t yp = 0; yp < Y; yp++)
|
147
|
+
(*psi)[t][yp][y] = e;
|
148
|
+
}
|
149
|
+
}
|
150
|
+
grd_free(grd);
|
151
|
+
return 1;
|
152
|
+
}
|
153
|
+
|
154
|
+
/* tag_viterbi:
|
155
|
+
* This function implement the Viterbi algorithm in order to decode the most
|
156
|
+
* probable sequence of labels according to the model. Some part of this code
|
157
|
+
* is very similar to the computation of the gradient as expected.
|
158
|
+
*
|
159
|
+
* And like for the gradient, the caller is responsible to ensure there is
|
160
|
+
* enough stack space.
|
161
|
+
*/
|
162
|
+
void tag_viterbi(mdl_t *mdl, const seq_t *seq,
|
163
|
+
size_t out[], double *sc, double psc[]) {
|
164
|
+
const size_t Y = mdl->nlbl;
|
165
|
+
const int T = seq->len;
|
166
|
+
double *vpsi = wapiti_xmalloc(sizeof(double) * T * Y * Y);
|
167
|
+
size_t *vback = wapiti_xmalloc(sizeof(size_t) * T * Y);
|
168
|
+
double (*psi) [T][Y][Y] = (void *)vpsi;
|
169
|
+
size_t (*back)[T][Y] = (void *)vback;
|
170
|
+
double *cur = wapiti_xmalloc(sizeof(double) * Y);
|
171
|
+
double *old = wapiti_xmalloc(sizeof(double) * Y);
|
172
|
+
// We first compute the scores for each transitions in the lattice of
|
173
|
+
// labels.
|
174
|
+
int op;
|
175
|
+
if (mdl->opt->lblpost)
|
176
|
+
op = tag_postsc(mdl, seq, vpsi);
|
177
|
+
else
|
178
|
+
op = tag_expsc(mdl, seq, vpsi);
|
179
|
+
// Now we can do the Viterbi algorithm. This is very similar to the
|
180
|
+
// forward pass
|
181
|
+
// | α_1(y) = Ψ_1(y,x_1)
|
182
|
+
// | α_t(y) = max_{y'} α_{t-1}(y') + Ψ_t(y',y,x_t)
|
183
|
+
// We just replace the sum by a max and as we do the computation in the
|
184
|
+
// logarithmic space the product become a sum. (this also mean that we
|
185
|
+
// don't have to worry about numerical problems)
|
186
|
+
//
|
187
|
+
// Next we have to walk backward over the α in order to find the best
|
188
|
+
// path. In order to do this efficiently, we keep in the 'back' array
|
189
|
+
// the indice of the y value selected by the max. This also mean that
|
190
|
+
// we only need the current and previous value of the α vectors, not
|
191
|
+
// the full matrix.
|
192
|
+
for (size_t y = 0; y < Y; y++)
|
193
|
+
cur[y] = (*psi)[0][0][y];
|
194
|
+
for (int t = 1; t < T; t++) {
|
195
|
+
for (size_t y = 0; y < Y; y++)
|
196
|
+
old[y] = cur[y];
|
197
|
+
for (size_t y = 0; y < Y; y++) {
|
198
|
+
double bst = -1.0;
|
199
|
+
int idx = 0;
|
200
|
+
for (size_t yp = 0; yp < Y; yp++) {
|
201
|
+
double val = old[yp];
|
202
|
+
if (op)
|
203
|
+
val *= (*psi)[t][yp][y];
|
204
|
+
else
|
205
|
+
val += (*psi)[t][yp][y];
|
206
|
+
if (val > bst) {
|
207
|
+
bst = val;
|
208
|
+
idx = yp;
|
209
|
+
}
|
210
|
+
}
|
211
|
+
(*back)[t][y] = idx;
|
212
|
+
cur[y] = bst;
|
213
|
+
}
|
214
|
+
}
|
215
|
+
// We can now build the sequence of labels predicted by the model. For
|
216
|
+
// this we search in the last α vector the best value. Using this index
|
217
|
+
// as a starting point in the back-pointer array we finally can decode
|
218
|
+
// the best sequence.
|
219
|
+
int bst = 0;
|
220
|
+
for (size_t y = 1; y < Y; y++)
|
221
|
+
if (cur[y] > cur[bst])
|
222
|
+
bst = y;
|
223
|
+
if (sc != NULL)
|
224
|
+
*sc = cur[bst];
|
225
|
+
for (int t = T; t > 0; t--) {
|
226
|
+
const size_t yp = (t != 1) ? (*back)[t - 1][bst] : 0;
|
227
|
+
const size_t y = bst;
|
228
|
+
out[t - 1] = y;
|
229
|
+
if (psc != NULL)
|
230
|
+
psc[t - 1] = (*psi)[t - 1][yp][y];
|
231
|
+
bst = yp;
|
232
|
+
}
|
233
|
+
free(old);
|
234
|
+
free(cur);
|
235
|
+
free(vback);
|
236
|
+
free(vpsi);
|
237
|
+
}
|
238
|
+
|
239
|
+
/* tag_nbviterbi:
|
240
|
+
* This function implement the Viterbi algorithm in order to decode the N-most
|
241
|
+
* probable sequences of labels according to the model. It can be used to
|
242
|
+
* compute only the best one and will return the same sequence than the
|
243
|
+
* previous function but will be slower to do it.
|
244
|
+
*/
|
245
|
+
void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
|
246
|
+
size_t out[][N], double sc[], double psc[][N]) {
|
247
|
+
const size_t Y = mdl->nlbl;
|
248
|
+
const int T = seq->len;
|
249
|
+
double *vpsi = wapiti_xmalloc(sizeof(double) * T * Y * Y);
|
250
|
+
size_t *vback = wapiti_xmalloc(sizeof(size_t) * T * Y * N);
|
251
|
+
double (*psi) [T][Y ][Y] = (void *)vpsi;
|
252
|
+
size_t (*back)[T][Y * N] = (void *)vback;
|
253
|
+
double *cur = wapiti_xmalloc(sizeof(double) * Y * N);
|
254
|
+
double *old = wapiti_xmalloc(sizeof(double) * Y * N);
|
255
|
+
// We first compute the scores for each transitions in the lattice of
|
256
|
+
// labels.
|
257
|
+
int op;
|
258
|
+
if (mdl->opt->lblpost)
|
259
|
+
op = tag_postsc(mdl, seq, (double *)psi);
|
260
|
+
else
|
261
|
+
op = tag_expsc(mdl, seq, (double *)psi);
|
262
|
+
// Here also, it's classical but we have to keep the N best paths
|
263
|
+
// leading to each nodes of the lattice instead of only the best one.
|
264
|
+
// This mean that code is less trivial and the current implementation is
|
265
|
+
// not the most efficient way to do this but it works well and is good
|
266
|
+
// enough for the moment.
|
267
|
+
// We first build the list of all incoming arcs from all paths from all
|
268
|
+
// N-best nodes and next select the N-best one. There is a lot of room
|
269
|
+
// here for later optimisations if needed.
|
270
|
+
for (size_t y = 0, d = 0; y < Y; y++) {
|
271
|
+
cur[d++] = (*psi)[0][0][y];
|
272
|
+
for (size_t n = 1; n < N; n++)
|
273
|
+
cur[d++] = -DBL_MAX;
|
274
|
+
}
|
275
|
+
for (int t = 1; t < T; t++) {
|
276
|
+
for (size_t d = 0; d < Y * N; d++)
|
277
|
+
old[d] = cur[d];
|
278
|
+
for (size_t y = 0; y < Y; y++) {
|
279
|
+
// 1st, build the list of all incoming
|
280
|
+
double lst[Y * N];
|
281
|
+
for (size_t yp = 0, d = 0; yp < Y; yp++) {
|
282
|
+
for (size_t n = 0; n < N; n++, d++) {
|
283
|
+
lst[d] = old[d];
|
284
|
+
if (op)
|
285
|
+
lst[d] *= (*psi)[t][yp][y];
|
286
|
+
else
|
287
|
+
lst[d] += (*psi)[t][yp][y];
|
288
|
+
}
|
289
|
+
}
|
290
|
+
// 2nd, init the back with the N first
|
291
|
+
size_t *bk = &(*back)[t][y * N];
|
292
|
+
for (size_t n = 0; n < N; n++)
|
293
|
+
bk[n] = n;
|
294
|
+
// 3rd, search the N highest values
|
295
|
+
for (size_t i = N; i < N * Y; i++) {
|
296
|
+
// Search the smallest current value
|
297
|
+
size_t idx = 0;
|
298
|
+
for (size_t n = 1; n < N; n++)
|
299
|
+
if (lst[bk[n]] < lst[bk[idx]])
|
300
|
+
idx = n;
|
301
|
+
// And replace it if needed
|
302
|
+
if (lst[i] > lst[bk[idx]])
|
303
|
+
bk[idx] = i;
|
304
|
+
}
|
305
|
+
// 4th, get the new scores
|
306
|
+
for (size_t n = 0; n < N; n++)
|
307
|
+
cur[y * N + n] = lst[bk[n]];
|
308
|
+
}
|
309
|
+
}
|
310
|
+
// Retrieving the best paths is similar to classical Viterbi except that
|
311
|
+
// we have to search for the N bet ones and there is N time more
|
312
|
+
// possibles starts.
|
313
|
+
for (size_t n = 0; n < N; n++) {
|
314
|
+
int bst = 0;
|
315
|
+
for (size_t d = 1; d < Y * N; d++)
|
316
|
+
if (cur[d] > cur[bst])
|
317
|
+
bst = d;
|
318
|
+
if (sc != NULL)
|
319
|
+
sc[n] = cur[bst];
|
320
|
+
cur[bst] = -DBL_MAX;
|
321
|
+
for (int t = T; t > 0; t--) {
|
322
|
+
const size_t yp = (t != 1) ? (*back)[t - 1][bst] / N: 0;
|
323
|
+
const size_t y = bst / N;
|
324
|
+
out[t - 1][n] = y;
|
325
|
+
if (psc != NULL)
|
326
|
+
psc[t - 1][n] = (*psi)[t - 1][yp][y];
|
327
|
+
bst = (*back)[t - 1][bst];
|
328
|
+
}
|
329
|
+
}
|
330
|
+
free(old);
|
331
|
+
free(cur);
|
332
|
+
free(vback);
|
333
|
+
free(vpsi);
|
334
|
+
}
|
335
|
+
|
336
|
+
/* tag_label:
|
337
|
+
* Label a data file using the current model. This output an almost exact copy
|
338
|
+
* of the input file with an additional column with the predicted label. If
|
339
|
+
* the check option is specified, the input file must be labelled and the
|
340
|
+
* predicted labels will be checked against the provided ones. This will
|
341
|
+
* output error rates during the labelling and detailed statistics per label
|
342
|
+
* at the end.
|
343
|
+
*/
|
344
|
+
void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
|
345
|
+
qrk_t *lbls = mdl->reader->lbl;
|
346
|
+
const size_t Y = mdl->nlbl;
|
347
|
+
const size_t N = mdl->opt->nbest;
|
348
|
+
// We start by preparing the statistic collection to be ready if check
|
349
|
+
// option is used. The stat array hold the following for each label
|
350
|
+
// [0] # of reference with this label
|
351
|
+
// [1] # of token we have taged with this label
|
352
|
+
// [2] # of match of the two preceding
|
353
|
+
size_t tcnt = 0, terr = 0;
|
354
|
+
size_t scnt = 0, serr = 0;
|
355
|
+
size_t stat[3][Y];
|
356
|
+
for (size_t y = 0; y < Y; y++)
|
357
|
+
stat[0][y] = stat[1][y] = stat[2][y] = 0;
|
358
|
+
// Next read the input file sequence by sequence and label them, we have
|
359
|
+
// to take care of not discarding the raw input as we want to send it
|
360
|
+
// back to the output with the additional predicted labels.
|
361
|
+
while (!feof(fin)) {
|
362
|
+
// So, first read an input sequence keeping the raw_t object
|
363
|
+
// available, and label it with Viterbi.
|
364
|
+
raw_t *raw = rdr_readraw(mdl->reader, fin);
|
365
|
+
if (raw == NULL)
|
366
|
+
break;
|
367
|
+
seq_t *seq = rdr_raw2seq(mdl->reader, raw, mdl->opt->check);
|
368
|
+
const int T = seq->len;
|
369
|
+
size_t *out = wapiti_xmalloc(sizeof(size_t) * T * N);
|
370
|
+
double *psc = wapiti_xmalloc(sizeof(double) * T * N);
|
371
|
+
double *scs = wapiti_xmalloc(sizeof(double) * N);
|
372
|
+
if (N == 1)
|
373
|
+
tag_viterbi(mdl, seq, (size_t*)out, scs, (double*)psc);
|
374
|
+
else
|
375
|
+
tag_nbviterbi(mdl, seq, N, (void*)out, scs, (void*)psc);
|
376
|
+
// Next we output the raw sequence with an aditional column for
|
377
|
+
// the predicted labels
|
378
|
+
for (size_t n = 0; n < N; n++) {
|
379
|
+
if (mdl->opt->outsc)
|
380
|
+
fprintf(fout, "# %d %f\n", (int)n, scs[n]);
|
381
|
+
for (int t = 0; t < T; t++) {
|
382
|
+
if (!mdl->opt->label)
|
383
|
+
fprintf(fout, "%s\t", raw->lines[t]);
|
384
|
+
size_t lbl = out[t * N + n];
|
385
|
+
const char *lblstr = qrk_id2str(lbls, lbl);
|
386
|
+
fprintf(fout, "%s", lblstr);
|
387
|
+
if (mdl->opt->outsc) {
|
388
|
+
fprintf(fout, "\t%s", lblstr);
|
389
|
+
fprintf(fout, "/%f", psc[t * N + n]);
|
390
|
+
}
|
391
|
+
fprintf(fout, "\n");
|
392
|
+
}
|
393
|
+
fprintf(fout, "\n");
|
394
|
+
}
|
395
|
+
fflush(fout);
|
396
|
+
// If user provided reference labels, use them to collect
|
397
|
+
// statistics about how well we have performed here.
|
398
|
+
if (mdl->opt->check) {
|
399
|
+
bool err = false;
|
400
|
+
for (int t = 0; t < T; t++) {
|
401
|
+
stat[0][seq->pos[t].lbl]++;
|
402
|
+
stat[1][out[t * N]]++;
|
403
|
+
if (seq->pos[t].lbl != out[t * N])
|
404
|
+
terr++, err = true;
|
405
|
+
else
|
406
|
+
stat[2][out[t * N]]++;
|
407
|
+
}
|
408
|
+
tcnt += (size_t)T;
|
409
|
+
serr += err;
|
410
|
+
}
|
411
|
+
// Cleanup memory used for this sequence
|
412
|
+
free(scs);
|
413
|
+
free(psc);
|
414
|
+
free(out);
|
415
|
+
rdr_freeseq(seq);
|
416
|
+
rdr_freeraw(raw);
|
417
|
+
// And report our progress, at regular interval we display how
|
418
|
+
// much sequence are labelled and if possible the current tokens
|
419
|
+
// and sequence error rates.
|
420
|
+
if (++scnt % 1000 == 0) {
|
421
|
+
info("%10zu sequences labeled", scnt);
|
422
|
+
if (mdl->opt->check) {
|
423
|
+
const double te = (double)terr / tcnt * 100.0;
|
424
|
+
const double se = (double)serr / scnt * 100.0;
|
425
|
+
info("\t%5.2f%%/%5.2f%%", te, se);
|
426
|
+
}
|
427
|
+
info("\n");
|
428
|
+
}
|
429
|
+
}
|
430
|
+
// If user have provided reference labels, we have collected a lot of
|
431
|
+
// statistics and we can repport global token and sequence error rate as
|
432
|
+
// well as precision recall and f-measure for each labels.
|
433
|
+
if (mdl->opt->check) {
|
434
|
+
const double te = (double)terr / tcnt * 100.0;
|
435
|
+
const double se = (double)serr / scnt * 100.0;
|
436
|
+
info(" Nb sequences : %zu\n", scnt);
|
437
|
+
info(" Token error : %5.2f%%\n", te);
|
438
|
+
info(" Sequence error: %5.2f%%\n", se);
|
439
|
+
info("* Per label statistics\n");
|
440
|
+
for (size_t y = 0; y < Y; y++) {
|
441
|
+
const char *lbl = qrk_id2str(lbls, y);
|
442
|
+
const double Rc = (double)stat[2][y] / stat[0][y];
|
443
|
+
const double Pr = (double)stat[2][y] / stat[1][y];
|
444
|
+
const double F1 = 2.0 * (Pr * Rc) / (Pr + Rc);
|
445
|
+
info(" %-6s", lbl);
|
446
|
+
info(" Pr=%.2f", Pr);
|
447
|
+
info(" Rc=%.2f", Rc);
|
448
|
+
info(" F1=%.2f\n", F1);
|
449
|
+
}
|
450
|
+
}
|
451
|
+
}
|
452
|
+
|
453
|
+
/* eval_t:
|
454
|
+
* This a state tracker used to communicate between the main eval function and
|
455
|
+
* its workers threads, the <mdl> and <dat> fields are used to transmit to the
|
456
|
+
* workers informations needed to make the computation, the other fields are
|
457
|
+
* for returning the partial results.
|
458
|
+
*/
|
459
|
+
typedef struct eval_s eval_t;
|
460
|
+
struct eval_s {
|
461
|
+
mdl_t *mdl;
|
462
|
+
dat_t *dat;
|
463
|
+
size_t tcnt; // Processed tokens count
|
464
|
+
size_t terr; // Tokens error found
|
465
|
+
size_t scnt; // Processes sequences count
|
466
|
+
size_t serr; // Sequence error found
|
467
|
+
};
|
468
|
+
|
469
|
+
/* tag_evalsub:
|
470
|
+
* This is where the real evaluation is done by the workers, we process data
|
471
|
+
* by batch and for each batch do a simple Viterbi and scan the result to find
|
472
|
+
* errors.
|
473
|
+
*/
|
474
|
+
static void tag_evalsub(job_t *job, int id, int cnt, eval_t *eval) {
|
475
|
+
unused(id && cnt);
|
476
|
+
mdl_t *mdl = eval->mdl;
|
477
|
+
dat_t *dat = eval->dat;
|
478
|
+
eval->tcnt = 0;
|
479
|
+
eval->terr = 0;
|
480
|
+
eval->scnt = 0;
|
481
|
+
eval->serr = 0;
|
482
|
+
// We just get a job a process all the squence in it.
|
483
|
+
size_t count, pos;
|
484
|
+
while (mth_getjob(job, &count, &pos)) {
|
485
|
+
for (size_t s = pos; s < pos + count; s++) {
|
486
|
+
// Tag the sequence with the viterbi
|
487
|
+
const seq_t *seq = dat->seq[s];
|
488
|
+
const int T = seq->len;
|
489
|
+
size_t out[T];
|
490
|
+
tag_viterbi(mdl, seq, out, NULL, NULL);
|
491
|
+
// And check for eventual (probable ?) errors
|
492
|
+
bool err = false;
|
493
|
+
for (int t = 0; t < T; t++)
|
494
|
+
if (seq->pos[t].lbl != out[t])
|
495
|
+
eval->terr++, err = true;
|
496
|
+
eval->tcnt += (size_t)T;
|
497
|
+
eval->scnt += 1;
|
498
|
+
eval->serr += err;
|
499
|
+
}
|
500
|
+
}
|
501
|
+
}
|
502
|
+
|
503
|
+
/* tag_eval:
|
504
|
+
* Compute the token error rate and sequence error rate over the devel set (or
|
505
|
+
* taining set if not available).
|
506
|
+
*/
|
507
|
+
void tag_eval(mdl_t *mdl, double *te, double *se) {
|
508
|
+
const size_t W = mdl->opt->nthread;
|
509
|
+
dat_t *dat = (mdl->devel == NULL) ? mdl->train : mdl->devel;
|
510
|
+
// First we prepare the eval state for all the workers threads, we just
|
511
|
+
// have to give them the model and dataset to use. This state will be
|
512
|
+
// used to retrieve partial result they computed.
|
513
|
+
eval_t *eval[W];
|
514
|
+
for (size_t w = 0; w < W; w++) {
|
515
|
+
eval[w] = wapiti_xmalloc(sizeof(eval_t));
|
516
|
+
eval[w]->mdl = mdl;
|
517
|
+
eval[w]->dat = dat;
|
518
|
+
}
|
519
|
+
// And next, we call the workers to do the job and reduce the partial
|
520
|
+
// result by summing them and computing the final error rates.
|
521
|
+
mth_spawn((func_t *)tag_evalsub, W, (void *)eval, dat->nseq,
|
522
|
+
mdl->opt->jobsize);
|
523
|
+
size_t tcnt = 0, terr = 0;
|
524
|
+
size_t scnt = 0, serr = 0;
|
525
|
+
for (size_t w = 0; w < W; w++) {
|
526
|
+
tcnt += eval[w]->tcnt;
|
527
|
+
terr += eval[w]->terr;
|
528
|
+
scnt += eval[w]->scnt;
|
529
|
+
serr += eval[w]->serr;
|
530
|
+
free(eval[w]);
|
531
|
+
}
|
532
|
+
*te = (double)terr / tcnt * 100.0;
|
533
|
+
*se = (double)serr / scnt * 100.0;
|
534
|
+
}
|
535
|
+
|