wapiti 0.0.5 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.simplecov +3 -0
- data/Gemfile +25 -2
- data/HISTORY.md +5 -1
- data/LICENSE +14 -13
- data/README.md +9 -16
- data/Rakefile +38 -8
- data/ext/wapiti/bcd.c +126 -124
- data/ext/wapiti/decoder.c +203 -124
- data/ext/wapiti/decoder.h +6 -4
- data/ext/wapiti/extconf.rb +2 -2
- data/ext/wapiti/gradient.c +491 -320
- data/ext/wapiti/gradient.h +52 -34
- data/ext/wapiti/lbfgs.c +74 -33
- data/ext/wapiti/model.c +47 -37
- data/ext/wapiti/model.h +22 -20
- data/ext/wapiti/native.c +850 -839
- data/ext/wapiti/native.h +1 -1
- data/ext/wapiti/options.c +52 -20
- data/ext/wapiti/options.h +37 -30
- data/ext/wapiti/pattern.c +35 -33
- data/ext/wapiti/pattern.h +12 -11
- data/ext/wapiti/progress.c +14 -13
- data/ext/wapiti/progress.h +3 -2
- data/ext/wapiti/quark.c +14 -16
- data/ext/wapiti/quark.h +6 -5
- data/ext/wapiti/reader.c +83 -69
- data/ext/wapiti/reader.h +11 -9
- data/ext/wapiti/rprop.c +84 -43
- data/ext/wapiti/sequence.h +18 -16
- data/ext/wapiti/sgdl1.c +45 -43
- data/ext/wapiti/thread.c +19 -17
- data/ext/wapiti/thread.h +5 -4
- data/ext/wapiti/tools.c +7 -7
- data/ext/wapiti/tools.h +3 -4
- data/ext/wapiti/trainers.h +1 -1
- data/ext/wapiti/vmath.c +40 -38
- data/ext/wapiti/vmath.h +12 -11
- data/ext/wapiti/wapiti.c +159 -37
- data/ext/wapiti/wapiti.h +18 -4
- data/lib/wapiti.rb +15 -15
- data/lib/wapiti/errors.rb +15 -15
- data/lib/wapiti/model.rb +92 -84
- data/lib/wapiti/options.rb +123 -124
- data/lib/wapiti/utility.rb +14 -14
- data/lib/wapiti/version.rb +2 -2
- data/spec/spec_helper.rb +29 -9
- data/spec/wapiti/model_spec.rb +230 -194
- data/spec/wapiti/native_spec.rb +7 -8
- data/spec/wapiti/options_spec.rb +184 -174
- data/wapiti.gemspec +22 -8
- metadata +38 -42
- data/.gitignore +0 -5
data/ext/wapiti/decoder.c
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -25,10 +25,14 @@
|
|
25
25
|
* POSSIBILITY OF SUCH DAMAGE.
|
26
26
|
*/
|
27
27
|
|
28
|
+
#include <inttypes.h>
|
28
29
|
#include <float.h>
|
30
|
+
#include <stdint.h>
|
29
31
|
#include <stddef.h>
|
30
32
|
#include <stdlib.h>
|
31
33
|
#include <stdio.h>
|
34
|
+
#include <string.h>
|
35
|
+
#include <math.h>
|
32
36
|
|
33
37
|
#include "wapiti.h"
|
34
38
|
#include "gradient.h"
|
@@ -39,6 +43,7 @@
|
|
39
43
|
#include "thread.h"
|
40
44
|
#include "tools.h"
|
41
45
|
#include "decoder.h"
|
46
|
+
#include "vmath.h"
|
42
47
|
|
43
48
|
/******************************************************************************
|
44
49
|
* Sequence tagging
|
@@ -60,9 +65,9 @@
|
|
60
65
|
* works in log-space.
|
61
66
|
*/
|
62
67
|
static int tag_expsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
|
63
|
-
const double
|
64
|
-
const
|
65
|
-
const
|
68
|
+
const double *x = mdl->theta;
|
69
|
+
const uint32_t Y = mdl->nlbl;
|
70
|
+
const uint32_t T = seq->len;
|
66
71
|
double (*psi)[T][Y][Y] = (void *)vpsi;
|
67
72
|
// We first have to compute the Ψ_t(y',y,x_t) weights defined as
|
68
73
|
// Ψ_t(y',y,x_t) = \exp( ∑_k θ_k f_k(y',y,x_t) )
|
@@ -88,25 +93,25 @@ static int tag_expsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
|
|
88
93
|
// 2/ we add the bigrams features weights by looping over actives
|
89
94
|
// bigrams observations (we don't have to do this for t=0 since
|
90
95
|
// there is no bigrams here)
|
91
|
-
for (
|
96
|
+
for (uint32_t t = 0; t < T; t++) {
|
92
97
|
const pos_t *pos = &(seq->pos[t]);
|
93
|
-
for (
|
98
|
+
for (uint32_t y = 0; y < Y; y++) {
|
94
99
|
double sum = 0.0;
|
95
|
-
for (
|
96
|
-
const
|
100
|
+
for (uint32_t n = 0; n < pos->ucnt; n++) {
|
101
|
+
const uint64_t o = pos->uobs[n];
|
97
102
|
sum += x[mdl->uoff[o] + y];
|
98
103
|
}
|
99
|
-
for (
|
104
|
+
for (uint32_t yp = 0; yp < Y; yp++)
|
100
105
|
(*psi)[t][yp][y] = sum;
|
101
106
|
}
|
102
107
|
}
|
103
|
-
for (
|
108
|
+
for (uint32_t t = 1; t < T; t++) {
|
104
109
|
const pos_t *pos = &(seq->pos[t]);
|
105
|
-
for (
|
106
|
-
for (
|
110
|
+
for (uint32_t yp = 0, d = 0; yp < Y; yp++) {
|
111
|
+
for (uint32_t y = 0; y < Y; y++, d++) {
|
107
112
|
double sum = 0.0;
|
108
|
-
for (
|
109
|
-
const
|
113
|
+
for (uint32_t n = 0; n < pos->bcnt; n++) {
|
114
|
+
const uint64_t o = pos->bobs[n];
|
110
115
|
sum += x[mdl->boff[o] + d];
|
111
116
|
}
|
112
117
|
(*psi)[t][yp][y] += sum;
|
@@ -116,6 +121,30 @@ static int tag_expsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
|
|
116
121
|
return 0;
|
117
122
|
}
|
118
123
|
|
124
|
+
/* tag_memmsc:
|
125
|
+
* Compute the score for viterbi decoding of MEMM models. This use the
|
126
|
+
* previous function to compute the classical score and then normalize them
|
127
|
+
* relative to the previous label. This normalization must be done in linear
|
128
|
+
* space, not in logarithm one.
|
129
|
+
*/
|
130
|
+
static int tag_memmsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
|
131
|
+
const uint32_t Y = mdl->nlbl;
|
132
|
+
const uint32_t T = seq->len;
|
133
|
+
tag_expsc(mdl, seq, vpsi);
|
134
|
+
xvm_expma(vpsi, vpsi, 0.0, T * Y * Y);
|
135
|
+
double (*psi)[T][Y][Y] = (void *)vpsi;
|
136
|
+
for (uint32_t t = 0; t < T; t++) {
|
137
|
+
for (uint32_t yp = 0; yp < Y; yp++) {
|
138
|
+
double sum = 0.0;
|
139
|
+
for (uint32_t y = 0; y < Y; y++)
|
140
|
+
sum += (*psi)[t][yp][y];
|
141
|
+
for (uint32_t y = 0; y < Y; y++)
|
142
|
+
(*psi)[t][yp][y] /= sum;
|
143
|
+
}
|
144
|
+
}
|
145
|
+
return 1;
|
146
|
+
}
|
147
|
+
|
119
148
|
/* tag_postsc:
|
120
149
|
* This function compute score lattice with posteriors. This generally result
|
121
150
|
* in a slightly best labelling and allow to output normalized score for the
|
@@ -123,34 +152,71 @@ static int tag_expsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
|
|
123
152
|
* a full forward backward instead of just the forward pass.
|
124
153
|
*/
|
125
154
|
static int tag_postsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
|
126
|
-
const
|
127
|
-
const
|
155
|
+
const uint32_t Y = mdl->nlbl;
|
156
|
+
const uint32_t T = seq->len;
|
128
157
|
double (*psi)[T][Y][Y] = (void *)vpsi;
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
158
|
+
grd_st_t *grd_st = grd_stnew(mdl, NULL);
|
159
|
+
grd_st->first = 0;
|
160
|
+
grd_st->last = T - 1;
|
161
|
+
grd_stcheck(grd_st, seq->len);
|
133
162
|
if (mdl->opt->sparse) {
|
134
|
-
grd_spdopsi(
|
135
|
-
grd_spfwdbwd(
|
163
|
+
grd_spdopsi(grd_st, seq);
|
164
|
+
grd_spfwdbwd(grd_st, seq);
|
136
165
|
} else {
|
137
|
-
grd_fldopsi(
|
138
|
-
grd_flfwdbwd(
|
166
|
+
grd_fldopsi(grd_st, seq);
|
167
|
+
grd_flfwdbwd(grd_st, seq);
|
139
168
|
}
|
140
|
-
double (*alpha)[T][Y] = (void *)
|
141
|
-
double (*beta )[T][Y] = (void *)
|
142
|
-
double *unorm =
|
143
|
-
for (
|
144
|
-
for (
|
169
|
+
double (*alpha)[T][Y] = (void *)grd_st->alpha;
|
170
|
+
double (*beta )[T][Y] = (void *)grd_st->beta;
|
171
|
+
double *unorm = grd_st->unorm;
|
172
|
+
for (uint32_t t = 0; t < T; t++) {
|
173
|
+
for (uint32_t y = 0; y < Y; y++) {
|
145
174
|
double e = (*alpha)[t][y] * (*beta)[t][y] * unorm[t];
|
146
|
-
for (
|
175
|
+
for (uint32_t yp = 0; yp < Y; yp++)
|
147
176
|
(*psi)[t][yp][y] = e;
|
148
177
|
}
|
149
178
|
}
|
150
|
-
|
179
|
+
grd_stfree(grd_st);
|
151
180
|
return 1;
|
152
181
|
}
|
153
182
|
|
183
|
+
/* tag_forced:
|
184
|
+
* This function apply correction to the psi table to take account of already
|
185
|
+
* known labels. If a label is known, all arcs leading or comming from other
|
186
|
+
* labels at this position are NULLified and will not be selected by the
|
187
|
+
* decoder.
|
188
|
+
*/
|
189
|
+
static void tag_forced(mdl_t *mdl, const seq_t *seq, double *vpsi, int op) {
|
190
|
+
const uint32_t Y = mdl->nlbl;
|
191
|
+
const uint32_t T = seq->len;
|
192
|
+
const double v = op ? 0.0 : -HUGE_VAL;
|
193
|
+
double (*psi)[T][Y][Y] = (void *)vpsi;
|
194
|
+
for (uint32_t t = 0; t < T; t++) {
|
195
|
+
const uint32_t yr = seq->pos[t].lbl;
|
196
|
+
if (yr == (uint32_t)-1)
|
197
|
+
continue;
|
198
|
+
if (t != 0)
|
199
|
+
for (uint32_t y = 0; y < Y; y++)
|
200
|
+
if (y != yr)
|
201
|
+
for (uint32_t yp = 0; yp < Y; yp++)
|
202
|
+
(*psi)[t][yp][y] = v;
|
203
|
+
if (t != T - 1)
|
204
|
+
for (uint32_t y = 0; y < Y; y++)
|
205
|
+
if (y != yr)
|
206
|
+
for (uint32_t yn = 0; yn < Y; yn++)
|
207
|
+
(*psi)[t + 1][y][yn] = v;
|
208
|
+
}
|
209
|
+
const uint32_t yr = seq->pos[0].lbl;
|
210
|
+
if (yr != (uint32_t)-1) {
|
211
|
+
for (uint32_t y = 0; y < Y; y++) {
|
212
|
+
if (yr == y)
|
213
|
+
continue;
|
214
|
+
for (uint32_t yp = 0; yp < Y; yp++)
|
215
|
+
(*psi)[0][yp][y] = v;
|
216
|
+
}
|
217
|
+
}
|
218
|
+
}
|
219
|
+
|
154
220
|
/* tag_viterbi:
|
155
221
|
* This function implement the Viterbi algorithm in order to decode the most
|
156
222
|
* probable sequence of labels according to the model. Some part of this code
|
@@ -160,22 +226,26 @@ static int tag_postsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
|
|
160
226
|
* enough stack space.
|
161
227
|
*/
|
162
228
|
void tag_viterbi(mdl_t *mdl, const seq_t *seq,
|
163
|
-
|
164
|
-
const
|
165
|
-
const
|
166
|
-
double
|
167
|
-
|
168
|
-
double
|
169
|
-
|
170
|
-
double
|
171
|
-
double
|
229
|
+
uint32_t out[], double *sc, double psc[]) {
|
230
|
+
const uint32_t Y = mdl->nlbl;
|
231
|
+
const uint32_t T = seq->len;
|
232
|
+
double *vpsi = xvm_new(T * Y * Y);
|
233
|
+
uint32_t *vback = wapiti_xmalloc(sizeof(uint32_t) * T * Y);
|
234
|
+
double (*psi) [T][Y][Y] = (void *)vpsi;
|
235
|
+
uint32_t (*back)[T][Y] = (void *)vback;
|
236
|
+
double *cur = wapiti_xmalloc(sizeof(double) * Y);
|
237
|
+
double *old = wapiti_xmalloc(sizeof(double) * Y);
|
172
238
|
// We first compute the scores for each transitions in the lattice of
|
173
239
|
// labels.
|
174
240
|
int op;
|
175
|
-
if (mdl->
|
241
|
+
if (mdl->type == 1)
|
242
|
+
op = tag_memmsc(mdl, seq, vpsi);
|
243
|
+
else if (mdl->opt->lblpost)
|
176
244
|
op = tag_postsc(mdl, seq, vpsi);
|
177
245
|
else
|
178
246
|
op = tag_expsc(mdl, seq, vpsi);
|
247
|
+
if (mdl->opt->force)
|
248
|
+
tag_forced(mdl, seq, vpsi, op);
|
179
249
|
// Now we can do the Viterbi algorithm. This is very similar to the
|
180
250
|
// forward pass
|
181
251
|
// | α_1(y) = Ψ_1(y,x_1)
|
@@ -189,15 +259,15 @@ void tag_viterbi(mdl_t *mdl, const seq_t *seq,
|
|
189
259
|
// the indice of the y value selected by the max. This also mean that
|
190
260
|
// we only need the current and previous value of the α vectors, not
|
191
261
|
// the full matrix.
|
192
|
-
for (
|
262
|
+
for (uint32_t y = 0; y < Y; y++)
|
193
263
|
cur[y] = (*psi)[0][0][y];
|
194
|
-
for (
|
195
|
-
for (
|
264
|
+
for (uint32_t t = 1; t < T; t++) {
|
265
|
+
for (uint32_t y = 0; y < Y; y++)
|
196
266
|
old[y] = cur[y];
|
197
|
-
for (
|
198
|
-
double
|
199
|
-
|
200
|
-
for (
|
267
|
+
for (uint32_t y = 0; y < Y; y++) {
|
268
|
+
double bst = -HUGE_VAL;
|
269
|
+
uint32_t idx = 0;
|
270
|
+
for (uint32_t yp = 0; yp < Y; yp++) {
|
201
271
|
double val = old[yp];
|
202
272
|
if (op)
|
203
273
|
val *= (*psi)[t][yp][y];
|
@@ -216,15 +286,15 @@ void tag_viterbi(mdl_t *mdl, const seq_t *seq,
|
|
216
286
|
// this we search in the last α vector the best value. Using this index
|
217
287
|
// as a starting point in the back-pointer array we finally can decode
|
218
288
|
// the best sequence.
|
219
|
-
|
220
|
-
for (
|
289
|
+
uint32_t bst = 0;
|
290
|
+
for (uint32_t y = 1; y < Y; y++)
|
221
291
|
if (cur[y] > cur[bst])
|
222
292
|
bst = y;
|
223
293
|
if (sc != NULL)
|
224
294
|
*sc = cur[bst];
|
225
|
-
for (
|
226
|
-
const
|
227
|
-
const
|
295
|
+
for (uint32_t t = T; t > 0; t--) {
|
296
|
+
const uint32_t yp = (t != 1) ? (*back)[t - 1][bst] : 0;
|
297
|
+
const uint32_t y = bst;
|
228
298
|
out[t - 1] = y;
|
229
299
|
if (psc != NULL)
|
230
300
|
psc[t - 1] = (*psi)[t - 1][yp][y];
|
@@ -233,7 +303,7 @@ void tag_viterbi(mdl_t *mdl, const seq_t *seq,
|
|
233
303
|
free(old);
|
234
304
|
free(cur);
|
235
305
|
free(vback);
|
236
|
-
|
306
|
+
xvm_free(vpsi);
|
237
307
|
}
|
238
308
|
|
239
309
|
/* tag_nbviterbi:
|
@@ -242,23 +312,27 @@ void tag_viterbi(mdl_t *mdl, const seq_t *seq,
|
|
242
312
|
* compute only the best one and will return the same sequence than the
|
243
313
|
* previous function but will be slower to do it.
|
244
314
|
*/
|
245
|
-
void tag_nbviterbi(mdl_t *mdl, const seq_t *seq,
|
246
|
-
|
247
|
-
const
|
248
|
-
const
|
249
|
-
double
|
250
|
-
|
251
|
-
double
|
252
|
-
|
253
|
-
double
|
254
|
-
double
|
315
|
+
void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, uint32_t N,
|
316
|
+
uint32_t out[][N], double sc[], double psc[][N]) {
|
317
|
+
const uint32_t Y = mdl->nlbl;
|
318
|
+
const uint32_t T = seq->len;
|
319
|
+
double *vpsi = xvm_new(T * Y * Y);
|
320
|
+
uint32_t *vback = wapiti_xmalloc(sizeof(uint32_t) * T * Y * N);
|
321
|
+
double (*psi) [T][Y ][Y] = (void *)vpsi;
|
322
|
+
uint32_t (*back)[T][Y * N] = (void *)vback;
|
323
|
+
double *cur = wapiti_xmalloc(sizeof(double) * Y * N);
|
324
|
+
double *old = wapiti_xmalloc(sizeof(double) * Y * N);
|
255
325
|
// We first compute the scores for each transitions in the lattice of
|
256
326
|
// labels.
|
257
327
|
int op;
|
258
|
-
if (mdl->
|
328
|
+
if (mdl->type == 1)
|
329
|
+
op = tag_memmsc(mdl, seq, vpsi);
|
330
|
+
else if (mdl->opt->lblpost)
|
259
331
|
op = tag_postsc(mdl, seq, (double *)psi);
|
260
332
|
else
|
261
333
|
op = tag_expsc(mdl, seq, (double *)psi);
|
334
|
+
if (mdl->opt->force)
|
335
|
+
tag_forced(mdl, seq, vpsi, op);
|
262
336
|
// Here also, it's classical but we have to keep the N best paths
|
263
337
|
// leading to each nodes of the lattice instead of only the best one.
|
264
338
|
// This mean that code is less trivial and the current implementation is
|
@@ -267,19 +341,19 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
|
|
267
341
|
// We first build the list of all incoming arcs from all paths from all
|
268
342
|
// N-best nodes and next select the N-best one. There is a lot of room
|
269
343
|
// here for later optimisations if needed.
|
270
|
-
for (
|
344
|
+
for (uint32_t y = 0, d = 0; y < Y; y++) {
|
271
345
|
cur[d++] = (*psi)[0][0][y];
|
272
|
-
for (
|
346
|
+
for (uint32_t n = 1; n < N; n++)
|
273
347
|
cur[d++] = -DBL_MAX;
|
274
348
|
}
|
275
|
-
for (
|
276
|
-
for (
|
349
|
+
for (uint32_t t = 1; t < T; t++) {
|
350
|
+
for (uint32_t d = 0; d < Y * N; d++)
|
277
351
|
old[d] = cur[d];
|
278
|
-
for (
|
352
|
+
for (uint32_t y = 0; y < Y; y++) {
|
279
353
|
// 1st, build the list of all incoming
|
280
354
|
double lst[Y * N];
|
281
|
-
for (
|
282
|
-
for (
|
355
|
+
for (uint32_t yp = 0, d = 0; yp < Y; yp++) {
|
356
|
+
for (uint32_t n = 0; n < N; n++, d++) {
|
283
357
|
lst[d] = old[d];
|
284
358
|
if (op)
|
285
359
|
lst[d] *= (*psi)[t][yp][y];
|
@@ -288,14 +362,14 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
|
|
288
362
|
}
|
289
363
|
}
|
290
364
|
// 2nd, init the back with the N first
|
291
|
-
|
292
|
-
for (
|
365
|
+
uint32_t *bk = &(*back)[t][y * N];
|
366
|
+
for (uint32_t n = 0; n < N; n++)
|
293
367
|
bk[n] = n;
|
294
368
|
// 3rd, search the N highest values
|
295
|
-
for (
|
369
|
+
for (uint32_t i = N; i < N * Y; i++) {
|
296
370
|
// Search the smallest current value
|
297
|
-
|
298
|
-
for (
|
371
|
+
uint32_t idx = 0;
|
372
|
+
for (uint32_t n = 1; n < N; n++)
|
299
373
|
if (lst[bk[n]] < lst[bk[idx]])
|
300
374
|
idx = n;
|
301
375
|
// And replace it if needed
|
@@ -303,24 +377,24 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
|
|
303
377
|
bk[idx] = i;
|
304
378
|
}
|
305
379
|
// 4th, get the new scores
|
306
|
-
for (
|
380
|
+
for (uint32_t n = 0; n < N; n++)
|
307
381
|
cur[y * N + n] = lst[bk[n]];
|
308
382
|
}
|
309
383
|
}
|
310
384
|
// Retrieving the best paths is similar to classical Viterbi except that
|
311
385
|
// we have to search for the N bet ones and there is N time more
|
312
386
|
// possibles starts.
|
313
|
-
for (
|
314
|
-
|
315
|
-
for (
|
387
|
+
for (uint32_t n = 0; n < N; n++) {
|
388
|
+
uint32_t bst = 0;
|
389
|
+
for (uint32_t d = 1; d < Y * N; d++)
|
316
390
|
if (cur[d] > cur[bst])
|
317
391
|
bst = d;
|
318
392
|
if (sc != NULL)
|
319
393
|
sc[n] = cur[bst];
|
320
394
|
cur[bst] = -DBL_MAX;
|
321
|
-
for (
|
322
|
-
const
|
323
|
-
const
|
395
|
+
for (uint32_t t = T; t > 0; t--) {
|
396
|
+
const uint32_t yp = (t != 1) ? (*back)[t - 1][bst] / N: 0;
|
397
|
+
const uint32_t y = bst / N;
|
324
398
|
out[t - 1][n] = y;
|
325
399
|
if (psc != NULL)
|
326
400
|
psc[t - 1][n] = (*psi)[t - 1][yp][y];
|
@@ -330,7 +404,7 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
|
|
330
404
|
free(old);
|
331
405
|
free(cur);
|
332
406
|
free(vback);
|
333
|
-
|
407
|
+
xvm_free(vpsi);
|
334
408
|
}
|
335
409
|
|
336
410
|
/* tag_label:
|
@@ -343,17 +417,17 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
|
|
343
417
|
*/
|
344
418
|
void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
|
345
419
|
qrk_t *lbls = mdl->reader->lbl;
|
346
|
-
const
|
347
|
-
const
|
420
|
+
const uint32_t Y = mdl->nlbl;
|
421
|
+
const uint32_t N = mdl->opt->nbest;
|
348
422
|
// We start by preparing the statistic collection to be ready if check
|
349
423
|
// option is used. The stat array hold the following for each label
|
350
424
|
// [0] # of reference with this label
|
351
425
|
// [1] # of token we have taged with this label
|
352
426
|
// [2] # of match of the two preceding
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
for (
|
427
|
+
uint64_t tcnt = 0, terr = 0;
|
428
|
+
uint64_t scnt = 0, serr = 0;
|
429
|
+
uint64_t stat[3][Y];
|
430
|
+
for (uint32_t y = 0; y < Y; y++)
|
357
431
|
stat[0][y] = stat[1][y] = stat[2][y] = 0;
|
358
432
|
// Next read the input file sequence by sequence and label them, we have
|
359
433
|
// to take care of not discarding the raw input as we want to send it
|
@@ -364,24 +438,25 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
|
|
364
438
|
raw_t *raw = rdr_readraw(mdl->reader, fin);
|
365
439
|
if (raw == NULL)
|
366
440
|
break;
|
367
|
-
seq_t *seq = rdr_raw2seq(mdl->reader, raw,
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
double
|
441
|
+
seq_t *seq = rdr_raw2seq(mdl->reader, raw,
|
442
|
+
mdl->opt->check | mdl->opt->force);
|
443
|
+
const uint32_t T = seq->len;
|
444
|
+
uint32_t *out = wapiti_xmalloc(sizeof(uint32_t) * T * N);
|
445
|
+
double *psc = wapiti_xmalloc(sizeof(double ) * T * N);
|
446
|
+
double *scs = wapiti_xmalloc(sizeof(double ) * N);
|
372
447
|
if (N == 1)
|
373
|
-
tag_viterbi(mdl, seq, (
|
448
|
+
tag_viterbi(mdl, seq, (uint32_t*)out, scs, (double*)psc);
|
374
449
|
else
|
375
450
|
tag_nbviterbi(mdl, seq, N, (void*)out, scs, (void*)psc);
|
376
451
|
// Next we output the raw sequence with an aditional column for
|
377
452
|
// the predicted labels
|
378
|
-
for (
|
453
|
+
for (uint32_t n = 0; n < N; n++) {
|
379
454
|
if (mdl->opt->outsc)
|
380
455
|
fprintf(fout, "# %d %f\n", (int)n, scs[n]);
|
381
|
-
for (
|
456
|
+
for (uint32_t t = 0; t < T; t++) {
|
382
457
|
if (!mdl->opt->label)
|
383
458
|
fprintf(fout, "%s\t", raw->lines[t]);
|
384
|
-
|
459
|
+
uint32_t lbl = out[t * N + n];
|
385
460
|
const char *lblstr = qrk_id2str(lbls, lbl);
|
386
461
|
fprintf(fout, "%s", lblstr);
|
387
462
|
if (mdl->opt->outsc) {
|
@@ -394,10 +469,13 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
|
|
394
469
|
}
|
395
470
|
fflush(fout);
|
396
471
|
// If user provided reference labels, use them to collect
|
397
|
-
// statistics about how well we have performed here.
|
472
|
+
// statistics about how well we have performed here. Labels
|
473
|
+
// unseen at training time are discarded.
|
398
474
|
if (mdl->opt->check) {
|
399
475
|
bool err = false;
|
400
|
-
for (
|
476
|
+
for (uint32_t t = 0; t < T; t++) {
|
477
|
+
if (seq->pos[t].lbl == (uint32_t)-1)
|
478
|
+
continue;
|
401
479
|
stat[0][seq->pos[t].lbl]++;
|
402
480
|
stat[1][out[t * N]]++;
|
403
481
|
if (seq->pos[t].lbl != out[t * N])
|
@@ -405,7 +483,7 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
|
|
405
483
|
else
|
406
484
|
stat[2][out[t * N]]++;
|
407
485
|
}
|
408
|
-
tcnt +=
|
486
|
+
tcnt += T;
|
409
487
|
serr += err;
|
410
488
|
}
|
411
489
|
// Cleanup memory used for this sequence
|
@@ -418,7 +496,7 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
|
|
418
496
|
// much sequence are labelled and if possible the current tokens
|
419
497
|
// and sequence error rates.
|
420
498
|
if (++scnt % 1000 == 0) {
|
421
|
-
info("%
|
499
|
+
info("%10"PRIu64" sequences labeled", scnt);
|
422
500
|
if (mdl->opt->check) {
|
423
501
|
const double te = (double)terr / tcnt * 100.0;
|
424
502
|
const double se = (double)serr / scnt * 100.0;
|
@@ -433,11 +511,11 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
|
|
433
511
|
if (mdl->opt->check) {
|
434
512
|
const double te = (double)terr / tcnt * 100.0;
|
435
513
|
const double se = (double)serr / scnt * 100.0;
|
436
|
-
info(" Nb sequences : %
|
514
|
+
info(" Nb sequences : %"PRIu64"\n", scnt);
|
437
515
|
info(" Token error : %5.2f%%\n", te);
|
438
516
|
info(" Sequence error: %5.2f%%\n", se);
|
439
517
|
info("* Per label statistics\n");
|
440
|
-
for (
|
518
|
+
for (uint32_t y = 0; y < Y; y++) {
|
441
519
|
const char *lbl = qrk_id2str(lbls, y);
|
442
520
|
const double Rc = (double)stat[2][y] / stat[0][y];
|
443
521
|
const double Pr = (double)stat[2][y] / stat[1][y];
|
@@ -458,12 +536,12 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
|
|
458
536
|
*/
|
459
537
|
typedef struct eval_s eval_t;
|
460
538
|
struct eval_s {
|
461
|
-
mdl_t
|
462
|
-
dat_t
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
539
|
+
mdl_t *mdl;
|
540
|
+
dat_t *dat;
|
541
|
+
uint64_t tcnt; // Processed tokens count
|
542
|
+
uint64_t terr; // Tokens error found
|
543
|
+
uint64_t scnt; // Processes sequences count
|
544
|
+
uint64_t serr; // Sequence error found
|
467
545
|
};
|
468
546
|
|
469
547
|
/* tag_evalsub:
|
@@ -471,7 +549,7 @@ struct eval_s {
|
|
471
549
|
* by batch and for each batch do a simple Viterbi and scan the result to find
|
472
550
|
* errors.
|
473
551
|
*/
|
474
|
-
static void tag_evalsub(job_t *job,
|
552
|
+
static void tag_evalsub(job_t *job, uint32_t id, uint32_t cnt, eval_t *eval) {
|
475
553
|
unused(id && cnt);
|
476
554
|
mdl_t *mdl = eval->mdl;
|
477
555
|
dat_t *dat = eval->dat;
|
@@ -480,22 +558,23 @@ static void tag_evalsub(job_t *job, int id, int cnt, eval_t *eval) {
|
|
480
558
|
eval->scnt = 0;
|
481
559
|
eval->serr = 0;
|
482
560
|
// We just get a job a process all the squence in it.
|
483
|
-
|
561
|
+
uint32_t count, pos;
|
484
562
|
while (mth_getjob(job, &count, &pos)) {
|
485
|
-
for (
|
563
|
+
for (uint32_t s = pos; s < pos + count; s++) {
|
486
564
|
// Tag the sequence with the viterbi
|
487
565
|
const seq_t *seq = dat->seq[s];
|
488
|
-
const
|
489
|
-
|
566
|
+
const uint32_t T = seq->len;
|
567
|
+
uint32_t *out = wapiti_xmalloc(sizeof(uint32_t) * T);
|
490
568
|
tag_viterbi(mdl, seq, out, NULL, NULL);
|
491
569
|
// And check for eventual (probable ?) errors
|
492
570
|
bool err = false;
|
493
|
-
for (
|
571
|
+
for (uint32_t t = 0; t < T; t++)
|
494
572
|
if (seq->pos[t].lbl != out[t])
|
495
573
|
eval->terr++, err = true;
|
496
|
-
eval->tcnt +=
|
574
|
+
eval->tcnt += T;
|
497
575
|
eval->scnt += 1;
|
498
576
|
eval->serr += err;
|
577
|
+
free(out);
|
499
578
|
}
|
500
579
|
}
|
501
580
|
}
|
@@ -505,13 +584,13 @@ static void tag_evalsub(job_t *job, int id, int cnt, eval_t *eval) {
|
|
505
584
|
* taining set if not available).
|
506
585
|
*/
|
507
586
|
void tag_eval(mdl_t *mdl, double *te, double *se) {
|
508
|
-
const
|
587
|
+
const uint32_t W = mdl->opt->nthread;
|
509
588
|
dat_t *dat = (mdl->devel == NULL) ? mdl->train : mdl->devel;
|
510
589
|
// First we prepare the eval state for all the workers threads, we just
|
511
590
|
// have to give them the model and dataset to use. This state will be
|
512
591
|
// used to retrieve partial result they computed.
|
513
592
|
eval_t *eval[W];
|
514
|
-
for (
|
593
|
+
for (uint32_t w = 0; w < W; w++) {
|
515
594
|
eval[w] = wapiti_xmalloc(sizeof(eval_t));
|
516
595
|
eval[w]->mdl = mdl;
|
517
596
|
eval[w]->dat = dat;
|
@@ -520,9 +599,9 @@ void tag_eval(mdl_t *mdl, double *te, double *se) {
|
|
520
599
|
// result by summing them and computing the final error rates.
|
521
600
|
mth_spawn((func_t *)tag_evalsub, W, (void *)eval, dat->nseq,
|
522
601
|
mdl->opt->jobsize);
|
523
|
-
|
524
|
-
|
525
|
-
for (
|
602
|
+
uint64_t tcnt = 0, terr = 0;
|
603
|
+
uint64_t scnt = 0, serr = 0;
|
604
|
+
for (uint32_t w = 0; w < W; w++) {
|
526
605
|
tcnt += eval[w]->tcnt;
|
527
606
|
terr += eval[w]->terr;
|
528
607
|
scnt += eval[w]->scnt;
|