wapiti 0.0.5 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.simplecov +3 -0
- data/Gemfile +25 -2
- data/HISTORY.md +5 -1
- data/LICENSE +14 -13
- data/README.md +9 -16
- data/Rakefile +38 -8
- data/ext/wapiti/bcd.c +126 -124
- data/ext/wapiti/decoder.c +203 -124
- data/ext/wapiti/decoder.h +6 -4
- data/ext/wapiti/extconf.rb +2 -2
- data/ext/wapiti/gradient.c +491 -320
- data/ext/wapiti/gradient.h +52 -34
- data/ext/wapiti/lbfgs.c +74 -33
- data/ext/wapiti/model.c +47 -37
- data/ext/wapiti/model.h +22 -20
- data/ext/wapiti/native.c +850 -839
- data/ext/wapiti/native.h +1 -1
- data/ext/wapiti/options.c +52 -20
- data/ext/wapiti/options.h +37 -30
- data/ext/wapiti/pattern.c +35 -33
- data/ext/wapiti/pattern.h +12 -11
- data/ext/wapiti/progress.c +14 -13
- data/ext/wapiti/progress.h +3 -2
- data/ext/wapiti/quark.c +14 -16
- data/ext/wapiti/quark.h +6 -5
- data/ext/wapiti/reader.c +83 -69
- data/ext/wapiti/reader.h +11 -9
- data/ext/wapiti/rprop.c +84 -43
- data/ext/wapiti/sequence.h +18 -16
- data/ext/wapiti/sgdl1.c +45 -43
- data/ext/wapiti/thread.c +19 -17
- data/ext/wapiti/thread.h +5 -4
- data/ext/wapiti/tools.c +7 -7
- data/ext/wapiti/tools.h +3 -4
- data/ext/wapiti/trainers.h +1 -1
- data/ext/wapiti/vmath.c +40 -38
- data/ext/wapiti/vmath.h +12 -11
- data/ext/wapiti/wapiti.c +159 -37
- data/ext/wapiti/wapiti.h +18 -4
- data/lib/wapiti.rb +15 -15
- data/lib/wapiti/errors.rb +15 -15
- data/lib/wapiti/model.rb +92 -84
- data/lib/wapiti/options.rb +123 -124
- data/lib/wapiti/utility.rb +14 -14
- data/lib/wapiti/version.rb +2 -2
- data/spec/spec_helper.rb +29 -9
- data/spec/wapiti/model_spec.rb +230 -194
- data/spec/wapiti/native_spec.rb +7 -8
- data/spec/wapiti/options_spec.rb +184 -174
- data/wapiti.gemspec +22 -8
- metadata +38 -42
- data/.gitignore +0 -5
data/ext/wapiti/decoder.c
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -25,10 +25,14 @@
|
|
25
25
|
* POSSIBILITY OF SUCH DAMAGE.
|
26
26
|
*/
|
27
27
|
|
28
|
+
#include <inttypes.h>
|
28
29
|
#include <float.h>
|
30
|
+
#include <stdint.h>
|
29
31
|
#include <stddef.h>
|
30
32
|
#include <stdlib.h>
|
31
33
|
#include <stdio.h>
|
34
|
+
#include <string.h>
|
35
|
+
#include <math.h>
|
32
36
|
|
33
37
|
#include "wapiti.h"
|
34
38
|
#include "gradient.h"
|
@@ -39,6 +43,7 @@
|
|
39
43
|
#include "thread.h"
|
40
44
|
#include "tools.h"
|
41
45
|
#include "decoder.h"
|
46
|
+
#include "vmath.h"
|
42
47
|
|
43
48
|
/******************************************************************************
|
44
49
|
* Sequence tagging
|
@@ -60,9 +65,9 @@
|
|
60
65
|
* works in log-space.
|
61
66
|
*/
|
62
67
|
static int tag_expsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
|
63
|
-
const double
|
64
|
-
const
|
65
|
-
const
|
68
|
+
const double *x = mdl->theta;
|
69
|
+
const uint32_t Y = mdl->nlbl;
|
70
|
+
const uint32_t T = seq->len;
|
66
71
|
double (*psi)[T][Y][Y] = (void *)vpsi;
|
67
72
|
// We first have to compute the Ψ_t(y',y,x_t) weights defined as
|
68
73
|
// Ψ_t(y',y,x_t) = \exp( ∑_k θ_k f_k(y',y,x_t) )
|
@@ -88,25 +93,25 @@ static int tag_expsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
|
|
88
93
|
// 2/ we add the bigrams features weights by looping over actives
|
89
94
|
// bigrams observations (we don't have to do this for t=0 since
|
90
95
|
// there is no bigrams here)
|
91
|
-
for (
|
96
|
+
for (uint32_t t = 0; t < T; t++) {
|
92
97
|
const pos_t *pos = &(seq->pos[t]);
|
93
|
-
for (
|
98
|
+
for (uint32_t y = 0; y < Y; y++) {
|
94
99
|
double sum = 0.0;
|
95
|
-
for (
|
96
|
-
const
|
100
|
+
for (uint32_t n = 0; n < pos->ucnt; n++) {
|
101
|
+
const uint64_t o = pos->uobs[n];
|
97
102
|
sum += x[mdl->uoff[o] + y];
|
98
103
|
}
|
99
|
-
for (
|
104
|
+
for (uint32_t yp = 0; yp < Y; yp++)
|
100
105
|
(*psi)[t][yp][y] = sum;
|
101
106
|
}
|
102
107
|
}
|
103
|
-
for (
|
108
|
+
for (uint32_t t = 1; t < T; t++) {
|
104
109
|
const pos_t *pos = &(seq->pos[t]);
|
105
|
-
for (
|
106
|
-
for (
|
110
|
+
for (uint32_t yp = 0, d = 0; yp < Y; yp++) {
|
111
|
+
for (uint32_t y = 0; y < Y; y++, d++) {
|
107
112
|
double sum = 0.0;
|
108
|
-
for (
|
109
|
-
const
|
113
|
+
for (uint32_t n = 0; n < pos->bcnt; n++) {
|
114
|
+
const uint64_t o = pos->bobs[n];
|
110
115
|
sum += x[mdl->boff[o] + d];
|
111
116
|
}
|
112
117
|
(*psi)[t][yp][y] += sum;
|
@@ -116,6 +121,30 @@ static int tag_expsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
|
|
116
121
|
return 0;
|
117
122
|
}
|
118
123
|
|
124
|
+
/* tag_memmsc:
|
125
|
+
* Compute the score for viterbi decoding of MEMM models. This use the
|
126
|
+
* previous function to compute the classical score and then normalize them
|
127
|
+
* relative to the previous label. This normalization must be done in linear
|
128
|
+
* space, not in logarithm one.
|
129
|
+
*/
|
130
|
+
static int tag_memmsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
|
131
|
+
const uint32_t Y = mdl->nlbl;
|
132
|
+
const uint32_t T = seq->len;
|
133
|
+
tag_expsc(mdl, seq, vpsi);
|
134
|
+
xvm_expma(vpsi, vpsi, 0.0, T * Y * Y);
|
135
|
+
double (*psi)[T][Y][Y] = (void *)vpsi;
|
136
|
+
for (uint32_t t = 0; t < T; t++) {
|
137
|
+
for (uint32_t yp = 0; yp < Y; yp++) {
|
138
|
+
double sum = 0.0;
|
139
|
+
for (uint32_t y = 0; y < Y; y++)
|
140
|
+
sum += (*psi)[t][yp][y];
|
141
|
+
for (uint32_t y = 0; y < Y; y++)
|
142
|
+
(*psi)[t][yp][y] /= sum;
|
143
|
+
}
|
144
|
+
}
|
145
|
+
return 1;
|
146
|
+
}
|
147
|
+
|
119
148
|
/* tag_postsc:
|
120
149
|
* This function compute score lattice with posteriors. This generally result
|
121
150
|
* in a slightly best labelling and allow to output normalized score for the
|
@@ -123,34 +152,71 @@ static int tag_expsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
|
|
123
152
|
* a full forward backward instead of just the forward pass.
|
124
153
|
*/
|
125
154
|
static int tag_postsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
|
126
|
-
const
|
127
|
-
const
|
155
|
+
const uint32_t Y = mdl->nlbl;
|
156
|
+
const uint32_t T = seq->len;
|
128
157
|
double (*psi)[T][Y][Y] = (void *)vpsi;
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
158
|
+
grd_st_t *grd_st = grd_stnew(mdl, NULL);
|
159
|
+
grd_st->first = 0;
|
160
|
+
grd_st->last = T - 1;
|
161
|
+
grd_stcheck(grd_st, seq->len);
|
133
162
|
if (mdl->opt->sparse) {
|
134
|
-
grd_spdopsi(
|
135
|
-
grd_spfwdbwd(
|
163
|
+
grd_spdopsi(grd_st, seq);
|
164
|
+
grd_spfwdbwd(grd_st, seq);
|
136
165
|
} else {
|
137
|
-
grd_fldopsi(
|
138
|
-
grd_flfwdbwd(
|
166
|
+
grd_fldopsi(grd_st, seq);
|
167
|
+
grd_flfwdbwd(grd_st, seq);
|
139
168
|
}
|
140
|
-
double (*alpha)[T][Y] = (void *)
|
141
|
-
double (*beta )[T][Y] = (void *)
|
142
|
-
double *unorm =
|
143
|
-
for (
|
144
|
-
for (
|
169
|
+
double (*alpha)[T][Y] = (void *)grd_st->alpha;
|
170
|
+
double (*beta )[T][Y] = (void *)grd_st->beta;
|
171
|
+
double *unorm = grd_st->unorm;
|
172
|
+
for (uint32_t t = 0; t < T; t++) {
|
173
|
+
for (uint32_t y = 0; y < Y; y++) {
|
145
174
|
double e = (*alpha)[t][y] * (*beta)[t][y] * unorm[t];
|
146
|
-
for (
|
175
|
+
for (uint32_t yp = 0; yp < Y; yp++)
|
147
176
|
(*psi)[t][yp][y] = e;
|
148
177
|
}
|
149
178
|
}
|
150
|
-
|
179
|
+
grd_stfree(grd_st);
|
151
180
|
return 1;
|
152
181
|
}
|
153
182
|
|
183
|
+
/* tag_forced:
|
184
|
+
* This function apply correction to the psi table to take account of already
|
185
|
+
* known labels. If a label is known, all arcs leading or comming from other
|
186
|
+
* labels at this position are NULLified and will not be selected by the
|
187
|
+
* decoder.
|
188
|
+
*/
|
189
|
+
static void tag_forced(mdl_t *mdl, const seq_t *seq, double *vpsi, int op) {
|
190
|
+
const uint32_t Y = mdl->nlbl;
|
191
|
+
const uint32_t T = seq->len;
|
192
|
+
const double v = op ? 0.0 : -HUGE_VAL;
|
193
|
+
double (*psi)[T][Y][Y] = (void *)vpsi;
|
194
|
+
for (uint32_t t = 0; t < T; t++) {
|
195
|
+
const uint32_t yr = seq->pos[t].lbl;
|
196
|
+
if (yr == (uint32_t)-1)
|
197
|
+
continue;
|
198
|
+
if (t != 0)
|
199
|
+
for (uint32_t y = 0; y < Y; y++)
|
200
|
+
if (y != yr)
|
201
|
+
for (uint32_t yp = 0; yp < Y; yp++)
|
202
|
+
(*psi)[t][yp][y] = v;
|
203
|
+
if (t != T - 1)
|
204
|
+
for (uint32_t y = 0; y < Y; y++)
|
205
|
+
if (y != yr)
|
206
|
+
for (uint32_t yn = 0; yn < Y; yn++)
|
207
|
+
(*psi)[t + 1][y][yn] = v;
|
208
|
+
}
|
209
|
+
const uint32_t yr = seq->pos[0].lbl;
|
210
|
+
if (yr != (uint32_t)-1) {
|
211
|
+
for (uint32_t y = 0; y < Y; y++) {
|
212
|
+
if (yr == y)
|
213
|
+
continue;
|
214
|
+
for (uint32_t yp = 0; yp < Y; yp++)
|
215
|
+
(*psi)[0][yp][y] = v;
|
216
|
+
}
|
217
|
+
}
|
218
|
+
}
|
219
|
+
|
154
220
|
/* tag_viterbi:
|
155
221
|
* This function implement the Viterbi algorithm in order to decode the most
|
156
222
|
* probable sequence of labels according to the model. Some part of this code
|
@@ -160,22 +226,26 @@ static int tag_postsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
|
|
160
226
|
* enough stack space.
|
161
227
|
*/
|
162
228
|
void tag_viterbi(mdl_t *mdl, const seq_t *seq,
|
163
|
-
|
164
|
-
const
|
165
|
-
const
|
166
|
-
double
|
167
|
-
|
168
|
-
double
|
169
|
-
|
170
|
-
double
|
171
|
-
double
|
229
|
+
uint32_t out[], double *sc, double psc[]) {
|
230
|
+
const uint32_t Y = mdl->nlbl;
|
231
|
+
const uint32_t T = seq->len;
|
232
|
+
double *vpsi = xvm_new(T * Y * Y);
|
233
|
+
uint32_t *vback = wapiti_xmalloc(sizeof(uint32_t) * T * Y);
|
234
|
+
double (*psi) [T][Y][Y] = (void *)vpsi;
|
235
|
+
uint32_t (*back)[T][Y] = (void *)vback;
|
236
|
+
double *cur = wapiti_xmalloc(sizeof(double) * Y);
|
237
|
+
double *old = wapiti_xmalloc(sizeof(double) * Y);
|
172
238
|
// We first compute the scores for each transitions in the lattice of
|
173
239
|
// labels.
|
174
240
|
int op;
|
175
|
-
if (mdl->
|
241
|
+
if (mdl->type == 1)
|
242
|
+
op = tag_memmsc(mdl, seq, vpsi);
|
243
|
+
else if (mdl->opt->lblpost)
|
176
244
|
op = tag_postsc(mdl, seq, vpsi);
|
177
245
|
else
|
178
246
|
op = tag_expsc(mdl, seq, vpsi);
|
247
|
+
if (mdl->opt->force)
|
248
|
+
tag_forced(mdl, seq, vpsi, op);
|
179
249
|
// Now we can do the Viterbi algorithm. This is very similar to the
|
180
250
|
// forward pass
|
181
251
|
// | α_1(y) = Ψ_1(y,x_1)
|
@@ -189,15 +259,15 @@ void tag_viterbi(mdl_t *mdl, const seq_t *seq,
|
|
189
259
|
// the indice of the y value selected by the max. This also mean that
|
190
260
|
// we only need the current and previous value of the α vectors, not
|
191
261
|
// the full matrix.
|
192
|
-
for (
|
262
|
+
for (uint32_t y = 0; y < Y; y++)
|
193
263
|
cur[y] = (*psi)[0][0][y];
|
194
|
-
for (
|
195
|
-
for (
|
264
|
+
for (uint32_t t = 1; t < T; t++) {
|
265
|
+
for (uint32_t y = 0; y < Y; y++)
|
196
266
|
old[y] = cur[y];
|
197
|
-
for (
|
198
|
-
double
|
199
|
-
|
200
|
-
for (
|
267
|
+
for (uint32_t y = 0; y < Y; y++) {
|
268
|
+
double bst = -HUGE_VAL;
|
269
|
+
uint32_t idx = 0;
|
270
|
+
for (uint32_t yp = 0; yp < Y; yp++) {
|
201
271
|
double val = old[yp];
|
202
272
|
if (op)
|
203
273
|
val *= (*psi)[t][yp][y];
|
@@ -216,15 +286,15 @@ void tag_viterbi(mdl_t *mdl, const seq_t *seq,
|
|
216
286
|
// this we search in the last α vector the best value. Using this index
|
217
287
|
// as a starting point in the back-pointer array we finally can decode
|
218
288
|
// the best sequence.
|
219
|
-
|
220
|
-
for (
|
289
|
+
uint32_t bst = 0;
|
290
|
+
for (uint32_t y = 1; y < Y; y++)
|
221
291
|
if (cur[y] > cur[bst])
|
222
292
|
bst = y;
|
223
293
|
if (sc != NULL)
|
224
294
|
*sc = cur[bst];
|
225
|
-
for (
|
226
|
-
const
|
227
|
-
const
|
295
|
+
for (uint32_t t = T; t > 0; t--) {
|
296
|
+
const uint32_t yp = (t != 1) ? (*back)[t - 1][bst] : 0;
|
297
|
+
const uint32_t y = bst;
|
228
298
|
out[t - 1] = y;
|
229
299
|
if (psc != NULL)
|
230
300
|
psc[t - 1] = (*psi)[t - 1][yp][y];
|
@@ -233,7 +303,7 @@ void tag_viterbi(mdl_t *mdl, const seq_t *seq,
|
|
233
303
|
free(old);
|
234
304
|
free(cur);
|
235
305
|
free(vback);
|
236
|
-
|
306
|
+
xvm_free(vpsi);
|
237
307
|
}
|
238
308
|
|
239
309
|
/* tag_nbviterbi:
|
@@ -242,23 +312,27 @@ void tag_viterbi(mdl_t *mdl, const seq_t *seq,
|
|
242
312
|
* compute only the best one and will return the same sequence than the
|
243
313
|
* previous function but will be slower to do it.
|
244
314
|
*/
|
245
|
-
void tag_nbviterbi(mdl_t *mdl, const seq_t *seq,
|
246
|
-
|
247
|
-
const
|
248
|
-
const
|
249
|
-
double
|
250
|
-
|
251
|
-
double
|
252
|
-
|
253
|
-
double
|
254
|
-
double
|
315
|
+
void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, uint32_t N,
|
316
|
+
uint32_t out[][N], double sc[], double psc[][N]) {
|
317
|
+
const uint32_t Y = mdl->nlbl;
|
318
|
+
const uint32_t T = seq->len;
|
319
|
+
double *vpsi = xvm_new(T * Y * Y);
|
320
|
+
uint32_t *vback = wapiti_xmalloc(sizeof(uint32_t) * T * Y * N);
|
321
|
+
double (*psi) [T][Y ][Y] = (void *)vpsi;
|
322
|
+
uint32_t (*back)[T][Y * N] = (void *)vback;
|
323
|
+
double *cur = wapiti_xmalloc(sizeof(double) * Y * N);
|
324
|
+
double *old = wapiti_xmalloc(sizeof(double) * Y * N);
|
255
325
|
// We first compute the scores for each transitions in the lattice of
|
256
326
|
// labels.
|
257
327
|
int op;
|
258
|
-
if (mdl->
|
328
|
+
if (mdl->type == 1)
|
329
|
+
op = tag_memmsc(mdl, seq, vpsi);
|
330
|
+
else if (mdl->opt->lblpost)
|
259
331
|
op = tag_postsc(mdl, seq, (double *)psi);
|
260
332
|
else
|
261
333
|
op = tag_expsc(mdl, seq, (double *)psi);
|
334
|
+
if (mdl->opt->force)
|
335
|
+
tag_forced(mdl, seq, vpsi, op);
|
262
336
|
// Here also, it's classical but we have to keep the N best paths
|
263
337
|
// leading to each nodes of the lattice instead of only the best one.
|
264
338
|
// This mean that code is less trivial and the current implementation is
|
@@ -267,19 +341,19 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
|
|
267
341
|
// We first build the list of all incoming arcs from all paths from all
|
268
342
|
// N-best nodes and next select the N-best one. There is a lot of room
|
269
343
|
// here for later optimisations if needed.
|
270
|
-
for (
|
344
|
+
for (uint32_t y = 0, d = 0; y < Y; y++) {
|
271
345
|
cur[d++] = (*psi)[0][0][y];
|
272
|
-
for (
|
346
|
+
for (uint32_t n = 1; n < N; n++)
|
273
347
|
cur[d++] = -DBL_MAX;
|
274
348
|
}
|
275
|
-
for (
|
276
|
-
for (
|
349
|
+
for (uint32_t t = 1; t < T; t++) {
|
350
|
+
for (uint32_t d = 0; d < Y * N; d++)
|
277
351
|
old[d] = cur[d];
|
278
|
-
for (
|
352
|
+
for (uint32_t y = 0; y < Y; y++) {
|
279
353
|
// 1st, build the list of all incoming
|
280
354
|
double lst[Y * N];
|
281
|
-
for (
|
282
|
-
for (
|
355
|
+
for (uint32_t yp = 0, d = 0; yp < Y; yp++) {
|
356
|
+
for (uint32_t n = 0; n < N; n++, d++) {
|
283
357
|
lst[d] = old[d];
|
284
358
|
if (op)
|
285
359
|
lst[d] *= (*psi)[t][yp][y];
|
@@ -288,14 +362,14 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
|
|
288
362
|
}
|
289
363
|
}
|
290
364
|
// 2nd, init the back with the N first
|
291
|
-
|
292
|
-
for (
|
365
|
+
uint32_t *bk = &(*back)[t][y * N];
|
366
|
+
for (uint32_t n = 0; n < N; n++)
|
293
367
|
bk[n] = n;
|
294
368
|
// 3rd, search the N highest values
|
295
|
-
for (
|
369
|
+
for (uint32_t i = N; i < N * Y; i++) {
|
296
370
|
// Search the smallest current value
|
297
|
-
|
298
|
-
for (
|
371
|
+
uint32_t idx = 0;
|
372
|
+
for (uint32_t n = 1; n < N; n++)
|
299
373
|
if (lst[bk[n]] < lst[bk[idx]])
|
300
374
|
idx = n;
|
301
375
|
// And replace it if needed
|
@@ -303,24 +377,24 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
|
|
303
377
|
bk[idx] = i;
|
304
378
|
}
|
305
379
|
// 4th, get the new scores
|
306
|
-
for (
|
380
|
+
for (uint32_t n = 0; n < N; n++)
|
307
381
|
cur[y * N + n] = lst[bk[n]];
|
308
382
|
}
|
309
383
|
}
|
310
384
|
// Retrieving the best paths is similar to classical Viterbi except that
|
311
385
|
// we have to search for the N bet ones and there is N time more
|
312
386
|
// possibles starts.
|
313
|
-
for (
|
314
|
-
|
315
|
-
for (
|
387
|
+
for (uint32_t n = 0; n < N; n++) {
|
388
|
+
uint32_t bst = 0;
|
389
|
+
for (uint32_t d = 1; d < Y * N; d++)
|
316
390
|
if (cur[d] > cur[bst])
|
317
391
|
bst = d;
|
318
392
|
if (sc != NULL)
|
319
393
|
sc[n] = cur[bst];
|
320
394
|
cur[bst] = -DBL_MAX;
|
321
|
-
for (
|
322
|
-
const
|
323
|
-
const
|
395
|
+
for (uint32_t t = T; t > 0; t--) {
|
396
|
+
const uint32_t yp = (t != 1) ? (*back)[t - 1][bst] / N: 0;
|
397
|
+
const uint32_t y = bst / N;
|
324
398
|
out[t - 1][n] = y;
|
325
399
|
if (psc != NULL)
|
326
400
|
psc[t - 1][n] = (*psi)[t - 1][yp][y];
|
@@ -330,7 +404,7 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
|
|
330
404
|
free(old);
|
331
405
|
free(cur);
|
332
406
|
free(vback);
|
333
|
-
|
407
|
+
xvm_free(vpsi);
|
334
408
|
}
|
335
409
|
|
336
410
|
/* tag_label:
|
@@ -343,17 +417,17 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
|
|
343
417
|
*/
|
344
418
|
void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
|
345
419
|
qrk_t *lbls = mdl->reader->lbl;
|
346
|
-
const
|
347
|
-
const
|
420
|
+
const uint32_t Y = mdl->nlbl;
|
421
|
+
const uint32_t N = mdl->opt->nbest;
|
348
422
|
// We start by preparing the statistic collection to be ready if check
|
349
423
|
// option is used. The stat array hold the following for each label
|
350
424
|
// [0] # of reference with this label
|
351
425
|
// [1] # of token we have taged with this label
|
352
426
|
// [2] # of match of the two preceding
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
for (
|
427
|
+
uint64_t tcnt = 0, terr = 0;
|
428
|
+
uint64_t scnt = 0, serr = 0;
|
429
|
+
uint64_t stat[3][Y];
|
430
|
+
for (uint32_t y = 0; y < Y; y++)
|
357
431
|
stat[0][y] = stat[1][y] = stat[2][y] = 0;
|
358
432
|
// Next read the input file sequence by sequence and label them, we have
|
359
433
|
// to take care of not discarding the raw input as we want to send it
|
@@ -364,24 +438,25 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
|
|
364
438
|
raw_t *raw = rdr_readraw(mdl->reader, fin);
|
365
439
|
if (raw == NULL)
|
366
440
|
break;
|
367
|
-
seq_t *seq = rdr_raw2seq(mdl->reader, raw,
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
double
|
441
|
+
seq_t *seq = rdr_raw2seq(mdl->reader, raw,
|
442
|
+
mdl->opt->check | mdl->opt->force);
|
443
|
+
const uint32_t T = seq->len;
|
444
|
+
uint32_t *out = wapiti_xmalloc(sizeof(uint32_t) * T * N);
|
445
|
+
double *psc = wapiti_xmalloc(sizeof(double ) * T * N);
|
446
|
+
double *scs = wapiti_xmalloc(sizeof(double ) * N);
|
372
447
|
if (N == 1)
|
373
|
-
tag_viterbi(mdl, seq, (
|
448
|
+
tag_viterbi(mdl, seq, (uint32_t*)out, scs, (double*)psc);
|
374
449
|
else
|
375
450
|
tag_nbviterbi(mdl, seq, N, (void*)out, scs, (void*)psc);
|
376
451
|
// Next we output the raw sequence with an aditional column for
|
377
452
|
// the predicted labels
|
378
|
-
for (
|
453
|
+
for (uint32_t n = 0; n < N; n++) {
|
379
454
|
if (mdl->opt->outsc)
|
380
455
|
fprintf(fout, "# %d %f\n", (int)n, scs[n]);
|
381
|
-
for (
|
456
|
+
for (uint32_t t = 0; t < T; t++) {
|
382
457
|
if (!mdl->opt->label)
|
383
458
|
fprintf(fout, "%s\t", raw->lines[t]);
|
384
|
-
|
459
|
+
uint32_t lbl = out[t * N + n];
|
385
460
|
const char *lblstr = qrk_id2str(lbls, lbl);
|
386
461
|
fprintf(fout, "%s", lblstr);
|
387
462
|
if (mdl->opt->outsc) {
|
@@ -394,10 +469,13 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
|
|
394
469
|
}
|
395
470
|
fflush(fout);
|
396
471
|
// If user provided reference labels, use them to collect
|
397
|
-
// statistics about how well we have performed here.
|
472
|
+
// statistics about how well we have performed here. Labels
|
473
|
+
// unseen at training time are discarded.
|
398
474
|
if (mdl->opt->check) {
|
399
475
|
bool err = false;
|
400
|
-
for (
|
476
|
+
for (uint32_t t = 0; t < T; t++) {
|
477
|
+
if (seq->pos[t].lbl == (uint32_t)-1)
|
478
|
+
continue;
|
401
479
|
stat[0][seq->pos[t].lbl]++;
|
402
480
|
stat[1][out[t * N]]++;
|
403
481
|
if (seq->pos[t].lbl != out[t * N])
|
@@ -405,7 +483,7 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
|
|
405
483
|
else
|
406
484
|
stat[2][out[t * N]]++;
|
407
485
|
}
|
408
|
-
tcnt +=
|
486
|
+
tcnt += T;
|
409
487
|
serr += err;
|
410
488
|
}
|
411
489
|
// Cleanup memory used for this sequence
|
@@ -418,7 +496,7 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
|
|
418
496
|
// much sequence are labelled and if possible the current tokens
|
419
497
|
// and sequence error rates.
|
420
498
|
if (++scnt % 1000 == 0) {
|
421
|
-
info("%
|
499
|
+
info("%10"PRIu64" sequences labeled", scnt);
|
422
500
|
if (mdl->opt->check) {
|
423
501
|
const double te = (double)terr / tcnt * 100.0;
|
424
502
|
const double se = (double)serr / scnt * 100.0;
|
@@ -433,11 +511,11 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
|
|
433
511
|
if (mdl->opt->check) {
|
434
512
|
const double te = (double)terr / tcnt * 100.0;
|
435
513
|
const double se = (double)serr / scnt * 100.0;
|
436
|
-
info(" Nb sequences : %
|
514
|
+
info(" Nb sequences : %"PRIu64"\n", scnt);
|
437
515
|
info(" Token error : %5.2f%%\n", te);
|
438
516
|
info(" Sequence error: %5.2f%%\n", se);
|
439
517
|
info("* Per label statistics\n");
|
440
|
-
for (
|
518
|
+
for (uint32_t y = 0; y < Y; y++) {
|
441
519
|
const char *lbl = qrk_id2str(lbls, y);
|
442
520
|
const double Rc = (double)stat[2][y] / stat[0][y];
|
443
521
|
const double Pr = (double)stat[2][y] / stat[1][y];
|
@@ -458,12 +536,12 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
|
|
458
536
|
*/
|
459
537
|
typedef struct eval_s eval_t;
|
460
538
|
struct eval_s {
|
461
|
-
mdl_t
|
462
|
-
dat_t
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
539
|
+
mdl_t *mdl;
|
540
|
+
dat_t *dat;
|
541
|
+
uint64_t tcnt; // Processed tokens count
|
542
|
+
uint64_t terr; // Tokens error found
|
543
|
+
uint64_t scnt; // Processes sequences count
|
544
|
+
uint64_t serr; // Sequence error found
|
467
545
|
};
|
468
546
|
|
469
547
|
/* tag_evalsub:
|
@@ -471,7 +549,7 @@ struct eval_s {
|
|
471
549
|
* by batch and for each batch do a simple Viterbi and scan the result to find
|
472
550
|
* errors.
|
473
551
|
*/
|
474
|
-
static void tag_evalsub(job_t *job,
|
552
|
+
static void tag_evalsub(job_t *job, uint32_t id, uint32_t cnt, eval_t *eval) {
|
475
553
|
unused(id && cnt);
|
476
554
|
mdl_t *mdl = eval->mdl;
|
477
555
|
dat_t *dat = eval->dat;
|
@@ -480,22 +558,23 @@ static void tag_evalsub(job_t *job, int id, int cnt, eval_t *eval) {
|
|
480
558
|
eval->scnt = 0;
|
481
559
|
eval->serr = 0;
|
482
560
|
// We just get a job a process all the squence in it.
|
483
|
-
|
561
|
+
uint32_t count, pos;
|
484
562
|
while (mth_getjob(job, &count, &pos)) {
|
485
|
-
for (
|
563
|
+
for (uint32_t s = pos; s < pos + count; s++) {
|
486
564
|
// Tag the sequence with the viterbi
|
487
565
|
const seq_t *seq = dat->seq[s];
|
488
|
-
const
|
489
|
-
|
566
|
+
const uint32_t T = seq->len;
|
567
|
+
uint32_t *out = wapiti_xmalloc(sizeof(uint32_t) * T);
|
490
568
|
tag_viterbi(mdl, seq, out, NULL, NULL);
|
491
569
|
// And check for eventual (probable ?) errors
|
492
570
|
bool err = false;
|
493
|
-
for (
|
571
|
+
for (uint32_t t = 0; t < T; t++)
|
494
572
|
if (seq->pos[t].lbl != out[t])
|
495
573
|
eval->terr++, err = true;
|
496
|
-
eval->tcnt +=
|
574
|
+
eval->tcnt += T;
|
497
575
|
eval->scnt += 1;
|
498
576
|
eval->serr += err;
|
577
|
+
free(out);
|
499
578
|
}
|
500
579
|
}
|
501
580
|
}
|
@@ -505,13 +584,13 @@ static void tag_evalsub(job_t *job, int id, int cnt, eval_t *eval) {
|
|
505
584
|
* taining set if not available).
|
506
585
|
*/
|
507
586
|
void tag_eval(mdl_t *mdl, double *te, double *se) {
|
508
|
-
const
|
587
|
+
const uint32_t W = mdl->opt->nthread;
|
509
588
|
dat_t *dat = (mdl->devel == NULL) ? mdl->train : mdl->devel;
|
510
589
|
// First we prepare the eval state for all the workers threads, we just
|
511
590
|
// have to give them the model and dataset to use. This state will be
|
512
591
|
// used to retrieve partial result they computed.
|
513
592
|
eval_t *eval[W];
|
514
|
-
for (
|
593
|
+
for (uint32_t w = 0; w < W; w++) {
|
515
594
|
eval[w] = wapiti_xmalloc(sizeof(eval_t));
|
516
595
|
eval[w]->mdl = mdl;
|
517
596
|
eval[w]->dat = dat;
|
@@ -520,9 +599,9 @@ void tag_eval(mdl_t *mdl, double *te, double *se) {
|
|
520
599
|
// result by summing them and computing the final error rates.
|
521
600
|
mth_spawn((func_t *)tag_evalsub, W, (void *)eval, dat->nseq,
|
522
601
|
mdl->opt->jobsize);
|
523
|
-
|
524
|
-
|
525
|
-
for (
|
602
|
+
uint64_t tcnt = 0, terr = 0;
|
603
|
+
uint64_t scnt = 0, serr = 0;
|
604
|
+
for (uint32_t w = 0; w < W; w++) {
|
526
605
|
tcnt += eval[w]->tcnt;
|
527
606
|
terr += eval[w]->terr;
|
528
607
|
scnt += eval[w]->scnt;
|