wapiti 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/.autotest +13 -0
  2. data/.gitignore +5 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +30 -0
  6. data/README.md +153 -0
  7. data/Rakefile +33 -0
  8. data/ext/wapiti/bcd.c +392 -0
  9. data/ext/wapiti/decoder.c +535 -0
  10. data/ext/wapiti/decoder.h +46 -0
  11. data/ext/wapiti/extconf.rb +8 -0
  12. data/ext/wapiti/gradient.c +818 -0
  13. data/ext/wapiti/gradient.h +81 -0
  14. data/ext/wapiti/lbfgs.c +294 -0
  15. data/ext/wapiti/model.c +296 -0
  16. data/ext/wapiti/model.h +100 -0
  17. data/ext/wapiti/native.c +1238 -0
  18. data/ext/wapiti/native.h +15 -0
  19. data/ext/wapiti/options.c +278 -0
  20. data/ext/wapiti/options.h +91 -0
  21. data/ext/wapiti/pattern.c +395 -0
  22. data/ext/wapiti/pattern.h +56 -0
  23. data/ext/wapiti/progress.c +167 -0
  24. data/ext/wapiti/progress.h +43 -0
  25. data/ext/wapiti/quark.c +272 -0
  26. data/ext/wapiti/quark.h +46 -0
  27. data/ext/wapiti/reader.c +553 -0
  28. data/ext/wapiti/reader.h +73 -0
  29. data/ext/wapiti/rprop.c +191 -0
  30. data/ext/wapiti/sequence.h +148 -0
  31. data/ext/wapiti/sgdl1.c +218 -0
  32. data/ext/wapiti/thread.c +171 -0
  33. data/ext/wapiti/thread.h +42 -0
  34. data/ext/wapiti/tools.c +202 -0
  35. data/ext/wapiti/tools.h +54 -0
  36. data/ext/wapiti/trainers.h +39 -0
  37. data/ext/wapiti/vmath.c +372 -0
  38. data/ext/wapiti/vmath.h +51 -0
  39. data/ext/wapiti/wapiti.c +288 -0
  40. data/ext/wapiti/wapiti.h +45 -0
  41. data/lib/wapiti.rb +30 -0
  42. data/lib/wapiti/errors.rb +17 -0
  43. data/lib/wapiti/model.rb +49 -0
  44. data/lib/wapiti/options.rb +113 -0
  45. data/lib/wapiti/utility.rb +15 -0
  46. data/lib/wapiti/version.rb +3 -0
  47. data/spec/fixtures/ch.mod +18550 -0
  48. data/spec/fixtures/chpattern.txt +52 -0
  49. data/spec/fixtures/chtest.txt +1973 -0
  50. data/spec/fixtures/chtrain.txt +19995 -0
  51. data/spec/fixtures/nppattern.txt +52 -0
  52. data/spec/fixtures/nptest.txt +1973 -0
  53. data/spec/fixtures/nptrain.txt +19995 -0
  54. data/spec/fixtures/pattern.txt +14 -0
  55. data/spec/fixtures/test.txt +60000 -0
  56. data/spec/fixtures/train.txt +1200 -0
  57. data/spec/spec_helper.rb +21 -0
  58. data/spec/wapiti/model_spec.rb +173 -0
  59. data/spec/wapiti/native_spec.rb +12 -0
  60. data/spec/wapiti/options_spec.rb +175 -0
  61. data/spec/wapiti/utility_spec.rb +22 -0
  62. data/wapiti.gemspec +35 -0
  63. metadata +178 -0
@@ -0,0 +1,535 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #include <float.h>
29
+ #include <stddef.h>
30
+ #include <stdlib.h>
31
+ #include <stdio.h>
32
+
33
+ #include "wapiti.h"
34
+ #include "gradient.h"
35
+ #include "model.h"
36
+ #include "quark.h"
37
+ #include "reader.h"
38
+ #include "sequence.h"
39
+ #include "thread.h"
40
+ #include "tools.h"
41
+ #include "decoder.h"
42
+
43
+ /******************************************************************************
44
+ * Sequence tagging
45
+ *
46
+ * This module implement sequence tagging using a trained model and model
47
+ * evaluation on devlopment set.
48
+ *
49
+ * The viterbi can be quite intensive on the stack if you push in it long
50
+ * sequence and use large labels set. It's less a problem than in gradient
51
+ * computations but it can show up in particular cases. The fix is to call it
52
+ * through the mth_spawn function and request enough stack space, this will be
53
+ * fixed in next version.
54
+ ******************************************************************************/
55
+
56
+ /* tag_expsc:
57
+ * Compute the score lattice for classical Viterbi decoding. This is the same
58
+ * as for the first step of the gradient computation with the exception that
59
+ * we don't need to take the exponential of the scores as the Viterbi decoding
60
+ * works in log-space.
61
+ */
62
+ static int tag_expsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
63
+ const double *x = mdl->theta;
64
+ const size_t Y = mdl->nlbl;
65
+ const int T = seq->len;
66
+ double (*psi)[T][Y][Y] = (void *)vpsi;
67
+ // We first have to compute the Ψ_t(y',y,x_t) weights defined as
68
+ // Ψ_t(y',y,x_t) = \exp( ∑_k θ_k f_k(y',y,x_t) )
69
+ // So at position 't' in the sequence, for each couple (y',y) we have
70
+ // to sum weights of all features.
71
+ // This is the same than what we do for computing the gradient but, as
72
+ // the viterbi algorithm also work in the logarithmic space, we can
73
+ // remove the exponential.
74
+ //
75
+ // Only the observations present at this position will have a non-nul
76
+ // weight so we can sum only on thoses.
77
+ //
78
+ // As we use only two kind of features: unigram and bigram, we can
79
+ // rewrite this as
80
+ // ∑_k μ_k(y, x_t) f_k(y, x_t) + ∑_k λ_k(y', y, x_t) f_k(y', y, x_t)
81
+ // Where the first sum is over the unigrams features and the second is
82
+ // over bigrams ones.
83
+ //
84
+ // This allow us to compute Ψ efficiently in two steps
85
+ // 1/ we sum the unigrams features weights by looping over actives
86
+ // unigrams observations. (we compute this sum once and use it
87
+ // for each value of y')
88
+ // 2/ we add the bigrams features weights by looping over actives
89
+ // bigrams observations (we don't have to do this for t=0 since
90
+ // there is no bigrams here)
91
+ for (int t = 0; t < T; t++) {
92
+ const pos_t *pos = &(seq->pos[t]);
93
+ for (size_t y = 0; y < Y; y++) {
94
+ double sum = 0.0;
95
+ for (size_t n = 0; n < pos->ucnt; n++) {
96
+ const size_t o = pos->uobs[n];
97
+ sum += x[mdl->uoff[o] + y];
98
+ }
99
+ for (size_t yp = 0; yp < Y; yp++)
100
+ (*psi)[t][yp][y] = sum;
101
+ }
102
+ }
103
+ for (int t = 1; t < T; t++) {
104
+ const pos_t *pos = &(seq->pos[t]);
105
+ for (size_t yp = 0, d = 0; yp < Y; yp++) {
106
+ for (size_t y = 0; y < Y; y++, d++) {
107
+ double sum = 0.0;
108
+ for (size_t n = 0; n < pos->bcnt; n++) {
109
+ const size_t o = pos->bobs[n];
110
+ sum += x[mdl->boff[o] + d];
111
+ }
112
+ (*psi)[t][yp][y] += sum;
113
+ }
114
+ }
115
+ }
116
+ return 0;
117
+ }
118
+
119
+ /* tag_postsc:
120
+ * This function compute score lattice with posteriors. This generally result
121
+ * in a slightly best labelling and allow to output normalized score for the
122
+ * sequence and for each labels but this is more costly as we have to perform
123
+ * a full forward backward instead of just the forward pass.
124
+ */
125
+ static int tag_postsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
126
+ const size_t Y = mdl->nlbl;
127
+ const int T = seq->len;
128
+ double (*psi)[T][Y][Y] = (void *)vpsi;
129
+ grd_t *grd = grd_new(mdl, NULL);
130
+ grd->first = 0;
131
+ grd->last = T - 1;
132
+ grd_check(grd, seq->len);
133
+ if (mdl->opt->sparse) {
134
+ grd_spdopsi(grd, seq);
135
+ grd_spfwdbwd(grd, seq);
136
+ } else {
137
+ grd_fldopsi(grd, seq);
138
+ grd_flfwdbwd(grd, seq);
139
+ }
140
+ double (*alpha)[T][Y] = (void *)grd->alpha;
141
+ double (*beta )[T][Y] = (void *)grd->beta;
142
+ double *unorm = grd->unorm;
143
+ for (int t = 0; t < T; t++) {
144
+ for (size_t y = 0; y < Y; y++) {
145
+ double e = (*alpha)[t][y] * (*beta)[t][y] * unorm[t];
146
+ for (size_t yp = 0; yp < Y; yp++)
147
+ (*psi)[t][yp][y] = e;
148
+ }
149
+ }
150
+ grd_free(grd);
151
+ return 1;
152
+ }
153
+
154
+ /* tag_viterbi:
155
+ * This function implement the Viterbi algorithm in order to decode the most
156
+ * probable sequence of labels according to the model. Some part of this code
157
+ * is very similar to the computation of the gradient as expected.
158
+ *
159
+ * And like for the gradient, the caller is responsible to ensure there is
160
+ * enough stack space.
161
+ */
162
+ void tag_viterbi(mdl_t *mdl, const seq_t *seq,
163
+ size_t out[], double *sc, double psc[]) {
164
+ const size_t Y = mdl->nlbl;
165
+ const int T = seq->len;
166
+ double *vpsi = wapiti_xmalloc(sizeof(double) * T * Y * Y);
167
+ size_t *vback = wapiti_xmalloc(sizeof(size_t) * T * Y);
168
+ double (*psi) [T][Y][Y] = (void *)vpsi;
169
+ size_t (*back)[T][Y] = (void *)vback;
170
+ double *cur = wapiti_xmalloc(sizeof(double) * Y);
171
+ double *old = wapiti_xmalloc(sizeof(double) * Y);
172
+ // We first compute the scores for each transitions in the lattice of
173
+ // labels.
174
+ int op;
175
+ if (mdl->opt->lblpost)
176
+ op = tag_postsc(mdl, seq, vpsi);
177
+ else
178
+ op = tag_expsc(mdl, seq, vpsi);
179
+ // Now we can do the Viterbi algorithm. This is very similar to the
180
+ // forward pass
181
+ // | α_1(y) = Ψ_1(y,x_1)
182
+ // | α_t(y) = max_{y'} α_{t-1}(y') + Ψ_t(y',y,x_t)
183
+ // We just replace the sum by a max and as we do the computation in the
184
+ // logarithmic space the product become a sum. (this also mean that we
185
+ // don't have to worry about numerical problems)
186
+ //
187
+ // Next we have to walk backward over the α in order to find the best
188
+ // path. In order to do this efficiently, we keep in the 'back' array
189
+ // the indice of the y value selected by the max. This also mean that
190
+ // we only need the current and previous value of the α vectors, not
191
+ // the full matrix.
192
+ for (size_t y = 0; y < Y; y++)
193
+ cur[y] = (*psi)[0][0][y];
194
+ for (int t = 1; t < T; t++) {
195
+ for (size_t y = 0; y < Y; y++)
196
+ old[y] = cur[y];
197
+ for (size_t y = 0; y < Y; y++) {
198
+ double bst = -1.0;
199
+ int idx = 0;
200
+ for (size_t yp = 0; yp < Y; yp++) {
201
+ double val = old[yp];
202
+ if (op)
203
+ val *= (*psi)[t][yp][y];
204
+ else
205
+ val += (*psi)[t][yp][y];
206
+ if (val > bst) {
207
+ bst = val;
208
+ idx = yp;
209
+ }
210
+ }
211
+ (*back)[t][y] = idx;
212
+ cur[y] = bst;
213
+ }
214
+ }
215
+ // We can now build the sequence of labels predicted by the model. For
216
+ // this we search in the last α vector the best value. Using this index
217
+ // as a starting point in the back-pointer array we finally can decode
218
+ // the best sequence.
219
+ int bst = 0;
220
+ for (size_t y = 1; y < Y; y++)
221
+ if (cur[y] > cur[bst])
222
+ bst = y;
223
+ if (sc != NULL)
224
+ *sc = cur[bst];
225
+ for (int t = T; t > 0; t--) {
226
+ const size_t yp = (t != 1) ? (*back)[t - 1][bst] : 0;
227
+ const size_t y = bst;
228
+ out[t - 1] = y;
229
+ if (psc != NULL)
230
+ psc[t - 1] = (*psi)[t - 1][yp][y];
231
+ bst = yp;
232
+ }
233
+ free(old);
234
+ free(cur);
235
+ free(vback);
236
+ free(vpsi);
237
+ }
238
+
239
+ /* tag_nbviterbi:
240
+ * This function implement the Viterbi algorithm in order to decode the N-most
241
+ * probable sequences of labels according to the model. It can be used to
242
+ * compute only the best one and will return the same sequence than the
243
+ * previous function but will be slower to do it.
244
+ */
245
+ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
246
+ size_t out[][N], double sc[], double psc[][N]) {
247
+ const size_t Y = mdl->nlbl;
248
+ const int T = seq->len;
249
+ double *vpsi = wapiti_xmalloc(sizeof(double) * T * Y * Y);
250
+ size_t *vback = wapiti_xmalloc(sizeof(size_t) * T * Y * N);
251
+ double (*psi) [T][Y ][Y] = (void *)vpsi;
252
+ size_t (*back)[T][Y * N] = (void *)vback;
253
+ double *cur = wapiti_xmalloc(sizeof(double) * Y * N);
254
+ double *old = wapiti_xmalloc(sizeof(double) * Y * N);
255
+ // We first compute the scores for each transitions in the lattice of
256
+ // labels.
257
+ int op;
258
+ if (mdl->opt->lblpost)
259
+ op = tag_postsc(mdl, seq, (double *)psi);
260
+ else
261
+ op = tag_expsc(mdl, seq, (double *)psi);
262
+ // Here also, it's classical but we have to keep the N best paths
263
+ // leading to each nodes of the lattice instead of only the best one.
264
+ // This mean that code is less trivial and the current implementation is
265
+ // not the most efficient way to do this but it works well and is good
266
+ // enough for the moment.
267
+ // We first build the list of all incoming arcs from all paths from all
268
+ // N-best nodes and next select the N-best one. There is a lot of room
269
+ // here for later optimisations if needed.
270
+ for (size_t y = 0, d = 0; y < Y; y++) {
271
+ cur[d++] = (*psi)[0][0][y];
272
+ for (size_t n = 1; n < N; n++)
273
+ cur[d++] = -DBL_MAX;
274
+ }
275
+ for (int t = 1; t < T; t++) {
276
+ for (size_t d = 0; d < Y * N; d++)
277
+ old[d] = cur[d];
278
+ for (size_t y = 0; y < Y; y++) {
279
+ // 1st, build the list of all incoming
280
+ double lst[Y * N];
281
+ for (size_t yp = 0, d = 0; yp < Y; yp++) {
282
+ for (size_t n = 0; n < N; n++, d++) {
283
+ lst[d] = old[d];
284
+ if (op)
285
+ lst[d] *= (*psi)[t][yp][y];
286
+ else
287
+ lst[d] += (*psi)[t][yp][y];
288
+ }
289
+ }
290
+ // 2nd, init the back with the N first
291
+ size_t *bk = &(*back)[t][y * N];
292
+ for (size_t n = 0; n < N; n++)
293
+ bk[n] = n;
294
+ // 3rd, search the N highest values
295
+ for (size_t i = N; i < N * Y; i++) {
296
+ // Search the smallest current value
297
+ size_t idx = 0;
298
+ for (size_t n = 1; n < N; n++)
299
+ if (lst[bk[n]] < lst[bk[idx]])
300
+ idx = n;
301
+ // And replace it if needed
302
+ if (lst[i] > lst[bk[idx]])
303
+ bk[idx] = i;
304
+ }
305
+ // 4th, get the new scores
306
+ for (size_t n = 0; n < N; n++)
307
+ cur[y * N + n] = lst[bk[n]];
308
+ }
309
+ }
310
+ // Retrieving the best paths is similar to classical Viterbi except that
311
+ // we have to search for the N bet ones and there is N time more
312
+ // possibles starts.
313
+ for (size_t n = 0; n < N; n++) {
314
+ int bst = 0;
315
+ for (size_t d = 1; d < Y * N; d++)
316
+ if (cur[d] > cur[bst])
317
+ bst = d;
318
+ if (sc != NULL)
319
+ sc[n] = cur[bst];
320
+ cur[bst] = -DBL_MAX;
321
+ for (int t = T; t > 0; t--) {
322
+ const size_t yp = (t != 1) ? (*back)[t - 1][bst] / N: 0;
323
+ const size_t y = bst / N;
324
+ out[t - 1][n] = y;
325
+ if (psc != NULL)
326
+ psc[t - 1][n] = (*psi)[t - 1][yp][y];
327
+ bst = (*back)[t - 1][bst];
328
+ }
329
+ }
330
+ free(old);
331
+ free(cur);
332
+ free(vback);
333
+ free(vpsi);
334
+ }
335
+
336
+ /* tag_label:
337
+ * Label a data file using the current model. This output an almost exact copy
338
+ * of the input file with an additional column with the predicted label. If
339
+ * the check option is specified, the input file must be labelled and the
340
+ * predicted labels will be checked against the provided ones. This will
341
+ * output error rates during the labelling and detailed statistics per label
342
+ * at the end.
343
+ */
344
+ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
345
+ qrk_t *lbls = mdl->reader->lbl;
346
+ const size_t Y = mdl->nlbl;
347
+ const size_t N = mdl->opt->nbest;
348
+ // We start by preparing the statistic collection to be ready if check
349
+ // option is used. The stat array hold the following for each label
350
+ // [0] # of reference with this label
351
+ // [1] # of token we have taged with this label
352
+ // [2] # of match of the two preceding
353
+ size_t tcnt = 0, terr = 0;
354
+ size_t scnt = 0, serr = 0;
355
+ size_t stat[3][Y];
356
+ for (size_t y = 0; y < Y; y++)
357
+ stat[0][y] = stat[1][y] = stat[2][y] = 0;
358
+ // Next read the input file sequence by sequence and label them, we have
359
+ // to take care of not discarding the raw input as we want to send it
360
+ // back to the output with the additional predicted labels.
361
+ while (!feof(fin)) {
362
+ // So, first read an input sequence keeping the raw_t object
363
+ // available, and label it with Viterbi.
364
+ raw_t *raw = rdr_readraw(mdl->reader, fin);
365
+ if (raw == NULL)
366
+ break;
367
+ seq_t *seq = rdr_raw2seq(mdl->reader, raw, mdl->opt->check);
368
+ const int T = seq->len;
369
+ size_t *out = wapiti_xmalloc(sizeof(size_t) * T * N);
370
+ double *psc = wapiti_xmalloc(sizeof(double) * T * N);
371
+ double *scs = wapiti_xmalloc(sizeof(double) * N);
372
+ if (N == 1)
373
+ tag_viterbi(mdl, seq, (size_t*)out, scs, (double*)psc);
374
+ else
375
+ tag_nbviterbi(mdl, seq, N, (void*)out, scs, (void*)psc);
376
+ // Next we output the raw sequence with an aditional column for
377
+ // the predicted labels
378
+ for (size_t n = 0; n < N; n++) {
379
+ if (mdl->opt->outsc)
380
+ fprintf(fout, "# %d %f\n", (int)n, scs[n]);
381
+ for (int t = 0; t < T; t++) {
382
+ if (!mdl->opt->label)
383
+ fprintf(fout, "%s\t", raw->lines[t]);
384
+ size_t lbl = out[t * N + n];
385
+ const char *lblstr = qrk_id2str(lbls, lbl);
386
+ fprintf(fout, "%s", lblstr);
387
+ if (mdl->opt->outsc) {
388
+ fprintf(fout, "\t%s", lblstr);
389
+ fprintf(fout, "/%f", psc[t * N + n]);
390
+ }
391
+ fprintf(fout, "\n");
392
+ }
393
+ fprintf(fout, "\n");
394
+ }
395
+ fflush(fout);
396
+ // If user provided reference labels, use them to collect
397
+ // statistics about how well we have performed here.
398
+ if (mdl->opt->check) {
399
+ bool err = false;
400
+ for (int t = 0; t < T; t++) {
401
+ stat[0][seq->pos[t].lbl]++;
402
+ stat[1][out[t * N]]++;
403
+ if (seq->pos[t].lbl != out[t * N])
404
+ terr++, err = true;
405
+ else
406
+ stat[2][out[t * N]]++;
407
+ }
408
+ tcnt += (size_t)T;
409
+ serr += err;
410
+ }
411
+ // Cleanup memory used for this sequence
412
+ free(scs);
413
+ free(psc);
414
+ free(out);
415
+ rdr_freeseq(seq);
416
+ rdr_freeraw(raw);
417
+ // And report our progress, at regular interval we display how
418
+ // much sequence are labelled and if possible the current tokens
419
+ // and sequence error rates.
420
+ if (++scnt % 1000 == 0) {
421
+ info("%10zu sequences labeled", scnt);
422
+ if (mdl->opt->check) {
423
+ const double te = (double)terr / tcnt * 100.0;
424
+ const double se = (double)serr / scnt * 100.0;
425
+ info("\t%5.2f%%/%5.2f%%", te, se);
426
+ }
427
+ info("\n");
428
+ }
429
+ }
430
+ // If user have provided reference labels, we have collected a lot of
431
+ // statistics and we can repport global token and sequence error rate as
432
+ // well as precision recall and f-measure for each labels.
433
+ if (mdl->opt->check) {
434
+ const double te = (double)terr / tcnt * 100.0;
435
+ const double se = (double)serr / scnt * 100.0;
436
+ info(" Nb sequences : %zu\n", scnt);
437
+ info(" Token error : %5.2f%%\n", te);
438
+ info(" Sequence error: %5.2f%%\n", se);
439
+ info("* Per label statistics\n");
440
+ for (size_t y = 0; y < Y; y++) {
441
+ const char *lbl = qrk_id2str(lbls, y);
442
+ const double Rc = (double)stat[2][y] / stat[0][y];
443
+ const double Pr = (double)stat[2][y] / stat[1][y];
444
+ const double F1 = 2.0 * (Pr * Rc) / (Pr + Rc);
445
+ info(" %-6s", lbl);
446
+ info(" Pr=%.2f", Pr);
447
+ info(" Rc=%.2f", Rc);
448
+ info(" F1=%.2f\n", F1);
449
+ }
450
+ }
451
+ }
452
+
453
+ /* eval_t:
454
+ * This a state tracker used to communicate between the main eval function and
455
+ * its workers threads, the <mdl> and <dat> fields are used to transmit to the
456
+ * workers informations needed to make the computation, the other fields are
457
+ * for returning the partial results.
458
+ */
459
+ typedef struct eval_s eval_t;
460
+ struct eval_s {
461
+ mdl_t *mdl;
462
+ dat_t *dat;
463
+ size_t tcnt; // Processed tokens count
464
+ size_t terr; // Tokens error found
465
+ size_t scnt; // Processes sequences count
466
+ size_t serr; // Sequence error found
467
+ };
468
+
469
+ /* tag_evalsub:
470
+ * This is where the real evaluation is done by the workers, we process data
471
+ * by batch and for each batch do a simple Viterbi and scan the result to find
472
+ * errors.
473
+ */
474
+ static void tag_evalsub(job_t *job, int id, int cnt, eval_t *eval) {
475
+ unused(id && cnt);
476
+ mdl_t *mdl = eval->mdl;
477
+ dat_t *dat = eval->dat;
478
+ eval->tcnt = 0;
479
+ eval->terr = 0;
480
+ eval->scnt = 0;
481
+ eval->serr = 0;
482
+ // We just get a job a process all the squence in it.
483
+ size_t count, pos;
484
+ while (mth_getjob(job, &count, &pos)) {
485
+ for (size_t s = pos; s < pos + count; s++) {
486
+ // Tag the sequence with the viterbi
487
+ const seq_t *seq = dat->seq[s];
488
+ const int T = seq->len;
489
+ size_t out[T];
490
+ tag_viterbi(mdl, seq, out, NULL, NULL);
491
+ // And check for eventual (probable ?) errors
492
+ bool err = false;
493
+ for (int t = 0; t < T; t++)
494
+ if (seq->pos[t].lbl != out[t])
495
+ eval->terr++, err = true;
496
+ eval->tcnt += (size_t)T;
497
+ eval->scnt += 1;
498
+ eval->serr += err;
499
+ }
500
+ }
501
+ }
502
+
503
+ /* tag_eval:
504
+ * Compute the token error rate and sequence error rate over the devel set (or
505
+ * taining set if not available).
506
+ */
507
+ void tag_eval(mdl_t *mdl, double *te, double *se) {
508
+ const size_t W = mdl->opt->nthread;
509
+ dat_t *dat = (mdl->devel == NULL) ? mdl->train : mdl->devel;
510
+ // First we prepare the eval state for all the workers threads, we just
511
+ // have to give them the model and dataset to use. This state will be
512
+ // used to retrieve partial result they computed.
513
+ eval_t *eval[W];
514
+ for (size_t w = 0; w < W; w++) {
515
+ eval[w] = wapiti_xmalloc(sizeof(eval_t));
516
+ eval[w]->mdl = mdl;
517
+ eval[w]->dat = dat;
518
+ }
519
+ // And next, we call the workers to do the job and reduce the partial
520
+ // result by summing them and computing the final error rates.
521
+ mth_spawn((func_t *)tag_evalsub, W, (void *)eval, dat->nseq,
522
+ mdl->opt->jobsize);
523
+ size_t tcnt = 0, terr = 0;
524
+ size_t scnt = 0, serr = 0;
525
+ for (size_t w = 0; w < W; w++) {
526
+ tcnt += eval[w]->tcnt;
527
+ terr += eval[w]->terr;
528
+ scnt += eval[w]->scnt;
529
+ serr += eval[w]->serr;
530
+ free(eval[w]);
531
+ }
532
+ *te = (double)terr / tcnt * 100.0;
533
+ *se = (double)serr / scnt * 100.0;
534
+ }
535
+