wapiti 0.0.5 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.simplecov +3 -0
  3. data/Gemfile +25 -2
  4. data/HISTORY.md +5 -1
  5. data/LICENSE +14 -13
  6. data/README.md +9 -16
  7. data/Rakefile +38 -8
  8. data/ext/wapiti/bcd.c +126 -124
  9. data/ext/wapiti/decoder.c +203 -124
  10. data/ext/wapiti/decoder.h +6 -4
  11. data/ext/wapiti/extconf.rb +2 -2
  12. data/ext/wapiti/gradient.c +491 -320
  13. data/ext/wapiti/gradient.h +52 -34
  14. data/ext/wapiti/lbfgs.c +74 -33
  15. data/ext/wapiti/model.c +47 -37
  16. data/ext/wapiti/model.h +22 -20
  17. data/ext/wapiti/native.c +850 -839
  18. data/ext/wapiti/native.h +1 -1
  19. data/ext/wapiti/options.c +52 -20
  20. data/ext/wapiti/options.h +37 -30
  21. data/ext/wapiti/pattern.c +35 -33
  22. data/ext/wapiti/pattern.h +12 -11
  23. data/ext/wapiti/progress.c +14 -13
  24. data/ext/wapiti/progress.h +3 -2
  25. data/ext/wapiti/quark.c +14 -16
  26. data/ext/wapiti/quark.h +6 -5
  27. data/ext/wapiti/reader.c +83 -69
  28. data/ext/wapiti/reader.h +11 -9
  29. data/ext/wapiti/rprop.c +84 -43
  30. data/ext/wapiti/sequence.h +18 -16
  31. data/ext/wapiti/sgdl1.c +45 -43
  32. data/ext/wapiti/thread.c +19 -17
  33. data/ext/wapiti/thread.h +5 -4
  34. data/ext/wapiti/tools.c +7 -7
  35. data/ext/wapiti/tools.h +3 -4
  36. data/ext/wapiti/trainers.h +1 -1
  37. data/ext/wapiti/vmath.c +40 -38
  38. data/ext/wapiti/vmath.h +12 -11
  39. data/ext/wapiti/wapiti.c +159 -37
  40. data/ext/wapiti/wapiti.h +18 -4
  41. data/lib/wapiti.rb +15 -15
  42. data/lib/wapiti/errors.rb +15 -15
  43. data/lib/wapiti/model.rb +92 -84
  44. data/lib/wapiti/options.rb +123 -124
  45. data/lib/wapiti/utility.rb +14 -14
  46. data/lib/wapiti/version.rb +2 -2
  47. data/spec/spec_helper.rb +29 -9
  48. data/spec/wapiti/model_spec.rb +230 -194
  49. data/spec/wapiti/native_spec.rb +7 -8
  50. data/spec/wapiti/options_spec.rb +184 -174
  51. data/wapiti.gemspec +22 -8
  52. metadata +38 -42
  53. data/.gitignore +0 -5
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -25,10 +25,14 @@
25
25
  * POSSIBILITY OF SUCH DAMAGE.
26
26
  */
27
27
 
28
+ #include <inttypes.h>
28
29
  #include <float.h>
30
+ #include <stdint.h>
29
31
  #include <stddef.h>
30
32
  #include <stdlib.h>
31
33
  #include <stdio.h>
34
+ #include <string.h>
35
+ #include <math.h>
32
36
 
33
37
  #include "wapiti.h"
34
38
  #include "gradient.h"
@@ -39,6 +43,7 @@
39
43
  #include "thread.h"
40
44
  #include "tools.h"
41
45
  #include "decoder.h"
46
+ #include "vmath.h"
42
47
 
43
48
  /******************************************************************************
44
49
  * Sequence tagging
@@ -60,9 +65,9 @@
60
65
  * works in log-space.
61
66
  */
62
67
  static int tag_expsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
63
- const double *x = mdl->theta;
64
- const size_t Y = mdl->nlbl;
65
- const int T = seq->len;
68
+ const double *x = mdl->theta;
69
+ const uint32_t Y = mdl->nlbl;
70
+ const uint32_t T = seq->len;
66
71
  double (*psi)[T][Y][Y] = (void *)vpsi;
67
72
  // We first have to compute the Ψ_t(y',y,x_t) weights defined as
68
73
  // Ψ_t(y',y,x_t) = \exp( ∑_k θ_k f_k(y',y,x_t) )
@@ -88,25 +93,25 @@ static int tag_expsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
88
93
  // 2/ we add the bigrams features weights by looping over actives
89
94
  // bigrams observations (we don't have to do this for t=0 since
90
95
  // there is no bigrams here)
91
- for (int t = 0; t < T; t++) {
96
+ for (uint32_t t = 0; t < T; t++) {
92
97
  const pos_t *pos = &(seq->pos[t]);
93
- for (size_t y = 0; y < Y; y++) {
98
+ for (uint32_t y = 0; y < Y; y++) {
94
99
  double sum = 0.0;
95
- for (size_t n = 0; n < pos->ucnt; n++) {
96
- const size_t o = pos->uobs[n];
100
+ for (uint32_t n = 0; n < pos->ucnt; n++) {
101
+ const uint64_t o = pos->uobs[n];
97
102
  sum += x[mdl->uoff[o] + y];
98
103
  }
99
- for (size_t yp = 0; yp < Y; yp++)
104
+ for (uint32_t yp = 0; yp < Y; yp++)
100
105
  (*psi)[t][yp][y] = sum;
101
106
  }
102
107
  }
103
- for (int t = 1; t < T; t++) {
108
+ for (uint32_t t = 1; t < T; t++) {
104
109
  const pos_t *pos = &(seq->pos[t]);
105
- for (size_t yp = 0, d = 0; yp < Y; yp++) {
106
- for (size_t y = 0; y < Y; y++, d++) {
110
+ for (uint32_t yp = 0, d = 0; yp < Y; yp++) {
111
+ for (uint32_t y = 0; y < Y; y++, d++) {
107
112
  double sum = 0.0;
108
- for (size_t n = 0; n < pos->bcnt; n++) {
109
- const size_t o = pos->bobs[n];
113
+ for (uint32_t n = 0; n < pos->bcnt; n++) {
114
+ const uint64_t o = pos->bobs[n];
110
115
  sum += x[mdl->boff[o] + d];
111
116
  }
112
117
  (*psi)[t][yp][y] += sum;
@@ -116,6 +121,30 @@ static int tag_expsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
116
121
  return 0;
117
122
  }
118
123
 
124
+ /* tag_memmsc:
125
+ * Compute the score for viterbi decoding of MEMM models. This use the
126
+ * previous function to compute the classical score and then normalize them
127
+ * relative to the previous label. This normalization must be done in linear
128
+ * space, not in logarithm one.
129
+ */
130
+ static int tag_memmsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
131
+ const uint32_t Y = mdl->nlbl;
132
+ const uint32_t T = seq->len;
133
+ tag_expsc(mdl, seq, vpsi);
134
+ xvm_expma(vpsi, vpsi, 0.0, T * Y * Y);
135
+ double (*psi)[T][Y][Y] = (void *)vpsi;
136
+ for (uint32_t t = 0; t < T; t++) {
137
+ for (uint32_t yp = 0; yp < Y; yp++) {
138
+ double sum = 0.0;
139
+ for (uint32_t y = 0; y < Y; y++)
140
+ sum += (*psi)[t][yp][y];
141
+ for (uint32_t y = 0; y < Y; y++)
142
+ (*psi)[t][yp][y] /= sum;
143
+ }
144
+ }
145
+ return 1;
146
+ }
147
+
119
148
  /* tag_postsc:
120
149
  * This function compute score lattice with posteriors. This generally result
121
150
  * in a slightly best labelling and allow to output normalized score for the
@@ -123,34 +152,71 @@ static int tag_expsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
123
152
  * a full forward backward instead of just the forward pass.
124
153
  */
125
154
  static int tag_postsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
126
- const size_t Y = mdl->nlbl;
127
- const int T = seq->len;
155
+ const uint32_t Y = mdl->nlbl;
156
+ const uint32_t T = seq->len;
128
157
  double (*psi)[T][Y][Y] = (void *)vpsi;
129
- grd_t *grd = grd_new(mdl, NULL);
130
- grd->first = 0;
131
- grd->last = T - 1;
132
- grd_check(grd, seq->len);
158
+ grd_st_t *grd_st = grd_stnew(mdl, NULL);
159
+ grd_st->first = 0;
160
+ grd_st->last = T - 1;
161
+ grd_stcheck(grd_st, seq->len);
133
162
  if (mdl->opt->sparse) {
134
- grd_spdopsi(grd, seq);
135
- grd_spfwdbwd(grd, seq);
163
+ grd_spdopsi(grd_st, seq);
164
+ grd_spfwdbwd(grd_st, seq);
136
165
  } else {
137
- grd_fldopsi(grd, seq);
138
- grd_flfwdbwd(grd, seq);
166
+ grd_fldopsi(grd_st, seq);
167
+ grd_flfwdbwd(grd_st, seq);
139
168
  }
140
- double (*alpha)[T][Y] = (void *)grd->alpha;
141
- double (*beta )[T][Y] = (void *)grd->beta;
142
- double *unorm = grd->unorm;
143
- for (int t = 0; t < T; t++) {
144
- for (size_t y = 0; y < Y; y++) {
169
+ double (*alpha)[T][Y] = (void *)grd_st->alpha;
170
+ double (*beta )[T][Y] = (void *)grd_st->beta;
171
+ double *unorm = grd_st->unorm;
172
+ for (uint32_t t = 0; t < T; t++) {
173
+ for (uint32_t y = 0; y < Y; y++) {
145
174
  double e = (*alpha)[t][y] * (*beta)[t][y] * unorm[t];
146
- for (size_t yp = 0; yp < Y; yp++)
175
+ for (uint32_t yp = 0; yp < Y; yp++)
147
176
  (*psi)[t][yp][y] = e;
148
177
  }
149
178
  }
150
- grd_free(grd);
179
+ grd_stfree(grd_st);
151
180
  return 1;
152
181
  }
153
182
 
183
+ /* tag_forced:
184
+ * This function apply correction to the psi table to take account of already
185
+ * known labels. If a label is known, all arcs leading or comming from other
186
+ * labels at this position are NULLified and will not be selected by the
187
+ * decoder.
188
+ */
189
+ static void tag_forced(mdl_t *mdl, const seq_t *seq, double *vpsi, int op) {
190
+ const uint32_t Y = mdl->nlbl;
191
+ const uint32_t T = seq->len;
192
+ const double v = op ? 0.0 : -HUGE_VAL;
193
+ double (*psi)[T][Y][Y] = (void *)vpsi;
194
+ for (uint32_t t = 0; t < T; t++) {
195
+ const uint32_t yr = seq->pos[t].lbl;
196
+ if (yr == (uint32_t)-1)
197
+ continue;
198
+ if (t != 0)
199
+ for (uint32_t y = 0; y < Y; y++)
200
+ if (y != yr)
201
+ for (uint32_t yp = 0; yp < Y; yp++)
202
+ (*psi)[t][yp][y] = v;
203
+ if (t != T - 1)
204
+ for (uint32_t y = 0; y < Y; y++)
205
+ if (y != yr)
206
+ for (uint32_t yn = 0; yn < Y; yn++)
207
+ (*psi)[t + 1][y][yn] = v;
208
+ }
209
+ const uint32_t yr = seq->pos[0].lbl;
210
+ if (yr != (uint32_t)-1) {
211
+ for (uint32_t y = 0; y < Y; y++) {
212
+ if (yr == y)
213
+ continue;
214
+ for (uint32_t yp = 0; yp < Y; yp++)
215
+ (*psi)[0][yp][y] = v;
216
+ }
217
+ }
218
+ }
219
+
154
220
  /* tag_viterbi:
155
221
  * This function implement the Viterbi algorithm in order to decode the most
156
222
  * probable sequence of labels according to the model. Some part of this code
@@ -160,22 +226,26 @@ static int tag_postsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
160
226
  * enough stack space.
161
227
  */
162
228
  void tag_viterbi(mdl_t *mdl, const seq_t *seq,
163
- size_t out[], double *sc, double psc[]) {
164
- const size_t Y = mdl->nlbl;
165
- const int T = seq->len;
166
- double *vpsi = wapiti_xmalloc(sizeof(double) * T * Y * Y);
167
- size_t *vback = wapiti_xmalloc(sizeof(size_t) * T * Y);
168
- double (*psi) [T][Y][Y] = (void *)vpsi;
169
- size_t (*back)[T][Y] = (void *)vback;
170
- double *cur = wapiti_xmalloc(sizeof(double) * Y);
171
- double *old = wapiti_xmalloc(sizeof(double) * Y);
229
+ uint32_t out[], double *sc, double psc[]) {
230
+ const uint32_t Y = mdl->nlbl;
231
+ const uint32_t T = seq->len;
232
+ double *vpsi = xvm_new(T * Y * Y);
233
+ uint32_t *vback = wapiti_xmalloc(sizeof(uint32_t) * T * Y);
234
+ double (*psi) [T][Y][Y] = (void *)vpsi;
235
+ uint32_t (*back)[T][Y] = (void *)vback;
236
+ double *cur = wapiti_xmalloc(sizeof(double) * Y);
237
+ double *old = wapiti_xmalloc(sizeof(double) * Y);
172
238
  // We first compute the scores for each transitions in the lattice of
173
239
  // labels.
174
240
  int op;
175
- if (mdl->opt->lblpost)
241
+ if (mdl->type == 1)
242
+ op = tag_memmsc(mdl, seq, vpsi);
243
+ else if (mdl->opt->lblpost)
176
244
  op = tag_postsc(mdl, seq, vpsi);
177
245
  else
178
246
  op = tag_expsc(mdl, seq, vpsi);
247
+ if (mdl->opt->force)
248
+ tag_forced(mdl, seq, vpsi, op);
179
249
  // Now we can do the Viterbi algorithm. This is very similar to the
180
250
  // forward pass
181
251
  // | α_1(y) = Ψ_1(y,x_1)
@@ -189,15 +259,15 @@ void tag_viterbi(mdl_t *mdl, const seq_t *seq,
189
259
  // the indice of the y value selected by the max. This also mean that
190
260
  // we only need the current and previous value of the α vectors, not
191
261
  // the full matrix.
192
- for (size_t y = 0; y < Y; y++)
262
+ for (uint32_t y = 0; y < Y; y++)
193
263
  cur[y] = (*psi)[0][0][y];
194
- for (int t = 1; t < T; t++) {
195
- for (size_t y = 0; y < Y; y++)
264
+ for (uint32_t t = 1; t < T; t++) {
265
+ for (uint32_t y = 0; y < Y; y++)
196
266
  old[y] = cur[y];
197
- for (size_t y = 0; y < Y; y++) {
198
- double bst = -1.0;
199
- int idx = 0;
200
- for (size_t yp = 0; yp < Y; yp++) {
267
+ for (uint32_t y = 0; y < Y; y++) {
268
+ double bst = -HUGE_VAL;
269
+ uint32_t idx = 0;
270
+ for (uint32_t yp = 0; yp < Y; yp++) {
201
271
  double val = old[yp];
202
272
  if (op)
203
273
  val *= (*psi)[t][yp][y];
@@ -216,15 +286,15 @@ void tag_viterbi(mdl_t *mdl, const seq_t *seq,
216
286
  // this we search in the last α vector the best value. Using this index
217
287
  // as a starting point in the back-pointer array we finally can decode
218
288
  // the best sequence.
219
- int bst = 0;
220
- for (size_t y = 1; y < Y; y++)
289
+ uint32_t bst = 0;
290
+ for (uint32_t y = 1; y < Y; y++)
221
291
  if (cur[y] > cur[bst])
222
292
  bst = y;
223
293
  if (sc != NULL)
224
294
  *sc = cur[bst];
225
- for (int t = T; t > 0; t--) {
226
- const size_t yp = (t != 1) ? (*back)[t - 1][bst] : 0;
227
- const size_t y = bst;
295
+ for (uint32_t t = T; t > 0; t--) {
296
+ const uint32_t yp = (t != 1) ? (*back)[t - 1][bst] : 0;
297
+ const uint32_t y = bst;
228
298
  out[t - 1] = y;
229
299
  if (psc != NULL)
230
300
  psc[t - 1] = (*psi)[t - 1][yp][y];
@@ -233,7 +303,7 @@ void tag_viterbi(mdl_t *mdl, const seq_t *seq,
233
303
  free(old);
234
304
  free(cur);
235
305
  free(vback);
236
- free(vpsi);
306
+ xvm_free(vpsi);
237
307
  }
238
308
 
239
309
  /* tag_nbviterbi:
@@ -242,23 +312,27 @@ void tag_viterbi(mdl_t *mdl, const seq_t *seq,
242
312
  * compute only the best one and will return the same sequence than the
243
313
  * previous function but will be slower to do it.
244
314
  */
245
- void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
246
- size_t out[][N], double sc[], double psc[][N]) {
247
- const size_t Y = mdl->nlbl;
248
- const int T = seq->len;
249
- double *vpsi = wapiti_xmalloc(sizeof(double) * T * Y * Y);
250
- size_t *vback = wapiti_xmalloc(sizeof(size_t) * T * Y * N);
251
- double (*psi) [T][Y ][Y] = (void *)vpsi;
252
- size_t (*back)[T][Y * N] = (void *)vback;
253
- double *cur = wapiti_xmalloc(sizeof(double) * Y * N);
254
- double *old = wapiti_xmalloc(sizeof(double) * Y * N);
315
+ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, uint32_t N,
316
+ uint32_t out[][N], double sc[], double psc[][N]) {
317
+ const uint32_t Y = mdl->nlbl;
318
+ const uint32_t T = seq->len;
319
+ double *vpsi = xvm_new(T * Y * Y);
320
+ uint32_t *vback = wapiti_xmalloc(sizeof(uint32_t) * T * Y * N);
321
+ double (*psi) [T][Y ][Y] = (void *)vpsi;
322
+ uint32_t (*back)[T][Y * N] = (void *)vback;
323
+ double *cur = wapiti_xmalloc(sizeof(double) * Y * N);
324
+ double *old = wapiti_xmalloc(sizeof(double) * Y * N);
255
325
  // We first compute the scores for each transitions in the lattice of
256
326
  // labels.
257
327
  int op;
258
- if (mdl->opt->lblpost)
328
+ if (mdl->type == 1)
329
+ op = tag_memmsc(mdl, seq, vpsi);
330
+ else if (mdl->opt->lblpost)
259
331
  op = tag_postsc(mdl, seq, (double *)psi);
260
332
  else
261
333
  op = tag_expsc(mdl, seq, (double *)psi);
334
+ if (mdl->opt->force)
335
+ tag_forced(mdl, seq, vpsi, op);
262
336
  // Here also, it's classical but we have to keep the N best paths
263
337
  // leading to each nodes of the lattice instead of only the best one.
264
338
  // This mean that code is less trivial and the current implementation is
@@ -267,19 +341,19 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
267
341
  // We first build the list of all incoming arcs from all paths from all
268
342
  // N-best nodes and next select the N-best one. There is a lot of room
269
343
  // here for later optimisations if needed.
270
- for (size_t y = 0, d = 0; y < Y; y++) {
344
+ for (uint32_t y = 0, d = 0; y < Y; y++) {
271
345
  cur[d++] = (*psi)[0][0][y];
272
- for (size_t n = 1; n < N; n++)
346
+ for (uint32_t n = 1; n < N; n++)
273
347
  cur[d++] = -DBL_MAX;
274
348
  }
275
- for (int t = 1; t < T; t++) {
276
- for (size_t d = 0; d < Y * N; d++)
349
+ for (uint32_t t = 1; t < T; t++) {
350
+ for (uint32_t d = 0; d < Y * N; d++)
277
351
  old[d] = cur[d];
278
- for (size_t y = 0; y < Y; y++) {
352
+ for (uint32_t y = 0; y < Y; y++) {
279
353
  // 1st, build the list of all incoming
280
354
  double lst[Y * N];
281
- for (size_t yp = 0, d = 0; yp < Y; yp++) {
282
- for (size_t n = 0; n < N; n++, d++) {
355
+ for (uint32_t yp = 0, d = 0; yp < Y; yp++) {
356
+ for (uint32_t n = 0; n < N; n++, d++) {
283
357
  lst[d] = old[d];
284
358
  if (op)
285
359
  lst[d] *= (*psi)[t][yp][y];
@@ -288,14 +362,14 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
288
362
  }
289
363
  }
290
364
  // 2nd, init the back with the N first
291
- size_t *bk = &(*back)[t][y * N];
292
- for (size_t n = 0; n < N; n++)
365
+ uint32_t *bk = &(*back)[t][y * N];
366
+ for (uint32_t n = 0; n < N; n++)
293
367
  bk[n] = n;
294
368
  // 3rd, search the N highest values
295
- for (size_t i = N; i < N * Y; i++) {
369
+ for (uint32_t i = N; i < N * Y; i++) {
296
370
  // Search the smallest current value
297
- size_t idx = 0;
298
- for (size_t n = 1; n < N; n++)
371
+ uint32_t idx = 0;
372
+ for (uint32_t n = 1; n < N; n++)
299
373
  if (lst[bk[n]] < lst[bk[idx]])
300
374
  idx = n;
301
375
  // And replace it if needed
@@ -303,24 +377,24 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
303
377
  bk[idx] = i;
304
378
  }
305
379
  // 4th, get the new scores
306
- for (size_t n = 0; n < N; n++)
380
+ for (uint32_t n = 0; n < N; n++)
307
381
  cur[y * N + n] = lst[bk[n]];
308
382
  }
309
383
  }
310
384
  // Retrieving the best paths is similar to classical Viterbi except that
311
385
  // we have to search for the N bet ones and there is N time more
312
386
  // possibles starts.
313
- for (size_t n = 0; n < N; n++) {
314
- int bst = 0;
315
- for (size_t d = 1; d < Y * N; d++)
387
+ for (uint32_t n = 0; n < N; n++) {
388
+ uint32_t bst = 0;
389
+ for (uint32_t d = 1; d < Y * N; d++)
316
390
  if (cur[d] > cur[bst])
317
391
  bst = d;
318
392
  if (sc != NULL)
319
393
  sc[n] = cur[bst];
320
394
  cur[bst] = -DBL_MAX;
321
- for (int t = T; t > 0; t--) {
322
- const size_t yp = (t != 1) ? (*back)[t - 1][bst] / N: 0;
323
- const size_t y = bst / N;
395
+ for (uint32_t t = T; t > 0; t--) {
396
+ const uint32_t yp = (t != 1) ? (*back)[t - 1][bst] / N: 0;
397
+ const uint32_t y = bst / N;
324
398
  out[t - 1][n] = y;
325
399
  if (psc != NULL)
326
400
  psc[t - 1][n] = (*psi)[t - 1][yp][y];
@@ -330,7 +404,7 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
330
404
  free(old);
331
405
  free(cur);
332
406
  free(vback);
333
- free(vpsi);
407
+ xvm_free(vpsi);
334
408
  }
335
409
 
336
410
  /* tag_label:
@@ -343,17 +417,17 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
343
417
  */
344
418
  void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
345
419
  qrk_t *lbls = mdl->reader->lbl;
346
- const size_t Y = mdl->nlbl;
347
- const size_t N = mdl->opt->nbest;
420
+ const uint32_t Y = mdl->nlbl;
421
+ const uint32_t N = mdl->opt->nbest;
348
422
  // We start by preparing the statistic collection to be ready if check
349
423
  // option is used. The stat array hold the following for each label
350
424
  // [0] # of reference with this label
351
425
  // [1] # of token we have taged with this label
352
426
  // [2] # of match of the two preceding
353
- size_t tcnt = 0, terr = 0;
354
- size_t scnt = 0, serr = 0;
355
- size_t stat[3][Y];
356
- for (size_t y = 0; y < Y; y++)
427
+ uint64_t tcnt = 0, terr = 0;
428
+ uint64_t scnt = 0, serr = 0;
429
+ uint64_t stat[3][Y];
430
+ for (uint32_t y = 0; y < Y; y++)
357
431
  stat[0][y] = stat[1][y] = stat[2][y] = 0;
358
432
  // Next read the input file sequence by sequence and label them, we have
359
433
  // to take care of not discarding the raw input as we want to send it
@@ -364,24 +438,25 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
364
438
  raw_t *raw = rdr_readraw(mdl->reader, fin);
365
439
  if (raw == NULL)
366
440
  break;
367
- seq_t *seq = rdr_raw2seq(mdl->reader, raw, mdl->opt->check);
368
- const int T = seq->len;
369
- size_t *out = wapiti_xmalloc(sizeof(size_t) * T * N);
370
- double *psc = wapiti_xmalloc(sizeof(double) * T * N);
371
- double *scs = wapiti_xmalloc(sizeof(double) * N);
441
+ seq_t *seq = rdr_raw2seq(mdl->reader, raw,
442
+ mdl->opt->check | mdl->opt->force);
443
+ const uint32_t T = seq->len;
444
+ uint32_t *out = wapiti_xmalloc(sizeof(uint32_t) * T * N);
445
+ double *psc = wapiti_xmalloc(sizeof(double ) * T * N);
446
+ double *scs = wapiti_xmalloc(sizeof(double ) * N);
372
447
  if (N == 1)
373
- tag_viterbi(mdl, seq, (size_t*)out, scs, (double*)psc);
448
+ tag_viterbi(mdl, seq, (uint32_t*)out, scs, (double*)psc);
374
449
  else
375
450
  tag_nbviterbi(mdl, seq, N, (void*)out, scs, (void*)psc);
376
451
  // Next we output the raw sequence with an aditional column for
377
452
  // the predicted labels
378
- for (size_t n = 0; n < N; n++) {
453
+ for (uint32_t n = 0; n < N; n++) {
379
454
  if (mdl->opt->outsc)
380
455
  fprintf(fout, "# %d %f\n", (int)n, scs[n]);
381
- for (int t = 0; t < T; t++) {
456
+ for (uint32_t t = 0; t < T; t++) {
382
457
  if (!mdl->opt->label)
383
458
  fprintf(fout, "%s\t", raw->lines[t]);
384
- size_t lbl = out[t * N + n];
459
+ uint32_t lbl = out[t * N + n];
385
460
  const char *lblstr = qrk_id2str(lbls, lbl);
386
461
  fprintf(fout, "%s", lblstr);
387
462
  if (mdl->opt->outsc) {
@@ -394,10 +469,13 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
394
469
  }
395
470
  fflush(fout);
396
471
  // If user provided reference labels, use them to collect
397
- // statistics about how well we have performed here.
472
+ // statistics about how well we have performed here. Labels
473
+ // unseen at training time are discarded.
398
474
  if (mdl->opt->check) {
399
475
  bool err = false;
400
- for (int t = 0; t < T; t++) {
476
+ for (uint32_t t = 0; t < T; t++) {
477
+ if (seq->pos[t].lbl == (uint32_t)-1)
478
+ continue;
401
479
  stat[0][seq->pos[t].lbl]++;
402
480
  stat[1][out[t * N]]++;
403
481
  if (seq->pos[t].lbl != out[t * N])
@@ -405,7 +483,7 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
405
483
  else
406
484
  stat[2][out[t * N]]++;
407
485
  }
408
- tcnt += (size_t)T;
486
+ tcnt += T;
409
487
  serr += err;
410
488
  }
411
489
  // Cleanup memory used for this sequence
@@ -418,7 +496,7 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
418
496
  // much sequence are labelled and if possible the current tokens
419
497
  // and sequence error rates.
420
498
  if (++scnt % 1000 == 0) {
421
- info("%10zu sequences labeled", scnt);
499
+ info("%10"PRIu64" sequences labeled", scnt);
422
500
  if (mdl->opt->check) {
423
501
  const double te = (double)terr / tcnt * 100.0;
424
502
  const double se = (double)serr / scnt * 100.0;
@@ -433,11 +511,11 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
433
511
  if (mdl->opt->check) {
434
512
  const double te = (double)terr / tcnt * 100.0;
435
513
  const double se = (double)serr / scnt * 100.0;
436
- info(" Nb sequences : %zu\n", scnt);
514
+ info(" Nb sequences : %"PRIu64"\n", scnt);
437
515
  info(" Token error : %5.2f%%\n", te);
438
516
  info(" Sequence error: %5.2f%%\n", se);
439
517
  info("* Per label statistics\n");
440
- for (size_t y = 0; y < Y; y++) {
518
+ for (uint32_t y = 0; y < Y; y++) {
441
519
  const char *lbl = qrk_id2str(lbls, y);
442
520
  const double Rc = (double)stat[2][y] / stat[0][y];
443
521
  const double Pr = (double)stat[2][y] / stat[1][y];
@@ -458,12 +536,12 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
458
536
  */
459
537
  typedef struct eval_s eval_t;
460
538
  struct eval_s {
461
- mdl_t *mdl;
462
- dat_t *dat;
463
- size_t tcnt; // Processed tokens count
464
- size_t terr; // Tokens error found
465
- size_t scnt; // Processes sequences count
466
- size_t serr; // Sequence error found
539
+ mdl_t *mdl;
540
+ dat_t *dat;
541
+ uint64_t tcnt; // Processed tokens count
542
+ uint64_t terr; // Tokens error found
543
+ uint64_t scnt; // Processes sequences count
544
+ uint64_t serr; // Sequence error found
467
545
  };
468
546
 
469
547
  /* tag_evalsub:
@@ -471,7 +549,7 @@ struct eval_s {
471
549
  * by batch and for each batch do a simple Viterbi and scan the result to find
472
550
  * errors.
473
551
  */
474
- static void tag_evalsub(job_t *job, int id, int cnt, eval_t *eval) {
552
+ static void tag_evalsub(job_t *job, uint32_t id, uint32_t cnt, eval_t *eval) {
475
553
  unused(id && cnt);
476
554
  mdl_t *mdl = eval->mdl;
477
555
  dat_t *dat = eval->dat;
@@ -480,22 +558,23 @@ static void tag_evalsub(job_t *job, int id, int cnt, eval_t *eval) {
480
558
  eval->scnt = 0;
481
559
  eval->serr = 0;
482
560
  // We just get a job a process all the squence in it.
483
- size_t count, pos;
561
+ uint32_t count, pos;
484
562
  while (mth_getjob(job, &count, &pos)) {
485
- for (size_t s = pos; s < pos + count; s++) {
563
+ for (uint32_t s = pos; s < pos + count; s++) {
486
564
  // Tag the sequence with the viterbi
487
565
  const seq_t *seq = dat->seq[s];
488
- const int T = seq->len;
489
- size_t out[T];
566
+ const uint32_t T = seq->len;
567
+ uint32_t *out = wapiti_xmalloc(sizeof(uint32_t) * T);
490
568
  tag_viterbi(mdl, seq, out, NULL, NULL);
491
569
  // And check for eventual (probable ?) errors
492
570
  bool err = false;
493
- for (int t = 0; t < T; t++)
571
+ for (uint32_t t = 0; t < T; t++)
494
572
  if (seq->pos[t].lbl != out[t])
495
573
  eval->terr++, err = true;
496
- eval->tcnt += (size_t)T;
574
+ eval->tcnt += T;
497
575
  eval->scnt += 1;
498
576
  eval->serr += err;
577
+ free(out);
499
578
  }
500
579
  }
501
580
  }
@@ -505,13 +584,13 @@ static void tag_evalsub(job_t *job, int id, int cnt, eval_t *eval) {
505
584
  * taining set if not available).
506
585
  */
507
586
  void tag_eval(mdl_t *mdl, double *te, double *se) {
508
- const size_t W = mdl->opt->nthread;
587
+ const uint32_t W = mdl->opt->nthread;
509
588
  dat_t *dat = (mdl->devel == NULL) ? mdl->train : mdl->devel;
510
589
  // First we prepare the eval state for all the workers threads, we just
511
590
  // have to give them the model and dataset to use. This state will be
512
591
  // used to retrieve partial result they computed.
513
592
  eval_t *eval[W];
514
- for (size_t w = 0; w < W; w++) {
593
+ for (uint32_t w = 0; w < W; w++) {
515
594
  eval[w] = wapiti_xmalloc(sizeof(eval_t));
516
595
  eval[w]->mdl = mdl;
517
596
  eval[w]->dat = dat;
@@ -520,9 +599,9 @@ void tag_eval(mdl_t *mdl, double *te, double *se) {
520
599
  // result by summing them and computing the final error rates.
521
600
  mth_spawn((func_t *)tag_evalsub, W, (void *)eval, dat->nseq,
522
601
  mdl->opt->jobsize);
523
- size_t tcnt = 0, terr = 0;
524
- size_t scnt = 0, serr = 0;
525
- for (size_t w = 0; w < W; w++) {
602
+ uint64_t tcnt = 0, terr = 0;
603
+ uint64_t scnt = 0, serr = 0;
604
+ for (uint32_t w = 0; w < W; w++) {
526
605
  tcnt += eval[w]->tcnt;
527
606
  terr += eval[w]->terr;
528
607
  scnt += eval[w]->scnt;