wapiti 0.0.5 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.simplecov +3 -0
  3. data/Gemfile +25 -2
  4. data/HISTORY.md +5 -1
  5. data/LICENSE +14 -13
  6. data/README.md +9 -16
  7. data/Rakefile +38 -8
  8. data/ext/wapiti/bcd.c +126 -124
  9. data/ext/wapiti/decoder.c +203 -124
  10. data/ext/wapiti/decoder.h +6 -4
  11. data/ext/wapiti/extconf.rb +2 -2
  12. data/ext/wapiti/gradient.c +491 -320
  13. data/ext/wapiti/gradient.h +52 -34
  14. data/ext/wapiti/lbfgs.c +74 -33
  15. data/ext/wapiti/model.c +47 -37
  16. data/ext/wapiti/model.h +22 -20
  17. data/ext/wapiti/native.c +850 -839
  18. data/ext/wapiti/native.h +1 -1
  19. data/ext/wapiti/options.c +52 -20
  20. data/ext/wapiti/options.h +37 -30
  21. data/ext/wapiti/pattern.c +35 -33
  22. data/ext/wapiti/pattern.h +12 -11
  23. data/ext/wapiti/progress.c +14 -13
  24. data/ext/wapiti/progress.h +3 -2
  25. data/ext/wapiti/quark.c +14 -16
  26. data/ext/wapiti/quark.h +6 -5
  27. data/ext/wapiti/reader.c +83 -69
  28. data/ext/wapiti/reader.h +11 -9
  29. data/ext/wapiti/rprop.c +84 -43
  30. data/ext/wapiti/sequence.h +18 -16
  31. data/ext/wapiti/sgdl1.c +45 -43
  32. data/ext/wapiti/thread.c +19 -17
  33. data/ext/wapiti/thread.h +5 -4
  34. data/ext/wapiti/tools.c +7 -7
  35. data/ext/wapiti/tools.h +3 -4
  36. data/ext/wapiti/trainers.h +1 -1
  37. data/ext/wapiti/vmath.c +40 -38
  38. data/ext/wapiti/vmath.h +12 -11
  39. data/ext/wapiti/wapiti.c +159 -37
  40. data/ext/wapiti/wapiti.h +18 -4
  41. data/lib/wapiti.rb +15 -15
  42. data/lib/wapiti/errors.rb +15 -15
  43. data/lib/wapiti/model.rb +92 -84
  44. data/lib/wapiti/options.rb +123 -124
  45. data/lib/wapiti/utility.rb +14 -14
  46. data/lib/wapiti/version.rb +2 -2
  47. data/spec/spec_helper.rb +29 -9
  48. data/spec/wapiti/model_spec.rb +230 -194
  49. data/spec/wapiti/native_spec.rb +7 -8
  50. data/spec/wapiti/options_spec.rb +184 -174
  51. data/wapiti.gemspec +22 -8
  52. metadata +38 -42
  53. data/.gitignore +0 -5
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -25,10 +25,14 @@
25
25
  * POSSIBILITY OF SUCH DAMAGE.
26
26
  */
27
27
 
28
+ #include <inttypes.h>
28
29
  #include <float.h>
30
+ #include <stdint.h>
29
31
  #include <stddef.h>
30
32
  #include <stdlib.h>
31
33
  #include <stdio.h>
34
+ #include <string.h>
35
+ #include <math.h>
32
36
 
33
37
  #include "wapiti.h"
34
38
  #include "gradient.h"
@@ -39,6 +43,7 @@
39
43
  #include "thread.h"
40
44
  #include "tools.h"
41
45
  #include "decoder.h"
46
+ #include "vmath.h"
42
47
 
43
48
  /******************************************************************************
44
49
  * Sequence tagging
@@ -60,9 +65,9 @@
60
65
  * works in log-space.
61
66
  */
62
67
  static int tag_expsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
63
- const double *x = mdl->theta;
64
- const size_t Y = mdl->nlbl;
65
- const int T = seq->len;
68
+ const double *x = mdl->theta;
69
+ const uint32_t Y = mdl->nlbl;
70
+ const uint32_t T = seq->len;
66
71
  double (*psi)[T][Y][Y] = (void *)vpsi;
67
72
  // We first have to compute the Ψ_t(y',y,x_t) weights defined as
68
73
  // Ψ_t(y',y,x_t) = \exp( ∑_k θ_k f_k(y',y,x_t) )
@@ -88,25 +93,25 @@ static int tag_expsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
88
93
  // 2/ we add the bigrams features weights by looping over actives
89
94
  // bigrams observations (we don't have to do this for t=0 since
90
95
  // there is no bigrams here)
91
- for (int t = 0; t < T; t++) {
96
+ for (uint32_t t = 0; t < T; t++) {
92
97
  const pos_t *pos = &(seq->pos[t]);
93
- for (size_t y = 0; y < Y; y++) {
98
+ for (uint32_t y = 0; y < Y; y++) {
94
99
  double sum = 0.0;
95
- for (size_t n = 0; n < pos->ucnt; n++) {
96
- const size_t o = pos->uobs[n];
100
+ for (uint32_t n = 0; n < pos->ucnt; n++) {
101
+ const uint64_t o = pos->uobs[n];
97
102
  sum += x[mdl->uoff[o] + y];
98
103
  }
99
- for (size_t yp = 0; yp < Y; yp++)
104
+ for (uint32_t yp = 0; yp < Y; yp++)
100
105
  (*psi)[t][yp][y] = sum;
101
106
  }
102
107
  }
103
- for (int t = 1; t < T; t++) {
108
+ for (uint32_t t = 1; t < T; t++) {
104
109
  const pos_t *pos = &(seq->pos[t]);
105
- for (size_t yp = 0, d = 0; yp < Y; yp++) {
106
- for (size_t y = 0; y < Y; y++, d++) {
110
+ for (uint32_t yp = 0, d = 0; yp < Y; yp++) {
111
+ for (uint32_t y = 0; y < Y; y++, d++) {
107
112
  double sum = 0.0;
108
- for (size_t n = 0; n < pos->bcnt; n++) {
109
- const size_t o = pos->bobs[n];
113
+ for (uint32_t n = 0; n < pos->bcnt; n++) {
114
+ const uint64_t o = pos->bobs[n];
110
115
  sum += x[mdl->boff[o] + d];
111
116
  }
112
117
  (*psi)[t][yp][y] += sum;
@@ -116,6 +121,30 @@ static int tag_expsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
116
121
  return 0;
117
122
  }
118
123
 
124
+ /* tag_memmsc:
125
+ * Compute the score for viterbi decoding of MEMM models. This use the
126
+ * previous function to compute the classical score and then normalize them
127
+ * relative to the previous label. This normalization must be done in linear
128
+ * space, not in logarithm one.
129
+ */
130
+ static int tag_memmsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
131
+ const uint32_t Y = mdl->nlbl;
132
+ const uint32_t T = seq->len;
133
+ tag_expsc(mdl, seq, vpsi);
134
+ xvm_expma(vpsi, vpsi, 0.0, T * Y * Y);
135
+ double (*psi)[T][Y][Y] = (void *)vpsi;
136
+ for (uint32_t t = 0; t < T; t++) {
137
+ for (uint32_t yp = 0; yp < Y; yp++) {
138
+ double sum = 0.0;
139
+ for (uint32_t y = 0; y < Y; y++)
140
+ sum += (*psi)[t][yp][y];
141
+ for (uint32_t y = 0; y < Y; y++)
142
+ (*psi)[t][yp][y] /= sum;
143
+ }
144
+ }
145
+ return 1;
146
+ }
147
+
119
148
  /* tag_postsc:
120
149
  * This function compute score lattice with posteriors. This generally result
121
150
  * in a slightly best labelling and allow to output normalized score for the
@@ -123,34 +152,71 @@ static int tag_expsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
123
152
  * a full forward backward instead of just the forward pass.
124
153
  */
125
154
  static int tag_postsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
126
- const size_t Y = mdl->nlbl;
127
- const int T = seq->len;
155
+ const uint32_t Y = mdl->nlbl;
156
+ const uint32_t T = seq->len;
128
157
  double (*psi)[T][Y][Y] = (void *)vpsi;
129
- grd_t *grd = grd_new(mdl, NULL);
130
- grd->first = 0;
131
- grd->last = T - 1;
132
- grd_check(grd, seq->len);
158
+ grd_st_t *grd_st = grd_stnew(mdl, NULL);
159
+ grd_st->first = 0;
160
+ grd_st->last = T - 1;
161
+ grd_stcheck(grd_st, seq->len);
133
162
  if (mdl->opt->sparse) {
134
- grd_spdopsi(grd, seq);
135
- grd_spfwdbwd(grd, seq);
163
+ grd_spdopsi(grd_st, seq);
164
+ grd_spfwdbwd(grd_st, seq);
136
165
  } else {
137
- grd_fldopsi(grd, seq);
138
- grd_flfwdbwd(grd, seq);
166
+ grd_fldopsi(grd_st, seq);
167
+ grd_flfwdbwd(grd_st, seq);
139
168
  }
140
- double (*alpha)[T][Y] = (void *)grd->alpha;
141
- double (*beta )[T][Y] = (void *)grd->beta;
142
- double *unorm = grd->unorm;
143
- for (int t = 0; t < T; t++) {
144
- for (size_t y = 0; y < Y; y++) {
169
+ double (*alpha)[T][Y] = (void *)grd_st->alpha;
170
+ double (*beta )[T][Y] = (void *)grd_st->beta;
171
+ double *unorm = grd_st->unorm;
172
+ for (uint32_t t = 0; t < T; t++) {
173
+ for (uint32_t y = 0; y < Y; y++) {
145
174
  double e = (*alpha)[t][y] * (*beta)[t][y] * unorm[t];
146
- for (size_t yp = 0; yp < Y; yp++)
175
+ for (uint32_t yp = 0; yp < Y; yp++)
147
176
  (*psi)[t][yp][y] = e;
148
177
  }
149
178
  }
150
- grd_free(grd);
179
+ grd_stfree(grd_st);
151
180
  return 1;
152
181
  }
153
182
 
183
+ /* tag_forced:
184
+ * This function apply correction to the psi table to take account of already
185
+ * known labels. If a label is known, all arcs leading or comming from other
186
+ * labels at this position are NULLified and will not be selected by the
187
+ * decoder.
188
+ */
189
+ static void tag_forced(mdl_t *mdl, const seq_t *seq, double *vpsi, int op) {
190
+ const uint32_t Y = mdl->nlbl;
191
+ const uint32_t T = seq->len;
192
+ const double v = op ? 0.0 : -HUGE_VAL;
193
+ double (*psi)[T][Y][Y] = (void *)vpsi;
194
+ for (uint32_t t = 0; t < T; t++) {
195
+ const uint32_t yr = seq->pos[t].lbl;
196
+ if (yr == (uint32_t)-1)
197
+ continue;
198
+ if (t != 0)
199
+ for (uint32_t y = 0; y < Y; y++)
200
+ if (y != yr)
201
+ for (uint32_t yp = 0; yp < Y; yp++)
202
+ (*psi)[t][yp][y] = v;
203
+ if (t != T - 1)
204
+ for (uint32_t y = 0; y < Y; y++)
205
+ if (y != yr)
206
+ for (uint32_t yn = 0; yn < Y; yn++)
207
+ (*psi)[t + 1][y][yn] = v;
208
+ }
209
+ const uint32_t yr = seq->pos[0].lbl;
210
+ if (yr != (uint32_t)-1) {
211
+ for (uint32_t y = 0; y < Y; y++) {
212
+ if (yr == y)
213
+ continue;
214
+ for (uint32_t yp = 0; yp < Y; yp++)
215
+ (*psi)[0][yp][y] = v;
216
+ }
217
+ }
218
+ }
219
+
154
220
  /* tag_viterbi:
155
221
  * This function implement the Viterbi algorithm in order to decode the most
156
222
  * probable sequence of labels according to the model. Some part of this code
@@ -160,22 +226,26 @@ static int tag_postsc(mdl_t *mdl, const seq_t *seq, double *vpsi) {
160
226
  * enough stack space.
161
227
  */
162
228
  void tag_viterbi(mdl_t *mdl, const seq_t *seq,
163
- size_t out[], double *sc, double psc[]) {
164
- const size_t Y = mdl->nlbl;
165
- const int T = seq->len;
166
- double *vpsi = wapiti_xmalloc(sizeof(double) * T * Y * Y);
167
- size_t *vback = wapiti_xmalloc(sizeof(size_t) * T * Y);
168
- double (*psi) [T][Y][Y] = (void *)vpsi;
169
- size_t (*back)[T][Y] = (void *)vback;
170
- double *cur = wapiti_xmalloc(sizeof(double) * Y);
171
- double *old = wapiti_xmalloc(sizeof(double) * Y);
229
+ uint32_t out[], double *sc, double psc[]) {
230
+ const uint32_t Y = mdl->nlbl;
231
+ const uint32_t T = seq->len;
232
+ double *vpsi = xvm_new(T * Y * Y);
233
+ uint32_t *vback = wapiti_xmalloc(sizeof(uint32_t) * T * Y);
234
+ double (*psi) [T][Y][Y] = (void *)vpsi;
235
+ uint32_t (*back)[T][Y] = (void *)vback;
236
+ double *cur = wapiti_xmalloc(sizeof(double) * Y);
237
+ double *old = wapiti_xmalloc(sizeof(double) * Y);
172
238
  // We first compute the scores for each transitions in the lattice of
173
239
  // labels.
174
240
  int op;
175
- if (mdl->opt->lblpost)
241
+ if (mdl->type == 1)
242
+ op = tag_memmsc(mdl, seq, vpsi);
243
+ else if (mdl->opt->lblpost)
176
244
  op = tag_postsc(mdl, seq, vpsi);
177
245
  else
178
246
  op = tag_expsc(mdl, seq, vpsi);
247
+ if (mdl->opt->force)
248
+ tag_forced(mdl, seq, vpsi, op);
179
249
  // Now we can do the Viterbi algorithm. This is very similar to the
180
250
  // forward pass
181
251
  // | α_1(y) = Ψ_1(y,x_1)
@@ -189,15 +259,15 @@ void tag_viterbi(mdl_t *mdl, const seq_t *seq,
189
259
  // the indice of the y value selected by the max. This also mean that
190
260
  // we only need the current and previous value of the α vectors, not
191
261
  // the full matrix.
192
- for (size_t y = 0; y < Y; y++)
262
+ for (uint32_t y = 0; y < Y; y++)
193
263
  cur[y] = (*psi)[0][0][y];
194
- for (int t = 1; t < T; t++) {
195
- for (size_t y = 0; y < Y; y++)
264
+ for (uint32_t t = 1; t < T; t++) {
265
+ for (uint32_t y = 0; y < Y; y++)
196
266
  old[y] = cur[y];
197
- for (size_t y = 0; y < Y; y++) {
198
- double bst = -1.0;
199
- int idx = 0;
200
- for (size_t yp = 0; yp < Y; yp++) {
267
+ for (uint32_t y = 0; y < Y; y++) {
268
+ double bst = -HUGE_VAL;
269
+ uint32_t idx = 0;
270
+ for (uint32_t yp = 0; yp < Y; yp++) {
201
271
  double val = old[yp];
202
272
  if (op)
203
273
  val *= (*psi)[t][yp][y];
@@ -216,15 +286,15 @@ void tag_viterbi(mdl_t *mdl, const seq_t *seq,
216
286
  // this we search in the last α vector the best value. Using this index
217
287
  // as a starting point in the back-pointer array we finally can decode
218
288
  // the best sequence.
219
- int bst = 0;
220
- for (size_t y = 1; y < Y; y++)
289
+ uint32_t bst = 0;
290
+ for (uint32_t y = 1; y < Y; y++)
221
291
  if (cur[y] > cur[bst])
222
292
  bst = y;
223
293
  if (sc != NULL)
224
294
  *sc = cur[bst];
225
- for (int t = T; t > 0; t--) {
226
- const size_t yp = (t != 1) ? (*back)[t - 1][bst] : 0;
227
- const size_t y = bst;
295
+ for (uint32_t t = T; t > 0; t--) {
296
+ const uint32_t yp = (t != 1) ? (*back)[t - 1][bst] : 0;
297
+ const uint32_t y = bst;
228
298
  out[t - 1] = y;
229
299
  if (psc != NULL)
230
300
  psc[t - 1] = (*psi)[t - 1][yp][y];
@@ -233,7 +303,7 @@ void tag_viterbi(mdl_t *mdl, const seq_t *seq,
233
303
  free(old);
234
304
  free(cur);
235
305
  free(vback);
236
- free(vpsi);
306
+ xvm_free(vpsi);
237
307
  }
238
308
 
239
309
  /* tag_nbviterbi:
@@ -242,23 +312,27 @@ void tag_viterbi(mdl_t *mdl, const seq_t *seq,
242
312
  * compute only the best one and will return the same sequence than the
243
313
  * previous function but will be slower to do it.
244
314
  */
245
- void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
246
- size_t out[][N], double sc[], double psc[][N]) {
247
- const size_t Y = mdl->nlbl;
248
- const int T = seq->len;
249
- double *vpsi = wapiti_xmalloc(sizeof(double) * T * Y * Y);
250
- size_t *vback = wapiti_xmalloc(sizeof(size_t) * T * Y * N);
251
- double (*psi) [T][Y ][Y] = (void *)vpsi;
252
- size_t (*back)[T][Y * N] = (void *)vback;
253
- double *cur = wapiti_xmalloc(sizeof(double) * Y * N);
254
- double *old = wapiti_xmalloc(sizeof(double) * Y * N);
315
+ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, uint32_t N,
316
+ uint32_t out[][N], double sc[], double psc[][N]) {
317
+ const uint32_t Y = mdl->nlbl;
318
+ const uint32_t T = seq->len;
319
+ double *vpsi = xvm_new(T * Y * Y);
320
+ uint32_t *vback = wapiti_xmalloc(sizeof(uint32_t) * T * Y * N);
321
+ double (*psi) [T][Y ][Y] = (void *)vpsi;
322
+ uint32_t (*back)[T][Y * N] = (void *)vback;
323
+ double *cur = wapiti_xmalloc(sizeof(double) * Y * N);
324
+ double *old = wapiti_xmalloc(sizeof(double) * Y * N);
255
325
  // We first compute the scores for each transitions in the lattice of
256
326
  // labels.
257
327
  int op;
258
- if (mdl->opt->lblpost)
328
+ if (mdl->type == 1)
329
+ op = tag_memmsc(mdl, seq, vpsi);
330
+ else if (mdl->opt->lblpost)
259
331
  op = tag_postsc(mdl, seq, (double *)psi);
260
332
  else
261
333
  op = tag_expsc(mdl, seq, (double *)psi);
334
+ if (mdl->opt->force)
335
+ tag_forced(mdl, seq, vpsi, op);
262
336
  // Here also, it's classical but we have to keep the N best paths
263
337
  // leading to each nodes of the lattice instead of only the best one.
264
338
  // This mean that code is less trivial and the current implementation is
@@ -267,19 +341,19 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
267
341
  // We first build the list of all incoming arcs from all paths from all
268
342
  // N-best nodes and next select the N-best one. There is a lot of room
269
343
  // here for later optimisations if needed.
270
- for (size_t y = 0, d = 0; y < Y; y++) {
344
+ for (uint32_t y = 0, d = 0; y < Y; y++) {
271
345
  cur[d++] = (*psi)[0][0][y];
272
- for (size_t n = 1; n < N; n++)
346
+ for (uint32_t n = 1; n < N; n++)
273
347
  cur[d++] = -DBL_MAX;
274
348
  }
275
- for (int t = 1; t < T; t++) {
276
- for (size_t d = 0; d < Y * N; d++)
349
+ for (uint32_t t = 1; t < T; t++) {
350
+ for (uint32_t d = 0; d < Y * N; d++)
277
351
  old[d] = cur[d];
278
- for (size_t y = 0; y < Y; y++) {
352
+ for (uint32_t y = 0; y < Y; y++) {
279
353
  // 1st, build the list of all incoming
280
354
  double lst[Y * N];
281
- for (size_t yp = 0, d = 0; yp < Y; yp++) {
282
- for (size_t n = 0; n < N; n++, d++) {
355
+ for (uint32_t yp = 0, d = 0; yp < Y; yp++) {
356
+ for (uint32_t n = 0; n < N; n++, d++) {
283
357
  lst[d] = old[d];
284
358
  if (op)
285
359
  lst[d] *= (*psi)[t][yp][y];
@@ -288,14 +362,14 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
288
362
  }
289
363
  }
290
364
  // 2nd, init the back with the N first
291
- size_t *bk = &(*back)[t][y * N];
292
- for (size_t n = 0; n < N; n++)
365
+ uint32_t *bk = &(*back)[t][y * N];
366
+ for (uint32_t n = 0; n < N; n++)
293
367
  bk[n] = n;
294
368
  // 3rd, search the N highest values
295
- for (size_t i = N; i < N * Y; i++) {
369
+ for (uint32_t i = N; i < N * Y; i++) {
296
370
  // Search the smallest current value
297
- size_t idx = 0;
298
- for (size_t n = 1; n < N; n++)
371
+ uint32_t idx = 0;
372
+ for (uint32_t n = 1; n < N; n++)
299
373
  if (lst[bk[n]] < lst[bk[idx]])
300
374
  idx = n;
301
375
  // And replace it if needed
@@ -303,24 +377,24 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
303
377
  bk[idx] = i;
304
378
  }
305
379
  // 4th, get the new scores
306
- for (size_t n = 0; n < N; n++)
380
+ for (uint32_t n = 0; n < N; n++)
307
381
  cur[y * N + n] = lst[bk[n]];
308
382
  }
309
383
  }
310
384
  // Retrieving the best paths is similar to classical Viterbi except that
311
385
  // we have to search for the N bet ones and there is N time more
312
386
  // possibles starts.
313
- for (size_t n = 0; n < N; n++) {
314
- int bst = 0;
315
- for (size_t d = 1; d < Y * N; d++)
387
+ for (uint32_t n = 0; n < N; n++) {
388
+ uint32_t bst = 0;
389
+ for (uint32_t d = 1; d < Y * N; d++)
316
390
  if (cur[d] > cur[bst])
317
391
  bst = d;
318
392
  if (sc != NULL)
319
393
  sc[n] = cur[bst];
320
394
  cur[bst] = -DBL_MAX;
321
- for (int t = T; t > 0; t--) {
322
- const size_t yp = (t != 1) ? (*back)[t - 1][bst] / N: 0;
323
- const size_t y = bst / N;
395
+ for (uint32_t t = T; t > 0; t--) {
396
+ const uint32_t yp = (t != 1) ? (*back)[t - 1][bst] / N: 0;
397
+ const uint32_t y = bst / N;
324
398
  out[t - 1][n] = y;
325
399
  if (psc != NULL)
326
400
  psc[t - 1][n] = (*psi)[t - 1][yp][y];
@@ -330,7 +404,7 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
330
404
  free(old);
331
405
  free(cur);
332
406
  free(vback);
333
- free(vpsi);
407
+ xvm_free(vpsi);
334
408
  }
335
409
 
336
410
  /* tag_label:
@@ -343,17 +417,17 @@ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
343
417
  */
344
418
  void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
345
419
  qrk_t *lbls = mdl->reader->lbl;
346
- const size_t Y = mdl->nlbl;
347
- const size_t N = mdl->opt->nbest;
420
+ const uint32_t Y = mdl->nlbl;
421
+ const uint32_t N = mdl->opt->nbest;
348
422
  // We start by preparing the statistic collection to be ready if check
349
423
  // option is used. The stat array hold the following for each label
350
424
  // [0] # of reference with this label
351
425
  // [1] # of token we have taged with this label
352
426
  // [2] # of match of the two preceding
353
- size_t tcnt = 0, terr = 0;
354
- size_t scnt = 0, serr = 0;
355
- size_t stat[3][Y];
356
- for (size_t y = 0; y < Y; y++)
427
+ uint64_t tcnt = 0, terr = 0;
428
+ uint64_t scnt = 0, serr = 0;
429
+ uint64_t stat[3][Y];
430
+ for (uint32_t y = 0; y < Y; y++)
357
431
  stat[0][y] = stat[1][y] = stat[2][y] = 0;
358
432
  // Next read the input file sequence by sequence and label them, we have
359
433
  // to take care of not discarding the raw input as we want to send it
@@ -364,24 +438,25 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
364
438
  raw_t *raw = rdr_readraw(mdl->reader, fin);
365
439
  if (raw == NULL)
366
440
  break;
367
- seq_t *seq = rdr_raw2seq(mdl->reader, raw, mdl->opt->check);
368
- const int T = seq->len;
369
- size_t *out = wapiti_xmalloc(sizeof(size_t) * T * N);
370
- double *psc = wapiti_xmalloc(sizeof(double) * T * N);
371
- double *scs = wapiti_xmalloc(sizeof(double) * N);
441
+ seq_t *seq = rdr_raw2seq(mdl->reader, raw,
442
+ mdl->opt->check | mdl->opt->force);
443
+ const uint32_t T = seq->len;
444
+ uint32_t *out = wapiti_xmalloc(sizeof(uint32_t) * T * N);
445
+ double *psc = wapiti_xmalloc(sizeof(double ) * T * N);
446
+ double *scs = wapiti_xmalloc(sizeof(double ) * N);
372
447
  if (N == 1)
373
- tag_viterbi(mdl, seq, (size_t*)out, scs, (double*)psc);
448
+ tag_viterbi(mdl, seq, (uint32_t*)out, scs, (double*)psc);
374
449
  else
375
450
  tag_nbviterbi(mdl, seq, N, (void*)out, scs, (void*)psc);
376
451
  // Next we output the raw sequence with an aditional column for
377
452
  // the predicted labels
378
- for (size_t n = 0; n < N; n++) {
453
+ for (uint32_t n = 0; n < N; n++) {
379
454
  if (mdl->opt->outsc)
380
455
  fprintf(fout, "# %d %f\n", (int)n, scs[n]);
381
- for (int t = 0; t < T; t++) {
456
+ for (uint32_t t = 0; t < T; t++) {
382
457
  if (!mdl->opt->label)
383
458
  fprintf(fout, "%s\t", raw->lines[t]);
384
- size_t lbl = out[t * N + n];
459
+ uint32_t lbl = out[t * N + n];
385
460
  const char *lblstr = qrk_id2str(lbls, lbl);
386
461
  fprintf(fout, "%s", lblstr);
387
462
  if (mdl->opt->outsc) {
@@ -394,10 +469,13 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
394
469
  }
395
470
  fflush(fout);
396
471
  // If user provided reference labels, use them to collect
397
- // statistics about how well we have performed here.
472
+ // statistics about how well we have performed here. Labels
473
+ // unseen at training time are discarded.
398
474
  if (mdl->opt->check) {
399
475
  bool err = false;
400
- for (int t = 0; t < T; t++) {
476
+ for (uint32_t t = 0; t < T; t++) {
477
+ if (seq->pos[t].lbl == (uint32_t)-1)
478
+ continue;
401
479
  stat[0][seq->pos[t].lbl]++;
402
480
  stat[1][out[t * N]]++;
403
481
  if (seq->pos[t].lbl != out[t * N])
@@ -405,7 +483,7 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
405
483
  else
406
484
  stat[2][out[t * N]]++;
407
485
  }
408
- tcnt += (size_t)T;
486
+ tcnt += T;
409
487
  serr += err;
410
488
  }
411
489
  // Cleanup memory used for this sequence
@@ -418,7 +496,7 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
418
496
  // much sequence are labelled and if possible the current tokens
419
497
  // and sequence error rates.
420
498
  if (++scnt % 1000 == 0) {
421
- info("%10zu sequences labeled", scnt);
499
+ info("%10"PRIu64" sequences labeled", scnt);
422
500
  if (mdl->opt->check) {
423
501
  const double te = (double)terr / tcnt * 100.0;
424
502
  const double se = (double)serr / scnt * 100.0;
@@ -433,11 +511,11 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
433
511
  if (mdl->opt->check) {
434
512
  const double te = (double)terr / tcnt * 100.0;
435
513
  const double se = (double)serr / scnt * 100.0;
436
- info(" Nb sequences : %zu\n", scnt);
514
+ info(" Nb sequences : %"PRIu64"\n", scnt);
437
515
  info(" Token error : %5.2f%%\n", te);
438
516
  info(" Sequence error: %5.2f%%\n", se);
439
517
  info("* Per label statistics\n");
440
- for (size_t y = 0; y < Y; y++) {
518
+ for (uint32_t y = 0; y < Y; y++) {
441
519
  const char *lbl = qrk_id2str(lbls, y);
442
520
  const double Rc = (double)stat[2][y] / stat[0][y];
443
521
  const double Pr = (double)stat[2][y] / stat[1][y];
@@ -458,12 +536,12 @@ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
458
536
  */
459
537
  typedef struct eval_s eval_t;
460
538
  struct eval_s {
461
- mdl_t *mdl;
462
- dat_t *dat;
463
- size_t tcnt; // Processed tokens count
464
- size_t terr; // Tokens error found
465
- size_t scnt; // Processes sequences count
466
- size_t serr; // Sequence error found
539
+ mdl_t *mdl;
540
+ dat_t *dat;
541
+ uint64_t tcnt; // Processed tokens count
542
+ uint64_t terr; // Tokens error found
543
+ uint64_t scnt; // Processes sequences count
544
+ uint64_t serr; // Sequence error found
467
545
  };
468
546
 
469
547
  /* tag_evalsub:
@@ -471,7 +549,7 @@ struct eval_s {
471
549
  * by batch and for each batch do a simple Viterbi and scan the result to find
472
550
  * errors.
473
551
  */
474
- static void tag_evalsub(job_t *job, int id, int cnt, eval_t *eval) {
552
+ static void tag_evalsub(job_t *job, uint32_t id, uint32_t cnt, eval_t *eval) {
475
553
  unused(id && cnt);
476
554
  mdl_t *mdl = eval->mdl;
477
555
  dat_t *dat = eval->dat;
@@ -480,22 +558,23 @@ static void tag_evalsub(job_t *job, int id, int cnt, eval_t *eval) {
480
558
  eval->scnt = 0;
481
559
  eval->serr = 0;
482
560
  // We just get a job a process all the squence in it.
483
- size_t count, pos;
561
+ uint32_t count, pos;
484
562
  while (mth_getjob(job, &count, &pos)) {
485
- for (size_t s = pos; s < pos + count; s++) {
563
+ for (uint32_t s = pos; s < pos + count; s++) {
486
564
  // Tag the sequence with the viterbi
487
565
  const seq_t *seq = dat->seq[s];
488
- const int T = seq->len;
489
- size_t out[T];
566
+ const uint32_t T = seq->len;
567
+ uint32_t *out = wapiti_xmalloc(sizeof(uint32_t) * T);
490
568
  tag_viterbi(mdl, seq, out, NULL, NULL);
491
569
  // And check for eventual (probable ?) errors
492
570
  bool err = false;
493
- for (int t = 0; t < T; t++)
571
+ for (uint32_t t = 0; t < T; t++)
494
572
  if (seq->pos[t].lbl != out[t])
495
573
  eval->terr++, err = true;
496
- eval->tcnt += (size_t)T;
574
+ eval->tcnt += T;
497
575
  eval->scnt += 1;
498
576
  eval->serr += err;
577
+ free(out);
499
578
  }
500
579
  }
501
580
  }
@@ -505,13 +584,13 @@ static void tag_evalsub(job_t *job, int id, int cnt, eval_t *eval) {
505
584
  * taining set if not available).
506
585
  */
507
586
  void tag_eval(mdl_t *mdl, double *te, double *se) {
508
- const size_t W = mdl->opt->nthread;
587
+ const uint32_t W = mdl->opt->nthread;
509
588
  dat_t *dat = (mdl->devel == NULL) ? mdl->train : mdl->devel;
510
589
  // First we prepare the eval state for all the workers threads, we just
511
590
  // have to give them the model and dataset to use. This state will be
512
591
  // used to retrieve partial result they computed.
513
592
  eval_t *eval[W];
514
- for (size_t w = 0; w < W; w++) {
593
+ for (uint32_t w = 0; w < W; w++) {
515
594
  eval[w] = wapiti_xmalloc(sizeof(eval_t));
516
595
  eval[w]->mdl = mdl;
517
596
  eval[w]->dat = dat;
@@ -520,9 +599,9 @@ void tag_eval(mdl_t *mdl, double *te, double *se) {
520
599
  // result by summing them and computing the final error rates.
521
600
  mth_spawn((func_t *)tag_evalsub, W, (void *)eval, dat->nseq,
522
601
  mdl->opt->jobsize);
523
- size_t tcnt = 0, terr = 0;
524
- size_t scnt = 0, serr = 0;
525
- for (size_t w = 0; w < W; w++) {
602
+ uint64_t tcnt = 0, terr = 0;
603
+ uint64_t scnt = 0, serr = 0;
604
+ for (uint32_t w = 0; w < W; w++) {
526
605
  tcnt += eval[w]->tcnt;
527
606
  terr += eval[w]->terr;
528
607
  scnt += eval[w]->scnt;