wapiti 0.0.5 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.simplecov +3 -0
  3. data/Gemfile +25 -2
  4. data/HISTORY.md +5 -1
  5. data/LICENSE +14 -13
  6. data/README.md +9 -16
  7. data/Rakefile +38 -8
  8. data/ext/wapiti/bcd.c +126 -124
  9. data/ext/wapiti/decoder.c +203 -124
  10. data/ext/wapiti/decoder.h +6 -4
  11. data/ext/wapiti/extconf.rb +2 -2
  12. data/ext/wapiti/gradient.c +491 -320
  13. data/ext/wapiti/gradient.h +52 -34
  14. data/ext/wapiti/lbfgs.c +74 -33
  15. data/ext/wapiti/model.c +47 -37
  16. data/ext/wapiti/model.h +22 -20
  17. data/ext/wapiti/native.c +850 -839
  18. data/ext/wapiti/native.h +1 -1
  19. data/ext/wapiti/options.c +52 -20
  20. data/ext/wapiti/options.h +37 -30
  21. data/ext/wapiti/pattern.c +35 -33
  22. data/ext/wapiti/pattern.h +12 -11
  23. data/ext/wapiti/progress.c +14 -13
  24. data/ext/wapiti/progress.h +3 -2
  25. data/ext/wapiti/quark.c +14 -16
  26. data/ext/wapiti/quark.h +6 -5
  27. data/ext/wapiti/reader.c +83 -69
  28. data/ext/wapiti/reader.h +11 -9
  29. data/ext/wapiti/rprop.c +84 -43
  30. data/ext/wapiti/sequence.h +18 -16
  31. data/ext/wapiti/sgdl1.c +45 -43
  32. data/ext/wapiti/thread.c +19 -17
  33. data/ext/wapiti/thread.h +5 -4
  34. data/ext/wapiti/tools.c +7 -7
  35. data/ext/wapiti/tools.h +3 -4
  36. data/ext/wapiti/trainers.h +1 -1
  37. data/ext/wapiti/vmath.c +40 -38
  38. data/ext/wapiti/vmath.h +12 -11
  39. data/ext/wapiti/wapiti.c +159 -37
  40. data/ext/wapiti/wapiti.h +18 -4
  41. data/lib/wapiti.rb +15 -15
  42. data/lib/wapiti/errors.rb +15 -15
  43. data/lib/wapiti/model.rb +92 -84
  44. data/lib/wapiti/options.rb +123 -124
  45. data/lib/wapiti/utility.rb +14 -14
  46. data/lib/wapiti/version.rb +2 -2
  47. data/spec/spec_helper.rb +29 -9
  48. data/spec/wapiti/model_spec.rb +230 -194
  49. data/spec/wapiti/native_spec.rb +7 -8
  50. data/spec/wapiti/options_spec.rb +184 -174
  51. data/wapiti.gemspec +22 -8
  52. metadata +38 -42
  53. data/.gitignore +0 -5
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -29,6 +29,7 @@
29
29
  #define decoder_h
30
30
 
31
31
  #include <stddef.h>
32
+ #include <stdint.h>
32
33
  #include <stdio.h>
33
34
 
34
35
  #include "wapiti.h"
@@ -36,11 +37,12 @@
36
37
  #include "sequence.h"
37
38
 
38
39
  void tag_viterbi(mdl_t *mdl, const seq_t *seq,
39
- size_t out[], double *sc, double psc[]);
40
- void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
41
- size_t out[][N], double sc[], double psc[][N]);
40
+ uint32_t out[], double *sc, double psc[]);
41
+ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, uint32_t N,
42
+ uint32_t out[][N], double sc[], double psc[][N]);
42
43
 
43
44
  void tag_label(mdl_t *mdl, FILE *fin, FILE *fout);
44
45
  void tag_eval(mdl_t *mdl, double *te, double *se);
46
+
45
47
  #endif
46
48
 
@@ -1,8 +1,8 @@
1
1
  require 'mkmf'
2
2
 
3
- $CFLAGS << %q{ -std=c99 -O3 -Wall -ggdb }
3
+ $CFLAGS << %q{ -std=c99 -W -Wall -Wno-declaration-after-statement -O3 }
4
4
 
5
5
  have_library('pthread')
6
6
  have_library('m')
7
7
 
8
- create_makefile('wapiti/native')
8
+ create_makefile('wapiti/native')
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -26,8 +26,9 @@
26
26
  */
27
27
  #include <math.h>
28
28
  #include <stddef.h>
29
- #include <stdlib.h>
29
+ #include <stdint.h>
30
30
  #include <stdio.h>
31
+ #include <stdlib.h>
31
32
  #include <string.h>
32
33
 
33
34
  #include "wapiti.h"
@@ -40,41 +41,71 @@
40
41
  #include "thread.h"
41
42
  #include "vmath.h"
42
43
 
44
+ /* atm_inc:
45
+ * Atomically increment the value pointed by [ptr] by [inc]. If ATM_ANSI is
46
+ * defined this NOT atomic at all so caller must have to deal with this.
47
+ */
48
+ #ifdef ATM_ANSI
49
+ static inline
50
+ void atm_inc(double *value, double inc) {
51
+ *value += inc;
52
+ }
53
+ #else
54
+ static inline
55
+ void atm_inc(volatile double *value, double inc) {
56
+ while (1) {
57
+ volatile union {
58
+ double d;
59
+ uint64_t u;
60
+ } old, new;
61
+ old.d = *value;
62
+ new.d = old.d + inc;
63
+ uint64_t *ptr = (uint64_t *)value;
64
+ if (__sync_bool_compare_and_swap(ptr, old.u, new.u))
65
+ break;
66
+ }
67
+ }
68
+ #endif
69
+
43
70
  /******************************************************************************
44
- * Maxent optimized gradient computation
71
+ * Maxent gradient computation
45
72
  *
46
- * Maxent or maximum entropy models are a specific case of CRF where the
47
- * output graph is reduced to a single node. In this specific case, the
48
- * computation of the gradient can be simplified a lot as it is done in this
49
- * part of the code.
73
+ * Maxent or maximum entropy models are multi class logistic regression (see
74
+ * [1]. Then can be viewed as a special class of CRFs models where the there
75
+ * is no dependencies between the output labels. This mean that the
76
+ * normalization is local to each nodes and can be done a lot more efficiently
77
+ * as we do not have to perform the forward backward procedure.
50
78
  *
51
- * This code will be used to compute gradient for sequences of length one and
52
- * without actives bigrams features. All other case are handled by the next
53
- * section.
79
+ * This code is used both when the maxent type of model is used and in other
80
+ * modes if the sequence length is one or if there is no bigrams features.
81
+ *
82
+ * [1] A maximum entropy approach to natural language processing, A. Berger
83
+ * and S. Della Pietra and V. Della Pietra, Computational Linguistics,
84
+ * (22-1), March 1996.
54
85
  ******************************************************************************/
55
- void grd_dosingle(grd_t *grd, const seq_t *seq) {
56
- const mdl_t *mdl = grd->mdl;
57
- const double *x = mdl->theta;
58
- const int T = seq->len;
59
- const size_t Y = mdl->nlbl;
60
- double *psi = grd->psi;
61
- double *g = grd->g;
62
- for (int t = 0; t < T; t++) {
86
+ void grd_domaxent(grd_st_t *grd_st, const seq_t *seq) {
87
+ const mdl_t *mdl = grd_st->mdl;
88
+ const double *x = mdl->theta;
89
+ const uint32_t T = seq->len;
90
+ const uint32_t Y = mdl->nlbl;
91
+ double *psi = grd_st->psi;
92
+ double *g = grd_st->g;
93
+ for (uint32_t t = 0; t < T; t++) {
63
94
  const pos_t *pos = &(seq->pos[t]);
64
95
  // We first compute for each Y the sum of weights of all
65
96
  // features actives in the sample:
66
97
  // Ψ(y,x^i) = \exp( ∑_k θ_k f_k(y,x^i) )
67
98
  // Z_θ(x^i) = ∑_y Ψ(y,x^i)
68
99
  double Z = 0.0;
69
- for (size_t y = 0; y < Y; y++)
100
+ for (uint32_t y = 0; y < Y; y++)
70
101
  psi[y] = 0.0;
71
- for (size_t n = 0; n < pos->ucnt; n++) {
102
+ for (uint32_t n = 0; n < pos->ucnt; n++) {
72
103
  const double *wgh = x + mdl->uoff[pos->uobs[n]];
73
- for (size_t y = 0; y < Y; y++)
104
+ for (uint32_t y = 0; y < Y; y++)
74
105
  psi[y] += wgh[y];
75
106
  }
76
107
  double lloss = psi[pos->lbl];
77
- for (size_t y = 0; y < Y; y++) {
108
+ for (uint32_t y = 0; y < Y; y++) {
78
109
  psi[y] = (psi[y] == 0.0) ? 1.0 : exp(psi[y]);
79
110
  Z += psi[y];
80
111
  }
@@ -85,22 +116,111 @@ void grd_dosingle(grd_t *grd, const seq_t *seq) {
85
116
  // E_{q_θ}(x,y) - E_{p}(x,y)
86
117
  // and we can compute the expectation over the model with:
87
118
  // E_{q_θ}(x,y) = f_k(y,x^i) * ψ(y,x) / Z_θ(x)
88
- for (size_t y = 0; y < Y; y++)
119
+ for (uint32_t y = 0; y < Y; y++)
89
120
  psi[y] /= Z;
90
- for (size_t n = 0; n < pos->ucnt; n++) {
121
+ for (uint32_t n = 0; n < pos->ucnt; n++) {
91
122
  double *grd = g + mdl->uoff[pos->uobs[n]];
92
- for (size_t y = 0; y < Y; y++)
93
- grd[y] += psi[y];
94
- grd[pos->lbl] -= 1.0;
123
+ for (uint32_t y = 0; y < Y; y++)
124
+ atm_inc(grd + y, psi[y]);
125
+ atm_inc(grd + pos->lbl, -1.0);
95
126
  }
96
127
  // And finally the log-likelihood with:
97
128
  // L_θ(x^i,y^i) = log(Z_θ(x^i)) - log(ψ(y^i,x^i))
98
- grd->lloss += log(Z) - lloss;
129
+ grd_st->lloss += log(Z) - lloss;
99
130
  }
100
131
  }
101
132
 
102
133
  /******************************************************************************
103
- * Single sequence gradient computation
134
+ * Maximum entropy markov model gradient computation
135
+ *
136
+ * Maximum entropy markov models are similar to linear-chains CRFs but with
137
+ * local normalization instead of global normalization (see [2]). This change
138
+ * make the computation a lot more simpler as at training time the gradient
139
+ * can be computed similarily to the maxent cases with the previous output
140
+ * label observed.
141
+ *
142
+ * This mean that for bigram features we only have to consider the reference
143
+ * label at previous position instead of all possible labels, so we don't have
144
+ * to perform the forward backward. Bigrams features are handle in the same
145
+ * way than unigrams features.
146
+ *
147
+ * [2] Maximum Entropy Markov Models for Information Extraction and
148
+ * Segmentation, A. McCallum and D. Freitag and F. Pereira, 2000,
149
+ * Proceedings of ICML 2000 , 591–598. Stanford, California.
150
+ ******************************************************************************/
151
+ void grd_domemm(grd_st_t *grd_st, const seq_t *seq) {
152
+ const mdl_t *mdl = grd_st->mdl;
153
+ const double *x = mdl->theta;
154
+ const uint32_t T = seq->len;
155
+ const uint32_t Y = mdl->nlbl;
156
+ double *psi = grd_st->psi;
157
+ double *g = grd_st->g;
158
+ for (uint32_t t = 0; t < T; t++) {
159
+ const pos_t *pos = &(seq->pos[t]);
160
+ // We first compute for each Y the sum of weights of all
161
+ // features actives in the sample:
162
+ // Ψ(y,x^i) = \exp( ∑_k θ_k f_k(y_t-1, y,x^i) )
163
+ // Z_θ(x^i) = ∑_y Ψ(y,x^i)
164
+ // Bigram features rely on the gold label at previous position
165
+ // for the markov dependency unlike in CRFs.
166
+ double Z = 0.0;
167
+ for (uint32_t y = 0; y < Y; y++)
168
+ psi[y] = 0.0;
169
+ for (uint32_t n = 0; n < pos->ucnt; n++) {
170
+ const double *wgh = x + mdl->uoff[pos->uobs[n]];
171
+ for (uint32_t y = 0; y < Y; y++)
172
+ psi[y] += wgh[y];
173
+ }
174
+ if (t != 0) {
175
+ const uint32_t yp = seq->pos[t - 1].lbl;
176
+ const uint32_t d = yp * Y;
177
+ for (uint32_t y = 0; y < Y; y++) {
178
+ double sum = 0.0;
179
+ for (uint32_t n = 0; n < pos->bcnt; n++) {
180
+ const uint64_t o = pos->bobs[n];
181
+ sum += x[mdl->boff[o] + d + y];
182
+ }
183
+ psi[y] += sum;
184
+ }
185
+ }
186
+ double lloss = psi[pos->lbl];
187
+ for (uint32_t y = 0; y < Y; y++) {
188
+ psi[y] = (psi[y] == 0.0) ? 1.0 : exp(psi[y]);
189
+ Z += psi[y];
190
+ }
191
+ // Now, we can compute the gradient update, for each active
192
+ // feature in the sample the update is the expectation over the
193
+ // current model minus the expectation over the observed
194
+ // distribution:
195
+ // E_{q_θ}(x,y) - E_{p}(x,y)
196
+ // and we can compute the expectation over the model with:
197
+ // E_{q_θ}(x,y) = f_k(y, y,x^i) * ψ(y,x) / Z_θ(x)
198
+ for (uint32_t y = 0; y < Y; y++)
199
+ psi[y] /= Z;
200
+ for (uint32_t n = 0; n < pos->ucnt; n++) {
201
+ double *grd = g + mdl->uoff[pos->uobs[n]];
202
+ for (uint32_t y = 0; y < Y; y++)
203
+ atm_inc(grd + y, psi[y]);
204
+ atm_inc(grd + pos->lbl, -1.0);
205
+ }
206
+ if (t != 0) {
207
+ const uint32_t yp = seq->pos[t - 1].lbl;
208
+ const uint32_t d = yp * Y;
209
+ for (uint32_t n = 0; n < pos->bcnt; n++) {
210
+ double *grd = g + mdl->boff[pos->bobs[n]] + d;
211
+ for (uint32_t y = 0; y < Y; y++)
212
+ atm_inc(grd + y, psi[y]);
213
+ atm_inc(grd + pos->lbl, -1.0);
214
+ }
215
+ }
216
+ // And finally the log-likelihood with:
217
+ // L_θ(x^i,y^i) = log(Z_θ(x^i)) - log(ψ(y^i,x^i))
218
+ grd_st->lloss += log(Z) - lloss;
219
+ }
220
+ }
221
+
222
+ /******************************************************************************
223
+ * Linear-chain CRF gradient computation
104
224
  *
105
225
  * This section is responsible for computing the gradient of the
106
226
  * log-likelihood function to optimize over a single sequence.
@@ -140,80 +260,6 @@ void grd_dosingle(grd_t *grd, const seq_t *seq) {
140
260
  * the worst case use as less as possible memory.
141
261
  ******************************************************************************/
142
262
 
143
- /* grd_check:
144
- * Check that enough memory is allocated in the gradient object so that the
145
- * linear-chain codepath can be computed for a sequence of the given length.
146
- */
147
- void grd_check(grd_t *grd, int len) {
148
- // Check if user ask for clearing the state tracker or if he requested a
149
- // bigger tracker. In this case we have to free the previous allocated
150
- // memory.
151
- if (len == 0 || (len > grd->len && grd->len != 0)) {
152
- if (grd->mdl->opt->sparse) {
153
- xvm_free(grd->psiuni); grd->psiuni = NULL;
154
- free(grd->psiyp); grd->psiyp = NULL;
155
- free(grd->psiidx); grd->psiidx = NULL;
156
- free(grd->psioff); grd->psioff = NULL;
157
- }
158
- xvm_free(grd->psi); grd->psi = NULL;
159
- xvm_free(grd->alpha); grd->alpha = NULL;
160
- xvm_free(grd->beta); grd->beta = NULL;
161
- xvm_free(grd->unorm); grd->unorm = NULL;
162
- xvm_free(grd->bnorm); grd->bnorm = NULL;
163
- xvm_free(grd->scale); grd->scale = NULL;
164
- grd->len = 0;
165
- }
166
- if (len == 0 || len <= grd->len)
167
- return;
168
- // If we are here, we have to allocate a new state. This is simple, we
169
- // just have to take care of the special case for sparse mode.
170
- const size_t Y = grd->mdl->nlbl;
171
- const int T = len;
172
- grd->psi = xvm_new(T * Y * Y);
173
- grd->alpha = xvm_new(T * Y);
174
- grd->beta = xvm_new(T * Y);
175
- grd->scale = xvm_new(T);
176
- grd->unorm = xvm_new(T);
177
- grd->bnorm = xvm_new(T);
178
- if (grd->mdl->opt->sparse) {
179
- grd->psiuni = xvm_new(T * Y);
180
- grd->psiyp = wapiti_xmalloc(sizeof(size_t) * T * Y * Y);
181
- grd->psiidx = wapiti_xmalloc(sizeof(size_t) * T * Y);
182
- grd->psioff = wapiti_xmalloc(sizeof(size_t) * T);
183
- }
184
- grd->len = len;
185
- }
186
-
187
- /* grd_new:
188
- * Allocation memory for gradient computation state. This allocate memory for
189
- * the longest sequence present in the data set.
190
- */
191
- grd_t *grd_new(mdl_t *mdl, double *g) {
192
- grd_t *grd = wapiti_xmalloc(sizeof(grd_t));
193
- grd->mdl = mdl;
194
- grd->len = 0;
195
- grd->g = g;
196
- grd->psi = NULL;
197
- grd->psiuni = NULL;
198
- grd->psiyp = NULL;
199
- grd->psiidx = NULL;
200
- grd->psioff = NULL;
201
- grd->alpha = NULL;
202
- grd->beta = NULL;
203
- grd->unorm = NULL;
204
- grd->bnorm = NULL;
205
- grd->scale = NULL;
206
- return grd;
207
- }
208
-
209
- /* grd_free:
210
- * Free all memory used by gradient computation.
211
- */
212
- void grd_free(grd_t *grd) {
213
- grd_check(grd, 0);
214
- free(grd);
215
- }
216
-
217
263
  /* grd_fldopsi:
218
264
  * We first have to compute the Ψ_t(y',y,x) weights defined as
219
265
  * Ψ_t(y',y,x) = \exp( ∑_k θ_k f_k(y',y,x_t) )
@@ -235,38 +281,38 @@ void grd_free(grd_t *grd) {
235
281
  * 3/ we take the component-wise exponential of the resulting matrix
236
282
  * (this can be done efficiently with vector maths)
237
283
  */
238
- void grd_fldopsi(grd_t *grd, const seq_t *seq) {
239
- const mdl_t *mdl = grd->mdl;
240
- const double *x = mdl->theta;
241
- const size_t Y = mdl->nlbl;
242
- const int T = seq->len;
243
- double (*psi)[T][Y][Y] = (void *)grd->psi;
244
- for (int t = 0; t < T; t++) {
284
+ void grd_fldopsi(grd_st_t *grd_st, const seq_t *seq) {
285
+ const mdl_t *mdl = grd_st->mdl;
286
+ const double *x = mdl->theta;
287
+ const uint32_t Y = mdl->nlbl;
288
+ const uint32_t T = seq->len;
289
+ double (*psi)[T][Y][Y] = (void *)grd_st->psi;
290
+ for (uint32_t t = 0; t < T; t++) {
245
291
  const pos_t *pos = &(seq->pos[t]);
246
- for (size_t y = 0; y < Y; y++) {
292
+ for (uint32_t y = 0; y < Y; y++) {
247
293
  double sum = 0.0;
248
- for (size_t n = 0; n < pos->ucnt; n++) {
249
- const size_t o = pos->uobs[n];
294
+ for (uint32_t n = 0; n < pos->ucnt; n++) {
295
+ const uint64_t o = pos->uobs[n];
250
296
  sum += x[mdl->uoff[o] + y];
251
297
  }
252
- for (size_t yp = 0; yp < Y; yp++)
298
+ for (uint32_t yp = 0; yp < Y; yp++)
253
299
  (*psi)[t][yp][y] = sum;
254
300
  }
255
301
  }
256
- for (int t = 1; t < T; t++) {
302
+ for (uint32_t t = 1; t < T; t++) {
257
303
  const pos_t *pos = &(seq->pos[t]);
258
- for (size_t yp = 0, d = 0; yp < Y; yp++) {
259
- for (size_t y = 0; y < Y; y++, d++) {
304
+ for (uint32_t yp = 0, d = 0; yp < Y; yp++) {
305
+ for (uint32_t y = 0; y < Y; y++, d++) {
260
306
  double sum = 0.0;
261
- for (size_t n = 0; n < pos->bcnt; n++) {
262
- const size_t o = pos->bobs[n];
307
+ for (uint32_t n = 0; n < pos->bcnt; n++) {
308
+ const uint64_t o = pos->bobs[n];
263
309
  sum += x[mdl->boff[o] + d];
264
310
  }
265
311
  (*psi)[t][yp][y] += sum;
266
312
  }
267
313
  }
268
314
  }
269
- xvm_expma((double *)psi, (double *)psi, 0.0, (size_t)T * Y * Y);
315
+ xvm_expma((double *)psi, (double *)psi, 0.0, (uint64_t)T * Y * Y);
270
316
  }
271
317
 
272
318
  /* grd_spdopsi:
@@ -290,36 +336,36 @@ void grd_fldopsi(grd_t *grd, const seq_t *seq) {
290
336
  * one. (here also this can be done efficiently with vector
291
337
  * maths)
292
338
  */
293
- void grd_spdopsi(grd_t *grd, const seq_t *seq) {
294
- const mdl_t *mdl = grd->mdl;
295
- const double *x = mdl->theta;
296
- const size_t Y = mdl->nlbl;
297
- const int T = seq->len;
298
- double (*psiuni)[T][Y] = (void *)grd->psiuni;
299
- double *psival = grd->psi;
300
- size_t *psiyp = grd->psiyp;
301
- size_t (*psiidx)[T][Y] = (void *)grd->psiidx;
302
- size_t *psioff = grd->psioff;
303
- for (int t = 0; t < T; t++) {
339
+ void grd_spdopsi(grd_st_t *grd_st, const seq_t *seq) {
340
+ const mdl_t *mdl = grd_st->mdl;
341
+ const double *x = mdl->theta;
342
+ const uint32_t Y = mdl->nlbl;
343
+ const uint32_t T = seq->len;
344
+ double (*psiuni)[T][Y] = (void *)grd_st->psiuni;
345
+ double *psival = grd_st->psi;
346
+ uint32_t *psiyp = grd_st->psiyp;
347
+ uint32_t (*psiidx)[T][Y] = (void *)grd_st->psiidx;
348
+ uint32_t *psioff = grd_st->psioff;
349
+ for (uint32_t t = 0; t < T; t++) {
304
350
  const pos_t *pos = &(seq->pos[t]);
305
- for (size_t y = 0; y < Y; y++) {
351
+ for (uint32_t y = 0; y < Y; y++) {
306
352
  double sum = 0.0;
307
- for (size_t n = 0; n < pos->ucnt; n++) {
308
- const size_t o = pos->uobs[n];
353
+ for (uint32_t n = 0; n < pos->ucnt; n++) {
354
+ const uint64_t o = pos->uobs[n];
309
355
  sum += x[mdl->uoff[o] + y];
310
356
  }
311
357
  (*psiuni)[t][y] = sum;
312
358
  }
313
359
  }
314
- size_t off = 0;
315
- for (int t = 1; t < T; t++) {
360
+ uint32_t off = 0;
361
+ for (uint32_t t = 1; t < T; t++) {
316
362
  const pos_t *pos = &(seq->pos[t]);
317
363
  psioff[t] = off;
318
- for (size_t y = 0, nnz = 0; y < Y; y++) {
319
- for (size_t yp = 0; yp < Y; yp++) {
364
+ for (uint32_t y = 0, nnz = 0; y < Y; y++) {
365
+ for (uint32_t yp = 0; yp < Y; yp++) {
320
366
  double sum = 0.0;
321
- for (size_t n = 0; n < pos->bcnt; n++) {
322
- const size_t o = pos->bobs[n];
367
+ for (uint32_t n = 0; n < pos->bcnt; n++) {
368
+ const uint64_t o = pos->bobs[n];
323
369
  sum += x[mdl->boff[o] + yp * Y + y];
324
370
  }
325
371
  if (sum == 0.0)
@@ -331,7 +377,7 @@ void grd_spdopsi(grd_t *grd, const seq_t *seq) {
331
377
  (*psiidx)[t][y] = nnz;
332
378
  }
333
379
  }
334
- xvm_expma((double *)psiuni, (double *)psiuni, 0.0, (size_t)T * Y);
380
+ xvm_expma((double *)psiuni, (double *)psiuni, 0.0, (uint64_t)T * Y);
335
381
  xvm_expma((double *)psival, (double *)psival, 1.0, off);
336
382
  }
337
383
 
@@ -356,42 +402,42 @@ void grd_spdopsi(grd_t *grd, const seq_t *seq) {
356
402
  * with α-scale_t the scaling factor used for the α vector at position t
357
403
  * in the forward recursion.
358
404
  */
359
- void grd_flfwdbwd(grd_t *grd, const seq_t *seq) {
360
- const mdl_t *mdl = grd->mdl;
361
- const size_t Y = mdl->nlbl;
362
- const int T = seq->len;
363
- const double (*psi)[T][Y][Y] = (void *)grd->psi;
364
- double (*alpha)[T][Y] = (void *)grd->alpha;
365
- double (*beta )[T][Y] = (void *)grd->beta;
366
- double *scale = grd->scale;
367
- double *unorm = grd->unorm;
368
- double *bnorm = grd->bnorm;
369
- for (size_t y = 0; y < Y; y++)
405
+ void grd_flfwdbwd(grd_st_t *grd_st, const seq_t *seq) {
406
+ const mdl_t *mdl = grd_st->mdl;
407
+ const uint64_t Y = mdl->nlbl;
408
+ const uint32_t T = seq->len;
409
+ const double (*psi)[T][Y][Y] = (void *)grd_st->psi;
410
+ double (*alpha)[T][Y] = (void *)grd_st->alpha;
411
+ double (*beta )[T][Y] = (void *)grd_st->beta;
412
+ double *scale = grd_st->scale;
413
+ double *unorm = grd_st->unorm;
414
+ double *bnorm = grd_st->bnorm;
415
+ for (uint32_t y = 0; y < Y; y++)
370
416
  (*alpha)[0][y] = (*psi)[0][0][y];
371
417
  scale[0] = xvm_unit((*alpha)[0], (*alpha)[0], Y);
372
- for (int t = 1; t < grd->last + 1; t++) {
373
- for (size_t y = 0; y < Y; y++) {
418
+ for (uint32_t t = 1; t < grd_st->last + 1; t++) {
419
+ for (uint32_t y = 0; y < Y; y++) {
374
420
  double sum = 0.0;
375
- for (size_t yp = 0; yp < Y; yp++)
421
+ for (uint32_t yp = 0; yp < Y; yp++)
376
422
  sum += (*alpha)[t - 1][yp] * (*psi)[t][yp][y];
377
423
  (*alpha)[t][y] = sum;
378
424
  }
379
425
  scale[t] = xvm_unit((*alpha)[t], (*alpha)[t], Y);
380
426
  }
381
- for (size_t yp = 0; yp < Y; yp++)
427
+ for (uint32_t yp = 0; yp < Y; yp++)
382
428
  (*beta)[T - 1][yp] = 1.0 / Y;
383
- for (int t = T - 1; t > grd->first; t--) {
384
- for (size_t yp = 0; yp < Y; yp++) {
429
+ for (uint32_t t = T - 1; t > grd_st->first; t--) {
430
+ for (uint32_t yp = 0; yp < Y; yp++) {
385
431
  double sum = 0.0;
386
- for (size_t y = 0; y < Y; y++)
432
+ for (uint32_t y = 0; y < Y; y++)
387
433
  sum += (*beta)[t][y] * (*psi)[t][yp][y];
388
434
  (*beta)[t - 1][yp] = sum;
389
435
  }
390
436
  xvm_unit((*beta)[t - 1], (*beta)[t - 1], Y);
391
437
  }
392
- for (int t = 0; t < T; t++) {
438
+ for (uint32_t t = 0; t < T; t++) {
393
439
  double z = 0.0;
394
- for (size_t y = 0; y < Y; y++)
440
+ for (uint32_t y = 0; y < Y; y++)
395
441
  z += (*alpha)[t][y] * (*beta)[t][y];
396
442
  unorm[t] = 1.0 / z;
397
443
  bnorm[t] = scale[t] / z;
@@ -416,67 +462,67 @@ void grd_flfwdbwd(grd_t *grd, const seq_t *seq) {
416
462
  * And here also we reduce the number of multiplication if the matrix is
417
463
  * really sparse.
418
464
  */
419
- void grd_spfwdbwd(grd_t *grd, const seq_t *seq) {
420
- const mdl_t *mdl = grd->mdl;
421
- const size_t Y = mdl->nlbl;
422
- const int T = seq->len;
423
- const double (*psiuni)[T][Y] = (void *)grd->psiuni;
424
- const double *psival = grd->psi;
425
- const size_t *psiyp = grd->psiyp;
426
- const size_t (*psiidx)[T][Y] = (void *)grd->psiidx;
427
- const size_t *psioff = grd->psioff;
428
- double (*alpha)[T][Y] = (void *)grd->alpha;
429
- double (*beta )[T][Y] = (void *)grd->beta;
430
- double *scale = grd->scale;
431
- double *unorm = grd->unorm;
432
- double *bnorm = grd->bnorm;
433
- for (size_t y = 0; y < Y; y++)
465
+ void grd_spfwdbwd(grd_st_t *grd_st, const seq_t *seq) {
466
+ const mdl_t *mdl = grd_st->mdl;
467
+ const uint32_t Y = mdl->nlbl;
468
+ const uint32_t T = seq->len;
469
+ const double (*psiuni)[T][Y] = (void *)grd_st->psiuni;
470
+ const double *psival = grd_st->psi;
471
+ const uint32_t *psiyp = grd_st->psiyp;
472
+ const uint32_t (*psiidx)[T][Y] = (void *)grd_st->psiidx;
473
+ const uint32_t *psioff = grd_st->psioff;
474
+ double (*alpha)[T][Y] = (void *)grd_st->alpha;
475
+ double (*beta )[T][Y] = (void *)grd_st->beta;
476
+ double *scale = grd_st->scale;
477
+ double *unorm = grd_st->unorm;
478
+ double *bnorm = grd_st->bnorm;
479
+ for (uint32_t y = 0; y < Y; y++)
434
480
  (*alpha)[0][y] = (*psiuni)[0][y];
435
481
  scale[0] = xvm_unit((*alpha)[0], (*alpha)[0], Y);
436
- for (int t = 1; t < grd->last + 1; t++) {
437
- for (size_t y = 0; y < Y; y++)
482
+ for (uint32_t t = 1; t < grd_st->last + 1; t++) {
483
+ for (uint32_t y = 0; y < Y; y++)
438
484
  (*alpha)[t][y] = 1.0;
439
- const size_t off = psioff[t];
440
- for (size_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
485
+ const uint32_t off = psioff[t];
486
+ for (uint32_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
441
487
  while (n >= (*psiidx)[t][y])
442
488
  y++;
443
489
  while (n < (*psiidx)[t][y]) {
444
- const size_t yp = psiyp [off + n];
445
- const double v = psival[off + n];
490
+ const uint32_t yp = psiyp [off + n];
491
+ const double v = psival[off + n];
446
492
  (*alpha)[t][y] += (*alpha)[t - 1][yp] * v;
447
493
  n++;
448
494
  }
449
495
  }
450
- for (size_t y = 0; y < Y; y++)
496
+ for (uint32_t y = 0; y < Y; y++)
451
497
  (*alpha)[t][y] *= (*psiuni)[t][y];
452
498
  scale[t] = xvm_unit((*alpha)[t], (*alpha)[t], Y);
453
499
  }
454
- for (size_t yp = 0; yp < Y; yp++)
500
+ for (uint32_t yp = 0; yp < Y; yp++)
455
501
  (*beta)[T - 1][yp] = 1.0 / Y;
456
- for (int t = T - 1; t > grd->first; t--) {
502
+ for (uint32_t t = T - 1; t > grd_st->first; t--) {
457
503
  double sum = 0.0, tmp[Y];
458
- for (size_t y = 0; y < Y; y++) {
504
+ for (uint32_t y = 0; y < Y; y++) {
459
505
  tmp[y] = (*beta)[t][y] * (*psiuni)[t][y];
460
506
  sum += tmp[y];
461
507
  }
462
- for (size_t y = 0; y < Y; y++)
508
+ for (uint32_t y = 0; y < Y; y++)
463
509
  (*beta)[t - 1][y] = sum;
464
- const size_t off = psioff[t];
465
- for (size_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
510
+ const uint32_t off = psioff[t];
511
+ for (uint32_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
466
512
  while (n >= (*psiidx)[t][y])
467
513
  y++;
468
514
  while (n < (*psiidx)[t][y]) {
469
- const size_t yp = psiyp [off + n];
470
- const double v = psival[off + n];
515
+ const uint32_t yp = psiyp [off + n];
516
+ const double v = psival[off + n];
471
517
  (*beta)[t - 1][yp] += v * tmp[y];
472
518
  n++;
473
519
  }
474
520
  }
475
521
  xvm_unit((*beta)[t - 1], (*beta)[t - 1], Y);
476
522
  }
477
- for (int t = 0; t < T; t++) {
523
+ for (uint32_t t = 0; t < T; t++) {
478
524
  double z = 0.0;
479
- for (size_t y = 0; y < Y; y++)
525
+ for (uint32_t y = 0; y < Y; y++)
480
526
  z += (*alpha)[t][y] * (*beta)[t][y];
481
527
  unorm[t] = 1.0 / z;
482
528
  bnorm[t] = scale[t] / z;
@@ -509,35 +555,35 @@ void grd_spfwdbwd(grd_t *grd, const seq_t *seq) {
509
555
  * vector but just adding the contribution of this sequence. This allow to
510
556
  * compute it easily the gradient over more than one sequence.
511
557
  */
512
- void grd_flupgrad(grd_t *grd, const seq_t *seq) {
513
- const mdl_t *mdl = grd->mdl;
514
- const size_t Y = mdl->nlbl;
515
- const int T = seq->len;
516
- const double (*psi )[T][Y][Y] = (void *)grd->psi;
517
- const double (*alpha)[T][Y] = (void *)grd->alpha;
518
- const double (*beta )[T][Y] = (void *)grd->beta;
519
- const double *unorm = grd->unorm;
520
- const double *bnorm = grd->bnorm;
521
- double *g = grd->g;
522
- for (int t = 0; t < T; t++) {
558
+ void grd_flupgrad(grd_st_t *grd_st, const seq_t *seq) {
559
+ const mdl_t *mdl = grd_st->mdl;
560
+ const uint32_t Y = mdl->nlbl;
561
+ const uint32_t T = seq->len;
562
+ const double (*psi )[T][Y][Y] = (void *)grd_st->psi;
563
+ const double (*alpha)[T][Y] = (void *)grd_st->alpha;
564
+ const double (*beta )[T][Y] = (void *)grd_st->beta;
565
+ const double *unorm = grd_st->unorm;
566
+ const double *bnorm = grd_st->bnorm;
567
+ double *g = grd_st->g;
568
+ for (uint32_t t = 0; t < T; t++) {
523
569
  const pos_t *pos = &(seq->pos[t]);
524
- for (size_t y = 0; y < Y; y++) {
570
+ for (uint32_t y = 0; y < Y; y++) {
525
571
  double e = (*alpha)[t][y] * (*beta)[t][y] * unorm[t];
526
- for (size_t n = 0; n < pos->ucnt; n++) {
527
- const size_t o = pos->uobs[n];
528
- g[mdl->uoff[o] + y] += e;
572
+ for (uint32_t n = 0; n < pos->ucnt; n++) {
573
+ const uint64_t o = pos->uobs[n];
574
+ atm_inc(g + mdl->uoff[o] + y, e);
529
575
  }
530
576
  }
531
577
  }
532
- for (int t = 1; t < T; t++) {
578
+ for (uint32_t t = 1; t < T; t++) {
533
579
  const pos_t *pos = &(seq->pos[t]);
534
- for (size_t yp = 0, d = 0; yp < Y; yp++) {
535
- for (size_t y = 0; y < Y; y++, d++) {
580
+ for (uint32_t yp = 0, d = 0; yp < Y; yp++) {
581
+ for (uint32_t y = 0; y < Y; y++, d++) {
536
582
  double e = (*alpha)[t - 1][yp] * (*beta)[t][y]
537
583
  * (*psi)[t][yp][y] * bnorm[t];
538
- for (size_t n = 0; n < pos->bcnt; n++) {
539
- const size_t o = pos->bobs[n];
540
- g[mdl->boff[o] + d] += e;
584
+ for (uint32_t n = 0; n < pos->bcnt; n++) {
585
+ const uint64_t o = pos->bobs[n];
586
+ atm_inc(g + mdl->boff[o] + d, e);
541
587
  }
542
588
  }
543
589
  }
@@ -552,55 +598,55 @@ void grd_flupgrad(grd_t *grd, const seq_t *seq) {
552
598
  * matrix. We first fill it with the unigram component and next multiply it
553
599
  * with the bigram one.
554
600
  */
555
- void grd_spupgrad(grd_t *grd, const seq_t *seq) {
556
- const mdl_t *mdl = grd->mdl;
557
- const size_t Y = mdl->nlbl;
558
- const int T = seq->len;
559
- const double (*psiuni)[T][Y] = (void *)grd->psiuni;
560
- const double *psival = grd->psi;
561
- const size_t *psiyp = grd->psiyp;
562
- const size_t (*psiidx)[T][Y] = (void *)grd->psiidx;
563
- const size_t *psioff = grd->psioff;
564
- const double (*alpha)[T][Y] = (void *)grd->alpha;
565
- const double (*beta )[T][Y] = (void *)grd->beta;
566
- const double *unorm = grd->unorm;
567
- const double *bnorm = grd->bnorm;
568
- double *g = grd->g;
569
- for (int t = 0; t < T; t++) {
601
+ void grd_spupgrad(grd_st_t *grd_st, const seq_t *seq) {
602
+ const mdl_t *mdl = grd_st->mdl;
603
+ const uint32_t Y = mdl->nlbl;
604
+ const uint32_t T = seq->len;
605
+ const double (*psiuni)[T][Y] = (void *)grd_st->psiuni;
606
+ const double *psival = grd_st->psi;
607
+ const uint32_t *psiyp = grd_st->psiyp;
608
+ const uint32_t (*psiidx)[T][Y] = (void *)grd_st->psiidx;
609
+ const uint32_t *psioff = grd_st->psioff;
610
+ const double (*alpha)[T][Y] = (void *)grd_st->alpha;
611
+ const double (*beta )[T][Y] = (void *)grd_st->beta;
612
+ const double *unorm = grd_st->unorm;
613
+ const double *bnorm = grd_st->bnorm;
614
+ double *g = grd_st->g;
615
+ for (uint32_t t = 0; t < T; t++) {
570
616
  const pos_t *pos = &(seq->pos[t]);
571
- for (size_t y = 0; y < Y; y++) {
617
+ for (uint32_t y = 0; y < Y; y++) {
572
618
  double e = (*alpha)[t][y] * (*beta)[t][y] * unorm[t];
573
- for (size_t n = 0; n < pos->ucnt; n++) {
574
- const size_t o = pos->uobs[n];
575
- g[mdl->uoff[o] + y] += e;
619
+ for (uint32_t n = 0; n < pos->ucnt; n++) {
620
+ const uint64_t o = pos->uobs[n];
621
+ atm_inc(g + mdl->uoff[o] + y, e);
576
622
  }
577
623
  }
578
624
  }
579
- for (int t = 1; t < T; t++) {
625
+ for (uint32_t t = 1; t < T; t++) {
580
626
  const pos_t *pos = &(seq->pos[t]);
581
627
  // We build the expectation matrix
582
628
  double e[Y][Y];
583
- for (size_t yp = 0; yp < Y; yp++)
584
- for (size_t y = 0; y < Y; y++)
629
+ for (uint32_t yp = 0; yp < Y; yp++)
630
+ for (uint32_t y = 0; y < Y; y++)
585
631
  e[yp][y] = (*alpha)[t - 1][yp] * (*beta)[t][y]
586
632
  * (*psiuni)[t][y] * bnorm[t];
587
- const size_t off = psioff[t];
588
- for (size_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
633
+ const uint32_t off = psioff[t];
634
+ for (uint32_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
589
635
  while (n >= (*psiidx)[t][y])
590
636
  y++;
591
637
  while (n < (*psiidx)[t][y]) {
592
- const size_t yp = psiyp [off + n];
593
- const double v = psival[off + n];
638
+ const uint32_t yp = psiyp [off + n];
639
+ const double v = psival[off + n];
594
640
  e[yp][y] += e[yp][y] * v;
595
641
  n++;
596
642
  }
597
643
  }
598
644
  // Add the expectation over the model distribution
599
- for (size_t yp = 0, d = 0; yp < Y; yp++) {
600
- for (size_t y = 0; y < Y; y++, d++) {
601
- for (size_t n = 0; n < pos->bcnt; n++) {
602
- const size_t o = pos->bobs[n];
603
- g[mdl->boff[o] + d] += e[yp][y];
645
+ for (uint32_t yp = 0, d = 0; yp < Y; yp++) {
646
+ for (uint32_t y = 0; y < Y; y++, d++) {
647
+ for (uint32_t n = 0; n < pos->bcnt; n++) {
648
+ const uint64_t o = pos->bobs[n];
649
+ atm_inc(g + mdl->boff[o] + d, e[yp][y]);
604
650
  }
605
651
  }
606
652
  }
@@ -612,24 +658,24 @@ void grd_spupgrad(grd_t *grd, const seq_t *seq) {
612
658
  * distribution. This is the second step of the gradient computation shared
613
659
  * by the non-sparse and sparse version.
614
660
  */
615
- void grd_subemp(grd_t *grd, const seq_t *seq) {
616
- const mdl_t *mdl = grd->mdl;
617
- const size_t Y = mdl->nlbl;
618
- const int T = seq->len;
619
- double *g = grd->g;
620
- for (int t = 0; t < T; t++) {
661
+ void grd_subemp(grd_st_t *grd_st, const seq_t *seq) {
662
+ const mdl_t *mdl = grd_st->mdl;
663
+ const uint32_t Y = mdl->nlbl;
664
+ const uint32_t T = seq->len;
665
+ double *g = grd_st->g;
666
+ for (uint32_t t = 0; t < T; t++) {
621
667
  const pos_t *pos = &(seq->pos[t]);
622
- const size_t y = seq->pos[t].lbl;
623
- for (size_t n = 0; n < pos->ucnt; n++)
624
- g[mdl->uoff[pos->uobs[n]] + y] -= 1.0;
668
+ const uint32_t y = seq->pos[t].lbl;
669
+ for (uint32_t n = 0; n < pos->ucnt; n++)
670
+ atm_inc(g + mdl->uoff[pos->uobs[n]] + y, -1.0);
625
671
  }
626
- for (int t = 1; t < T; t++) {
672
+ for (uint32_t t = 1; t < T; t++) {
627
673
  const pos_t *pos = &(seq->pos[t]);
628
- const size_t yp = seq->pos[t - 1].lbl;
629
- const size_t y = seq->pos[t ].lbl;
630
- const size_t d = yp * Y + y;
631
- for (size_t n = 0; n < pos->bcnt; n++)
632
- g[mdl->boff[pos->bobs[n]] + d] -= 1.0;
674
+ const uint32_t yp = seq->pos[t - 1].lbl;
675
+ const uint32_t y = seq->pos[t ].lbl;
676
+ const uint32_t d = yp * Y + y;
677
+ for (uint32_t n = 0; n < pos->bcnt; n++)
678
+ atm_inc(g + mdl->boff[pos->bobs[n]] + d, -1.0);
633
679
  }
634
680
  }
635
681
 
@@ -655,38 +701,38 @@ void grd_subemp(grd_t *grd, const seq_t *seq) {
655
701
  * weights will be non-nul only for observations present in the sequence, we
656
702
  * sum only over these ones.
657
703
  */
658
- void grd_logloss(grd_t *grd, const seq_t *seq) {
659
- const mdl_t *mdl = grd->mdl;
660
- const double *x = mdl->theta;
661
- const size_t Y = mdl->nlbl;
662
- const int T = seq->len;
663
- const double (*alpha)[T][Y] = (void *)grd->alpha;
664
- const double *scale = grd->scale;
704
+ void grd_logloss(grd_st_t *grd_st, const seq_t *seq) {
705
+ const mdl_t *mdl = grd_st->mdl;
706
+ const double *x = mdl->theta;
707
+ const uint32_t Y = mdl->nlbl;
708
+ const uint32_t T = seq->len;
709
+ const double (*alpha)[T][Y] = (void *)grd_st->alpha;
710
+ const double *scale = grd_st->scale;
665
711
  double logz = 0.0;
666
- for (size_t y = 0; y < Y; y++)
712
+ for (uint32_t y = 0; y < Y; y++)
667
713
  logz += (*alpha)[T - 1][y];
668
714
  logz = log(logz);
669
- for (int t = 0; t < T; t++)
715
+ for (uint32_t t = 0; t < T; t++)
670
716
  logz -= log(scale[t]);
671
717
  double lloss = logz;
672
- for (int t = 0; t < T; t++) {
718
+ for (uint32_t t = 0; t < T; t++) {
673
719
  const pos_t *pos = &(seq->pos[t]);
674
- const size_t y = seq->pos[t].lbl;
675
- for (size_t n = 0; n < pos->ucnt; n++)
720
+ const uint32_t y = seq->pos[t].lbl;
721
+ for (uint32_t n = 0; n < pos->ucnt; n++)
676
722
  lloss -= x[mdl->uoff[pos->uobs[n]] + y];
677
723
  }
678
- for (int t = 1; t < T; t++) {
724
+ for (uint32_t t = 1; t < T; t++) {
679
725
  const pos_t *pos = &(seq->pos[t]);
680
- const size_t yp = seq->pos[t - 1].lbl;
681
- const size_t y = seq->pos[t ].lbl;
682
- const size_t d = yp * Y + y;
683
- for (size_t n = 0; n < pos->bcnt; n++)
726
+ const uint32_t yp = seq->pos[t - 1].lbl;
727
+ const uint32_t y = seq->pos[t ].lbl;
728
+ const uint32_t d = yp * Y + y;
729
+ for (uint32_t n = 0; n < pos->bcnt; n++)
684
730
  lloss -= x[mdl->boff[pos->bobs[n]] + d];
685
731
  }
686
- grd->lloss += lloss;
732
+ grd_st->lloss += lloss;
687
733
  }
688
734
 
689
- /* grd_doseq:
735
+ /* grd_docrf:
690
736
  * This function compute the gradient and value of the negative log-likelihood
691
737
  * of the model over a single training sequence.
692
738
  *
@@ -694,21 +740,21 @@ void grd_logloss(grd_t *grd, const seq_t *seq) {
694
740
  * just accumulate the values for the given sequence in it. This allow to
695
741
  * easily compute the gradient over a set of sequences.
696
742
  */
697
- void grd_doseq(grd_t *grd, const seq_t *seq) {
698
- const mdl_t *mdl = grd->mdl;
699
- grd->first = 0;
700
- grd->last = seq->len - 1;
743
+ void grd_docrf(grd_st_t *grd_st, const seq_t *seq) {
744
+ const mdl_t *mdl = grd_st->mdl;
745
+ grd_st->first = 0;
746
+ grd_st->last = seq->len - 1;
701
747
  if (!mdl->opt->sparse) {
702
- grd_fldopsi(grd, seq);
703
- grd_flfwdbwd(grd, seq);
704
- grd_flupgrad(grd, seq);
748
+ grd_fldopsi(grd_st, seq);
749
+ grd_flfwdbwd(grd_st, seq);
750
+ grd_flupgrad(grd_st, seq);
705
751
  } else {
706
- grd_spdopsi(grd, seq);
707
- grd_spfwdbwd(grd, seq);
708
- grd_spupgrad(grd, seq);
752
+ grd_spdopsi(grd_st, seq);
753
+ grd_spfwdbwd(grd_st, seq);
754
+ grd_spupgrad(grd_st, seq);
709
755
  }
710
- grd_subemp(grd, seq);
711
- grd_logloss(grd, seq);
756
+ grd_subemp(grd_st, seq);
757
+ grd_logloss(grd_st, seq);
712
758
  }
713
759
 
714
760
  /******************************************************************************
@@ -731,16 +777,130 @@ void grd_doseq(grd_t *grd, const seq_t *seq) {
731
777
  * cores, or to more thread than you have memory to hold vectors.
732
778
  ******************************************************************************/
733
779
 
780
+ /* grd_stcheck:
781
+ * Check that enough memory is allocated in the gradient object so that the
782
+ * linear-chain codepath can be computed for a sequence of the given length.
783
+ */
784
+ void grd_stcheck(grd_st_t *grd_st, uint32_t len) {
785
+ // Check if user ask for clearing the state tracker or if he requested a
786
+ // bigger tracker. In this case we have to free the previous allocated
787
+ // memory.
788
+ if (len == 0 || (len > grd_st->len && grd_st->len != 0)) {
789
+ if (grd_st->mdl->opt->sparse) {
790
+ xvm_free(grd_st->psiuni); grd_st->psiuni = NULL;
791
+ free(grd_st->psiyp); grd_st->psiyp = NULL;
792
+ free(grd_st->psiidx); grd_st->psiidx = NULL;
793
+ free(grd_st->psioff); grd_st->psioff = NULL;
794
+ }
795
+ xvm_free(grd_st->psi); grd_st->psi = NULL;
796
+ xvm_free(grd_st->alpha); grd_st->alpha = NULL;
797
+ xvm_free(grd_st->beta); grd_st->beta = NULL;
798
+ xvm_free(grd_st->unorm); grd_st->unorm = NULL;
799
+ xvm_free(grd_st->bnorm); grd_st->bnorm = NULL;
800
+ xvm_free(grd_st->scale); grd_st->scale = NULL;
801
+ grd_st->len = 0;
802
+ }
803
+ if (len == 0 || len <= grd_st->len)
804
+ return;
805
+ // If we are here, we have to allocate a new state. This is simple, we
806
+ // just have to take care of the special case for sparse mode.
807
+ const uint32_t Y = grd_st->mdl->nlbl;
808
+ const uint32_t T = len;
809
+ grd_st->psi = xvm_new(T * Y * Y);
810
+ grd_st->alpha = xvm_new(T * Y);
811
+ grd_st->beta = xvm_new(T * Y);
812
+ grd_st->scale = xvm_new(T);
813
+ grd_st->unorm = xvm_new(T);
814
+ grd_st->bnorm = xvm_new(T);
815
+ if (grd_st->mdl->opt->sparse) {
816
+ grd_st->psiuni = xvm_new(T * Y);
817
+ grd_st->psiyp = wapiti_xmalloc(sizeof(uint32_t) * T * Y * Y);
818
+ grd_st->psiidx = wapiti_xmalloc(sizeof(uint32_t) * T * Y);
819
+ grd_st->psioff = wapiti_xmalloc(sizeof(uint32_t) * T);
820
+ }
821
+ grd_st->len = len;
822
+ }
823
+
824
+ /* grd_stnew:
825
+ * Allocation memory for gradient computation state. This allocate memory for
826
+ * the longest sequence present in the data set.
827
+ */
828
+ grd_st_t *grd_stnew(mdl_t *mdl, double *g) {
829
+ grd_st_t *grd_st = wapiti_xmalloc(sizeof(grd_st_t));
830
+ grd_st->mdl = mdl;
831
+ grd_st->len = 0;
832
+ grd_st->g = g;
833
+ grd_st->psi = NULL;
834
+ grd_st->psiuni = NULL;
835
+ grd_st->psiyp = NULL;
836
+ grd_st->psiidx = NULL;
837
+ grd_st->psioff = NULL;
838
+ grd_st->alpha = NULL;
839
+ grd_st->beta = NULL;
840
+ grd_st->unorm = NULL;
841
+ grd_st->bnorm = NULL;
842
+ grd_st->scale = NULL;
843
+ return grd_st;
844
+ }
845
+
846
+ /* grd_stfree:
847
+ * Free all memory used by gradient computation.
848
+ */
849
+ void grd_stfree(grd_st_t *grd_st) {
850
+ grd_stcheck(grd_st, 0);
851
+ free(grd_st);
852
+ }
853
+
734
854
  /* grd_dospl:
735
855
  * Compute the gradient of a single sample choosing between the maxent
736
856
  * optimised codepath and classical one depending of the sample.
737
857
  */
738
- void grd_dospl(grd_t *grd, const seq_t *seq) {
739
- grd_check(grd, seq->len);
740
- if (seq->len == 1 || grd->mdl->reader->nbi == 0)
741
- grd_dosingle(grd, seq);
858
+ void grd_dospl(grd_st_t *grd_st, const seq_t *seq) {
859
+ grd_stcheck(grd_st, seq->len);
860
+ rdr_t *rdr = grd_st->mdl->reader;
861
+ if (seq->len == 1 || (rdr->npats != 0 && rdr->nbi == 0))
862
+ grd_domaxent(grd_st, seq);
863
+ else if (grd_st->mdl->type == 0)
864
+ grd_domaxent(grd_st, seq);
865
+ else if (grd_st->mdl->type == 1)
866
+ grd_domemm(grd_st, seq);
742
867
  else
743
- grd_doseq(grd, seq);
868
+ grd_docrf(grd_st, seq);
869
+ }
870
+
871
+ /* grd_new:
872
+ * Allocate a new parallel gradient computer. Return a grd_t object who can
873
+ * compute gradient over the full data set and store it in the vector <g>.
874
+ */
875
+ grd_t *grd_new(mdl_t *mdl, double *g) {
876
+ const uint32_t W = mdl->opt->nthread;
877
+ grd_t *grd = wapiti_xmalloc(sizeof(grd_t));
878
+ grd->mdl = mdl;
879
+ grd->grd_st = wapiti_xmalloc(sizeof(grd_st_t *) * W);
880
+ #ifdef ATM_ANSI
881
+ grd->grd_st[0] = grd_stnew(mdl, g);
882
+ for (uint32_t w = 1; w < W; w++)
883
+ grd->grd_st[w] = grd_stnew(mdl, xvm_new(mdl->nftr));
884
+ #else
885
+ for (uint32_t w = 0; w < W; w++)
886
+ grd->grd_st[w] = grd_stnew(mdl, g);
887
+ #endif
888
+ return grd;
889
+ }
890
+
891
+ /* grd_free:
892
+ * Free all memory allocated for the given gradient computer object.
893
+ */
894
+ void grd_free(grd_t *grd) {
895
+ const uint32_t W = grd->mdl->opt->nthread;
896
+ #ifdef ATM_ANSI
897
+ for (uint32_t w = 1; w < W; w++)
898
+ xvm_free(grd->grd_st[w]->g);
899
+ #endif
900
+ for (uint32_t w = 0; w < W; w++)
901
+ grd_stfree(grd->grd_st[w]);
902
+ free(grd->grd_st);
903
+ free(grd);
744
904
  }
745
905
 
746
906
  /* grd_worker:
@@ -748,22 +908,25 @@ void grd_dospl(grd_t *grd, const seq_t *seq) {
748
908
  * training set. It is mean to be called by the thread spawner in order to
749
909
  * compute the gradient over the full training set.
750
910
  */
751
- static void grd_worker(job_t *job, int id, int cnt, grd_t *grd) {
911
+ static
912
+ void grd_worker(job_t *job, uint32_t id, uint32_t cnt, grd_st_t *grd_st) {
752
913
  unused(id && cnt);
753
- mdl_t *mdl = grd->mdl;
914
+ mdl_t *mdl = grd_st->mdl;
754
915
  const dat_t *dat = mdl->train;
755
- const size_t F = mdl->nftr;
756
916
  // We first cleanup the gradient and value as our parent don't do it (it
757
917
  // is better to do this also in parallel)
758
- grd->lloss = 0.0;
759
- for (size_t f = 0; f < F; f++)
760
- grd->g[f] = 0.0;
918
+ grd_st->lloss = 0.0;
919
+ #ifdef ATM_ANSI
920
+ const uint64_t F = mdl->nftr;
921
+ for (uint64_t f = 0; f < F; f++)
922
+ grd_st->g[f] = 0.0;
923
+ #endif
761
924
  // Now all is ready, we can process our sequences and accumulate the
762
925
  // gradient and inverse log-likelihood
763
- size_t count, pos;
926
+ uint32_t count, pos;
764
927
  while (mth_getjob(job, &count, &pos)) {
765
- for (size_t s = pos; !uit_stop && s < pos + count; s++)
766
- grd_dospl(grd, dat->seq[s]);
928
+ for (uint32_t s = pos; !uit_stop && s < pos + count; s++)
929
+ grd_dospl(grd_st, dat->seq[s]);
767
930
  if (uit_stop)
768
931
  break;
769
932
  }
@@ -775,30 +938,38 @@ static void grd_worker(job_t *job, int id, int cnt, grd_t *grd) {
775
938
  * the fact that the gradient over the full training set is just the sum of
776
939
  * the gradient of each sequence.
777
940
  */
778
- double grd_gradient(mdl_t *mdl, double *g, grd_t *grds[]) {
779
- const double *x = mdl->theta;
780
- const size_t F = mdl->nftr;
781
- const size_t W = mdl->opt->nthread;
941
+ double grd_gradient(grd_t *grd) {
942
+ mdl_t *mdl = grd->mdl;
943
+ const double *x = mdl->theta;
944
+ const uint64_t F = mdl->nftr;
945
+ const uint32_t W = mdl->opt->nthread;
946
+ double *g = grd->grd_st[0]->g;
947
+ #ifndef ATM_ANSI
948
+ for (uint64_t f = 0; f < F; f++)
949
+ g[f] = 0.0;
950
+ #endif
782
951
  // All is ready to compute the gradient, we spawn the threads of
783
952
  // workers, each one working on a part of the data. As the gradient and
784
953
  // log-likelihood are additive, computing the final values will be
785
954
  // trivial.
786
- mth_spawn((func_t *)grd_worker, W, (void **)grds, mdl->train->nseq,
787
- mdl->opt->jobsize);
955
+ mth_spawn((func_t *)grd_worker, W, (void **)grd->grd_st,
956
+ mdl->train->nseq, mdl->opt->jobsize);
788
957
  if (uit_stop)
789
958
  return -1.0;
790
959
  // All computations are done, it just remain to add all the gradients
791
- // and inverse log-likelihood from all the workers.
792
- double fx = grds[0]->lloss;
793
- for (size_t w = 1; w < W; w++) {
794
- for (size_t f = 0; f < F; f++)
795
- g[f] += grds[w]->g[f];
796
- fx += grds[w]->lloss;
797
- }
960
+ // and negative log-likelihood from all the workers.
961
+ double fx = grd->grd_st[0]->lloss;
962
+ for (uint32_t w = 1; w < W; w++)
963
+ fx += grd->grd_st[w]->lloss;
964
+ #ifdef ATM_ANSI
965
+ for (uint32_t w = 1; w < W; w++)
966
+ for (uint64_t f = 0; f < F; f++)
967
+ g[f] += grd->grd_st[w]->g[f];
968
+ #endif
798
969
  // If needed we clip the gradient: setting to 0.0 all coordinates where
799
970
  // the function is 0.0.
800
971
  if (mdl->opt->lbfgs.clip == true)
801
- for (size_t f = 0; f < F; f++)
972
+ for (uint64_t f = 0; f < F; f++)
802
973
  if (x[f] == 0.0)
803
974
  g[f] = 0.0;
804
975
  // Now we can apply the elastic-net penalty. Depending of the values of
@@ -806,7 +977,7 @@ double grd_gradient(mdl_t *mdl, double *g, grd_t *grds[]) {
806
977
  const double rho1 = mdl->opt->rho1;
807
978
  const double rho2 = mdl->opt->rho2;
808
979
  double nl1 = 0.0, nl2 = 0.0;
809
- for (size_t f = 0; f < F; f++) {
980
+ for (uint64_t f = 0; f < F; f++) {
810
981
  const double v = x[f];
811
982
  g[f] += rho2 * v;
812
983
  nl1 += fabs(v);