wapiti 0.0.5 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.simplecov +3 -0
  3. data/Gemfile +25 -2
  4. data/HISTORY.md +5 -1
  5. data/LICENSE +14 -13
  6. data/README.md +9 -16
  7. data/Rakefile +38 -8
  8. data/ext/wapiti/bcd.c +126 -124
  9. data/ext/wapiti/decoder.c +203 -124
  10. data/ext/wapiti/decoder.h +6 -4
  11. data/ext/wapiti/extconf.rb +2 -2
  12. data/ext/wapiti/gradient.c +491 -320
  13. data/ext/wapiti/gradient.h +52 -34
  14. data/ext/wapiti/lbfgs.c +74 -33
  15. data/ext/wapiti/model.c +47 -37
  16. data/ext/wapiti/model.h +22 -20
  17. data/ext/wapiti/native.c +850 -839
  18. data/ext/wapiti/native.h +1 -1
  19. data/ext/wapiti/options.c +52 -20
  20. data/ext/wapiti/options.h +37 -30
  21. data/ext/wapiti/pattern.c +35 -33
  22. data/ext/wapiti/pattern.h +12 -11
  23. data/ext/wapiti/progress.c +14 -13
  24. data/ext/wapiti/progress.h +3 -2
  25. data/ext/wapiti/quark.c +14 -16
  26. data/ext/wapiti/quark.h +6 -5
  27. data/ext/wapiti/reader.c +83 -69
  28. data/ext/wapiti/reader.h +11 -9
  29. data/ext/wapiti/rprop.c +84 -43
  30. data/ext/wapiti/sequence.h +18 -16
  31. data/ext/wapiti/sgdl1.c +45 -43
  32. data/ext/wapiti/thread.c +19 -17
  33. data/ext/wapiti/thread.h +5 -4
  34. data/ext/wapiti/tools.c +7 -7
  35. data/ext/wapiti/tools.h +3 -4
  36. data/ext/wapiti/trainers.h +1 -1
  37. data/ext/wapiti/vmath.c +40 -38
  38. data/ext/wapiti/vmath.h +12 -11
  39. data/ext/wapiti/wapiti.c +159 -37
  40. data/ext/wapiti/wapiti.h +18 -4
  41. data/lib/wapiti.rb +15 -15
  42. data/lib/wapiti/errors.rb +15 -15
  43. data/lib/wapiti/model.rb +92 -84
  44. data/lib/wapiti/options.rb +123 -124
  45. data/lib/wapiti/utility.rb +14 -14
  46. data/lib/wapiti/version.rb +2 -2
  47. data/spec/spec_helper.rb +29 -9
  48. data/spec/wapiti/model_spec.rb +230 -194
  49. data/spec/wapiti/native_spec.rb +7 -8
  50. data/spec/wapiti/options_spec.rb +184 -174
  51. data/wapiti.gemspec +22 -8
  52. metadata +38 -42
  53. data/.gitignore +0 -5
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -29,6 +29,7 @@
29
29
  #define decoder_h
30
30
 
31
31
  #include <stddef.h>
32
+ #include <stdint.h>
32
33
  #include <stdio.h>
33
34
 
34
35
  #include "wapiti.h"
@@ -36,11 +37,12 @@
36
37
  #include "sequence.h"
37
38
 
38
39
  void tag_viterbi(mdl_t *mdl, const seq_t *seq,
39
- size_t out[], double *sc, double psc[]);
40
- void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
41
- size_t out[][N], double sc[], double psc[][N]);
40
+ uint32_t out[], double *sc, double psc[]);
41
+ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, uint32_t N,
42
+ uint32_t out[][N], double sc[], double psc[][N]);
42
43
 
43
44
  void tag_label(mdl_t *mdl, FILE *fin, FILE *fout);
44
45
  void tag_eval(mdl_t *mdl, double *te, double *se);
46
+
45
47
  #endif
46
48
 
@@ -1,8 +1,8 @@
1
1
  require 'mkmf'
2
2
 
3
- $CFLAGS << %q{ -std=c99 -O3 -Wall -ggdb }
3
+ $CFLAGS << %q{ -std=c99 -W -Wall -Wno-declaration-after-statement -O3 }
4
4
 
5
5
  have_library('pthread')
6
6
  have_library('m')
7
7
 
8
- create_makefile('wapiti/native')
8
+ create_makefile('wapiti/native')
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -26,8 +26,9 @@
26
26
  */
27
27
  #include <math.h>
28
28
  #include <stddef.h>
29
- #include <stdlib.h>
29
+ #include <stdint.h>
30
30
  #include <stdio.h>
31
+ #include <stdlib.h>
31
32
  #include <string.h>
32
33
 
33
34
  #include "wapiti.h"
@@ -40,41 +41,71 @@
40
41
  #include "thread.h"
41
42
  #include "vmath.h"
42
43
 
44
+ /* atm_inc:
45
+ * Atomically increment the value pointed by [ptr] by [inc]. If ATM_ANSI is
46
+ * defined this NOT atomic at all so caller must have to deal with this.
47
+ */
48
+ #ifdef ATM_ANSI
49
+ static inline
50
+ void atm_inc(double *value, double inc) {
51
+ *value += inc;
52
+ }
53
+ #else
54
+ static inline
55
+ void atm_inc(volatile double *value, double inc) {
56
+ while (1) {
57
+ volatile union {
58
+ double d;
59
+ uint64_t u;
60
+ } old, new;
61
+ old.d = *value;
62
+ new.d = old.d + inc;
63
+ uint64_t *ptr = (uint64_t *)value;
64
+ if (__sync_bool_compare_and_swap(ptr, old.u, new.u))
65
+ break;
66
+ }
67
+ }
68
+ #endif
69
+
43
70
  /******************************************************************************
44
- * Maxent optimized gradient computation
71
+ * Maxent gradient computation
45
72
  *
46
- * Maxent or maximum entropy models are a specific case of CRF where the
47
- * output graph is reduced to a single node. In this specific case, the
48
- * computation of the gradient can be simplified a lot as it is done in this
49
- * part of the code.
73
+ * Maxent or maximum entropy models are multi class logistic regression (see
74
+ * [1]. Then can be viewed as a special class of CRFs models where the there
75
+ * is no dependencies between the output labels. This mean that the
76
+ * normalization is local to each nodes and can be done a lot more efficiently
77
+ * as we do not have to perform the forward backward procedure.
50
78
  *
51
- * This code will be used to compute gradient for sequences of length one and
52
- * without actives bigrams features. All other case are handled by the next
53
- * section.
79
+ * This code is used both when the maxent type of model is used and in other
80
+ * modes if the sequence length is one or if there is no bigrams features.
81
+ *
82
+ * [1] A maximum entropy approach to natural language processing, A. Berger
83
+ * and S. Della Pietra and V. Della Pietra, Computational Linguistics,
84
+ * (22-1), March 1996.
54
85
  ******************************************************************************/
55
- void grd_dosingle(grd_t *grd, const seq_t *seq) {
56
- const mdl_t *mdl = grd->mdl;
57
- const double *x = mdl->theta;
58
- const int T = seq->len;
59
- const size_t Y = mdl->nlbl;
60
- double *psi = grd->psi;
61
- double *g = grd->g;
62
- for (int t = 0; t < T; t++) {
86
+ void grd_domaxent(grd_st_t *grd_st, const seq_t *seq) {
87
+ const mdl_t *mdl = grd_st->mdl;
88
+ const double *x = mdl->theta;
89
+ const uint32_t T = seq->len;
90
+ const uint32_t Y = mdl->nlbl;
91
+ double *psi = grd_st->psi;
92
+ double *g = grd_st->g;
93
+ for (uint32_t t = 0; t < T; t++) {
63
94
  const pos_t *pos = &(seq->pos[t]);
64
95
  // We first compute for each Y the sum of weights of all
65
96
  // features actives in the sample:
66
97
  // Ψ(y,x^i) = \exp( ∑_k θ_k f_k(y,x^i) )
67
98
  // Z_θ(x^i) = ∑_y Ψ(y,x^i)
68
99
  double Z = 0.0;
69
- for (size_t y = 0; y < Y; y++)
100
+ for (uint32_t y = 0; y < Y; y++)
70
101
  psi[y] = 0.0;
71
- for (size_t n = 0; n < pos->ucnt; n++) {
102
+ for (uint32_t n = 0; n < pos->ucnt; n++) {
72
103
  const double *wgh = x + mdl->uoff[pos->uobs[n]];
73
- for (size_t y = 0; y < Y; y++)
104
+ for (uint32_t y = 0; y < Y; y++)
74
105
  psi[y] += wgh[y];
75
106
  }
76
107
  double lloss = psi[pos->lbl];
77
- for (size_t y = 0; y < Y; y++) {
108
+ for (uint32_t y = 0; y < Y; y++) {
78
109
  psi[y] = (psi[y] == 0.0) ? 1.0 : exp(psi[y]);
79
110
  Z += psi[y];
80
111
  }
@@ -85,22 +116,111 @@ void grd_dosingle(grd_t *grd, const seq_t *seq) {
85
116
  // E_{q_θ}(x,y) - E_{p}(x,y)
86
117
  // and we can compute the expectation over the model with:
87
118
  // E_{q_θ}(x,y) = f_k(y,x^i) * ψ(y,x) / Z_θ(x)
88
- for (size_t y = 0; y < Y; y++)
119
+ for (uint32_t y = 0; y < Y; y++)
89
120
  psi[y] /= Z;
90
- for (size_t n = 0; n < pos->ucnt; n++) {
121
+ for (uint32_t n = 0; n < pos->ucnt; n++) {
91
122
  double *grd = g + mdl->uoff[pos->uobs[n]];
92
- for (size_t y = 0; y < Y; y++)
93
- grd[y] += psi[y];
94
- grd[pos->lbl] -= 1.0;
123
+ for (uint32_t y = 0; y < Y; y++)
124
+ atm_inc(grd + y, psi[y]);
125
+ atm_inc(grd + pos->lbl, -1.0);
95
126
  }
96
127
  // And finally the log-likelihood with:
97
128
  // L_θ(x^i,y^i) = log(Z_θ(x^i)) - log(ψ(y^i,x^i))
98
- grd->lloss += log(Z) - lloss;
129
+ grd_st->lloss += log(Z) - lloss;
99
130
  }
100
131
  }
101
132
 
102
133
  /******************************************************************************
103
- * Single sequence gradient computation
134
+ * Maximum entropy markov model gradient computation
135
+ *
136
+ * Maximum entropy markov models are similar to linear-chains CRFs but with
137
+ * local normalization instead of global normalization (see [2]). This change
138
+ * make the computation a lot more simpler as at training time the gradient
139
+ * can be computed similarily to the maxent cases with the previous output
140
+ * label observed.
141
+ *
142
+ * This mean that for bigram features we only have to consider the reference
143
+ * label at previous position instead of all possible labels, so we don't have
144
+ * to perform the forward backward. Bigrams features are handle in the same
145
+ * way than unigrams features.
146
+ *
147
+ * [2] Maximum Entropy Markov Models for Information Extraction and
148
+ * Segmentation, A. McCallum and D. Freitag and F. Pereira, 2000,
149
+ * Proceedings of ICML 2000 , 591–598. Stanford, California.
150
+ ******************************************************************************/
151
+ void grd_domemm(grd_st_t *grd_st, const seq_t *seq) {
152
+ const mdl_t *mdl = grd_st->mdl;
153
+ const double *x = mdl->theta;
154
+ const uint32_t T = seq->len;
155
+ const uint32_t Y = mdl->nlbl;
156
+ double *psi = grd_st->psi;
157
+ double *g = grd_st->g;
158
+ for (uint32_t t = 0; t < T; t++) {
159
+ const pos_t *pos = &(seq->pos[t]);
160
+ // We first compute for each Y the sum of weights of all
161
+ // features actives in the sample:
162
+ // Ψ(y,x^i) = \exp( ∑_k θ_k f_k(y_t-1, y,x^i) )
163
+ // Z_θ(x^i) = ∑_y Ψ(y,x^i)
164
+ // Bigram features rely on the gold label at previous position
165
+ // for the markov dependency unlike in CRFs.
166
+ double Z = 0.0;
167
+ for (uint32_t y = 0; y < Y; y++)
168
+ psi[y] = 0.0;
169
+ for (uint32_t n = 0; n < pos->ucnt; n++) {
170
+ const double *wgh = x + mdl->uoff[pos->uobs[n]];
171
+ for (uint32_t y = 0; y < Y; y++)
172
+ psi[y] += wgh[y];
173
+ }
174
+ if (t != 0) {
175
+ const uint32_t yp = seq->pos[t - 1].lbl;
176
+ const uint32_t d = yp * Y;
177
+ for (uint32_t y = 0; y < Y; y++) {
178
+ double sum = 0.0;
179
+ for (uint32_t n = 0; n < pos->bcnt; n++) {
180
+ const uint64_t o = pos->bobs[n];
181
+ sum += x[mdl->boff[o] + d + y];
182
+ }
183
+ psi[y] += sum;
184
+ }
185
+ }
186
+ double lloss = psi[pos->lbl];
187
+ for (uint32_t y = 0; y < Y; y++) {
188
+ psi[y] = (psi[y] == 0.0) ? 1.0 : exp(psi[y]);
189
+ Z += psi[y];
190
+ }
191
+ // Now, we can compute the gradient update, for each active
192
+ // feature in the sample the update is the expectation over the
193
+ // current model minus the expectation over the observed
194
+ // distribution:
195
+ // E_{q_θ}(x,y) - E_{p}(x,y)
196
+ // and we can compute the expectation over the model with:
197
+ // E_{q_θ}(x,y) = f_k(y, y,x^i) * ψ(y,x) / Z_θ(x)
198
+ for (uint32_t y = 0; y < Y; y++)
199
+ psi[y] /= Z;
200
+ for (uint32_t n = 0; n < pos->ucnt; n++) {
201
+ double *grd = g + mdl->uoff[pos->uobs[n]];
202
+ for (uint32_t y = 0; y < Y; y++)
203
+ atm_inc(grd + y, psi[y]);
204
+ atm_inc(grd + pos->lbl, -1.0);
205
+ }
206
+ if (t != 0) {
207
+ const uint32_t yp = seq->pos[t - 1].lbl;
208
+ const uint32_t d = yp * Y;
209
+ for (uint32_t n = 0; n < pos->bcnt; n++) {
210
+ double *grd = g + mdl->boff[pos->bobs[n]] + d;
211
+ for (uint32_t y = 0; y < Y; y++)
212
+ atm_inc(grd + y, psi[y]);
213
+ atm_inc(grd + pos->lbl, -1.0);
214
+ }
215
+ }
216
+ // And finally the log-likelihood with:
217
+ // L_θ(x^i,y^i) = log(Z_θ(x^i)) - log(ψ(y^i,x^i))
218
+ grd_st->lloss += log(Z) - lloss;
219
+ }
220
+ }
221
+
222
+ /******************************************************************************
223
+ * Linear-chain CRF gradient computation
104
224
  *
105
225
  * This section is responsible for computing the gradient of the
106
226
  * log-likelihood function to optimize over a single sequence.
@@ -140,80 +260,6 @@ void grd_dosingle(grd_t *grd, const seq_t *seq) {
140
260
  * the worst case use as less as possible memory.
141
261
  ******************************************************************************/
142
262
 
143
- /* grd_check:
144
- * Check that enough memory is allocated in the gradient object so that the
145
- * linear-chain codepath can be computed for a sequence of the given length.
146
- */
147
- void grd_check(grd_t *grd, int len) {
148
- // Check if user ask for clearing the state tracker or if he requested a
149
- // bigger tracker. In this case we have to free the previous allocated
150
- // memory.
151
- if (len == 0 || (len > grd->len && grd->len != 0)) {
152
- if (grd->mdl->opt->sparse) {
153
- xvm_free(grd->psiuni); grd->psiuni = NULL;
154
- free(grd->psiyp); grd->psiyp = NULL;
155
- free(grd->psiidx); grd->psiidx = NULL;
156
- free(grd->psioff); grd->psioff = NULL;
157
- }
158
- xvm_free(grd->psi); grd->psi = NULL;
159
- xvm_free(grd->alpha); grd->alpha = NULL;
160
- xvm_free(grd->beta); grd->beta = NULL;
161
- xvm_free(grd->unorm); grd->unorm = NULL;
162
- xvm_free(grd->bnorm); grd->bnorm = NULL;
163
- xvm_free(grd->scale); grd->scale = NULL;
164
- grd->len = 0;
165
- }
166
- if (len == 0 || len <= grd->len)
167
- return;
168
- // If we are here, we have to allocate a new state. This is simple, we
169
- // just have to take care of the special case for sparse mode.
170
- const size_t Y = grd->mdl->nlbl;
171
- const int T = len;
172
- grd->psi = xvm_new(T * Y * Y);
173
- grd->alpha = xvm_new(T * Y);
174
- grd->beta = xvm_new(T * Y);
175
- grd->scale = xvm_new(T);
176
- grd->unorm = xvm_new(T);
177
- grd->bnorm = xvm_new(T);
178
- if (grd->mdl->opt->sparse) {
179
- grd->psiuni = xvm_new(T * Y);
180
- grd->psiyp = wapiti_xmalloc(sizeof(size_t) * T * Y * Y);
181
- grd->psiidx = wapiti_xmalloc(sizeof(size_t) * T * Y);
182
- grd->psioff = wapiti_xmalloc(sizeof(size_t) * T);
183
- }
184
- grd->len = len;
185
- }
186
-
187
- /* grd_new:
188
- * Allocation memory for gradient computation state. This allocate memory for
189
- * the longest sequence present in the data set.
190
- */
191
- grd_t *grd_new(mdl_t *mdl, double *g) {
192
- grd_t *grd = wapiti_xmalloc(sizeof(grd_t));
193
- grd->mdl = mdl;
194
- grd->len = 0;
195
- grd->g = g;
196
- grd->psi = NULL;
197
- grd->psiuni = NULL;
198
- grd->psiyp = NULL;
199
- grd->psiidx = NULL;
200
- grd->psioff = NULL;
201
- grd->alpha = NULL;
202
- grd->beta = NULL;
203
- grd->unorm = NULL;
204
- grd->bnorm = NULL;
205
- grd->scale = NULL;
206
- return grd;
207
- }
208
-
209
- /* grd_free:
210
- * Free all memory used by gradient computation.
211
- */
212
- void grd_free(grd_t *grd) {
213
- grd_check(grd, 0);
214
- free(grd);
215
- }
216
-
217
263
  /* grd_fldopsi:
218
264
  * We first have to compute the Ψ_t(y',y,x) weights defined as
219
265
  * Ψ_t(y',y,x) = \exp( ∑_k θ_k f_k(y',y,x_t) )
@@ -235,38 +281,38 @@ void grd_free(grd_t *grd) {
235
281
  * 3/ we take the component-wise exponential of the resulting matrix
236
282
  * (this can be done efficiently with vector maths)
237
283
  */
238
- void grd_fldopsi(grd_t *grd, const seq_t *seq) {
239
- const mdl_t *mdl = grd->mdl;
240
- const double *x = mdl->theta;
241
- const size_t Y = mdl->nlbl;
242
- const int T = seq->len;
243
- double (*psi)[T][Y][Y] = (void *)grd->psi;
244
- for (int t = 0; t < T; t++) {
284
+ void grd_fldopsi(grd_st_t *grd_st, const seq_t *seq) {
285
+ const mdl_t *mdl = grd_st->mdl;
286
+ const double *x = mdl->theta;
287
+ const uint32_t Y = mdl->nlbl;
288
+ const uint32_t T = seq->len;
289
+ double (*psi)[T][Y][Y] = (void *)grd_st->psi;
290
+ for (uint32_t t = 0; t < T; t++) {
245
291
  const pos_t *pos = &(seq->pos[t]);
246
- for (size_t y = 0; y < Y; y++) {
292
+ for (uint32_t y = 0; y < Y; y++) {
247
293
  double sum = 0.0;
248
- for (size_t n = 0; n < pos->ucnt; n++) {
249
- const size_t o = pos->uobs[n];
294
+ for (uint32_t n = 0; n < pos->ucnt; n++) {
295
+ const uint64_t o = pos->uobs[n];
250
296
  sum += x[mdl->uoff[o] + y];
251
297
  }
252
- for (size_t yp = 0; yp < Y; yp++)
298
+ for (uint32_t yp = 0; yp < Y; yp++)
253
299
  (*psi)[t][yp][y] = sum;
254
300
  }
255
301
  }
256
- for (int t = 1; t < T; t++) {
302
+ for (uint32_t t = 1; t < T; t++) {
257
303
  const pos_t *pos = &(seq->pos[t]);
258
- for (size_t yp = 0, d = 0; yp < Y; yp++) {
259
- for (size_t y = 0; y < Y; y++, d++) {
304
+ for (uint32_t yp = 0, d = 0; yp < Y; yp++) {
305
+ for (uint32_t y = 0; y < Y; y++, d++) {
260
306
  double sum = 0.0;
261
- for (size_t n = 0; n < pos->bcnt; n++) {
262
- const size_t o = pos->bobs[n];
307
+ for (uint32_t n = 0; n < pos->bcnt; n++) {
308
+ const uint64_t o = pos->bobs[n];
263
309
  sum += x[mdl->boff[o] + d];
264
310
  }
265
311
  (*psi)[t][yp][y] += sum;
266
312
  }
267
313
  }
268
314
  }
269
- xvm_expma((double *)psi, (double *)psi, 0.0, (size_t)T * Y * Y);
315
+ xvm_expma((double *)psi, (double *)psi, 0.0, (uint64_t)T * Y * Y);
270
316
  }
271
317
 
272
318
  /* grd_spdopsi:
@@ -290,36 +336,36 @@ void grd_fldopsi(grd_t *grd, const seq_t *seq) {
290
336
  * one. (here also this can be done efficiently with vector
291
337
  * maths)
292
338
  */
293
- void grd_spdopsi(grd_t *grd, const seq_t *seq) {
294
- const mdl_t *mdl = grd->mdl;
295
- const double *x = mdl->theta;
296
- const size_t Y = mdl->nlbl;
297
- const int T = seq->len;
298
- double (*psiuni)[T][Y] = (void *)grd->psiuni;
299
- double *psival = grd->psi;
300
- size_t *psiyp = grd->psiyp;
301
- size_t (*psiidx)[T][Y] = (void *)grd->psiidx;
302
- size_t *psioff = grd->psioff;
303
- for (int t = 0; t < T; t++) {
339
+ void grd_spdopsi(grd_st_t *grd_st, const seq_t *seq) {
340
+ const mdl_t *mdl = grd_st->mdl;
341
+ const double *x = mdl->theta;
342
+ const uint32_t Y = mdl->nlbl;
343
+ const uint32_t T = seq->len;
344
+ double (*psiuni)[T][Y] = (void *)grd_st->psiuni;
345
+ double *psival = grd_st->psi;
346
+ uint32_t *psiyp = grd_st->psiyp;
347
+ uint32_t (*psiidx)[T][Y] = (void *)grd_st->psiidx;
348
+ uint32_t *psioff = grd_st->psioff;
349
+ for (uint32_t t = 0; t < T; t++) {
304
350
  const pos_t *pos = &(seq->pos[t]);
305
- for (size_t y = 0; y < Y; y++) {
351
+ for (uint32_t y = 0; y < Y; y++) {
306
352
  double sum = 0.0;
307
- for (size_t n = 0; n < pos->ucnt; n++) {
308
- const size_t o = pos->uobs[n];
353
+ for (uint32_t n = 0; n < pos->ucnt; n++) {
354
+ const uint64_t o = pos->uobs[n];
309
355
  sum += x[mdl->uoff[o] + y];
310
356
  }
311
357
  (*psiuni)[t][y] = sum;
312
358
  }
313
359
  }
314
- size_t off = 0;
315
- for (int t = 1; t < T; t++) {
360
+ uint32_t off = 0;
361
+ for (uint32_t t = 1; t < T; t++) {
316
362
  const pos_t *pos = &(seq->pos[t]);
317
363
  psioff[t] = off;
318
- for (size_t y = 0, nnz = 0; y < Y; y++) {
319
- for (size_t yp = 0; yp < Y; yp++) {
364
+ for (uint32_t y = 0, nnz = 0; y < Y; y++) {
365
+ for (uint32_t yp = 0; yp < Y; yp++) {
320
366
  double sum = 0.0;
321
- for (size_t n = 0; n < pos->bcnt; n++) {
322
- const size_t o = pos->bobs[n];
367
+ for (uint32_t n = 0; n < pos->bcnt; n++) {
368
+ const uint64_t o = pos->bobs[n];
323
369
  sum += x[mdl->boff[o] + yp * Y + y];
324
370
  }
325
371
  if (sum == 0.0)
@@ -331,7 +377,7 @@ void grd_spdopsi(grd_t *grd, const seq_t *seq) {
331
377
  (*psiidx)[t][y] = nnz;
332
378
  }
333
379
  }
334
- xvm_expma((double *)psiuni, (double *)psiuni, 0.0, (size_t)T * Y);
380
+ xvm_expma((double *)psiuni, (double *)psiuni, 0.0, (uint64_t)T * Y);
335
381
  xvm_expma((double *)psival, (double *)psival, 1.0, off);
336
382
  }
337
383
 
@@ -356,42 +402,42 @@ void grd_spdopsi(grd_t *grd, const seq_t *seq) {
356
402
  * with α-scale_t the scaling factor used for the α vector at position t
357
403
  * in the forward recursion.
358
404
  */
359
- void grd_flfwdbwd(grd_t *grd, const seq_t *seq) {
360
- const mdl_t *mdl = grd->mdl;
361
- const size_t Y = mdl->nlbl;
362
- const int T = seq->len;
363
- const double (*psi)[T][Y][Y] = (void *)grd->psi;
364
- double (*alpha)[T][Y] = (void *)grd->alpha;
365
- double (*beta )[T][Y] = (void *)grd->beta;
366
- double *scale = grd->scale;
367
- double *unorm = grd->unorm;
368
- double *bnorm = grd->bnorm;
369
- for (size_t y = 0; y < Y; y++)
405
+ void grd_flfwdbwd(grd_st_t *grd_st, const seq_t *seq) {
406
+ const mdl_t *mdl = grd_st->mdl;
407
+ const uint64_t Y = mdl->nlbl;
408
+ const uint32_t T = seq->len;
409
+ const double (*psi)[T][Y][Y] = (void *)grd_st->psi;
410
+ double (*alpha)[T][Y] = (void *)grd_st->alpha;
411
+ double (*beta )[T][Y] = (void *)grd_st->beta;
412
+ double *scale = grd_st->scale;
413
+ double *unorm = grd_st->unorm;
414
+ double *bnorm = grd_st->bnorm;
415
+ for (uint32_t y = 0; y < Y; y++)
370
416
  (*alpha)[0][y] = (*psi)[0][0][y];
371
417
  scale[0] = xvm_unit((*alpha)[0], (*alpha)[0], Y);
372
- for (int t = 1; t < grd->last + 1; t++) {
373
- for (size_t y = 0; y < Y; y++) {
418
+ for (uint32_t t = 1; t < grd_st->last + 1; t++) {
419
+ for (uint32_t y = 0; y < Y; y++) {
374
420
  double sum = 0.0;
375
- for (size_t yp = 0; yp < Y; yp++)
421
+ for (uint32_t yp = 0; yp < Y; yp++)
376
422
  sum += (*alpha)[t - 1][yp] * (*psi)[t][yp][y];
377
423
  (*alpha)[t][y] = sum;
378
424
  }
379
425
  scale[t] = xvm_unit((*alpha)[t], (*alpha)[t], Y);
380
426
  }
381
- for (size_t yp = 0; yp < Y; yp++)
427
+ for (uint32_t yp = 0; yp < Y; yp++)
382
428
  (*beta)[T - 1][yp] = 1.0 / Y;
383
- for (int t = T - 1; t > grd->first; t--) {
384
- for (size_t yp = 0; yp < Y; yp++) {
429
+ for (uint32_t t = T - 1; t > grd_st->first; t--) {
430
+ for (uint32_t yp = 0; yp < Y; yp++) {
385
431
  double sum = 0.0;
386
- for (size_t y = 0; y < Y; y++)
432
+ for (uint32_t y = 0; y < Y; y++)
387
433
  sum += (*beta)[t][y] * (*psi)[t][yp][y];
388
434
  (*beta)[t - 1][yp] = sum;
389
435
  }
390
436
  xvm_unit((*beta)[t - 1], (*beta)[t - 1], Y);
391
437
  }
392
- for (int t = 0; t < T; t++) {
438
+ for (uint32_t t = 0; t < T; t++) {
393
439
  double z = 0.0;
394
- for (size_t y = 0; y < Y; y++)
440
+ for (uint32_t y = 0; y < Y; y++)
395
441
  z += (*alpha)[t][y] * (*beta)[t][y];
396
442
  unorm[t] = 1.0 / z;
397
443
  bnorm[t] = scale[t] / z;
@@ -416,67 +462,67 @@ void grd_flfwdbwd(grd_t *grd, const seq_t *seq) {
416
462
  * And here also we reduce the number of multiplication if the matrix is
417
463
  * really sparse.
418
464
  */
419
- void grd_spfwdbwd(grd_t *grd, const seq_t *seq) {
420
- const mdl_t *mdl = grd->mdl;
421
- const size_t Y = mdl->nlbl;
422
- const int T = seq->len;
423
- const double (*psiuni)[T][Y] = (void *)grd->psiuni;
424
- const double *psival = grd->psi;
425
- const size_t *psiyp = grd->psiyp;
426
- const size_t (*psiidx)[T][Y] = (void *)grd->psiidx;
427
- const size_t *psioff = grd->psioff;
428
- double (*alpha)[T][Y] = (void *)grd->alpha;
429
- double (*beta )[T][Y] = (void *)grd->beta;
430
- double *scale = grd->scale;
431
- double *unorm = grd->unorm;
432
- double *bnorm = grd->bnorm;
433
- for (size_t y = 0; y < Y; y++)
465
+ void grd_spfwdbwd(grd_st_t *grd_st, const seq_t *seq) {
466
+ const mdl_t *mdl = grd_st->mdl;
467
+ const uint32_t Y = mdl->nlbl;
468
+ const uint32_t T = seq->len;
469
+ const double (*psiuni)[T][Y] = (void *)grd_st->psiuni;
470
+ const double *psival = grd_st->psi;
471
+ const uint32_t *psiyp = grd_st->psiyp;
472
+ const uint32_t (*psiidx)[T][Y] = (void *)grd_st->psiidx;
473
+ const uint32_t *psioff = grd_st->psioff;
474
+ double (*alpha)[T][Y] = (void *)grd_st->alpha;
475
+ double (*beta )[T][Y] = (void *)grd_st->beta;
476
+ double *scale = grd_st->scale;
477
+ double *unorm = grd_st->unorm;
478
+ double *bnorm = grd_st->bnorm;
479
+ for (uint32_t y = 0; y < Y; y++)
434
480
  (*alpha)[0][y] = (*psiuni)[0][y];
435
481
  scale[0] = xvm_unit((*alpha)[0], (*alpha)[0], Y);
436
- for (int t = 1; t < grd->last + 1; t++) {
437
- for (size_t y = 0; y < Y; y++)
482
+ for (uint32_t t = 1; t < grd_st->last + 1; t++) {
483
+ for (uint32_t y = 0; y < Y; y++)
438
484
  (*alpha)[t][y] = 1.0;
439
- const size_t off = psioff[t];
440
- for (size_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
485
+ const uint32_t off = psioff[t];
486
+ for (uint32_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
441
487
  while (n >= (*psiidx)[t][y])
442
488
  y++;
443
489
  while (n < (*psiidx)[t][y]) {
444
- const size_t yp = psiyp [off + n];
445
- const double v = psival[off + n];
490
+ const uint32_t yp = psiyp [off + n];
491
+ const double v = psival[off + n];
446
492
  (*alpha)[t][y] += (*alpha)[t - 1][yp] * v;
447
493
  n++;
448
494
  }
449
495
  }
450
- for (size_t y = 0; y < Y; y++)
496
+ for (uint32_t y = 0; y < Y; y++)
451
497
  (*alpha)[t][y] *= (*psiuni)[t][y];
452
498
  scale[t] = xvm_unit((*alpha)[t], (*alpha)[t], Y);
453
499
  }
454
- for (size_t yp = 0; yp < Y; yp++)
500
+ for (uint32_t yp = 0; yp < Y; yp++)
455
501
  (*beta)[T - 1][yp] = 1.0 / Y;
456
- for (int t = T - 1; t > grd->first; t--) {
502
+ for (uint32_t t = T - 1; t > grd_st->first; t--) {
457
503
  double sum = 0.0, tmp[Y];
458
- for (size_t y = 0; y < Y; y++) {
504
+ for (uint32_t y = 0; y < Y; y++) {
459
505
  tmp[y] = (*beta)[t][y] * (*psiuni)[t][y];
460
506
  sum += tmp[y];
461
507
  }
462
- for (size_t y = 0; y < Y; y++)
508
+ for (uint32_t y = 0; y < Y; y++)
463
509
  (*beta)[t - 1][y] = sum;
464
- const size_t off = psioff[t];
465
- for (size_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
510
+ const uint32_t off = psioff[t];
511
+ for (uint32_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
466
512
  while (n >= (*psiidx)[t][y])
467
513
  y++;
468
514
  while (n < (*psiidx)[t][y]) {
469
- const size_t yp = psiyp [off + n];
470
- const double v = psival[off + n];
515
+ const uint32_t yp = psiyp [off + n];
516
+ const double v = psival[off + n];
471
517
  (*beta)[t - 1][yp] += v * tmp[y];
472
518
  n++;
473
519
  }
474
520
  }
475
521
  xvm_unit((*beta)[t - 1], (*beta)[t - 1], Y);
476
522
  }
477
- for (int t = 0; t < T; t++) {
523
+ for (uint32_t t = 0; t < T; t++) {
478
524
  double z = 0.0;
479
- for (size_t y = 0; y < Y; y++)
525
+ for (uint32_t y = 0; y < Y; y++)
480
526
  z += (*alpha)[t][y] * (*beta)[t][y];
481
527
  unorm[t] = 1.0 / z;
482
528
  bnorm[t] = scale[t] / z;
@@ -509,35 +555,35 @@ void grd_spfwdbwd(grd_t *grd, const seq_t *seq) {
509
555
  * vector but just adding the contribution of this sequence. This allow to
510
556
  * compute it easily the gradient over more than one sequence.
511
557
  */
512
- void grd_flupgrad(grd_t *grd, const seq_t *seq) {
513
- const mdl_t *mdl = grd->mdl;
514
- const size_t Y = mdl->nlbl;
515
- const int T = seq->len;
516
- const double (*psi )[T][Y][Y] = (void *)grd->psi;
517
- const double (*alpha)[T][Y] = (void *)grd->alpha;
518
- const double (*beta )[T][Y] = (void *)grd->beta;
519
- const double *unorm = grd->unorm;
520
- const double *bnorm = grd->bnorm;
521
- double *g = grd->g;
522
- for (int t = 0; t < T; t++) {
558
+ void grd_flupgrad(grd_st_t *grd_st, const seq_t *seq) {
559
+ const mdl_t *mdl = grd_st->mdl;
560
+ const uint32_t Y = mdl->nlbl;
561
+ const uint32_t T = seq->len;
562
+ const double (*psi )[T][Y][Y] = (void *)grd_st->psi;
563
+ const double (*alpha)[T][Y] = (void *)grd_st->alpha;
564
+ const double (*beta )[T][Y] = (void *)grd_st->beta;
565
+ const double *unorm = grd_st->unorm;
566
+ const double *bnorm = grd_st->bnorm;
567
+ double *g = grd_st->g;
568
+ for (uint32_t t = 0; t < T; t++) {
523
569
  const pos_t *pos = &(seq->pos[t]);
524
- for (size_t y = 0; y < Y; y++) {
570
+ for (uint32_t y = 0; y < Y; y++) {
525
571
  double e = (*alpha)[t][y] * (*beta)[t][y] * unorm[t];
526
- for (size_t n = 0; n < pos->ucnt; n++) {
527
- const size_t o = pos->uobs[n];
528
- g[mdl->uoff[o] + y] += e;
572
+ for (uint32_t n = 0; n < pos->ucnt; n++) {
573
+ const uint64_t o = pos->uobs[n];
574
+ atm_inc(g + mdl->uoff[o] + y, e);
529
575
  }
530
576
  }
531
577
  }
532
- for (int t = 1; t < T; t++) {
578
+ for (uint32_t t = 1; t < T; t++) {
533
579
  const pos_t *pos = &(seq->pos[t]);
534
- for (size_t yp = 0, d = 0; yp < Y; yp++) {
535
- for (size_t y = 0; y < Y; y++, d++) {
580
+ for (uint32_t yp = 0, d = 0; yp < Y; yp++) {
581
+ for (uint32_t y = 0; y < Y; y++, d++) {
536
582
  double e = (*alpha)[t - 1][yp] * (*beta)[t][y]
537
583
  * (*psi)[t][yp][y] * bnorm[t];
538
- for (size_t n = 0; n < pos->bcnt; n++) {
539
- const size_t o = pos->bobs[n];
540
- g[mdl->boff[o] + d] += e;
584
+ for (uint32_t n = 0; n < pos->bcnt; n++) {
585
+ const uint64_t o = pos->bobs[n];
586
+ atm_inc(g + mdl->boff[o] + d, e);
541
587
  }
542
588
  }
543
589
  }
@@ -552,55 +598,55 @@ void grd_flupgrad(grd_t *grd, const seq_t *seq) {
552
598
  * matrix. We first fill it with the unigram component and next multiply it
553
599
  * with the bigram one.
554
600
  */
555
- void grd_spupgrad(grd_t *grd, const seq_t *seq) {
556
- const mdl_t *mdl = grd->mdl;
557
- const size_t Y = mdl->nlbl;
558
- const int T = seq->len;
559
- const double (*psiuni)[T][Y] = (void *)grd->psiuni;
560
- const double *psival = grd->psi;
561
- const size_t *psiyp = grd->psiyp;
562
- const size_t (*psiidx)[T][Y] = (void *)grd->psiidx;
563
- const size_t *psioff = grd->psioff;
564
- const double (*alpha)[T][Y] = (void *)grd->alpha;
565
- const double (*beta )[T][Y] = (void *)grd->beta;
566
- const double *unorm = grd->unorm;
567
- const double *bnorm = grd->bnorm;
568
- double *g = grd->g;
569
- for (int t = 0; t < T; t++) {
601
+ void grd_spupgrad(grd_st_t *grd_st, const seq_t *seq) {
602
+ const mdl_t *mdl = grd_st->mdl;
603
+ const uint32_t Y = mdl->nlbl;
604
+ const uint32_t T = seq->len;
605
+ const double (*psiuni)[T][Y] = (void *)grd_st->psiuni;
606
+ const double *psival = grd_st->psi;
607
+ const uint32_t *psiyp = grd_st->psiyp;
608
+ const uint32_t (*psiidx)[T][Y] = (void *)grd_st->psiidx;
609
+ const uint32_t *psioff = grd_st->psioff;
610
+ const double (*alpha)[T][Y] = (void *)grd_st->alpha;
611
+ const double (*beta )[T][Y] = (void *)grd_st->beta;
612
+ const double *unorm = grd_st->unorm;
613
+ const double *bnorm = grd_st->bnorm;
614
+ double *g = grd_st->g;
615
+ for (uint32_t t = 0; t < T; t++) {
570
616
  const pos_t *pos = &(seq->pos[t]);
571
- for (size_t y = 0; y < Y; y++) {
617
+ for (uint32_t y = 0; y < Y; y++) {
572
618
  double e = (*alpha)[t][y] * (*beta)[t][y] * unorm[t];
573
- for (size_t n = 0; n < pos->ucnt; n++) {
574
- const size_t o = pos->uobs[n];
575
- g[mdl->uoff[o] + y] += e;
619
+ for (uint32_t n = 0; n < pos->ucnt; n++) {
620
+ const uint64_t o = pos->uobs[n];
621
+ atm_inc(g + mdl->uoff[o] + y, e);
576
622
  }
577
623
  }
578
624
  }
579
- for (int t = 1; t < T; t++) {
625
+ for (uint32_t t = 1; t < T; t++) {
580
626
  const pos_t *pos = &(seq->pos[t]);
581
627
  // We build the expectation matrix
582
628
  double e[Y][Y];
583
- for (size_t yp = 0; yp < Y; yp++)
584
- for (size_t y = 0; y < Y; y++)
629
+ for (uint32_t yp = 0; yp < Y; yp++)
630
+ for (uint32_t y = 0; y < Y; y++)
585
631
  e[yp][y] = (*alpha)[t - 1][yp] * (*beta)[t][y]
586
632
  * (*psiuni)[t][y] * bnorm[t];
587
- const size_t off = psioff[t];
588
- for (size_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
633
+ const uint32_t off = psioff[t];
634
+ for (uint32_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
589
635
  while (n >= (*psiidx)[t][y])
590
636
  y++;
591
637
  while (n < (*psiidx)[t][y]) {
592
- const size_t yp = psiyp [off + n];
593
- const double v = psival[off + n];
638
+ const uint32_t yp = psiyp [off + n];
639
+ const double v = psival[off + n];
594
640
  e[yp][y] += e[yp][y] * v;
595
641
  n++;
596
642
  }
597
643
  }
598
644
  // Add the expectation over the model distribution
599
- for (size_t yp = 0, d = 0; yp < Y; yp++) {
600
- for (size_t y = 0; y < Y; y++, d++) {
601
- for (size_t n = 0; n < pos->bcnt; n++) {
602
- const size_t o = pos->bobs[n];
603
- g[mdl->boff[o] + d] += e[yp][y];
645
+ for (uint32_t yp = 0, d = 0; yp < Y; yp++) {
646
+ for (uint32_t y = 0; y < Y; y++, d++) {
647
+ for (uint32_t n = 0; n < pos->bcnt; n++) {
648
+ const uint64_t o = pos->bobs[n];
649
+ atm_inc(g + mdl->boff[o] + d, e[yp][y]);
604
650
  }
605
651
  }
606
652
  }
@@ -612,24 +658,24 @@ void grd_spupgrad(grd_t *grd, const seq_t *seq) {
612
658
  * distribution. This is the second step of the gradient computation shared
613
659
  * by the non-sparse and sparse version.
614
660
  */
615
- void grd_subemp(grd_t *grd, const seq_t *seq) {
616
- const mdl_t *mdl = grd->mdl;
617
- const size_t Y = mdl->nlbl;
618
- const int T = seq->len;
619
- double *g = grd->g;
620
- for (int t = 0; t < T; t++) {
661
+ void grd_subemp(grd_st_t *grd_st, const seq_t *seq) {
662
+ const mdl_t *mdl = grd_st->mdl;
663
+ const uint32_t Y = mdl->nlbl;
664
+ const uint32_t T = seq->len;
665
+ double *g = grd_st->g;
666
+ for (uint32_t t = 0; t < T; t++) {
621
667
  const pos_t *pos = &(seq->pos[t]);
622
- const size_t y = seq->pos[t].lbl;
623
- for (size_t n = 0; n < pos->ucnt; n++)
624
- g[mdl->uoff[pos->uobs[n]] + y] -= 1.0;
668
+ const uint32_t y = seq->pos[t].lbl;
669
+ for (uint32_t n = 0; n < pos->ucnt; n++)
670
+ atm_inc(g + mdl->uoff[pos->uobs[n]] + y, -1.0);
625
671
  }
626
- for (int t = 1; t < T; t++) {
672
+ for (uint32_t t = 1; t < T; t++) {
627
673
  const pos_t *pos = &(seq->pos[t]);
628
- const size_t yp = seq->pos[t - 1].lbl;
629
- const size_t y = seq->pos[t ].lbl;
630
- const size_t d = yp * Y + y;
631
- for (size_t n = 0; n < pos->bcnt; n++)
632
- g[mdl->boff[pos->bobs[n]] + d] -= 1.0;
674
+ const uint32_t yp = seq->pos[t - 1].lbl;
675
+ const uint32_t y = seq->pos[t ].lbl;
676
+ const uint32_t d = yp * Y + y;
677
+ for (uint32_t n = 0; n < pos->bcnt; n++)
678
+ atm_inc(g + mdl->boff[pos->bobs[n]] + d, -1.0);
633
679
  }
634
680
  }
635
681
 
@@ -655,38 +701,38 @@ void grd_subemp(grd_t *grd, const seq_t *seq) {
655
701
  * weights will be non-nul only for observations present in the sequence, we
656
702
  * sum only over these ones.
657
703
  */
658
- void grd_logloss(grd_t *grd, const seq_t *seq) {
659
- const mdl_t *mdl = grd->mdl;
660
- const double *x = mdl->theta;
661
- const size_t Y = mdl->nlbl;
662
- const int T = seq->len;
663
- const double (*alpha)[T][Y] = (void *)grd->alpha;
664
- const double *scale = grd->scale;
704
+ void grd_logloss(grd_st_t *grd_st, const seq_t *seq) {
705
+ const mdl_t *mdl = grd_st->mdl;
706
+ const double *x = mdl->theta;
707
+ const uint32_t Y = mdl->nlbl;
708
+ const uint32_t T = seq->len;
709
+ const double (*alpha)[T][Y] = (void *)grd_st->alpha;
710
+ const double *scale = grd_st->scale;
665
711
  double logz = 0.0;
666
- for (size_t y = 0; y < Y; y++)
712
+ for (uint32_t y = 0; y < Y; y++)
667
713
  logz += (*alpha)[T - 1][y];
668
714
  logz = log(logz);
669
- for (int t = 0; t < T; t++)
715
+ for (uint32_t t = 0; t < T; t++)
670
716
  logz -= log(scale[t]);
671
717
  double lloss = logz;
672
- for (int t = 0; t < T; t++) {
718
+ for (uint32_t t = 0; t < T; t++) {
673
719
  const pos_t *pos = &(seq->pos[t]);
674
- const size_t y = seq->pos[t].lbl;
675
- for (size_t n = 0; n < pos->ucnt; n++)
720
+ const uint32_t y = seq->pos[t].lbl;
721
+ for (uint32_t n = 0; n < pos->ucnt; n++)
676
722
  lloss -= x[mdl->uoff[pos->uobs[n]] + y];
677
723
  }
678
- for (int t = 1; t < T; t++) {
724
+ for (uint32_t t = 1; t < T; t++) {
679
725
  const pos_t *pos = &(seq->pos[t]);
680
- const size_t yp = seq->pos[t - 1].lbl;
681
- const size_t y = seq->pos[t ].lbl;
682
- const size_t d = yp * Y + y;
683
- for (size_t n = 0; n < pos->bcnt; n++)
726
+ const uint32_t yp = seq->pos[t - 1].lbl;
727
+ const uint32_t y = seq->pos[t ].lbl;
728
+ const uint32_t d = yp * Y + y;
729
+ for (uint32_t n = 0; n < pos->bcnt; n++)
684
730
  lloss -= x[mdl->boff[pos->bobs[n]] + d];
685
731
  }
686
- grd->lloss += lloss;
732
+ grd_st->lloss += lloss;
687
733
  }
688
734
 
689
- /* grd_doseq:
735
+ /* grd_docrf:
690
736
  * This function compute the gradient and value of the negative log-likelihood
691
737
  * of the model over a single training sequence.
692
738
  *
@@ -694,21 +740,21 @@ void grd_logloss(grd_t *grd, const seq_t *seq) {
694
740
  * just accumulate the values for the given sequence in it. This allow to
695
741
  * easily compute the gradient over a set of sequences.
696
742
  */
697
- void grd_doseq(grd_t *grd, const seq_t *seq) {
698
- const mdl_t *mdl = grd->mdl;
699
- grd->first = 0;
700
- grd->last = seq->len - 1;
743
+ void grd_docrf(grd_st_t *grd_st, const seq_t *seq) {
744
+ const mdl_t *mdl = grd_st->mdl;
745
+ grd_st->first = 0;
746
+ grd_st->last = seq->len - 1;
701
747
  if (!mdl->opt->sparse) {
702
- grd_fldopsi(grd, seq);
703
- grd_flfwdbwd(grd, seq);
704
- grd_flupgrad(grd, seq);
748
+ grd_fldopsi(grd_st, seq);
749
+ grd_flfwdbwd(grd_st, seq);
750
+ grd_flupgrad(grd_st, seq);
705
751
  } else {
706
- grd_spdopsi(grd, seq);
707
- grd_spfwdbwd(grd, seq);
708
- grd_spupgrad(grd, seq);
752
+ grd_spdopsi(grd_st, seq);
753
+ grd_spfwdbwd(grd_st, seq);
754
+ grd_spupgrad(grd_st, seq);
709
755
  }
710
- grd_subemp(grd, seq);
711
- grd_logloss(grd, seq);
756
+ grd_subemp(grd_st, seq);
757
+ grd_logloss(grd_st, seq);
712
758
  }
713
759
 
714
760
  /******************************************************************************
@@ -731,16 +777,130 @@ void grd_doseq(grd_t *grd, const seq_t *seq) {
731
777
  * cores, or to more thread than you have memory to hold vectors.
732
778
  ******************************************************************************/
733
779
 
780
+ /* grd_stcheck:
781
+ * Check that enough memory is allocated in the gradient object so that the
782
+ * linear-chain codepath can be computed for a sequence of the given length.
783
+ */
784
+ void grd_stcheck(grd_st_t *grd_st, uint32_t len) {
785
+ // Check if user ask for clearing the state tracker or if he requested a
786
+ // bigger tracker. In this case we have to free the previous allocated
787
+ // memory.
788
+ if (len == 0 || (len > grd_st->len && grd_st->len != 0)) {
789
+ if (grd_st->mdl->opt->sparse) {
790
+ xvm_free(grd_st->psiuni); grd_st->psiuni = NULL;
791
+ free(grd_st->psiyp); grd_st->psiyp = NULL;
792
+ free(grd_st->psiidx); grd_st->psiidx = NULL;
793
+ free(grd_st->psioff); grd_st->psioff = NULL;
794
+ }
795
+ xvm_free(grd_st->psi); grd_st->psi = NULL;
796
+ xvm_free(grd_st->alpha); grd_st->alpha = NULL;
797
+ xvm_free(grd_st->beta); grd_st->beta = NULL;
798
+ xvm_free(grd_st->unorm); grd_st->unorm = NULL;
799
+ xvm_free(grd_st->bnorm); grd_st->bnorm = NULL;
800
+ xvm_free(grd_st->scale); grd_st->scale = NULL;
801
+ grd_st->len = 0;
802
+ }
803
+ if (len == 0 || len <= grd_st->len)
804
+ return;
805
+ // If we are here, we have to allocate a new state. This is simple, we
806
+ // just have to take care of the special case for sparse mode.
807
+ const uint32_t Y = grd_st->mdl->nlbl;
808
+ const uint32_t T = len;
809
+ grd_st->psi = xvm_new(T * Y * Y);
810
+ grd_st->alpha = xvm_new(T * Y);
811
+ grd_st->beta = xvm_new(T * Y);
812
+ grd_st->scale = xvm_new(T);
813
+ grd_st->unorm = xvm_new(T);
814
+ grd_st->bnorm = xvm_new(T);
815
+ if (grd_st->mdl->opt->sparse) {
816
+ grd_st->psiuni = xvm_new(T * Y);
817
+ grd_st->psiyp = wapiti_xmalloc(sizeof(uint32_t) * T * Y * Y);
818
+ grd_st->psiidx = wapiti_xmalloc(sizeof(uint32_t) * T * Y);
819
+ grd_st->psioff = wapiti_xmalloc(sizeof(uint32_t) * T);
820
+ }
821
+ grd_st->len = len;
822
+ }
823
+
824
+ /* grd_stnew:
825
+ * Allocation memory for gradient computation state. This allocate memory for
826
+ * the longest sequence present in the data set.
827
+ */
828
+ grd_st_t *grd_stnew(mdl_t *mdl, double *g) {
829
+ grd_st_t *grd_st = wapiti_xmalloc(sizeof(grd_st_t));
830
+ grd_st->mdl = mdl;
831
+ grd_st->len = 0;
832
+ grd_st->g = g;
833
+ grd_st->psi = NULL;
834
+ grd_st->psiuni = NULL;
835
+ grd_st->psiyp = NULL;
836
+ grd_st->psiidx = NULL;
837
+ grd_st->psioff = NULL;
838
+ grd_st->alpha = NULL;
839
+ grd_st->beta = NULL;
840
+ grd_st->unorm = NULL;
841
+ grd_st->bnorm = NULL;
842
+ grd_st->scale = NULL;
843
+ return grd_st;
844
+ }
845
+
846
+ /* grd_stfree:
847
+ * Free all memory used by gradient computation.
848
+ */
849
+ void grd_stfree(grd_st_t *grd_st) {
850
+ grd_stcheck(grd_st, 0);
851
+ free(grd_st);
852
+ }
853
+
734
854
  /* grd_dospl:
735
855
  * Compute the gradient of a single sample choosing between the maxent
736
856
  * optimised codepath and classical one depending of the sample.
737
857
  */
738
- void grd_dospl(grd_t *grd, const seq_t *seq) {
739
- grd_check(grd, seq->len);
740
- if (seq->len == 1 || grd->mdl->reader->nbi == 0)
741
- grd_dosingle(grd, seq);
858
+ void grd_dospl(grd_st_t *grd_st, const seq_t *seq) {
859
+ grd_stcheck(grd_st, seq->len);
860
+ rdr_t *rdr = grd_st->mdl->reader;
861
+ if (seq->len == 1 || (rdr->npats != 0 && rdr->nbi == 0))
862
+ grd_domaxent(grd_st, seq);
863
+ else if (grd_st->mdl->type == 0)
864
+ grd_domaxent(grd_st, seq);
865
+ else if (grd_st->mdl->type == 1)
866
+ grd_domemm(grd_st, seq);
742
867
  else
743
- grd_doseq(grd, seq);
868
+ grd_docrf(grd_st, seq);
869
+ }
870
+
871
+ /* grd_new:
872
+ * Allocate a new parallel gradient computer. Return a grd_t object who can
873
+ * compute gradient over the full data set and store it in the vector <g>.
874
+ */
875
+ grd_t *grd_new(mdl_t *mdl, double *g) {
876
+ const uint32_t W = mdl->opt->nthread;
877
+ grd_t *grd = wapiti_xmalloc(sizeof(grd_t));
878
+ grd->mdl = mdl;
879
+ grd->grd_st = wapiti_xmalloc(sizeof(grd_st_t *) * W);
880
+ #ifdef ATM_ANSI
881
+ grd->grd_st[0] = grd_stnew(mdl, g);
882
+ for (uint32_t w = 1; w < W; w++)
883
+ grd->grd_st[w] = grd_stnew(mdl, xvm_new(mdl->nftr));
884
+ #else
885
+ for (uint32_t w = 0; w < W; w++)
886
+ grd->grd_st[w] = grd_stnew(mdl, g);
887
+ #endif
888
+ return grd;
889
+ }
890
+
891
+ /* grd_free:
892
+ * Free all memory allocated for the given gradient computer object.
893
+ */
894
+ void grd_free(grd_t *grd) {
895
+ const uint32_t W = grd->mdl->opt->nthread;
896
+ #ifdef ATM_ANSI
897
+ for (uint32_t w = 1; w < W; w++)
898
+ xvm_free(grd->grd_st[w]->g);
899
+ #endif
900
+ for (uint32_t w = 0; w < W; w++)
901
+ grd_stfree(grd->grd_st[w]);
902
+ free(grd->grd_st);
903
+ free(grd);
744
904
  }
745
905
 
746
906
  /* grd_worker:
@@ -748,22 +908,25 @@ void grd_dospl(grd_t *grd, const seq_t *seq) {
748
908
  * training set. It is mean to be called by the thread spawner in order to
749
909
  * compute the gradient over the full training set.
750
910
  */
751
- static void grd_worker(job_t *job, int id, int cnt, grd_t *grd) {
911
+ static
912
+ void grd_worker(job_t *job, uint32_t id, uint32_t cnt, grd_st_t *grd_st) {
752
913
  unused(id && cnt);
753
- mdl_t *mdl = grd->mdl;
914
+ mdl_t *mdl = grd_st->mdl;
754
915
  const dat_t *dat = mdl->train;
755
- const size_t F = mdl->nftr;
756
916
  // We first cleanup the gradient and value as our parent don't do it (it
757
917
  // is better to do this also in parallel)
758
- grd->lloss = 0.0;
759
- for (size_t f = 0; f < F; f++)
760
- grd->g[f] = 0.0;
918
+ grd_st->lloss = 0.0;
919
+ #ifdef ATM_ANSI
920
+ const uint64_t F = mdl->nftr;
921
+ for (uint64_t f = 0; f < F; f++)
922
+ grd_st->g[f] = 0.0;
923
+ #endif
761
924
  // Now all is ready, we can process our sequences and accumulate the
762
925
  // gradient and inverse log-likelihood
763
- size_t count, pos;
926
+ uint32_t count, pos;
764
927
  while (mth_getjob(job, &count, &pos)) {
765
- for (size_t s = pos; !uit_stop && s < pos + count; s++)
766
- grd_dospl(grd, dat->seq[s]);
928
+ for (uint32_t s = pos; !uit_stop && s < pos + count; s++)
929
+ grd_dospl(grd_st, dat->seq[s]);
767
930
  if (uit_stop)
768
931
  break;
769
932
  }
@@ -775,30 +938,38 @@ static void grd_worker(job_t *job, int id, int cnt, grd_t *grd) {
775
938
  * the fact that the gradient over the full training set is just the sum of
776
939
  * the gradient of each sequence.
777
940
  */
778
- double grd_gradient(mdl_t *mdl, double *g, grd_t *grds[]) {
779
- const double *x = mdl->theta;
780
- const size_t F = mdl->nftr;
781
- const size_t W = mdl->opt->nthread;
941
+ double grd_gradient(grd_t *grd) {
942
+ mdl_t *mdl = grd->mdl;
943
+ const double *x = mdl->theta;
944
+ const uint64_t F = mdl->nftr;
945
+ const uint32_t W = mdl->opt->nthread;
946
+ double *g = grd->grd_st[0]->g;
947
+ #ifndef ATM_ANSI
948
+ for (uint64_t f = 0; f < F; f++)
949
+ g[f] = 0.0;
950
+ #endif
782
951
  // All is ready to compute the gradient, we spawn the threads of
783
952
  // workers, each one working on a part of the data. As the gradient and
784
953
  // log-likelihood are additive, computing the final values will be
785
954
  // trivial.
786
- mth_spawn((func_t *)grd_worker, W, (void **)grds, mdl->train->nseq,
787
- mdl->opt->jobsize);
955
+ mth_spawn((func_t *)grd_worker, W, (void **)grd->grd_st,
956
+ mdl->train->nseq, mdl->opt->jobsize);
788
957
  if (uit_stop)
789
958
  return -1.0;
790
959
  // All computations are done, it just remain to add all the gradients
791
- // and inverse log-likelihood from all the workers.
792
- double fx = grds[0]->lloss;
793
- for (size_t w = 1; w < W; w++) {
794
- for (size_t f = 0; f < F; f++)
795
- g[f] += grds[w]->g[f];
796
- fx += grds[w]->lloss;
797
- }
960
+ // and negative log-likelihood from all the workers.
961
+ double fx = grd->grd_st[0]->lloss;
962
+ for (uint32_t w = 1; w < W; w++)
963
+ fx += grd->grd_st[w]->lloss;
964
+ #ifdef ATM_ANSI
965
+ for (uint32_t w = 1; w < W; w++)
966
+ for (uint64_t f = 0; f < F; f++)
967
+ g[f] += grd->grd_st[w]->g[f];
968
+ #endif
798
969
  // If needed we clip the gradient: setting to 0.0 all coordinates where
799
970
  // the function is 0.0.
800
971
  if (mdl->opt->lbfgs.clip == true)
801
- for (size_t f = 0; f < F; f++)
972
+ for (uint64_t f = 0; f < F; f++)
802
973
  if (x[f] == 0.0)
803
974
  g[f] = 0.0;
804
975
  // Now we can apply the elastic-net penalty. Depending of the values of
@@ -806,7 +977,7 @@ double grd_gradient(mdl_t *mdl, double *g, grd_t *grds[]) {
806
977
  const double rho1 = mdl->opt->rho1;
807
978
  const double rho2 = mdl->opt->rho2;
808
979
  double nl1 = 0.0, nl2 = 0.0;
809
- for (size_t f = 0; f < F; f++) {
980
+ for (uint64_t f = 0; f < F; f++) {
810
981
  const double v = x[f];
811
982
  g[f] += rho2 * v;
812
983
  nl1 += fabs(v);