wapiti 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/.autotest +13 -0
  2. data/.gitignore +5 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +30 -0
  6. data/README.md +153 -0
  7. data/Rakefile +33 -0
  8. data/ext/wapiti/bcd.c +392 -0
  9. data/ext/wapiti/decoder.c +535 -0
  10. data/ext/wapiti/decoder.h +46 -0
  11. data/ext/wapiti/extconf.rb +8 -0
  12. data/ext/wapiti/gradient.c +818 -0
  13. data/ext/wapiti/gradient.h +81 -0
  14. data/ext/wapiti/lbfgs.c +294 -0
  15. data/ext/wapiti/model.c +296 -0
  16. data/ext/wapiti/model.h +100 -0
  17. data/ext/wapiti/native.c +1238 -0
  18. data/ext/wapiti/native.h +15 -0
  19. data/ext/wapiti/options.c +278 -0
  20. data/ext/wapiti/options.h +91 -0
  21. data/ext/wapiti/pattern.c +395 -0
  22. data/ext/wapiti/pattern.h +56 -0
  23. data/ext/wapiti/progress.c +167 -0
  24. data/ext/wapiti/progress.h +43 -0
  25. data/ext/wapiti/quark.c +272 -0
  26. data/ext/wapiti/quark.h +46 -0
  27. data/ext/wapiti/reader.c +553 -0
  28. data/ext/wapiti/reader.h +73 -0
  29. data/ext/wapiti/rprop.c +191 -0
  30. data/ext/wapiti/sequence.h +148 -0
  31. data/ext/wapiti/sgdl1.c +218 -0
  32. data/ext/wapiti/thread.c +171 -0
  33. data/ext/wapiti/thread.h +42 -0
  34. data/ext/wapiti/tools.c +202 -0
  35. data/ext/wapiti/tools.h +54 -0
  36. data/ext/wapiti/trainers.h +39 -0
  37. data/ext/wapiti/vmath.c +372 -0
  38. data/ext/wapiti/vmath.h +51 -0
  39. data/ext/wapiti/wapiti.c +288 -0
  40. data/ext/wapiti/wapiti.h +45 -0
  41. data/lib/wapiti.rb +30 -0
  42. data/lib/wapiti/errors.rb +17 -0
  43. data/lib/wapiti/model.rb +49 -0
  44. data/lib/wapiti/options.rb +113 -0
  45. data/lib/wapiti/utility.rb +15 -0
  46. data/lib/wapiti/version.rb +3 -0
  47. data/spec/fixtures/ch.mod +18550 -0
  48. data/spec/fixtures/chpattern.txt +52 -0
  49. data/spec/fixtures/chtest.txt +1973 -0
  50. data/spec/fixtures/chtrain.txt +19995 -0
  51. data/spec/fixtures/nppattern.txt +52 -0
  52. data/spec/fixtures/nptest.txt +1973 -0
  53. data/spec/fixtures/nptrain.txt +19995 -0
  54. data/spec/fixtures/pattern.txt +14 -0
  55. data/spec/fixtures/test.txt +60000 -0
  56. data/spec/fixtures/train.txt +1200 -0
  57. data/spec/spec_helper.rb +21 -0
  58. data/spec/wapiti/model_spec.rb +173 -0
  59. data/spec/wapiti/native_spec.rb +12 -0
  60. data/spec/wapiti/options_spec.rb +175 -0
  61. data/spec/wapiti/utility_spec.rb +22 -0
  62. data/wapiti.gemspec +35 -0
  63. metadata +178 -0
@@ -0,0 +1,46 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #ifndef decoder_h
29
+ #define decoder_h
30
+
31
+ #include <stddef.h>
32
+ #include <stdio.h>
33
+
34
+ #include "wapiti.h"
35
+ #include "model.h"
36
+ #include "sequence.h"
37
+
38
+ void tag_viterbi(mdl_t *mdl, const seq_t *seq,
39
+ size_t out[], double *sc, double psc[]);
40
+ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, size_t N,
41
+ size_t out[][N], double sc[], double psc[][N]);
42
+
43
+ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout);
44
+ void tag_eval(mdl_t *mdl, double *te, double *se);
45
+ #endif
46
+
@@ -0,0 +1,8 @@
1
+ require 'mkmf'
2
+
3
+ $CFLAGS << %q{ -std=c99 -O3 -Wall }
4
+
5
+ have_library('pthread')
6
+ have_library('m')
7
+
8
+ create_makefile('wapiti/native')
@@ -0,0 +1,818 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+ #include <math.h>
28
+ #include <stddef.h>
29
+ #include <stdlib.h>
30
+ #include <stdio.h>
31
+ #include <string.h>
32
+
33
+ #include "wapiti.h"
34
+ #include "gradient.h"
35
+ #include "model.h"
36
+ #include "options.h"
37
+ #include "progress.h"
38
+ #include "sequence.h"
39
+ #include "tools.h"
40
+ #include "thread.h"
41
+ #include "vmath.h"
42
+
43
+ /******************************************************************************
44
+ * Maxent optimized gradient computation
45
+ *
46
+ * Maxent or maximum entropy models are a specific case of CRF where the
47
+ * output graph is reduced to a single node. In this specific case, the
48
+ * computation of the gradient can be simplified a lot as it is done in this
49
+ * part of the code.
50
+ *
51
+ * This code will be used to compute gradient for sequences of length one and
52
+ * without actives bigrams features. All other case are handled by the next
53
+ * section.
54
+ ******************************************************************************/
55
+ void grd_dosingle(grd_t *grd, const seq_t *seq) {
56
+ const mdl_t *mdl = grd->mdl;
57
+ const double *x = mdl->theta;
58
+ const int T = seq->len;
59
+ const size_t Y = mdl->nlbl;
60
+ double *psi = grd->psi;
61
+ double *g = grd->g;
62
+ for (int t = 0; t < T; t++) {
63
+ const pos_t *pos = &(seq->pos[t]);
64
+ // We first compute for each Y the sum of weights of all
65
+ // features actives in the sample:
66
+ // Ψ(y,x^i) = \exp( ∑_k θ_k f_k(y,x^i) )
67
+ // Z_θ(x^i) = ∑_y Ψ(y,x^i)
68
+ double Z = 0.0;
69
+ for (size_t y = 0; y < Y; y++)
70
+ psi[y] = 0.0;
71
+ for (size_t n = 0; n < pos->ucnt; n++) {
72
+ const double *wgh = x + mdl->uoff[pos->uobs[n]];
73
+ for (size_t y = 0; y < Y; y++)
74
+ psi[y] += wgh[y];
75
+ }
76
+ double lloss = psi[pos->lbl];
77
+ for (size_t y = 0; y < Y; y++) {
78
+ psi[y] = (psi[y] == 0.0) ? 1.0 : exp(psi[y]);
79
+ Z += psi[y];
80
+ }
81
+ // Now, we can compute the gradient update, for each active
82
+ // feature in the sample the update is the expectation over the
83
+ // current model minus the expectation over the observed
84
+ // distribution:
85
+ // E_{q_θ}(x,y) - E_{p}(x,y)
86
+ // and we can compute the expectation over the model with:
87
+ // E_{q_θ}(x,y) = f_k(y,x^i) * ψ(y,x) / Z_θ(x)
88
+ for (size_t y = 0; y < Y; y++)
89
+ psi[y] /= Z;
90
+ for (size_t n = 0; n < pos->ucnt; n++) {
91
+ double *grd = g + mdl->uoff[pos->uobs[n]];
92
+ for (size_t y = 0; y < Y; y++)
93
+ grd[y] += psi[y];
94
+ grd[pos->lbl] -= 1.0;
95
+ }
96
+ // And finally the log-likelihood with:
97
+ // L_θ(x^i,y^i) = log(Z_θ(x^i)) - log(ψ(y^i,x^i))
98
+ grd->lloss += log(Z) - lloss;
99
+ }
100
+ }
101
+
102
+ /******************************************************************************
103
+ * Single sequence gradient computation
104
+ *
105
+ * This section is responsible for computing the gradient of the
106
+ * log-likelihood function to optimize over a single sequence.
107
+ *
108
+ * There is two version of this code, one using dense matrix and one with
109
+ * sparse matrix. The sparse version use the fact that for L1 regularized
110
+ * trainers, the bigrams scores will be very sparse so there is a way to
111
+ * reduce the amount of computation needed in the forward backward at the
112
+ * price of a more complex implementation. Due to the fact that using a sparse
113
+ * matrix have a cost, this implementation is slower on L2 regularized models
114
+ * and on lighty L1-regularized models, this is why there is also a classical
115
+ * dense version of the algorithm used for example by the L-BFGS trainer.
116
+ *
117
+ * The sparse matrix implementation is a bit tricky because we need to store
118
+ * all values in sequences in order to use the vector exponential who gives
119
+ * also a lot of performance improvement on vector able machine.
120
+ * We need four arrays noted <val>, <off>, <idx>, and <yp>. For each positions
121
+ * t, <off>[t] value indicate where the non-zero values for t starts in <val>.
122
+ * The other arrays gives the y and yp indices of these values. The easier one
123
+ * to retrieve is yp, the yp indice for value at <val>[<off>[t] + n] is stored
124
+ * at the same position in <yp>.
125
+ * The y are more difficult: the indice y are stored with n between <idx>[y-1]
126
+ * and <idx>[y]. It may seems inefective but the matrix is indexed in the
127
+ * other way, we go through the idx array, and for each y we get the yp and
128
+ * values, so in practice it's very efficient.
129
+ *
130
+ * This can seem too complex but we have to keep in mind that Y are generally
131
+ * very low and any sparse-matrix have overhead so we have to reduce it to the
132
+ * minimum in order to get a real improvment. Dedicated library are optimized
133
+ * for bigger matrix where the overhead is not a so important problem.
134
+ * Another problem here is cache size. The optimization process will last most
135
+ * of his time in this function so it have to be well optimized and we already
136
+ * need a lot of memory for other data so we have to be carefull here if we
137
+ * don't want to flush the cache all the time. Sparse matrix require less
138
+ * memory than dense one only if we now in advance the number of non-zero
139
+ * entries, which is not the case here, so we have to use a scheme which in
140
+ * the worst case use as less as possible memory.
141
+ ******************************************************************************/
142
+
143
+ /* grd_check:
144
+ * Check that enough memory is allocated in the gradient object so that the
145
+ * linear-chain codepath can be computed for a sequence of the given length.
146
+ */
147
+ void grd_check(grd_t *grd, int len) {
148
+ // Check if user ask for clearing the state tracker or if he requested a
149
+ // bigger tracker. In this case we have to free the previous allocated
150
+ // memory.
151
+ if (len == 0 || (len > grd->len && grd->len != 0)) {
152
+ if (grd->mdl->opt->sparse) {
153
+ xvm_free(grd->psiuni); grd->psiuni = NULL;
154
+ free(grd->psiyp); grd->psiyp = NULL;
155
+ free(grd->psiidx); grd->psiidx = NULL;
156
+ free(grd->psioff); grd->psioff = NULL;
157
+ }
158
+ xvm_free(grd->psi); grd->psi = NULL;
159
+ xvm_free(grd->alpha); grd->alpha = NULL;
160
+ xvm_free(grd->beta); grd->beta = NULL;
161
+ xvm_free(grd->unorm); grd->unorm = NULL;
162
+ xvm_free(grd->bnorm); grd->bnorm = NULL;
163
+ xvm_free(grd->scale); grd->scale = NULL;
164
+ grd->len = 0;
165
+ }
166
+ if (len == 0 || len <= grd->len)
167
+ return;
168
+ // If we are here, we have to allocate a new state. This is simple, we
169
+ // just have to take care of the special case for sparse mode.
170
+ const size_t Y = grd->mdl->nlbl;
171
+ const int T = len;
172
+ grd->psi = xvm_new(T * Y * Y);
173
+ grd->alpha = xvm_new(T * Y);
174
+ grd->beta = xvm_new(T * Y);
175
+ grd->scale = xvm_new(T);
176
+ grd->unorm = xvm_new(T);
177
+ grd->bnorm = xvm_new(T);
178
+ if (grd->mdl->opt->sparse) {
179
+ grd->psiuni = xvm_new(T * Y);
180
+ grd->psiyp = wapiti_xmalloc(sizeof(size_t) * T * Y * Y);
181
+ grd->psiidx = wapiti_xmalloc(sizeof(size_t) * T * Y);
182
+ grd->psioff = wapiti_xmalloc(sizeof(size_t) * T);
183
+ }
184
+ grd->len = len;
185
+ }
186
+
187
+ /* grd_new:
188
+ * Allocation memory for gradient computation state. This allocate memory for
189
+ * the longest sequence present in the data set.
190
+ */
191
+ grd_t *grd_new(mdl_t *mdl, double *g) {
192
+ grd_t *grd = wapiti_xmalloc(sizeof(grd_t));
193
+ grd->mdl = mdl;
194
+ grd->len = 0;
195
+ grd->g = g;
196
+ grd->psi = NULL;
197
+ grd->psiuni = NULL;
198
+ grd->psiyp = NULL;
199
+ grd->psiidx = NULL;
200
+ grd->psioff = NULL;
201
+ grd->alpha = NULL;
202
+ grd->beta = NULL;
203
+ grd->unorm = NULL;
204
+ grd->bnorm = NULL;
205
+ grd->scale = NULL;
206
+ return grd;
207
+ }
208
+
209
+ /* grd_free:
210
+ * Free all memory used by gradient computation.
211
+ */
212
+ void grd_free(grd_t *grd) {
213
+ grd_check(grd, 0);
214
+ free(grd);
215
+ }
216
+
217
+ /* grd_fldopsi:
218
+ * We first have to compute the Ψ_t(y',y,x) weights defined as
219
+ * Ψ_t(y',y,x) = \exp( ∑_k θ_k f_k(y',y,x_t) )
220
+ * So at position 't' in the sequence, for each couple (y',y) we have to sum
221
+ * weights of all features. Only the observations present at this position
222
+ * will have a non-nul weight so we can sum only on thoses. As we use only two
223
+ * kind of features: unigram and bigram, we can rewrite this as
224
+ * \exp ( ∑_k μ_k(y, x_t) f_k(y, x_t)
225
+ * + ∑_k λ_k(y', y, x_t) f_k(y', y, x_t) )
226
+ * Where the first sum is over the unigrams features and the second is over
227
+ * bigrams ones.
228
+ * This allow us to compute Ψ efficiently in three steps
229
+ * 1/ we sum the unigrams features weights by looping over actives
230
+ * unigrams observations. (we compute this sum once and use it
231
+ * for each value of y')
232
+ * 2/ we add the bigrams features weights by looping over actives
233
+ * bigrams observations (we don't have to do this for t=0 since
234
+ * there is no bigrams here)
235
+ * 3/ we take the component-wise exponential of the resulting matrix
236
+ * (this can be done efficiently with vector maths)
237
+ */
238
+ void grd_fldopsi(grd_t *grd, const seq_t *seq) {
239
+ const mdl_t *mdl = grd->mdl;
240
+ const double *x = mdl->theta;
241
+ const size_t Y = mdl->nlbl;
242
+ const int T = seq->len;
243
+ double (*psi)[T][Y][Y] = (void *)grd->psi;
244
+ for (int t = 0; t < T; t++) {
245
+ const pos_t *pos = &(seq->pos[t]);
246
+ for (size_t y = 0; y < Y; y++) {
247
+ double sum = 0.0;
248
+ for (size_t n = 0; n < pos->ucnt; n++) {
249
+ const size_t o = pos->uobs[n];
250
+ sum += x[mdl->uoff[o] + y];
251
+ }
252
+ for (size_t yp = 0; yp < Y; yp++)
253
+ (*psi)[t][yp][y] = sum;
254
+ }
255
+ }
256
+ for (int t = 1; t < T; t++) {
257
+ const pos_t *pos = &(seq->pos[t]);
258
+ for (size_t yp = 0, d = 0; yp < Y; yp++) {
259
+ for (size_t y = 0; y < Y; y++, d++) {
260
+ double sum = 0.0;
261
+ for (size_t n = 0; n < pos->bcnt; n++) {
262
+ const size_t o = pos->bobs[n];
263
+ sum += x[mdl->boff[o] + d];
264
+ }
265
+ (*psi)[t][yp][y] += sum;
266
+ }
267
+ }
268
+ }
269
+ xvm_expma((double *)psi, (double *)psi, 0.0, (size_t)T * Y * Y);
270
+ }
271
+
272
+ /* grd_spdopsi:
273
+ * For the sparse version, we keep the two sum separate so we will have
274
+ * separate Ψ_t(y,x) and Ψ_t(y',y,x). The first one define a vector for
275
+ * unigram at each position, and the second one a matrix for bigrams. This is
276
+ * where the trick is as we will store Ψ_t(y',y,x) - 1. If the sum is nul, his
277
+ * exponential will be 1.0 and so we have to store 0.0. As most of the sum
278
+ * are expected to be nul the resulting matrix will be very sparse and we will
279
+ * save computation in the forward-backward.
280
+ *
281
+ * So we compute Ψ differently here
282
+ * 1/ we sum the unigrams features weights by looping over actives
283
+ * unigrams observations and store them in |psiuni|.
284
+ * 2/ we sum the bigrams features weights by looping over actives
285
+ * bigrams observations (we don't have to do this for t=0 since
286
+ * there is no bigrams here) and we store the non-nul one in the
287
+ * sparse matrix.
288
+ * 3/ we take the component-wise exponential of the unigrams vectors,
289
+ * and the component-wise exponential of the sparse matrix minus
290
+ * one. (here also this can be done efficiently with vector
291
+ * maths)
292
+ */
293
+ void grd_spdopsi(grd_t *grd, const seq_t *seq) {
294
+ const mdl_t *mdl = grd->mdl;
295
+ const double *x = mdl->theta;
296
+ const size_t Y = mdl->nlbl;
297
+ const int T = seq->len;
298
+ double (*psiuni)[T][Y] = (void *)grd->psiuni;
299
+ double *psival = grd->psi;
300
+ size_t *psiyp = grd->psiyp;
301
+ size_t (*psiidx)[T][Y] = (void *)grd->psiidx;
302
+ size_t *psioff = grd->psioff;
303
+ for (int t = 0; t < T; t++) {
304
+ const pos_t *pos = &(seq->pos[t]);
305
+ for (size_t y = 0; y < Y; y++) {
306
+ double sum = 0.0;
307
+ for (size_t n = 0; n < pos->ucnt; n++) {
308
+ const size_t o = pos->uobs[n];
309
+ sum += x[mdl->uoff[o] + y];
310
+ }
311
+ (*psiuni)[t][y] = sum;
312
+ }
313
+ }
314
+ size_t off = 0;
315
+ for (int t = 1; t < T; t++) {
316
+ const pos_t *pos = &(seq->pos[t]);
317
+ psioff[t] = off;
318
+ for (size_t y = 0, nnz = 0; y < Y; y++) {
319
+ for (size_t yp = 0; yp < Y; yp++) {
320
+ double sum = 0.0;
321
+ for (size_t n = 0; n < pos->bcnt; n++) {
322
+ const size_t o = pos->bobs[n];
323
+ sum += x[mdl->boff[o] + yp * Y + y];
324
+ }
325
+ if (sum == 0.0)
326
+ continue;
327
+ psiyp [off] = yp;
328
+ psival[off] = sum;
329
+ nnz++, off++;
330
+ }
331
+ (*psiidx)[t][y] = nnz;
332
+ }
333
+ }
334
+ xvm_expma((double *)psiuni, (double *)psiuni, 0.0, (size_t)T * Y);
335
+ xvm_expma((double *)psival, (double *)psival, 1.0, off);
336
+ }
337
+
338
+ /* grd_flfwdbwd:
339
+ * Now, we go to the forward-backward algorithm. As this part of the code rely
340
+ * on a lot of recursive sums and products of exponentials, we have to take
341
+ * care of numerical problems.
342
+ * First the forward recursion
343
+ * | α_1(y) = Ψ_1(y,x)
344
+ * | α_t(y) = ∑_{y'} α_{t-1}(y') * Ψ_t(y',y,x)
345
+ * Next come the backward recursion which is very similar
346
+ * | β_T(y') = 1
347
+ * | β_t(y') = ∑_y β_{t+1}(y) * Ψ_{t+1}(y',y,x)
348
+ * The numerical problems can appear here. To solve them we will scale the α_t
349
+ * and β_t vectors so they sum to 1 but we have to keep the scaling coeficient
350
+ * as we will need them later.
351
+ * Now, we have to compute the nomalization factor. But, due to the scaling
352
+ * performed during the forward-backward recursions, we have to compute it at
353
+ * each positions and separately for unigrams and bigrams using
354
+ * for unigrams: Z_θ(t) = ∑_y α_t(y) β_t(y)
355
+ * for bigrams: Z_θ(t) = ∑_y α_t(y) β_t(y) / α-scale_t
356
+ * with α-scale_t the scaling factor used for the α vector at position t
357
+ * in the forward recursion.
358
+ */
359
+ void grd_flfwdbwd(grd_t *grd, const seq_t *seq) {
360
+ const mdl_t *mdl = grd->mdl;
361
+ const size_t Y = mdl->nlbl;
362
+ const int T = seq->len;
363
+ const double (*psi)[T][Y][Y] = (void *)grd->psi;
364
+ double (*alpha)[T][Y] = (void *)grd->alpha;
365
+ double (*beta )[T][Y] = (void *)grd->beta;
366
+ double *scale = grd->scale;
367
+ double *unorm = grd->unorm;
368
+ double *bnorm = grd->bnorm;
369
+ for (size_t y = 0; y < Y; y++)
370
+ (*alpha)[0][y] = (*psi)[0][0][y];
371
+ scale[0] = xvm_unit((*alpha)[0], (*alpha)[0], Y);
372
+ for (int t = 1; t < grd->last + 1; t++) {
373
+ for (size_t y = 0; y < Y; y++) {
374
+ double sum = 0.0;
375
+ for (size_t yp = 0; yp < Y; yp++)
376
+ sum += (*alpha)[t - 1][yp] * (*psi)[t][yp][y];
377
+ (*alpha)[t][y] = sum;
378
+ }
379
+ scale[t] = xvm_unit((*alpha)[t], (*alpha)[t], Y);
380
+ }
381
+ for (size_t yp = 0; yp < Y; yp++)
382
+ (*beta)[T - 1][yp] = 1.0 / Y;
383
+ for (int t = T - 1; t > grd->first; t--) {
384
+ for (size_t yp = 0; yp < Y; yp++) {
385
+ double sum = 0.0;
386
+ for (size_t y = 0; y < Y; y++)
387
+ sum += (*beta)[t][y] * (*psi)[t][yp][y];
388
+ (*beta)[t - 1][yp] = sum;
389
+ }
390
+ xvm_unit((*beta)[t - 1], (*beta)[t - 1], Y);
391
+ }
392
+ for (int t = 0; t < T; t++) {
393
+ double z = 0.0;
394
+ for (size_t y = 0; y < Y; y++)
395
+ z += (*alpha)[t][y] * (*beta)[t][y];
396
+ unorm[t] = 1.0 / z;
397
+ bnorm[t] = scale[t] / z;
398
+ }
399
+ }
400
+
401
+ /* grd_spfwdbwd:
402
+ * And the sparse version which is a bit more cmoplicated but follow the same
403
+ * general path. First the forward recursion
404
+ * | α_1(y) = Ψ_1(y,x)
405
+ * | α_t(y) = Ψ_t(y,x) * ( ∑_{y'} α_{t-1}(y')
406
+ * + ∑_{y'} α_{t-1}(y') * (Ψ_t(y',y,x) - 1) )
407
+ * The inner part contains two sums, the first one will be 1.0 as we scale the
408
+ * α vectors, and the second is a sparse matrix multiplication who need less
409
+ * than |Y|x|Y| multiplication if the matrix is really sparse, so we will gain
410
+ * here.
411
+ * Next come the backward recursion which is very similar
412
+ * | β_T(y') = 1
413
+ * | β_t(y') = ∑_y v_{t+1}(y) + ∑_y v_{t+1}(y) * (Ψ_{t+1}(y',y,x) - 1)
414
+ * with
415
+ * v_{t+1}(y) = β_{t+1}(y) * Ψ_{t+1}(y,x)
416
+ * And here also we reduce the number of multiplication if the matrix is
417
+ * really sparse.
418
+ */
419
+ void grd_spfwdbwd(grd_t *grd, const seq_t *seq) {
420
+ const mdl_t *mdl = grd->mdl;
421
+ const size_t Y = mdl->nlbl;
422
+ const int T = seq->len;
423
+ const double (*psiuni)[T][Y] = (void *)grd->psiuni;
424
+ const double *psival = grd->psi;
425
+ const size_t *psiyp = grd->psiyp;
426
+ const size_t (*psiidx)[T][Y] = (void *)grd->psiidx;
427
+ const size_t *psioff = grd->psioff;
428
+ double (*alpha)[T][Y] = (void *)grd->alpha;
429
+ double (*beta )[T][Y] = (void *)grd->beta;
430
+ double *scale = grd->scale;
431
+ double *unorm = grd->unorm;
432
+ double *bnorm = grd->bnorm;
433
+ for (size_t y = 0; y < Y; y++)
434
+ (*alpha)[0][y] = (*psiuni)[0][y];
435
+ scale[0] = xvm_unit((*alpha)[0], (*alpha)[0], Y);
436
+ for (int t = 1; t < grd->last + 1; t++) {
437
+ for (size_t y = 0; y < Y; y++)
438
+ (*alpha)[t][y] = 1.0;
439
+ const size_t off = psioff[t];
440
+ for (size_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
441
+ while (n >= (*psiidx)[t][y])
442
+ y++;
443
+ while (n < (*psiidx)[t][y]) {
444
+ const size_t yp = psiyp [off + n];
445
+ const double v = psival[off + n];
446
+ (*alpha)[t][y] += (*alpha)[t - 1][yp] * v;
447
+ n++;
448
+ }
449
+ }
450
+ for (size_t y = 0; y < Y; y++)
451
+ (*alpha)[t][y] *= (*psiuni)[t][y];
452
+ scale[t] = xvm_unit((*alpha)[t], (*alpha)[t], Y);
453
+ }
454
+ for (size_t yp = 0; yp < Y; yp++)
455
+ (*beta)[T - 1][yp] = 1.0 / Y;
456
+ for (int t = T - 1; t > grd->first; t--) {
457
+ double sum = 0.0, tmp[Y];
458
+ for (size_t y = 0; y < Y; y++) {
459
+ tmp[y] = (*beta)[t][y] * (*psiuni)[t][y];
460
+ sum += tmp[y];
461
+ }
462
+ for (size_t y = 0; y < Y; y++)
463
+ (*beta)[t - 1][y] = sum;
464
+ const size_t off = psioff[t];
465
+ for (size_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
466
+ while (n >= (*psiidx)[t][y])
467
+ y++;
468
+ while (n < (*psiidx)[t][y]) {
469
+ const size_t yp = psiyp [off + n];
470
+ const double v = psival[off + n];
471
+ (*beta)[t - 1][yp] += v * tmp[y];
472
+ n++;
473
+ }
474
+ }
475
+ xvm_unit((*beta)[t - 1], (*beta)[t - 1], Y);
476
+ }
477
+ for (int t = 0; t < T; t++) {
478
+ double z = 0.0;
479
+ for (size_t y = 0; y < Y; y++)
480
+ z += (*alpha)[t][y] * (*beta)[t][y];
481
+ unorm[t] = 1.0 / z;
482
+ bnorm[t] = scale[t] / z;
483
+ }
484
+ }
485
+
486
+ /* grd_flupgrad:
487
+ * Now, we have all we need to compute the gradient of the negative log-
488
+ * likelihood
489
+ * ∂-L(θ)
490
+ * ------ = ∑_t ∑_{(y',y)} f_k(y',y,x_t) p_θ(y_{t-1}=y',y_t=y|x)
491
+ * ∂θ_k - ∑_t f_k(y_{t-1},y_t,x_t)
492
+ *
493
+ * The first term is the expectation of f_k under the model distribution and
494
+ * the second one is the expectation of f_k under the empirical distribution.
495
+ *
496
+ * The second is very simple to compute as we just have to sum over the
497
+ * actives observations in the sequence and will be done by the grd_subemp.
498
+ * The first one is more tricky as it involve computing the probability p_θ.
499
+ * This is where we use all the previous computations. Again we separate the
500
+ * computations for unigrams and bigrams here.
501
+ *
502
+ * These probabilities are given by
503
+ * p_θ(y_t=y|x) = α_t(y)β_t(y) / Z_θ
504
+ * p_θ(y_{t-1}=y',y_t=y|x) = α_{t-1}(y') Ψ_t(y',y,x) β_t(y) / Z_θ
505
+ * but we have to remember that, since we have scaled the α and β, we have to
506
+ * use the local normalization constants.
507
+ *
508
+ * We must also take care of not clearing previous value of the gradient
509
+ * vector but just adding the contribution of this sequence. This allow to
510
+ * compute it easily the gradient over more than one sequence.
511
+ */
512
+ void grd_flupgrad(grd_t *grd, const seq_t *seq) {
513
+ const mdl_t *mdl = grd->mdl;
514
+ const size_t Y = mdl->nlbl;
515
+ const int T = seq->len;
516
+ const double (*psi )[T][Y][Y] = (void *)grd->psi;
517
+ const double (*alpha)[T][Y] = (void *)grd->alpha;
518
+ const double (*beta )[T][Y] = (void *)grd->beta;
519
+ const double *unorm = grd->unorm;
520
+ const double *bnorm = grd->bnorm;
521
+ double *g = grd->g;
522
+ for (int t = 0; t < T; t++) {
523
+ const pos_t *pos = &(seq->pos[t]);
524
+ for (size_t y = 0; y < Y; y++) {
525
+ double e = (*alpha)[t][y] * (*beta)[t][y] * unorm[t];
526
+ for (size_t n = 0; n < pos->ucnt; n++) {
527
+ const size_t o = pos->uobs[n];
528
+ g[mdl->uoff[o] + y] += e;
529
+ }
530
+ }
531
+ }
532
+ for (int t = 1; t < T; t++) {
533
+ const pos_t *pos = &(seq->pos[t]);
534
+ for (size_t yp = 0, d = 0; yp < Y; yp++) {
535
+ for (size_t y = 0; y < Y; y++, d++) {
536
+ double e = (*alpha)[t - 1][yp] * (*beta)[t][y]
537
+ * (*psi)[t][yp][y] * bnorm[t];
538
+ for (size_t n = 0; n < pos->bcnt; n++) {
539
+ const size_t o = pos->bobs[n];
540
+ g[mdl->boff[o] + d] += e;
541
+ }
542
+ }
543
+ }
544
+ }
545
+ }
546
+
547
+ /* grd_spupgrad:
548
+ * The sparse matrix make things a bit more complicated here as we cannot
549
+ * directly multiply with the original Ψ_t(y',y,x) because we have split it
550
+ * two components and the second one is sparse, so we have to make a quite
551
+ * complex workaround to fix that. We have to explicitly build the expectation
552
+ * matrix. We first fill it with the unigram component and next multiply it
553
+ * with the bigram one.
554
+ */
555
+ void grd_spupgrad(grd_t *grd, const seq_t *seq) {
556
+ const mdl_t *mdl = grd->mdl;
557
+ const size_t Y = mdl->nlbl;
558
+ const int T = seq->len;
559
+ const double (*psiuni)[T][Y] = (void *)grd->psiuni;
560
+ const double *psival = grd->psi;
561
+ const size_t *psiyp = grd->psiyp;
562
+ const size_t (*psiidx)[T][Y] = (void *)grd->psiidx;
563
+ const size_t *psioff = grd->psioff;
564
+ const double (*alpha)[T][Y] = (void *)grd->alpha;
565
+ const double (*beta )[T][Y] = (void *)grd->beta;
566
+ const double *unorm = grd->unorm;
567
+ const double *bnorm = grd->bnorm;
568
+ double *g = grd->g;
569
+ for (int t = 0; t < T; t++) {
570
+ const pos_t *pos = &(seq->pos[t]);
571
+ for (size_t y = 0; y < Y; y++) {
572
+ double e = (*alpha)[t][y] * (*beta)[t][y] * unorm[t];
573
+ for (size_t n = 0; n < pos->ucnt; n++) {
574
+ const size_t o = pos->uobs[n];
575
+ g[mdl->uoff[o] + y] += e;
576
+ }
577
+ }
578
+ }
579
+ for (int t = 1; t < T; t++) {
580
+ const pos_t *pos = &(seq->pos[t]);
581
+ // We build the expectation matrix
582
+ double e[Y][Y];
583
+ for (size_t yp = 0; yp < Y; yp++)
584
+ for (size_t y = 0; y < Y; y++)
585
+ e[yp][y] = (*alpha)[t - 1][yp] * (*beta)[t][y]
586
+ * (*psiuni)[t][y] * bnorm[t];
587
+ const size_t off = psioff[t];
588
+ for (size_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
589
+ while (n >= (*psiidx)[t][y])
590
+ y++;
591
+ while (n < (*psiidx)[t][y]) {
592
+ const size_t yp = psiyp [off + n];
593
+ const double v = psival[off + n];
594
+ e[yp][y] += e[yp][y] * v;
595
+ n++;
596
+ }
597
+ }
598
+ // Add the expectation over the model distribution
599
+ for (size_t yp = 0, d = 0; yp < Y; yp++) {
600
+ for (size_t y = 0; y < Y; y++, d++) {
601
+ for (size_t n = 0; n < pos->bcnt; n++) {
602
+ const size_t o = pos->bobs[n];
603
+ g[mdl->boff[o] + d] += e[yp][y];
604
+ }
605
+ }
606
+ }
607
+ }
608
+ }
609
+
610
+ /* grd_subemp:
611
+ * Substract from the gradient, the expectation over the empirical
612
+ * distribution. This is the second step of the gradient computation shared
613
+ * by the non-sparse and sparse version.
614
+ */
615
+ void grd_subemp(grd_t *grd, const seq_t *seq) {
616
+ const mdl_t *mdl = grd->mdl;
617
+ const size_t Y = mdl->nlbl;
618
+ const int T = seq->len;
619
+ double *g = grd->g;
620
+ for (int t = 0; t < T; t++) {
621
+ const pos_t *pos = &(seq->pos[t]);
622
+ const size_t y = seq->pos[t].lbl;
623
+ for (size_t n = 0; n < pos->ucnt; n++)
624
+ g[mdl->uoff[pos->uobs[n]] + y] -= 1.0;
625
+ }
626
+ for (int t = 1; t < T; t++) {
627
+ const pos_t *pos = &(seq->pos[t]);
628
+ const size_t yp = seq->pos[t - 1].lbl;
629
+ const size_t y = seq->pos[t ].lbl;
630
+ const size_t d = yp * Y + y;
631
+ for (size_t n = 0; n < pos->bcnt; n++)
632
+ g[mdl->boff[pos->bobs[n]] + d] -= 1.0;
633
+ }
634
+ }
635
+
636
+ /* grd_logloss:
637
+ * And the final touch, the computation of the negative log-likelihood
638
+ * -L(θ) = log(Z_θ) - ∑_t ∑_k θ_k f_k(y_{t-1}, y_t, x_t)
639
+ *
640
+ * The numerical problems show again here as we cannot compute the Z_θ
641
+ * directly for the same reason we have done scaling. Fortunately, there is a
642
+ * way to directly compute his logarithm
643
+ * log(Z_θ) = log( ∑_y α_t(y) β_t(y) )
644
+ * - ∑_{i=1..t} log(α-scale_i)
645
+ * - ∑_{i=t..T} log(β-scale_i)
646
+ * for any value of t.
647
+ *
648
+ * So we can compute it at any position in the sequence but the last one is
649
+ * easier as the value of β_T(y) and β-scale_T are constant and cancel out.
650
+ * This is why we have just keep the α-scale_t values.
651
+ *
652
+ * Now, we have the first term of -L(θ). We have now to substract the second
653
+ * one. As we have done for the computation of Ψ, we separate the sum over K
654
+ * in two sums, one for unigrams and one for bigrams. And, as here also the
655
+ * weights will be non-nul only for observations present in the sequence, we
656
+ * sum only over these ones.
657
+ */
658
+ void grd_logloss(grd_t *grd, const seq_t *seq) {
659
+ const mdl_t *mdl = grd->mdl;
660
+ const double *x = mdl->theta;
661
+ const size_t Y = mdl->nlbl;
662
+ const int T = seq->len;
663
+ const double (*alpha)[T][Y] = (void *)grd->alpha;
664
+ const double *scale = grd->scale;
665
+ double logz = 0.0;
666
+ for (size_t y = 0; y < Y; y++)
667
+ logz += (*alpha)[T - 1][y];
668
+ logz = log(logz);
669
+ for (int t = 0; t < T; t++)
670
+ logz -= log(scale[t]);
671
+ double lloss = logz;
672
+ for (int t = 0; t < T; t++) {
673
+ const pos_t *pos = &(seq->pos[t]);
674
+ const size_t y = seq->pos[t].lbl;
675
+ for (size_t n = 0; n < pos->ucnt; n++)
676
+ lloss -= x[mdl->uoff[pos->uobs[n]] + y];
677
+ }
678
+ for (int t = 1; t < T; t++) {
679
+ const pos_t *pos = &(seq->pos[t]);
680
+ const size_t yp = seq->pos[t - 1].lbl;
681
+ const size_t y = seq->pos[t ].lbl;
682
+ const size_t d = yp * Y + y;
683
+ for (size_t n = 0; n < pos->bcnt; n++)
684
+ lloss -= x[mdl->boff[pos->bobs[n]] + d];
685
+ }
686
+ grd->lloss += lloss;
687
+ }
688
+
689
+ /* grd_doseq:
690
+ * This function compute the gradient and value of the negative log-likelihood
691
+ * of the model over a single training sequence.
692
+ *
693
+ * This function will not clear the gradient before computation, but instead
694
+ * just accumulate the values for the given sequence in it. This allow to
695
+ * easily compute the gradient over a set of sequences.
696
+ */
697
+ void grd_doseq(grd_t *grd, const seq_t *seq) {
698
+ const mdl_t *mdl = grd->mdl;
699
+ grd->first = 0;
700
+ grd->last = seq->len - 1;
701
+ if (!mdl->opt->sparse) {
702
+ grd_fldopsi(grd, seq);
703
+ grd_flfwdbwd(grd, seq);
704
+ grd_flupgrad(grd, seq);
705
+ } else {
706
+ grd_spdopsi(grd, seq);
707
+ grd_spfwdbwd(grd, seq);
708
+ grd_spupgrad(grd, seq);
709
+ }
710
+ grd_subemp(grd, seq);
711
+ grd_logloss(grd, seq);
712
+ }
713
+
714
+ /******************************************************************************
715
+ * Dataset gradient computation
716
+ *
717
+ * This section is responsible for computing the gradient of the
718
+ * log-likelihood function to optimize over the full training set.
719
+ *
720
+ * The gradient computation is multi-threaded, you first have to call the
721
+ * function 'grd_setup' to prepare the workers pool, and next you can use
722
+ * 'grd_gradient' to ask for the full gradient as many time as you want. Each
723
+ * time the gradient is computed over the full training set, using the curent
724
+ * value of the parameters and applying the regularization. If need the
725
+ * pseudo-gradient can also be computed. When you have done, you have to call
726
+ * 'grd_cleanup' to free the allocated memory.
727
+ *
728
+ * This require an additional vector of size <nftr> per thread after the
729
+ * first, so it can take a lot of memory to compute big models on a lot of
730
+ * threads. It is strongly discouraged to ask for more threads than you have
731
+ * cores, or to more thread than you have memory to hold vectors.
732
+ ******************************************************************************/
733
+
734
+ /* grd_dospl:
735
+ * Compute the gradient of a single sample choosing between the maxent
736
+ * optimised codepath and classical one depending of the sample.
737
+ */
738
+ void grd_dospl(grd_t *grd, const seq_t *seq) {
739
+ grd_check(grd, seq->len);
740
+ if (seq->len == 1 || grd->mdl->reader->nbi == 0)
741
+ grd_dosingle(grd, seq);
742
+ else
743
+ grd_doseq(grd, seq);
744
+ }
745
+
746
+ /* grd_worker:
747
+ * This is a simple function who compute the gradient over a subset of the
748
+ * training set. It is mean to be called by the thread spawner in order to
749
+ * compute the gradient over the full training set.
750
+ */
751
+ static void grd_worker(job_t *job, int id, int cnt, grd_t *grd) {
752
+ unused(id && cnt);
753
+ mdl_t *mdl = grd->mdl;
754
+ const dat_t *dat = mdl->train;
755
+ const size_t F = mdl->nftr;
756
+ // We first cleanup the gradient and value as our parent don't do it (it
757
+ // is better to do this also in parallel)
758
+ grd->lloss = 0.0;
759
+ for (size_t f = 0; f < F; f++)
760
+ grd->g[f] = 0.0;
761
+ // Now all is ready, we can process our sequences and accumulate the
762
+ // gradient and inverse log-likelihood
763
+ size_t count, pos;
764
+ while (mth_getjob(job, &count, &pos)) {
765
+ for (size_t s = pos; !uit_stop && s < pos + count; s++)
766
+ grd_dospl(grd, dat->seq[s]);
767
+ if (uit_stop)
768
+ break;
769
+ }
770
+ }
771
+
772
+ /* grd_gradient:
773
+ * Compute the gradient and value of the negative log-likelihood of the model
774
+ * at current point. The computation is done in parallel taking profit of
775
+ * the fact that the gradient over the full training set is just the sum of
776
+ * the gradient of each sequence.
777
+ */
778
+ double grd_gradient(mdl_t *mdl, double *g, grd_t *grds[]) {
779
+ const double *x = mdl->theta;
780
+ const size_t F = mdl->nftr;
781
+ const size_t W = mdl->opt->nthread;
782
+ // All is ready to compute the gradient, we spawn the threads of
783
+ // workers, each one working on a part of the data. As the gradient and
784
+ // log-likelihood are additive, computing the final values will be
785
+ // trivial.
786
+ mth_spawn((func_t *)grd_worker, W, (void **)grds, mdl->train->nseq,
787
+ mdl->opt->jobsize);
788
+ if (uit_stop)
789
+ return -1.0;
790
+ // All computations are done, it just remain to add all the gradients
791
+ // and inverse log-likelihood from all the workers.
792
+ double fx = grds[0]->lloss;
793
+ for (size_t w = 1; w < W; w++) {
794
+ for (size_t f = 0; f < F; f++)
795
+ g[f] += grds[w]->g[f];
796
+ fx += grds[w]->lloss;
797
+ }
798
+ // If needed we clip the gradient: setting to 0.0 all coordinates where
799
+ // the function is 0.0.
800
+ if (mdl->opt->lbfgs.clip == true)
801
+ for (size_t f = 0; f < F; f++)
802
+ if (x[f] == 0.0)
803
+ g[f] = 0.0;
804
+ // Now we can apply the elastic-net penalty. Depending of the values of
805
+ // rho1 and rho2, this can in fact be a classical L1 or L2 penalty.
806
+ const double rho1 = mdl->opt->rho1;
807
+ const double rho2 = mdl->opt->rho2;
808
+ double nl1 = 0.0, nl2 = 0.0;
809
+ for (size_t f = 0; f < F; f++) {
810
+ const double v = x[f];
811
+ g[f] += rho2 * v;
812
+ nl1 += fabs(v);
813
+ nl2 += v * v;
814
+ }
815
+ fx += nl1 * rho1 + nl2 * rho2 / 2.0;
816
+ return fx;
817
+ }
818
+