wapiti 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/.autotest +13 -0
  2. data/.gitignore +5 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +30 -0
  6. data/README.md +153 -0
  7. data/Rakefile +33 -0
  8. data/ext/wapiti/bcd.c +392 -0
  9. data/ext/wapiti/decoder.c +535 -0
  10. data/ext/wapiti/decoder.h +46 -0
  11. data/ext/wapiti/extconf.rb +8 -0
  12. data/ext/wapiti/gradient.c +818 -0
  13. data/ext/wapiti/gradient.h +81 -0
  14. data/ext/wapiti/lbfgs.c +294 -0
  15. data/ext/wapiti/model.c +296 -0
  16. data/ext/wapiti/model.h +100 -0
  17. data/ext/wapiti/native.c +1238 -0
  18. data/ext/wapiti/native.h +15 -0
  19. data/ext/wapiti/options.c +278 -0
  20. data/ext/wapiti/options.h +91 -0
  21. data/ext/wapiti/pattern.c +395 -0
  22. data/ext/wapiti/pattern.h +56 -0
  23. data/ext/wapiti/progress.c +167 -0
  24. data/ext/wapiti/progress.h +43 -0
  25. data/ext/wapiti/quark.c +272 -0
  26. data/ext/wapiti/quark.h +46 -0
  27. data/ext/wapiti/reader.c +553 -0
  28. data/ext/wapiti/reader.h +73 -0
  29. data/ext/wapiti/rprop.c +191 -0
  30. data/ext/wapiti/sequence.h +148 -0
  31. data/ext/wapiti/sgdl1.c +218 -0
  32. data/ext/wapiti/thread.c +171 -0
  33. data/ext/wapiti/thread.h +42 -0
  34. data/ext/wapiti/tools.c +202 -0
  35. data/ext/wapiti/tools.h +54 -0
  36. data/ext/wapiti/trainers.h +39 -0
  37. data/ext/wapiti/vmath.c +372 -0
  38. data/ext/wapiti/vmath.h +51 -0
  39. data/ext/wapiti/wapiti.c +288 -0
  40. data/ext/wapiti/wapiti.h +45 -0
  41. data/lib/wapiti.rb +30 -0
  42. data/lib/wapiti/errors.rb +17 -0
  43. data/lib/wapiti/model.rb +49 -0
  44. data/lib/wapiti/options.rb +113 -0
  45. data/lib/wapiti/utility.rb +15 -0
  46. data/lib/wapiti/version.rb +3 -0
  47. data/spec/fixtures/ch.mod +18550 -0
  48. data/spec/fixtures/chpattern.txt +52 -0
  49. data/spec/fixtures/chtest.txt +1973 -0
  50. data/spec/fixtures/chtrain.txt +19995 -0
  51. data/spec/fixtures/nppattern.txt +52 -0
  52. data/spec/fixtures/nptest.txt +1973 -0
  53. data/spec/fixtures/nptrain.txt +19995 -0
  54. data/spec/fixtures/pattern.txt +14 -0
  55. data/spec/fixtures/test.txt +60000 -0
  56. data/spec/fixtures/train.txt +1200 -0
  57. data/spec/spec_helper.rb +21 -0
  58. data/spec/wapiti/model_spec.rb +173 -0
  59. data/spec/wapiti/native_spec.rb +12 -0
  60. data/spec/wapiti/options_spec.rb +175 -0
  61. data/spec/wapiti/utility_spec.rb +22 -0
  62. data/wapiti.gemspec +35 -0
  63. metadata +178 -0
@@ -0,0 +1,81 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #ifndef gradient_h
29
+ #define gradient_h
30
+
31
+ #include "wapiti.h"
32
+ #include "model.h"
33
+ #include "sequence.h"
34
+
35
+ /* grd_t:
36
+ * State tracker for the gradient computation. To compute the gradient we need
37
+ * to perform several steps and communicate between them a lot of intermediate
38
+ * values, all these temporary are store in this object.
39
+ * A tracker can be used to compute sequence of length <len> at most, before
40
+ * using it you must call grd_check to ensure that the tracker is big enough
41
+ * for your sequence.
42
+ */
43
+ typedef struct grd_s grd_t;
44
+ struct grd_s {
45
+ mdl_t *mdl;
46
+ int len; // =T max length of sequence
47
+ double *g; // [F] vector where to put gradient updates
48
+ double lloss; // loss value for the sequence
49
+ double *psi; // [T][Y][Y] the transitions scores
50
+ double *psiuni; // [T][Y] | Same as psi in sparse format
51
+ size_t *psiyp; // [T][Y][Y] |
52
+ size_t *psiidx; // [T][Y] |
53
+ size_t *psioff; // [T]
54
+ double *alpha; // [T][Y] forward scores
55
+ double *beta; // [T][Y] backward scores
56
+ double *scale; // [T] scaling factors of forward scores
57
+ double *unorm; // [T] normalization factors for unigrams
58
+ double *bnorm; // [T] normalization factors for bigrams
59
+ int first; // first position where gradient is needed
60
+ int last; // last position where gradient is needed
61
+ };
62
+
63
+ grd_t *grd_new(mdl_t *mdl, double *g);
64
+ void grd_free(grd_t *grd);
65
+ void grd_check(grd_t *grd, int len);
66
+
67
+ void grd_fldopsi(grd_t *grd, const seq_t *seq);
68
+ void grd_flfwdbwd(grd_t *grd, const seq_t *seq);
69
+ void grd_flupgrad(grd_t *grd, const seq_t *seq);
70
+
71
+ void grd_spdopsi(grd_t *grd, const seq_t *seq);
72
+ void grd_spfwdbwd(grd_t *grd, const seq_t *seq);
73
+ void grd_spupgrad(grd_t *grd, const seq_t *seq);
74
+
75
+ void grd_logloss(grd_t *grd, const seq_t *seq);
76
+
77
+ void grd_dospl(grd_t *grd, const seq_t *seq);
78
+ double grd_gradient(mdl_t *mdl, double *g, grd_t *grds[]);
79
+
80
+ #endif
81
+
@@ -0,0 +1,294 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+ #include <math.h>
28
+ #include <stdbool.h>
29
+ #include <stddef.h>
30
+ #include <stdlib.h>
31
+ #include <string.h>
32
+
33
+ #include "wapiti.h"
34
+ #include "gradient.h"
35
+ #include "model.h"
36
+ #include "options.h"
37
+ #include "progress.h"
38
+ #include "tools.h"
39
+ #include "thread.h"
40
+ #include "vmath.h"
41
+
42
+ /******************************************************************************
43
+ * Quasi-Newton optimizer
44
+ *
45
+ * This section implement the quasi-Newton optimizer. We use the L-BFGS
46
+ * algorithm described by Liu and Nocedal in [1] and [2]. If an l1-norm must
47
+ * be applyed we fallback on the OWL-QN variant described in [3] by Galen and
48
+ * Jianfeng which allow to use L-BFGS for function not differentiable in 0.0.
49
+ *
50
+ * [1] Updating quasi-Newton matrices with limited storage, Jorge Nocedal, in
51
+ * Mathematics of Computation, vol. 35(151) 773-782, July 1980.
52
+ * [2] On the limited memory BFGS method for large scale optimization, Dong C.
53
+ * Liu and Jorge Nocedal, in Mathematical Programming, vol. 45(1) 503-528,
54
+ * January 1989.
55
+ * [3] Scalable Training of L1-Regularized Log-Linear Models, Andrew Galen and
56
+ * Gao Jianfeng, in Proceedings of the 24th International Conference on
57
+ * Machine Learning (ICML), Corvallis, OR, 2007.
58
+ ******************************************************************************/
59
+
60
+ void trn_lbfgs(mdl_t *mdl) {
61
+ const size_t F = mdl->nftr;
62
+ const int K = mdl->opt->maxiter;
63
+ const int C = mdl->opt->objwin;
64
+ const int M = mdl->opt->lbfgs.histsz;
65
+ const size_t W = mdl->opt->nthread;
66
+ const bool l1 = mdl->opt->rho1 != 0.0;
67
+ double *x, *xp; // Current and previous value of the variables
68
+ double *g, *gp; // Current and previous value of the gradient
69
+ double *pg; // The pseudo-gradient (only for owl-qn)
70
+ double *d; // The search direction
71
+ double *s[M]; // History value s_k = Δ(x,px)
72
+ double *y[M]; // History value y_k = Δ(g,pg)
73
+ double p[M]; // ρ_k
74
+ double fh[C]; // f(x) history
75
+ grd_t *grds[W];
76
+ // Initialization: Here, we have to allocate memory on the heap as we
77
+ // cannot request so much memory on the stack as this will have a too
78
+ // big impact on performance and will be refused by the system on non-
79
+ // trivial models.
80
+ x = mdl->theta;
81
+ xp = xvm_new(F); g = xvm_new(F);
82
+ gp = xvm_new(F); d = xvm_new(F);
83
+ for (int m = 0; m < M; m++) {
84
+ s[m] = xvm_new(F);
85
+ y[m] = xvm_new(F);
86
+ }
87
+ pg = l1 ? xvm_new(F) : NULL;
88
+ grds[0] = grd_new(mdl, g);
89
+ for (size_t w = 1; w < W; w++)
90
+ grds[w] = grd_new(mdl, xvm_new(F));
91
+ // Minimization: This is the heart of the function. (a big heart...) We
92
+ // will perform iterations until one these conditions is reached
93
+ // - the maximum iteration count is reached
94
+ // - we have converged (upto numerical precision)
95
+ // - the report function return false
96
+ // - an error happen somewhere
97
+ double fx = grd_gradient(mdl, g, grds);
98
+ for (int k = 0; !uit_stop && k < K; k++) {
99
+ // We first compute the pseudo-gradient of f for owl-qn. It is
100
+ // defined in [3, pp 335(4)]
101
+ // | ∂_i^- f(x) if ∂_i^- f(x) > 0
102
+ // ◇_i f(x) = | ∂_i^+ f(x) if ∂_i^+ f(x) < 0
103
+ // | 0 otherwise
104
+ // with
105
+ // ∂_i^± f(x) = ∂/∂x_i l(x) + | Cσ(x_i) if x_i ≠ 0
106
+ // | ±C if x_i = 0
107
+ if (l1) {
108
+ const double rho1 = mdl->opt->rho1;
109
+ for (unsigned f = 0; f < F; f++) {
110
+ if (x[f] < 0.0)
111
+ pg[f] = g[f] - rho1;
112
+ else if (x[f] > 0.0)
113
+ pg[f] = g[f] + rho1;
114
+ else if (g[f] < -rho1)
115
+ pg[f] = g[f] + rho1;
116
+ else if (g[f] > rho1)
117
+ pg[f] = g[f] - rho1;
118
+ else
119
+ pg[f] = 0.0;
120
+ }
121
+ }
122
+ // 1st step: We compute the search direction. We search in the
123
+ // direction who minimize the second order approximation given
124
+ // by the Taylor series which give
125
+ // d_k = - H_k^{-1} g_k
126
+ // But computing the inverse of the hessian is intractable so
127
+ // the l-bfgs only approximate it's diagonal. The exact
128
+ // computation is well described in [1, pp 779].
129
+ // The only special thing for owl-qn here is to use the pseudo
130
+ // gradient instead of the true one.
131
+ xvm_neg(d, l1 ? pg : g, F);
132
+ if (k != 0) {
133
+ const int km = k % M;
134
+ const int bnd = (k <= M) ? k : M;
135
+ double alpha[M], beta;
136
+ // α_i = ρ_j s_j^T q_{i+1}
137
+ // q_i = q_{i+1} - α_i y_i
138
+ for (int i = bnd; i > 0; i--) {
139
+ const int j = (k - i + M + 1) % M;
140
+ alpha[i - 1] = p[j] * xvm_dot(s[j], d, F);
141
+ xvm_axpy(d, -alpha[i - 1], y[j], d, F);
142
+ }
143
+ // r_0 = H_0 q_0
144
+ // Scaling is described in [2, pp 515]
145
+ // for k = 0: H_0 = I
146
+ // for k > 0: H_0 = I * y_k^T s_k / ||y_k||²
147
+ // = I * 1 / ρ_k ||y_k||²
148
+ const double y2 = xvm_dot(y[km], y[km], F);
149
+ const double v = 1.0 / (p[km] * y2);
150
+ for (size_t f = 0; f < F; f++)
151
+ d[f] *= v;
152
+ // β_j = ρ_j y_j^T r_i
153
+ // r_{i+1} = r_i + s_j (α_i - β_i)
154
+ for (int i = 0; i < bnd; i++) {
155
+ const int j = (k - i + M) % M;
156
+ beta = p[j] * xvm_dot(y[j], d, F);
157
+ xvm_axpy(d, alpha[i] - beta, s[j], d, F);
158
+ }
159
+ }
160
+ // For owl-qn, we must remain in the same orthant than the
161
+ // pseudo-gradient, so we have to constrain the search
162
+ // direction as described in [3, pp 35(3)]
163
+ // d^k = π(d^k ; v^k)
164
+ // = π(d^k ; -◇f(x^k))
165
+ if (l1)
166
+ for (size_t f = 0; f < F; f++)
167
+ if (d[f] * pg[f] >= 0.0)
168
+ d[f] = 0.0;
169
+ // 2nd step: we perform a linesearch in the computed direction,
170
+ // we search a step value that satisfy the constrains using a
171
+ // backtracking algorithm. Much elaborated algorithm can perform
172
+ // better in the general case, but for CRF training, bactracking
173
+ // is very efficient and simple to implement.
174
+ // For quasi-Newton, the natural step is 1.0 so we start with
175
+ // this one and reduce it only if it fail with an exception for
176
+ // the first step where a better guess can be done.
177
+ // We have to keep track of the current point and gradient as we
178
+ // will need to compute the delta between those and the found
179
+ // point, and perhaps need to restore them if linesearch fail.
180
+ memcpy(xp, x, sizeof(double) * F);
181
+ memcpy(gp, g, sizeof(double) * F);
182
+ double sc = (k == 0) ? 0.1 : 0.5;
183
+ double stp = (k == 0) ? 1.0 / xvm_norm(d, F) : 1.0;
184
+ double gd = l1 ? 0.0 : xvm_dot(g, d, F); // gd = g_k^T d_k
185
+ double fi = fx;
186
+ bool err = false;
187
+ for (int ls = 1; !uit_stop; ls++, stp *= sc) {
188
+ // We compute the new point using the current step and
189
+ // search direction
190
+ xvm_axpy(x, stp, d, xp, F);
191
+ // For owl-qn, we have to project back the point in the
192
+ // current orthant [3, pp 35]
193
+ // x^{k+1} = π(x^k + αp^k ; ξ)
194
+ if (l1) {
195
+ for (size_t f = 0; f < F; f++) {
196
+ double or = xp[f];
197
+ if (or == 0.0)
198
+ or = -pg[f];
199
+ if (x[f] * or <= 0.0)
200
+ x[f] = 0.0;
201
+ }
202
+ }
203
+ // And we ask for the value of the objective function
204
+ // and its gradient.
205
+ fx = grd_gradient(mdl, g, grds);
206
+ // Now we check if the step satisfy the conditions. For
207
+ // l-bfgs, we check the classical decrease and curvature
208
+ // known as the Wolfe conditions [2, pp 506]
209
+ // f(x_k + α_k d_k) ≤ f(x_k) + β' α_k g_k^T d_k
210
+ // g(x_k + α_k d_k)^T d_k ≥ β g_k^T d_k
211
+ //
212
+ // And for owl-qn we check a variant of the Armijo rule
213
+ // described in [3, pp 36]
214
+ // f(π(x^k+αp^k;ξ)) ≤ f(x^k) - γv^T[π(x^k+αp^k;ξ)-x^k]
215
+ if (!l1) {
216
+ if (fx > fi + stp * gd * 1e-4)
217
+ sc = 0.5;
218
+ else if (xvm_dot(g, d, F) < gd * 0.9)
219
+ sc = 2.1;
220
+ else
221
+ break;
222
+ } else {
223
+ double vp = 0.0;
224
+ for (size_t f = 0; f < F; f++)
225
+ vp += (x[f] - xp[f]) * d[f];
226
+ if (fx < fi + vp * 1e-4)
227
+ break;
228
+ }
229
+ // If we reach the maximum number of linesearsh steps
230
+ // without finding a good one, we just fail.
231
+ if (ls == mdl->opt->lbfgs.maxls) {
232
+ warning("maximum linesearch reached");
233
+ err = true;
234
+ break;
235
+ }
236
+ }
237
+ // If linesearch failed or user interupted training, we return
238
+ // to the last valid point and stop the training. The model is
239
+ // probably not fully optimized but we let the user decide what
240
+ // to do with it.
241
+ if (err || uit_stop) {
242
+ memcpy(x, xp, sizeof(double) * F);
243
+ break;
244
+ }
245
+ if (uit_progress(mdl, k + 1, fx) == false)
246
+ break;
247
+ // 3rd step: we update the history used for approximating the
248
+ // inverse of the diagonal of the hessian
249
+ // s_k = x_{k+1} - x_k
250
+ // y_k = g_{k+1} - g_k
251
+ // ρ_k = 1 / y_k^T s_k
252
+ const int kn = (k + 1) % M;
253
+ xvm_sub(s[kn], x, xp, F);
254
+ xvm_sub(y[kn], g, gp, F);
255
+ p[kn] = 1.0 / xvm_dot(y[kn], s[kn], F);
256
+ // And last, we check for convergence. The convergence check is
257
+ // quite simple [2, pp 508]
258
+ // ||g|| / max(1, ||x||) ≤ ε
259
+ // with ε small enough so we stop when numerical precision is
260
+ // reached. For owl-qn we just have to check against the pseudo-
261
+ // gradient instead of the true one.
262
+ const double xn = xvm_norm(x, F);
263
+ const double gn = xvm_norm(l1 ? pg : g, F);
264
+ if (gn / max(xn, 1.0) <= 1e-5)
265
+ break;
266
+ if (k + 1 == K)
267
+ break;
268
+ // Second stoping criterion tested is a check for improvement of
269
+ // the function value over the past W iteration. When this come
270
+ // under an epsilon, we also stop the minimization.
271
+ fh[k % C] = fx;
272
+ double dlt = 1.0;
273
+ if (k >= C) {
274
+ const double of = fh[(k + 1) % C];
275
+ dlt = fabs(of - fx) / of;
276
+ if (dlt < mdl->opt->stopeps)
277
+ break;
278
+ }
279
+ }
280
+ // Cleanup: We free all the vectors we have allocated.
281
+ xvm_free(xp); xvm_free(g);
282
+ xvm_free(gp); xvm_free(d);
283
+ for (int m = 0; m < M; m++) {
284
+ xvm_free(s[m]);
285
+ xvm_free(y[m]);
286
+ }
287
+ if (l1)
288
+ xvm_free(pg);
289
+ for (size_t w = 1; w < W; w++)
290
+ xvm_free(grds[w]->g);
291
+ for (size_t w = 0; w < W; w++)
292
+ grd_free(grds[w]);
293
+ }
294
+
@@ -0,0 +1,296 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+ #include <stdbool.h>
28
+ #include <stddef.h>
29
+ #include <stdlib.h>
30
+ #include <stdio.h>
31
+ #include <string.h>
32
+
33
+ #include "wapiti.h"
34
+ #include "model.h"
35
+ #include "options.h"
36
+ #include "quark.h"
37
+ #include "reader.h"
38
+ #include "tools.h"
39
+ #include "vmath.h"
40
+
41
+ /*******************************************************************************
42
+ * Linear chain CRF model
43
+ *
44
+ * There is three concept that must be well understand here, the labels,
45
+ * observations, and features. The labels are the values predicted by the
46
+ * model at each point of the sequence and denoted by Y. The observations are
47
+ * the values, at each point of the sequence, given to the model in order to
48
+ * predict the label and denoted by O. A feature is a test on both labels and
49
+ * observations, denoted by F. In linear chain CRF there is two kinds of
50
+ * features :
51
+ * - unigram feature who represent a test on the observations at the current
52
+ * point and the label at current point.
53
+ * - bigram feature who represent a test on the observation at the current
54
+ * point and two labels : the current one and the previous one.
55
+ * So for each observation, there Y possible unigram features and Y*Y possible
56
+ * bigram features. The kind of features used by the model for a given
57
+ * observation depend on the pattern who generated it.
58
+ ******************************************************************************/
59
+
60
+ /* mdl_new:
61
+ * Allocate a new empty model object linked with the given reader. The model
62
+ * have to be synchronized before starting training or labelling. If you not
63
+ * provide a reader (as it will loaded from file for example) you must be sure
64
+ * to set one in the model before any attempts to synchronize it.
65
+ */
66
+ mdl_t *mdl_new(rdr_t *rdr) {
67
+ mdl_t *mdl = wapiti_xmalloc(sizeof(mdl_t));
68
+ mdl->nlbl = mdl->nobs = mdl->nftr = 0;
69
+ mdl->kind = NULL;
70
+ mdl->uoff = mdl->boff = NULL;
71
+ mdl->theta = NULL;
72
+ mdl->train = mdl->devel = NULL;
73
+ mdl->reader = rdr;
74
+ mdl->werr = NULL;
75
+ mdl->total = 0.0;
76
+ return mdl;
77
+ }
78
+
79
+ /* mdl_free:
80
+ * Free all memory used by a model object inculding the reader and datasets
81
+ * loaded in the model.
82
+ */
83
+ void mdl_free(mdl_t *mdl) {
84
+ free(mdl->kind);
85
+ free(mdl->uoff);
86
+ free(mdl->boff);
87
+ if (mdl->theta != NULL)
88
+ xvm_free(mdl->theta);
89
+ if (mdl->train != NULL)
90
+ rdr_freedat(mdl->train);
91
+ if (mdl->devel != NULL)
92
+ rdr_freedat(mdl->devel);
93
+ if (mdl->reader != NULL)
94
+ rdr_free(mdl->reader);
95
+ if (mdl->werr != NULL)
96
+ free(mdl->werr);
97
+ free(mdl);
98
+ }
99
+
100
+ /* mdl_sync:
101
+ * Synchronize the model with its reader. As the model is just a placeholder
102
+ * for features weights and interned sequences, it know very few about the
103
+ * labels and observations, all the informations are kept in the reader. A
104
+ * sync will get the labels and observations count as well as the observation
105
+ * kind from the reader and build internal structures representing the model.
106
+ *
107
+ * If the model was already synchronized before, there is an existing model
108
+ * incompatible with the new one to be created. In this case there is two
109
+ * possibility :
110
+ * - If only new observations was added, the weights of the old ones remain
111
+ * valid and are kept as they form a probably good starting point for
112
+ * training the new model, the new observation get a 0 weight ;
113
+ * - If new labels was added, the old model are trully meaningless so we
114
+ * have to fully discard them and build a new empty model.
115
+ * In any case, you must never change existing labels or observations, if this
116
+ * happen, you need to create a new model and destroy this one.
117
+ *
118
+ * After synchronization, the labels and observations databases are locked to
119
+ * prevent new one to be created. You must unlock them explicitly if needed.
120
+ * This reduce the risk of mistakes.
121
+ */
122
+ void mdl_sync(mdl_t *mdl) {
123
+ const size_t Y = qrk_count(mdl->reader->lbl);
124
+ const size_t O = qrk_count(mdl->reader->obs);
125
+ // If model is already synchronized, do nothing and just return
126
+ if (mdl->nlbl == Y && mdl->nobs == O)
127
+ return;
128
+ if (Y == 0 || O == 0)
129
+ fatal("cannot synchronize an empty model");
130
+ // If new labels was added, we have to discard all the model. In this
131
+ // case we also display a warning as this is probably not expected by
132
+ // the user. If only new observations was added, we will try to expand
133
+ // the model.
134
+ size_t oldF = mdl->nftr;
135
+ size_t oldO = mdl->nobs;
136
+ if (mdl->nlbl != Y && mdl->nlbl != 0) {
137
+ warning("labels count changed, discarding the model");
138
+ free(mdl->kind); mdl->kind = NULL;
139
+ free(mdl->uoff); mdl->uoff = NULL;
140
+ free(mdl->boff); mdl->boff = NULL;
141
+ if (mdl->theta != NULL) {
142
+ xvm_free(mdl->theta);
143
+ mdl->theta = NULL;
144
+ }
145
+ oldF = oldO = 0;
146
+ }
147
+ mdl->nlbl = Y;
148
+ mdl->nobs = O;
149
+ // Allocate the observations datastructure. If the model is empty or
150
+ // discarded, a new one iscreated, else the old one is expanded.
151
+ char *kind = wapiti_xrealloc(mdl->kind, sizeof(char ) * O);
152
+ size_t *uoff = wapiti_xrealloc(mdl->uoff, sizeof(size_t) * O);
153
+ size_t *boff = wapiti_xrealloc(mdl->boff, sizeof(size_t) * O);
154
+ mdl->kind = kind;
155
+ mdl->uoff = uoff;
156
+ mdl->boff = boff;
157
+ // Now, we can setup the features. For each new observations we fill the
158
+ // kind and offsets arrays and count total number of features as well.
159
+ size_t F = oldF;
160
+ for (size_t o = oldO; o < O; o++) {
161
+ const char *obs = qrk_id2str(mdl->reader->obs, o);
162
+ switch (obs[0]) {
163
+ case 'u': kind[o] = 1; break;
164
+ case 'b': kind[o] = 2; break;
165
+ case '*': kind[o] = 3; break;
166
+ }
167
+ if (kind[o] & 1)
168
+ uoff[o] = F, F += Y;
169
+ if (kind[o] & 2)
170
+ boff[o] = F, F += Y * Y;
171
+ }
172
+ mdl->nftr = F;
173
+ // We can finally grow the features weights vector itself. We set all
174
+ // the new features to 0.0 but don't touch the old ones.
175
+ // This is a bit tricky as aligned malloc cannot be simply grown so we
176
+ // have to allocate a new vector and copy old values ourself.
177
+ if (oldF != 0) {
178
+ double *new = xvm_new(F);
179
+ for (size_t f = 0; f < oldF; f++)
180
+ new[f] = mdl->theta[f];
181
+ xvm_free(mdl->theta);
182
+ mdl->theta = new;
183
+ } else {
184
+ mdl->theta = xvm_new(F);
185
+ }
186
+ for (size_t f = oldF; f < F; f++)
187
+ mdl->theta[f] = 0.0;
188
+ // And lock the databases
189
+ qrk_lock(mdl->reader->lbl, true);
190
+ qrk_lock(mdl->reader->obs, true);
191
+ }
192
+
193
+ /* mdl_compact:
194
+ * Comapct the given model by removing from it all observation who lead to
195
+ * zero actives features. On model trained with l1 regularization this can
196
+ * lead to a drastic model size reduction and so to faster loading, training
197
+ * and labeling.
198
+ */
199
+ void mdl_compact(mdl_t *mdl) {
200
+ const size_t Y = mdl->nlbl;
201
+ // We first build the new observation list with only observations which
202
+ // lead to at least one active feature. At the same time we build the
203
+ // translation table which map the new observations index to the old
204
+ // ones.
205
+ info(" - Scan the model\n");
206
+ qrk_t *old_obs = mdl->reader->obs;
207
+ qrk_t *new_obs = qrk_new();
208
+ size_t *trans = wapiti_xmalloc(sizeof(size_t) * mdl->nobs);
209
+ for (size_t oldo = 0; oldo < mdl->nobs; oldo++) {
210
+ bool active = false;
211
+ if (mdl->kind[oldo] & 1)
212
+ for (size_t y = 0; y < Y; y++)
213
+ if (mdl->theta[mdl->uoff[oldo] + y] != 0.0)
214
+ active = true;
215
+ if (mdl->kind[oldo] & 2)
216
+ for (size_t d = 0; d < Y * Y; d++)
217
+ if (mdl->theta[mdl->boff[oldo] + d] != 0.0)
218
+ active = true;
219
+ if (!active)
220
+ continue;
221
+ const char *str = qrk_id2str(old_obs, oldo);
222
+ const size_t newo = qrk_str2id(new_obs, str);
223
+ trans[newo] = oldo;
224
+ }
225
+ mdl->reader->obs = new_obs;
226
+ // Now we save the old model features informations and build a new one
227
+ // corresponding to the compacted model.
228
+ size_t *old_uoff = mdl->uoff; mdl->uoff = NULL;
229
+ size_t *old_boff = mdl->boff; mdl->boff = NULL;
230
+ double *old_theta = mdl->theta; mdl->theta = NULL;
231
+ free(mdl->kind);
232
+ mdl->kind = NULL;
233
+ mdl->nlbl = mdl->nobs = mdl->nftr = 0;
234
+ mdl_sync(mdl);
235
+ // The model is now ready, so we copy in it the features weights from
236
+ // the old model for observations we have kept.
237
+ info(" - Compact it\n");
238
+ for (size_t newo = 0; newo < mdl->nobs; newo++) {
239
+ const size_t oldo = trans[newo];
240
+ if (mdl->kind[newo] & 1) {
241
+ double *src = old_theta + old_uoff[oldo];
242
+ double *dst = mdl->theta + mdl->uoff[newo];
243
+ for (size_t y = 0; y < Y; y++)
244
+ dst[y] = src[y];
245
+ }
246
+ if (mdl->kind[newo] & 2) {
247
+ double *src = old_theta + old_boff[oldo];
248
+ double *dst = mdl->theta + mdl->boff[newo];
249
+ for (size_t d = 0; d < Y * Y; d++)
250
+ dst[d] = src[d];
251
+ }
252
+ }
253
+ // And cleanup
254
+ free(trans);
255
+ qrk_free(old_obs);
256
+ free(old_uoff);
257
+ free(old_boff);
258
+ xvm_free(old_theta);
259
+ }
260
+
261
+ /* mdl_save:
262
+ * Save a model to be restored later in a platform independant way.
263
+ */
264
+ void mdl_save(mdl_t *mdl, FILE *file) {
265
+ size_t nact = 0;
266
+ for (size_t f = 0; f < mdl->nftr; f++)
267
+ if (mdl->theta[f] != 0.0)
268
+ nact++;
269
+ fprintf(file, "#mdl#%zu\n", nact);
270
+ rdr_save(mdl->reader, file);
271
+ for (size_t f = 0; f < mdl->nftr; f++)
272
+ if (mdl->theta[f] != 0.0)
273
+ fprintf(file, "%zu=%la\n", f, mdl->theta[f]);
274
+ }
275
+
276
+ /* mdl_load:
277
+ * Read back a previously saved model to continue training or start labeling.
278
+ * The returned model is synced and the quarks are locked. You must give to
279
+ * this function an empty model fresh from mdl_new.
280
+ */
281
+ void mdl_load(mdl_t *mdl, FILE *file) {
282
+ const char *err = "invalid model format";
283
+ size_t nact = 0;
284
+ if (fscanf(file, "#mdl#%zu\n", &nact) != 1)
285
+ fatal(err);
286
+ rdr_load(mdl->reader, file);
287
+ mdl_sync(mdl);
288
+ for (size_t i = 0; i < nact; i++) {
289
+ size_t f;
290
+ double v;
291
+ if (fscanf(file, "%zu=%la\n", &f, &v) != 2)
292
+ fatal(err);
293
+ mdl->theta[f] = v;
294
+ }
295
+ }
296
+