wapiti 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.autotest +13 -0
- data/.gitignore +5 -0
- data/.rspec +3 -0
- data/Gemfile +6 -0
- data/LICENSE +30 -0
- data/README.md +153 -0
- data/Rakefile +33 -0
- data/ext/wapiti/bcd.c +392 -0
- data/ext/wapiti/decoder.c +535 -0
- data/ext/wapiti/decoder.h +46 -0
- data/ext/wapiti/extconf.rb +8 -0
- data/ext/wapiti/gradient.c +818 -0
- data/ext/wapiti/gradient.h +81 -0
- data/ext/wapiti/lbfgs.c +294 -0
- data/ext/wapiti/model.c +296 -0
- data/ext/wapiti/model.h +100 -0
- data/ext/wapiti/native.c +1238 -0
- data/ext/wapiti/native.h +15 -0
- data/ext/wapiti/options.c +278 -0
- data/ext/wapiti/options.h +91 -0
- data/ext/wapiti/pattern.c +395 -0
- data/ext/wapiti/pattern.h +56 -0
- data/ext/wapiti/progress.c +167 -0
- data/ext/wapiti/progress.h +43 -0
- data/ext/wapiti/quark.c +272 -0
- data/ext/wapiti/quark.h +46 -0
- data/ext/wapiti/reader.c +553 -0
- data/ext/wapiti/reader.h +73 -0
- data/ext/wapiti/rprop.c +191 -0
- data/ext/wapiti/sequence.h +148 -0
- data/ext/wapiti/sgdl1.c +218 -0
- data/ext/wapiti/thread.c +171 -0
- data/ext/wapiti/thread.h +42 -0
- data/ext/wapiti/tools.c +202 -0
- data/ext/wapiti/tools.h +54 -0
- data/ext/wapiti/trainers.h +39 -0
- data/ext/wapiti/vmath.c +372 -0
- data/ext/wapiti/vmath.h +51 -0
- data/ext/wapiti/wapiti.c +288 -0
- data/ext/wapiti/wapiti.h +45 -0
- data/lib/wapiti.rb +30 -0
- data/lib/wapiti/errors.rb +17 -0
- data/lib/wapiti/model.rb +49 -0
- data/lib/wapiti/options.rb +113 -0
- data/lib/wapiti/utility.rb +15 -0
- data/lib/wapiti/version.rb +3 -0
- data/spec/fixtures/ch.mod +18550 -0
- data/spec/fixtures/chpattern.txt +52 -0
- data/spec/fixtures/chtest.txt +1973 -0
- data/spec/fixtures/chtrain.txt +19995 -0
- data/spec/fixtures/nppattern.txt +52 -0
- data/spec/fixtures/nptest.txt +1973 -0
- data/spec/fixtures/nptrain.txt +19995 -0
- data/spec/fixtures/pattern.txt +14 -0
- data/spec/fixtures/test.txt +60000 -0
- data/spec/fixtures/train.txt +1200 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/wapiti/model_spec.rb +173 -0
- data/spec/wapiti/native_spec.rb +12 -0
- data/spec/wapiti/options_spec.rb +175 -0
- data/spec/wapiti/utility_spec.rb +22 -0
- data/wapiti.gemspec +35 -0
- metadata +178 -0
@@ -0,0 +1,81 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#ifndef gradient_h
|
29
|
+
#define gradient_h
|
30
|
+
|
31
|
+
#include "wapiti.h"
|
32
|
+
#include "model.h"
|
33
|
+
#include "sequence.h"
|
34
|
+
|
35
|
+
/* grd_t:
|
36
|
+
* State tracker for the gradient computation. To compute the gradient we need
|
37
|
+
* to perform several steps and communicate between them a lot of intermediate
|
38
|
+
* values, all these temporary are store in this object.
|
39
|
+
* A tracker can be used to compute sequence of length <len> at most, before
|
40
|
+
* using it you must call grd_check to ensure that the tracker is big enough
|
41
|
+
* for your sequence.
|
42
|
+
*/
|
43
|
+
typedef struct grd_s grd_t;
|
44
|
+
struct grd_s {
|
45
|
+
mdl_t *mdl;
|
46
|
+
int len; // =T max length of sequence
|
47
|
+
double *g; // [F] vector where to put gradient updates
|
48
|
+
double lloss; // loss value for the sequence
|
49
|
+
double *psi; // [T][Y][Y] the transitions scores
|
50
|
+
double *psiuni; // [T][Y] | Same as psi in sparse format
|
51
|
+
size_t *psiyp; // [T][Y][Y] |
|
52
|
+
size_t *psiidx; // [T][Y] |
|
53
|
+
size_t *psioff; // [T]
|
54
|
+
double *alpha; // [T][Y] forward scores
|
55
|
+
double *beta; // [T][Y] backward scores
|
56
|
+
double *scale; // [T] scaling factors of forward scores
|
57
|
+
double *unorm; // [T] normalization factors for unigrams
|
58
|
+
double *bnorm; // [T] normalization factors for bigrams
|
59
|
+
int first; // first position where gradient is needed
|
60
|
+
int last; // last position where gradient is needed
|
61
|
+
};
|
62
|
+
|
63
|
+
grd_t *grd_new(mdl_t *mdl, double *g);
|
64
|
+
void grd_free(grd_t *grd);
|
65
|
+
void grd_check(grd_t *grd, int len);
|
66
|
+
|
67
|
+
void grd_fldopsi(grd_t *grd, const seq_t *seq);
|
68
|
+
void grd_flfwdbwd(grd_t *grd, const seq_t *seq);
|
69
|
+
void grd_flupgrad(grd_t *grd, const seq_t *seq);
|
70
|
+
|
71
|
+
void grd_spdopsi(grd_t *grd, const seq_t *seq);
|
72
|
+
void grd_spfwdbwd(grd_t *grd, const seq_t *seq);
|
73
|
+
void grd_spupgrad(grd_t *grd, const seq_t *seq);
|
74
|
+
|
75
|
+
void grd_logloss(grd_t *grd, const seq_t *seq);
|
76
|
+
|
77
|
+
void grd_dospl(grd_t *grd, const seq_t *seq);
|
78
|
+
double grd_gradient(mdl_t *mdl, double *g, grd_t *grds[]);
|
79
|
+
|
80
|
+
#endif
|
81
|
+
|
data/ext/wapiti/lbfgs.c
ADDED
@@ -0,0 +1,294 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
#include <math.h>
|
28
|
+
#include <stdbool.h>
|
29
|
+
#include <stddef.h>
|
30
|
+
#include <stdlib.h>
|
31
|
+
#include <string.h>
|
32
|
+
|
33
|
+
#include "wapiti.h"
|
34
|
+
#include "gradient.h"
|
35
|
+
#include "model.h"
|
36
|
+
#include "options.h"
|
37
|
+
#include "progress.h"
|
38
|
+
#include "tools.h"
|
39
|
+
#include "thread.h"
|
40
|
+
#include "vmath.h"
|
41
|
+
|
42
|
+
/******************************************************************************
|
43
|
+
* Quasi-Newton optimizer
|
44
|
+
*
|
45
|
+
* This section implement the quasi-Newton optimizer. We use the L-BFGS
|
46
|
+
* algorithm described by Liu and Nocedal in [1] and [2]. If an l1-norm must
|
47
|
+
* be applyed we fallback on the OWL-QN variant described in [3] by Galen and
|
48
|
+
* Jianfeng which allow to use L-BFGS for function not differentiable in 0.0.
|
49
|
+
*
|
50
|
+
* [1] Updating quasi-Newton matrices with limited storage, Jorge Nocedal, in
|
51
|
+
* Mathematics of Computation, vol. 35(151) 773-782, July 1980.
|
52
|
+
* [2] On the limited memory BFGS method for large scale optimization, Dong C.
|
53
|
+
* Liu and Jorge Nocedal, in Mathematical Programming, vol. 45(1) 503-528,
|
54
|
+
* January 1989.
|
55
|
+
* [3] Scalable Training of L1-Regularized Log-Linear Models, Andrew Galen and
|
56
|
+
* Gao Jianfeng, in Proceedings of the 24th International Conference on
|
57
|
+
* Machine Learning (ICML), Corvallis, OR, 2007.
|
58
|
+
******************************************************************************/
|
59
|
+
|
60
|
+
void trn_lbfgs(mdl_t *mdl) {
|
61
|
+
const size_t F = mdl->nftr;
|
62
|
+
const int K = mdl->opt->maxiter;
|
63
|
+
const int C = mdl->opt->objwin;
|
64
|
+
const int M = mdl->opt->lbfgs.histsz;
|
65
|
+
const size_t W = mdl->opt->nthread;
|
66
|
+
const bool l1 = mdl->opt->rho1 != 0.0;
|
67
|
+
double *x, *xp; // Current and previous value of the variables
|
68
|
+
double *g, *gp; // Current and previous value of the gradient
|
69
|
+
double *pg; // The pseudo-gradient (only for owl-qn)
|
70
|
+
double *d; // The search direction
|
71
|
+
double *s[M]; // History value s_k = Δ(x,px)
|
72
|
+
double *y[M]; // History value y_k = Δ(g,pg)
|
73
|
+
double p[M]; // ρ_k
|
74
|
+
double fh[C]; // f(x) history
|
75
|
+
grd_t *grds[W];
|
76
|
+
// Initialization: Here, we have to allocate memory on the heap as we
|
77
|
+
// cannot request so much memory on the stack as this will have a too
|
78
|
+
// big impact on performance and will be refused by the system on non-
|
79
|
+
// trivial models.
|
80
|
+
x = mdl->theta;
|
81
|
+
xp = xvm_new(F); g = xvm_new(F);
|
82
|
+
gp = xvm_new(F); d = xvm_new(F);
|
83
|
+
for (int m = 0; m < M; m++) {
|
84
|
+
s[m] = xvm_new(F);
|
85
|
+
y[m] = xvm_new(F);
|
86
|
+
}
|
87
|
+
pg = l1 ? xvm_new(F) : NULL;
|
88
|
+
grds[0] = grd_new(mdl, g);
|
89
|
+
for (size_t w = 1; w < W; w++)
|
90
|
+
grds[w] = grd_new(mdl, xvm_new(F));
|
91
|
+
// Minimization: This is the heart of the function. (a big heart...) We
|
92
|
+
// will perform iterations until one these conditions is reached
|
93
|
+
// - the maximum iteration count is reached
|
94
|
+
// - we have converged (upto numerical precision)
|
95
|
+
// - the report function return false
|
96
|
+
// - an error happen somewhere
|
97
|
+
double fx = grd_gradient(mdl, g, grds);
|
98
|
+
for (int k = 0; !uit_stop && k < K; k++) {
|
99
|
+
// We first compute the pseudo-gradient of f for owl-qn. It is
|
100
|
+
// defined in [3, pp 335(4)]
|
101
|
+
// | ∂_i^- f(x) if ∂_i^- f(x) > 0
|
102
|
+
// ◇_i f(x) = | ∂_i^+ f(x) if ∂_i^+ f(x) < 0
|
103
|
+
// | 0 otherwise
|
104
|
+
// with
|
105
|
+
// ∂_i^± f(x) = ∂/∂x_i l(x) + | Cσ(x_i) if x_i ≠ 0
|
106
|
+
// | ±C if x_i = 0
|
107
|
+
if (l1) {
|
108
|
+
const double rho1 = mdl->opt->rho1;
|
109
|
+
for (unsigned f = 0; f < F; f++) {
|
110
|
+
if (x[f] < 0.0)
|
111
|
+
pg[f] = g[f] - rho1;
|
112
|
+
else if (x[f] > 0.0)
|
113
|
+
pg[f] = g[f] + rho1;
|
114
|
+
else if (g[f] < -rho1)
|
115
|
+
pg[f] = g[f] + rho1;
|
116
|
+
else if (g[f] > rho1)
|
117
|
+
pg[f] = g[f] - rho1;
|
118
|
+
else
|
119
|
+
pg[f] = 0.0;
|
120
|
+
}
|
121
|
+
}
|
122
|
+
// 1st step: We compute the search direction. We search in the
|
123
|
+
// direction who minimize the second order approximation given
|
124
|
+
// by the Taylor series which give
|
125
|
+
// d_k = - H_k^{-1} g_k
|
126
|
+
// But computing the inverse of the hessian is intractable so
|
127
|
+
// the l-bfgs only approximate it's diagonal. The exact
|
128
|
+
// computation is well described in [1, pp 779].
|
129
|
+
// The only special thing for owl-qn here is to use the pseudo
|
130
|
+
// gradient instead of the true one.
|
131
|
+
xvm_neg(d, l1 ? pg : g, F);
|
132
|
+
if (k != 0) {
|
133
|
+
const int km = k % M;
|
134
|
+
const int bnd = (k <= M) ? k : M;
|
135
|
+
double alpha[M], beta;
|
136
|
+
// α_i = ρ_j s_j^T q_{i+1}
|
137
|
+
// q_i = q_{i+1} - α_i y_i
|
138
|
+
for (int i = bnd; i > 0; i--) {
|
139
|
+
const int j = (k - i + M + 1) % M;
|
140
|
+
alpha[i - 1] = p[j] * xvm_dot(s[j], d, F);
|
141
|
+
xvm_axpy(d, -alpha[i - 1], y[j], d, F);
|
142
|
+
}
|
143
|
+
// r_0 = H_0 q_0
|
144
|
+
// Scaling is described in [2, pp 515]
|
145
|
+
// for k = 0: H_0 = I
|
146
|
+
// for k > 0: H_0 = I * y_k^T s_k / ||y_k||²
|
147
|
+
// = I * 1 / ρ_k ||y_k||²
|
148
|
+
const double y2 = xvm_dot(y[km], y[km], F);
|
149
|
+
const double v = 1.0 / (p[km] * y2);
|
150
|
+
for (size_t f = 0; f < F; f++)
|
151
|
+
d[f] *= v;
|
152
|
+
// β_j = ρ_j y_j^T r_i
|
153
|
+
// r_{i+1} = r_i + s_j (α_i - β_i)
|
154
|
+
for (int i = 0; i < bnd; i++) {
|
155
|
+
const int j = (k - i + M) % M;
|
156
|
+
beta = p[j] * xvm_dot(y[j], d, F);
|
157
|
+
xvm_axpy(d, alpha[i] - beta, s[j], d, F);
|
158
|
+
}
|
159
|
+
}
|
160
|
+
// For owl-qn, we must remain in the same orthant than the
|
161
|
+
// pseudo-gradient, so we have to constrain the search
|
162
|
+
// direction as described in [3, pp 35(3)]
|
163
|
+
// d^k = π(d^k ; v^k)
|
164
|
+
// = π(d^k ; -◇f(x^k))
|
165
|
+
if (l1)
|
166
|
+
for (size_t f = 0; f < F; f++)
|
167
|
+
if (d[f] * pg[f] >= 0.0)
|
168
|
+
d[f] = 0.0;
|
169
|
+
// 2nd step: we perform a linesearch in the computed direction,
|
170
|
+
// we search a step value that satisfy the constrains using a
|
171
|
+
// backtracking algorithm. Much elaborated algorithm can perform
|
172
|
+
// better in the general case, but for CRF training, bactracking
|
173
|
+
// is very efficient and simple to implement.
|
174
|
+
// For quasi-Newton, the natural step is 1.0 so we start with
|
175
|
+
// this one and reduce it only if it fail with an exception for
|
176
|
+
// the first step where a better guess can be done.
|
177
|
+
// We have to keep track of the current point and gradient as we
|
178
|
+
// will need to compute the delta between those and the found
|
179
|
+
// point, and perhaps need to restore them if linesearch fail.
|
180
|
+
memcpy(xp, x, sizeof(double) * F);
|
181
|
+
memcpy(gp, g, sizeof(double) * F);
|
182
|
+
double sc = (k == 0) ? 0.1 : 0.5;
|
183
|
+
double stp = (k == 0) ? 1.0 / xvm_norm(d, F) : 1.0;
|
184
|
+
double gd = l1 ? 0.0 : xvm_dot(g, d, F); // gd = g_k^T d_k
|
185
|
+
double fi = fx;
|
186
|
+
bool err = false;
|
187
|
+
for (int ls = 1; !uit_stop; ls++, stp *= sc) {
|
188
|
+
// We compute the new point using the current step and
|
189
|
+
// search direction
|
190
|
+
xvm_axpy(x, stp, d, xp, F);
|
191
|
+
// For owl-qn, we have to project back the point in the
|
192
|
+
// current orthant [3, pp 35]
|
193
|
+
// x^{k+1} = π(x^k + αp^k ; ξ)
|
194
|
+
if (l1) {
|
195
|
+
for (size_t f = 0; f < F; f++) {
|
196
|
+
double or = xp[f];
|
197
|
+
if (or == 0.0)
|
198
|
+
or = -pg[f];
|
199
|
+
if (x[f] * or <= 0.0)
|
200
|
+
x[f] = 0.0;
|
201
|
+
}
|
202
|
+
}
|
203
|
+
// And we ask for the value of the objective function
|
204
|
+
// and its gradient.
|
205
|
+
fx = grd_gradient(mdl, g, grds);
|
206
|
+
// Now we check if the step satisfy the conditions. For
|
207
|
+
// l-bfgs, we check the classical decrease and curvature
|
208
|
+
// known as the Wolfe conditions [2, pp 506]
|
209
|
+
// f(x_k + α_k d_k) ≤ f(x_k) + β' α_k g_k^T d_k
|
210
|
+
// g(x_k + α_k d_k)^T d_k ≥ β g_k^T d_k
|
211
|
+
//
|
212
|
+
// And for owl-qn we check a variant of the Armijo rule
|
213
|
+
// described in [3, pp 36]
|
214
|
+
// f(π(x^k+αp^k;ξ)) ≤ f(x^k) - γv^T[π(x^k+αp^k;ξ)-x^k]
|
215
|
+
if (!l1) {
|
216
|
+
if (fx > fi + stp * gd * 1e-4)
|
217
|
+
sc = 0.5;
|
218
|
+
else if (xvm_dot(g, d, F) < gd * 0.9)
|
219
|
+
sc = 2.1;
|
220
|
+
else
|
221
|
+
break;
|
222
|
+
} else {
|
223
|
+
double vp = 0.0;
|
224
|
+
for (size_t f = 0; f < F; f++)
|
225
|
+
vp += (x[f] - xp[f]) * d[f];
|
226
|
+
if (fx < fi + vp * 1e-4)
|
227
|
+
break;
|
228
|
+
}
|
229
|
+
// If we reach the maximum number of linesearsh steps
|
230
|
+
// without finding a good one, we just fail.
|
231
|
+
if (ls == mdl->opt->lbfgs.maxls) {
|
232
|
+
warning("maximum linesearch reached");
|
233
|
+
err = true;
|
234
|
+
break;
|
235
|
+
}
|
236
|
+
}
|
237
|
+
// If linesearch failed or user interupted training, we return
|
238
|
+
// to the last valid point and stop the training. The model is
|
239
|
+
// probably not fully optimized but we let the user decide what
|
240
|
+
// to do with it.
|
241
|
+
if (err || uit_stop) {
|
242
|
+
memcpy(x, xp, sizeof(double) * F);
|
243
|
+
break;
|
244
|
+
}
|
245
|
+
if (uit_progress(mdl, k + 1, fx) == false)
|
246
|
+
break;
|
247
|
+
// 3rd step: we update the history used for approximating the
|
248
|
+
// inverse of the diagonal of the hessian
|
249
|
+
// s_k = x_{k+1} - x_k
|
250
|
+
// y_k = g_{k+1} - g_k
|
251
|
+
// ρ_k = 1 / y_k^T s_k
|
252
|
+
const int kn = (k + 1) % M;
|
253
|
+
xvm_sub(s[kn], x, xp, F);
|
254
|
+
xvm_sub(y[kn], g, gp, F);
|
255
|
+
p[kn] = 1.0 / xvm_dot(y[kn], s[kn], F);
|
256
|
+
// And last, we check for convergence. The convergence check is
|
257
|
+
// quite simple [2, pp 508]
|
258
|
+
// ||g|| / max(1, ||x||) ≤ ε
|
259
|
+
// with ε small enough so we stop when numerical precision is
|
260
|
+
// reached. For owl-qn we just have to check against the pseudo-
|
261
|
+
// gradient instead of the true one.
|
262
|
+
const double xn = xvm_norm(x, F);
|
263
|
+
const double gn = xvm_norm(l1 ? pg : g, F);
|
264
|
+
if (gn / max(xn, 1.0) <= 1e-5)
|
265
|
+
break;
|
266
|
+
if (k + 1 == K)
|
267
|
+
break;
|
268
|
+
// Second stoping criterion tested is a check for improvement of
|
269
|
+
// the function value over the past W iteration. When this come
|
270
|
+
// under an epsilon, we also stop the minimization.
|
271
|
+
fh[k % C] = fx;
|
272
|
+
double dlt = 1.0;
|
273
|
+
if (k >= C) {
|
274
|
+
const double of = fh[(k + 1) % C];
|
275
|
+
dlt = fabs(of - fx) / of;
|
276
|
+
if (dlt < mdl->opt->stopeps)
|
277
|
+
break;
|
278
|
+
}
|
279
|
+
}
|
280
|
+
// Cleanup: We free all the vectors we have allocated.
|
281
|
+
xvm_free(xp); xvm_free(g);
|
282
|
+
xvm_free(gp); xvm_free(d);
|
283
|
+
for (int m = 0; m < M; m++) {
|
284
|
+
xvm_free(s[m]);
|
285
|
+
xvm_free(y[m]);
|
286
|
+
}
|
287
|
+
if (l1)
|
288
|
+
xvm_free(pg);
|
289
|
+
for (size_t w = 1; w < W; w++)
|
290
|
+
xvm_free(grds[w]->g);
|
291
|
+
for (size_t w = 0; w < W; w++)
|
292
|
+
grd_free(grds[w]);
|
293
|
+
}
|
294
|
+
|
data/ext/wapiti/model.c
ADDED
@@ -0,0 +1,296 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
#include <stdbool.h>
|
28
|
+
#include <stddef.h>
|
29
|
+
#include <stdlib.h>
|
30
|
+
#include <stdio.h>
|
31
|
+
#include <string.h>
|
32
|
+
|
33
|
+
#include "wapiti.h"
|
34
|
+
#include "model.h"
|
35
|
+
#include "options.h"
|
36
|
+
#include "quark.h"
|
37
|
+
#include "reader.h"
|
38
|
+
#include "tools.h"
|
39
|
+
#include "vmath.h"
|
40
|
+
|
41
|
+
/*******************************************************************************
|
42
|
+
* Linear chain CRF model
|
43
|
+
*
|
44
|
+
* There is three concept that must be well understand here, the labels,
|
45
|
+
* observations, and features. The labels are the values predicted by the
|
46
|
+
* model at each point of the sequence and denoted by Y. The observations are
|
47
|
+
* the values, at each point of the sequence, given to the model in order to
|
48
|
+
* predict the label and denoted by O. A feature is a test on both labels and
|
49
|
+
* observations, denoted by F. In linear chain CRF there is two kinds of
|
50
|
+
* features :
|
51
|
+
* - unigram feature who represent a test on the observations at the current
|
52
|
+
* point and the label at current point.
|
53
|
+
* - bigram feature who represent a test on the observation at the current
|
54
|
+
* point and two labels : the current one and the previous one.
|
55
|
+
* So for each observation, there Y possible unigram features and Y*Y possible
|
56
|
+
* bigram features. The kind of features used by the model for a given
|
57
|
+
* observation depend on the pattern who generated it.
|
58
|
+
******************************************************************************/
|
59
|
+
|
60
|
+
/* mdl_new:
|
61
|
+
* Allocate a new empty model object linked with the given reader. The model
|
62
|
+
* have to be synchronized before starting training or labelling. If you not
|
63
|
+
* provide a reader (as it will loaded from file for example) you must be sure
|
64
|
+
* to set one in the model before any attempts to synchronize it.
|
65
|
+
*/
|
66
|
+
mdl_t *mdl_new(rdr_t *rdr) {
|
67
|
+
mdl_t *mdl = wapiti_xmalloc(sizeof(mdl_t));
|
68
|
+
mdl->nlbl = mdl->nobs = mdl->nftr = 0;
|
69
|
+
mdl->kind = NULL;
|
70
|
+
mdl->uoff = mdl->boff = NULL;
|
71
|
+
mdl->theta = NULL;
|
72
|
+
mdl->train = mdl->devel = NULL;
|
73
|
+
mdl->reader = rdr;
|
74
|
+
mdl->werr = NULL;
|
75
|
+
mdl->total = 0.0;
|
76
|
+
return mdl;
|
77
|
+
}
|
78
|
+
|
79
|
+
/* mdl_free:
|
80
|
+
* Free all memory used by a model object inculding the reader and datasets
|
81
|
+
* loaded in the model.
|
82
|
+
*/
|
83
|
+
void mdl_free(mdl_t *mdl) {
|
84
|
+
free(mdl->kind);
|
85
|
+
free(mdl->uoff);
|
86
|
+
free(mdl->boff);
|
87
|
+
if (mdl->theta != NULL)
|
88
|
+
xvm_free(mdl->theta);
|
89
|
+
if (mdl->train != NULL)
|
90
|
+
rdr_freedat(mdl->train);
|
91
|
+
if (mdl->devel != NULL)
|
92
|
+
rdr_freedat(mdl->devel);
|
93
|
+
if (mdl->reader != NULL)
|
94
|
+
rdr_free(mdl->reader);
|
95
|
+
if (mdl->werr != NULL)
|
96
|
+
free(mdl->werr);
|
97
|
+
free(mdl);
|
98
|
+
}
|
99
|
+
|
100
|
+
/* mdl_sync:
|
101
|
+
* Synchronize the model with its reader. As the model is just a placeholder
|
102
|
+
* for features weights and interned sequences, it know very few about the
|
103
|
+
* labels and observations, all the informations are kept in the reader. A
|
104
|
+
* sync will get the labels and observations count as well as the observation
|
105
|
+
* kind from the reader and build internal structures representing the model.
|
106
|
+
*
|
107
|
+
* If the model was already synchronized before, there is an existing model
|
108
|
+
* incompatible with the new one to be created. In this case there is two
|
109
|
+
* possibility :
|
110
|
+
* - If only new observations was added, the weights of the old ones remain
|
111
|
+
* valid and are kept as they form a probably good starting point for
|
112
|
+
* training the new model, the new observation get a 0 weight ;
|
113
|
+
* - If new labels was added, the old model are trully meaningless so we
|
114
|
+
* have to fully discard them and build a new empty model.
|
115
|
+
* In any case, you must never change existing labels or observations, if this
|
116
|
+
* happen, you need to create a new model and destroy this one.
|
117
|
+
*
|
118
|
+
* After synchronization, the labels and observations databases are locked to
|
119
|
+
* prevent new one to be created. You must unlock them explicitly if needed.
|
120
|
+
* This reduce the risk of mistakes.
|
121
|
+
*/
|
122
|
+
void mdl_sync(mdl_t *mdl) {
|
123
|
+
const size_t Y = qrk_count(mdl->reader->lbl);
|
124
|
+
const size_t O = qrk_count(mdl->reader->obs);
|
125
|
+
// If model is already synchronized, do nothing and just return
|
126
|
+
if (mdl->nlbl == Y && mdl->nobs == O)
|
127
|
+
return;
|
128
|
+
if (Y == 0 || O == 0)
|
129
|
+
fatal("cannot synchronize an empty model");
|
130
|
+
// If new labels was added, we have to discard all the model. In this
|
131
|
+
// case we also display a warning as this is probably not expected by
|
132
|
+
// the user. If only new observations was added, we will try to expand
|
133
|
+
// the model.
|
134
|
+
size_t oldF = mdl->nftr;
|
135
|
+
size_t oldO = mdl->nobs;
|
136
|
+
if (mdl->nlbl != Y && mdl->nlbl != 0) {
|
137
|
+
warning("labels count changed, discarding the model");
|
138
|
+
free(mdl->kind); mdl->kind = NULL;
|
139
|
+
free(mdl->uoff); mdl->uoff = NULL;
|
140
|
+
free(mdl->boff); mdl->boff = NULL;
|
141
|
+
if (mdl->theta != NULL) {
|
142
|
+
xvm_free(mdl->theta);
|
143
|
+
mdl->theta = NULL;
|
144
|
+
}
|
145
|
+
oldF = oldO = 0;
|
146
|
+
}
|
147
|
+
mdl->nlbl = Y;
|
148
|
+
mdl->nobs = O;
|
149
|
+
// Allocate the observations datastructure. If the model is empty or
|
150
|
+
// discarded, a new one iscreated, else the old one is expanded.
|
151
|
+
char *kind = wapiti_xrealloc(mdl->kind, sizeof(char ) * O);
|
152
|
+
size_t *uoff = wapiti_xrealloc(mdl->uoff, sizeof(size_t) * O);
|
153
|
+
size_t *boff = wapiti_xrealloc(mdl->boff, sizeof(size_t) * O);
|
154
|
+
mdl->kind = kind;
|
155
|
+
mdl->uoff = uoff;
|
156
|
+
mdl->boff = boff;
|
157
|
+
// Now, we can setup the features. For each new observations we fill the
|
158
|
+
// kind and offsets arrays and count total number of features as well.
|
159
|
+
size_t F = oldF;
|
160
|
+
for (size_t o = oldO; o < O; o++) {
|
161
|
+
const char *obs = qrk_id2str(mdl->reader->obs, o);
|
162
|
+
switch (obs[0]) {
|
163
|
+
case 'u': kind[o] = 1; break;
|
164
|
+
case 'b': kind[o] = 2; break;
|
165
|
+
case '*': kind[o] = 3; break;
|
166
|
+
}
|
167
|
+
if (kind[o] & 1)
|
168
|
+
uoff[o] = F, F += Y;
|
169
|
+
if (kind[o] & 2)
|
170
|
+
boff[o] = F, F += Y * Y;
|
171
|
+
}
|
172
|
+
mdl->nftr = F;
|
173
|
+
// We can finally grow the features weights vector itself. We set all
|
174
|
+
// the new features to 0.0 but don't touch the old ones.
|
175
|
+
// This is a bit tricky as aligned malloc cannot be simply grown so we
|
176
|
+
// have to allocate a new vector and copy old values ourself.
|
177
|
+
if (oldF != 0) {
|
178
|
+
double *new = xvm_new(F);
|
179
|
+
for (size_t f = 0; f < oldF; f++)
|
180
|
+
new[f] = mdl->theta[f];
|
181
|
+
xvm_free(mdl->theta);
|
182
|
+
mdl->theta = new;
|
183
|
+
} else {
|
184
|
+
mdl->theta = xvm_new(F);
|
185
|
+
}
|
186
|
+
for (size_t f = oldF; f < F; f++)
|
187
|
+
mdl->theta[f] = 0.0;
|
188
|
+
// And lock the databases
|
189
|
+
qrk_lock(mdl->reader->lbl, true);
|
190
|
+
qrk_lock(mdl->reader->obs, true);
|
191
|
+
}
|
192
|
+
|
193
|
+
/* mdl_compact:
|
194
|
+
* Comapct the given model by removing from it all observation who lead to
|
195
|
+
* zero actives features. On model trained with l1 regularization this can
|
196
|
+
* lead to a drastic model size reduction and so to faster loading, training
|
197
|
+
* and labeling.
|
198
|
+
*/
|
199
|
+
void mdl_compact(mdl_t *mdl) {
|
200
|
+
const size_t Y = mdl->nlbl;
|
201
|
+
// We first build the new observation list with only observations which
|
202
|
+
// lead to at least one active feature. At the same time we build the
|
203
|
+
// translation table which map the new observations index to the old
|
204
|
+
// ones.
|
205
|
+
info(" - Scan the model\n");
|
206
|
+
qrk_t *old_obs = mdl->reader->obs;
|
207
|
+
qrk_t *new_obs = qrk_new();
|
208
|
+
size_t *trans = wapiti_xmalloc(sizeof(size_t) * mdl->nobs);
|
209
|
+
for (size_t oldo = 0; oldo < mdl->nobs; oldo++) {
|
210
|
+
bool active = false;
|
211
|
+
if (mdl->kind[oldo] & 1)
|
212
|
+
for (size_t y = 0; y < Y; y++)
|
213
|
+
if (mdl->theta[mdl->uoff[oldo] + y] != 0.0)
|
214
|
+
active = true;
|
215
|
+
if (mdl->kind[oldo] & 2)
|
216
|
+
for (size_t d = 0; d < Y * Y; d++)
|
217
|
+
if (mdl->theta[mdl->boff[oldo] + d] != 0.0)
|
218
|
+
active = true;
|
219
|
+
if (!active)
|
220
|
+
continue;
|
221
|
+
const char *str = qrk_id2str(old_obs, oldo);
|
222
|
+
const size_t newo = qrk_str2id(new_obs, str);
|
223
|
+
trans[newo] = oldo;
|
224
|
+
}
|
225
|
+
mdl->reader->obs = new_obs;
|
226
|
+
// Now we save the old model features informations and build a new one
|
227
|
+
// corresponding to the compacted model.
|
228
|
+
size_t *old_uoff = mdl->uoff; mdl->uoff = NULL;
|
229
|
+
size_t *old_boff = mdl->boff; mdl->boff = NULL;
|
230
|
+
double *old_theta = mdl->theta; mdl->theta = NULL;
|
231
|
+
free(mdl->kind);
|
232
|
+
mdl->kind = NULL;
|
233
|
+
mdl->nlbl = mdl->nobs = mdl->nftr = 0;
|
234
|
+
mdl_sync(mdl);
|
235
|
+
// The model is now ready, so we copy in it the features weights from
|
236
|
+
// the old model for observations we have kept.
|
237
|
+
info(" - Compact it\n");
|
238
|
+
for (size_t newo = 0; newo < mdl->nobs; newo++) {
|
239
|
+
const size_t oldo = trans[newo];
|
240
|
+
if (mdl->kind[newo] & 1) {
|
241
|
+
double *src = old_theta + old_uoff[oldo];
|
242
|
+
double *dst = mdl->theta + mdl->uoff[newo];
|
243
|
+
for (size_t y = 0; y < Y; y++)
|
244
|
+
dst[y] = src[y];
|
245
|
+
}
|
246
|
+
if (mdl->kind[newo] & 2) {
|
247
|
+
double *src = old_theta + old_boff[oldo];
|
248
|
+
double *dst = mdl->theta + mdl->boff[newo];
|
249
|
+
for (size_t d = 0; d < Y * Y; d++)
|
250
|
+
dst[d] = src[d];
|
251
|
+
}
|
252
|
+
}
|
253
|
+
// And cleanup
|
254
|
+
free(trans);
|
255
|
+
qrk_free(old_obs);
|
256
|
+
free(old_uoff);
|
257
|
+
free(old_boff);
|
258
|
+
xvm_free(old_theta);
|
259
|
+
}
|
260
|
+
|
261
|
+
/* mdl_save:
|
262
|
+
* Save a model to be restored later in a platform independant way.
|
263
|
+
*/
|
264
|
+
void mdl_save(mdl_t *mdl, FILE *file) {
|
265
|
+
size_t nact = 0;
|
266
|
+
for (size_t f = 0; f < mdl->nftr; f++)
|
267
|
+
if (mdl->theta[f] != 0.0)
|
268
|
+
nact++;
|
269
|
+
fprintf(file, "#mdl#%zu\n", nact);
|
270
|
+
rdr_save(mdl->reader, file);
|
271
|
+
for (size_t f = 0; f < mdl->nftr; f++)
|
272
|
+
if (mdl->theta[f] != 0.0)
|
273
|
+
fprintf(file, "%zu=%la\n", f, mdl->theta[f]);
|
274
|
+
}
|
275
|
+
|
276
|
+
/* mdl_load:
|
277
|
+
* Read back a previously saved model to continue training or start labeling.
|
278
|
+
* The returned model is synced and the quarks are locked. You must give to
|
279
|
+
* this function an empty model fresh from mdl_new.
|
280
|
+
*/
|
281
|
+
void mdl_load(mdl_t *mdl, FILE *file) {
|
282
|
+
const char *err = "invalid model format";
|
283
|
+
size_t nact = 0;
|
284
|
+
if (fscanf(file, "#mdl#%zu\n", &nact) != 1)
|
285
|
+
fatal(err);
|
286
|
+
rdr_load(mdl->reader, file);
|
287
|
+
mdl_sync(mdl);
|
288
|
+
for (size_t i = 0; i < nact; i++) {
|
289
|
+
size_t f;
|
290
|
+
double v;
|
291
|
+
if (fscanf(file, "%zu=%la\n", &f, &v) != 2)
|
292
|
+
fatal(err);
|
293
|
+
mdl->theta[f] = v;
|
294
|
+
}
|
295
|
+
}
|
296
|
+
|