chaine 3.13.1__cp312-cp312-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of chaine might be problematic. Click here for more details.
- chaine/__init__.py +2 -0
- chaine/_core/crf.cpp +19854 -0
- chaine/_core/crf.cpython-312-darwin.so +0 -0
- chaine/_core/crf.pyx +271 -0
- chaine/_core/crfsuite/COPYING +27 -0
- chaine/_core/crfsuite/README +183 -0
- chaine/_core/crfsuite/include/crfsuite.h +1077 -0
- chaine/_core/crfsuite/include/crfsuite.hpp +649 -0
- chaine/_core/crfsuite/include/crfsuite_api.hpp +406 -0
- chaine/_core/crfsuite/include/os.h +65 -0
- chaine/_core/crfsuite/lib/cqdb/COPYING +28 -0
- chaine/_core/crfsuite/lib/cqdb/include/cqdb.h +518 -0
- chaine/_core/crfsuite/lib/cqdb/src/cqdb.c +639 -0
- chaine/_core/crfsuite/lib/cqdb/src/lookup3.c +1271 -0
- chaine/_core/crfsuite/lib/cqdb/src/main.c +184 -0
- chaine/_core/crfsuite/lib/crf/src/crf1d.h +354 -0
- chaine/_core/crfsuite/lib/crf/src/crf1d_context.c +788 -0
- chaine/_core/crfsuite/lib/crf/src/crf1d_encode.c +1020 -0
- chaine/_core/crfsuite/lib/crf/src/crf1d_feature.c +382 -0
- chaine/_core/crfsuite/lib/crf/src/crf1d_model.c +1085 -0
- chaine/_core/crfsuite/lib/crf/src/crf1d_tag.c +582 -0
- chaine/_core/crfsuite/lib/crf/src/crfsuite.c +500 -0
- chaine/_core/crfsuite/lib/crf/src/crfsuite_internal.h +233 -0
- chaine/_core/crfsuite/lib/crf/src/crfsuite_train.c +302 -0
- chaine/_core/crfsuite/lib/crf/src/dataset.c +115 -0
- chaine/_core/crfsuite/lib/crf/src/dictionary.c +127 -0
- chaine/_core/crfsuite/lib/crf/src/holdout.c +83 -0
- chaine/_core/crfsuite/lib/crf/src/json.c +1497 -0
- chaine/_core/crfsuite/lib/crf/src/json.h +120 -0
- chaine/_core/crfsuite/lib/crf/src/logging.c +85 -0
- chaine/_core/crfsuite/lib/crf/src/logging.h +49 -0
- chaine/_core/crfsuite/lib/crf/src/params.c +370 -0
- chaine/_core/crfsuite/lib/crf/src/params.h +84 -0
- chaine/_core/crfsuite/lib/crf/src/quark.c +180 -0
- chaine/_core/crfsuite/lib/crf/src/quark.h +46 -0
- chaine/_core/crfsuite/lib/crf/src/rumavl.c +1178 -0
- chaine/_core/crfsuite/lib/crf/src/rumavl.h +144 -0
- chaine/_core/crfsuite/lib/crf/src/train_arow.c +409 -0
- chaine/_core/crfsuite/lib/crf/src/train_averaged_perceptron.c +237 -0
- chaine/_core/crfsuite/lib/crf/src/train_l2sgd.c +491 -0
- chaine/_core/crfsuite/lib/crf/src/train_lbfgs.c +323 -0
- chaine/_core/crfsuite/lib/crf/src/train_passive_aggressive.c +442 -0
- chaine/_core/crfsuite/lib/crf/src/vecmath.h +360 -0
- chaine/_core/crfsuite/swig/crfsuite.cpp +1 -0
- chaine/_core/crfsuite_api.pxd +67 -0
- chaine/_core/liblbfgs/COPYING +22 -0
- chaine/_core/liblbfgs/README +71 -0
- chaine/_core/liblbfgs/include/lbfgs.h +745 -0
- chaine/_core/liblbfgs/lib/arithmetic_ansi.h +142 -0
- chaine/_core/liblbfgs/lib/arithmetic_sse_double.h +303 -0
- chaine/_core/liblbfgs/lib/arithmetic_sse_float.h +312 -0
- chaine/_core/liblbfgs/lib/lbfgs.c +1531 -0
- chaine/_core/tagger_wrapper.hpp +58 -0
- chaine/_core/trainer_wrapper.cpp +32 -0
- chaine/_core/trainer_wrapper.hpp +26 -0
- chaine/crf.py +505 -0
- chaine/logging.py +214 -0
- chaine/optimization/__init__.py +10 -0
- chaine/optimization/metrics.py +129 -0
- chaine/optimization/spaces.py +394 -0
- chaine/optimization/trial.py +103 -0
- chaine/optimization/utils.py +119 -0
- chaine/training.py +184 -0
- chaine/typing.py +18 -0
- chaine/validation.py +43 -0
- chaine-3.13.1.dist-info/METADATA +348 -0
- chaine-3.13.1.dist-info/RECORD +68 -0
- chaine-3.13.1.dist-info/WHEEL +5 -0
|
@@ -0,0 +1,491 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Online training with L2-regularized Stochastic Gradient Descent (SGD).
|
|
3
|
+
*
|
|
4
|
+
* Copyright (c) 2007-2010, Naoaki Okazaki
|
|
5
|
+
* All rights reserved.
|
|
6
|
+
*
|
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
|
14
|
+
* * Neither the names of the authors nor the names of its contributors
|
|
15
|
+
* may be used to endorse or promote products derived from this
|
|
16
|
+
* software without specific prior written permission.
|
|
17
|
+
*
|
|
18
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
19
|
+
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
20
|
+
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
21
|
+
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
22
|
+
* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
23
|
+
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
24
|
+
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
25
|
+
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
26
|
+
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
27
|
+
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
28
|
+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
29
|
+
*/
|
|
30
|
+
|
|
31
|
+
/* $Id$ */
|
|
32
|
+
|
|
33
|
+
/*
|
|
34
|
+
SGD for L2-regularized MAP estimation.
|
|
35
|
+
|
|
36
|
+
The iterative algorithm is inspired by Pegasos:
|
|
37
|
+
|
|
38
|
+
Shai Shalev-Shwartz, Yoram Singer, and Nathan Srebro.
|
|
39
|
+
Pegasos: Primal Estimated sub-GrAdient SOlver for SVM.
|
|
40
|
+
In Proc. of ICML 2007, pp 807-814, 2007.
|
|
41
|
+
|
|
42
|
+
The calibration strategy is inspired by the implementation of sgd:
|
|
43
|
+
http://leon.bottou.org/projects/sgd
|
|
44
|
+
written by Léon Bottou.
|
|
45
|
+
|
|
46
|
+
The objective function to minimize is:
|
|
47
|
+
|
|
48
|
+
f(w) = (lambda/2) * ||w||^2 + (1/N) * \sum_i^N log P^i(y|x)
|
|
49
|
+
lambda = 2 * C / N
|
|
50
|
+
|
|
51
|
+
The original version of the Pegasos algorithm.
|
|
52
|
+
|
|
53
|
+
0) Initialization
|
|
54
|
+
t = t0
|
|
55
|
+
k = [the batch size]
|
|
56
|
+
1) Computing the learning rate (eta).
|
|
57
|
+
eta = 1 / (lambda * t)
|
|
58
|
+
2) Updating feature weights.
|
|
59
|
+
w = (1 - eta * lambda) w - (eta / k) \sum_i (oexp - mexp)
|
|
60
|
+
3) Projecting feature weights within an L2-ball.
|
|
61
|
+
w = min{1, (1/sqrt(lambda))/||w||} * w
|
|
62
|
+
4) Goto 1 until convergence.
|
|
63
|
+
|
|
64
|
+
This implementation omit the step 3) because it makes the source code
|
|
65
|
+
tricky (in order to maintain L2-norm of feature weights at any time) and
|
|
66
|
+
because the project step does not have a strong impact to the quality of
|
|
67
|
+
solution.
|
|
68
|
+
|
|
69
|
+
A naive implementation requires O(K) computations for steps 2,
|
|
70
|
+
where K is the total number of features. This code implements the procedure
|
|
71
|
+
in an efficient way:
|
|
72
|
+
|
|
73
|
+
0) Initialization
|
|
74
|
+
decay = 1
|
|
75
|
+
1) Computing various factors
|
|
76
|
+
eta = 1 / (lambda * t)
|
|
77
|
+
decay *= (1 - eta * lambda)
|
|
78
|
+
gain = (eta / k) / decay
|
|
79
|
+
2) Updating feature weights
|
|
80
|
+
Updating feature weights from observation expectation:
|
|
81
|
+
delta = gain * (1.0) * f(x,y)
|
|
82
|
+
w += delta
|
|
83
|
+
Updating feature weights from model expectation:
|
|
84
|
+
delta = gain * (-P(y|x)) * f(x,y)
|
|
85
|
+
w += delta
|
|
86
|
+
4) Goto 1 until convergence.
|
|
87
|
+
*/
|
|
88
|
+
|
|
89
|
+
#ifdef HAVE_CONFIG_H
|
|
90
|
+
#include <config.h>
|
|
91
|
+
#endif /*HAVE_CONFIG_H*/
|
|
92
|
+
|
|
93
|
+
#include <os.h>
|
|
94
|
+
|
|
95
|
+
#include <float.h>
|
|
96
|
+
#include <stdio.h>
|
|
97
|
+
#include <stdlib.h>
|
|
98
|
+
#include <string.h>
|
|
99
|
+
#include <time.h>
|
|
100
|
+
#include <math.h>
|
|
101
|
+
|
|
102
|
+
#include <crfsuite.h>
|
|
103
|
+
#include "crfsuite_internal.h"
|
|
104
|
+
|
|
105
|
+
#include "logging.h"
|
|
106
|
+
#include "params.h"
|
|
107
|
+
#include "crf1d.h"
|
|
108
|
+
#include "vecmath.h"
|
|
109
|
+
|
|
110
|
+
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
|
111
|
+
|
|
112
|
+
typedef struct
|
|
113
|
+
{
|
|
114
|
+
floatval_t c2;
|
|
115
|
+
floatval_t lambda;
|
|
116
|
+
floatval_t t0;
|
|
117
|
+
int max_iterations;
|
|
118
|
+
int period;
|
|
119
|
+
floatval_t delta;
|
|
120
|
+
floatval_t calibration_eta;
|
|
121
|
+
floatval_t calibration_rate;
|
|
122
|
+
int calibration_samples;
|
|
123
|
+
int calibration_candidates;
|
|
124
|
+
int calibration_max_trials;
|
|
125
|
+
} training_option_t;
|
|
126
|
+
|
|
127
|
+
static int l2sgd(
|
|
128
|
+
encoder_t *gm,
|
|
129
|
+
dataset_t *trainset,
|
|
130
|
+
dataset_t *testset,
|
|
131
|
+
floatval_t *w,
|
|
132
|
+
logging_t *lg,
|
|
133
|
+
const int N,
|
|
134
|
+
const floatval_t t0,
|
|
135
|
+
const floatval_t lambda,
|
|
136
|
+
const int num_epochs,
|
|
137
|
+
int calibration,
|
|
138
|
+
int period,
|
|
139
|
+
const floatval_t epsilon,
|
|
140
|
+
floatval_t *ptr_loss)
|
|
141
|
+
{
|
|
142
|
+
int i, epoch, ret = 0;
|
|
143
|
+
floatval_t t = 0;
|
|
144
|
+
floatval_t loss = 0, sum_loss = 0;
|
|
145
|
+
floatval_t best_sum_loss = DBL_MAX;
|
|
146
|
+
floatval_t eta, gain, decay = 1.;
|
|
147
|
+
floatval_t improvement = 0.;
|
|
148
|
+
floatval_t norm2 = 0.;
|
|
149
|
+
floatval_t *pf = NULL;
|
|
150
|
+
floatval_t *best_w = NULL;
|
|
151
|
+
clock_t clk_prev, clk_begin = clock();
|
|
152
|
+
const int K = gm->num_features;
|
|
153
|
+
|
|
154
|
+
if (!calibration)
|
|
155
|
+
{
|
|
156
|
+
pf = (floatval_t *)malloc(sizeof(floatval_t) * period);
|
|
157
|
+
best_w = (floatval_t *)calloc(K, sizeof(floatval_t));
|
|
158
|
+
if (pf == NULL || best_w == NULL)
|
|
159
|
+
{
|
|
160
|
+
ret = CRFSUITEERR_OUTOFMEMORY;
|
|
161
|
+
goto error_exit;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/* Initialize the feature weights. */
|
|
166
|
+
vecset(w, 0, K);
|
|
167
|
+
|
|
168
|
+
/* Loop for epochs. */
|
|
169
|
+
for (epoch = 1; epoch <= num_epochs; ++epoch)
|
|
170
|
+
{
|
|
171
|
+
clk_prev = clock();
|
|
172
|
+
|
|
173
|
+
if (!calibration)
|
|
174
|
+
{
|
|
175
|
+
/* Shuffle the training instances. */
|
|
176
|
+
dataset_shuffle(trainset);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
/* Loop for instances. */
|
|
180
|
+
sum_loss = 0.;
|
|
181
|
+
for (i = 0; i < N; ++i)
|
|
182
|
+
{
|
|
183
|
+
const crfsuite_instance_t *inst = dataset_get(trainset, i);
|
|
184
|
+
|
|
185
|
+
/* Update various factors. */
|
|
186
|
+
eta = 1 / (lambda * (t0 + t));
|
|
187
|
+
decay *= (1.0 - eta * lambda);
|
|
188
|
+
gain = eta / decay;
|
|
189
|
+
|
|
190
|
+
/* Compute the loss and gradients for the instance. */
|
|
191
|
+
gm->set_weights(gm, w, decay);
|
|
192
|
+
gm->set_instance(gm, inst);
|
|
193
|
+
gm->objective_and_gradients(gm, &loss, w, gain, inst->weight);
|
|
194
|
+
|
|
195
|
+
sum_loss += loss;
|
|
196
|
+
++t;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
/* Terminate when the loss is abnormal (NaN, -Inf, +Inf). */
|
|
200
|
+
if (!isfinite(loss))
|
|
201
|
+
{
|
|
202
|
+
logging(lg, "Loss is abnormal");
|
|
203
|
+
ret = CRFSUITEERR_OVERFLOW;
|
|
204
|
+
sum_loss = loss;
|
|
205
|
+
goto error_exit;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/* Scale the feature weights. */
|
|
209
|
+
vecscale(w, decay, K);
|
|
210
|
+
decay = 1.;
|
|
211
|
+
|
|
212
|
+
/* Include the L2 norm of feature weights to the objective. */
|
|
213
|
+
/* The factor N is necessary because lambda = 2 * C / N. */
|
|
214
|
+
norm2 = vecdot(w, w, K);
|
|
215
|
+
sum_loss += 0.5 * lambda * norm2 * N;
|
|
216
|
+
|
|
217
|
+
/* One epoch finished. */
|
|
218
|
+
if (!calibration)
|
|
219
|
+
{
|
|
220
|
+
/* Check if the current epoch is the best. */
|
|
221
|
+
if (sum_loss < best_sum_loss)
|
|
222
|
+
{
|
|
223
|
+
/* Store the feature weights to best_w. */
|
|
224
|
+
best_sum_loss = sum_loss;
|
|
225
|
+
veccopy(best_w, w, K);
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/* We don't test the stopping criterion while period < epoch. */
|
|
229
|
+
if (period < epoch)
|
|
230
|
+
{
|
|
231
|
+
improvement = (pf[(epoch - 1) % period] - sum_loss) / sum_loss;
|
|
232
|
+
}
|
|
233
|
+
else
|
|
234
|
+
{
|
|
235
|
+
improvement = epsilon;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
/* Store the current value of the objective function. */
|
|
239
|
+
pf[(epoch - 1) % period] = sum_loss;
|
|
240
|
+
|
|
241
|
+
logging(lg, "Epoch %d, learning rate: %f, training loss: %f", epoch, eta, sum_loss);
|
|
242
|
+
|
|
243
|
+
/* Check for the stopping criterion. */
|
|
244
|
+
if (improvement < epsilon)
|
|
245
|
+
{
|
|
246
|
+
ret = 0;
|
|
247
|
+
break;
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
/* Output the optimization result. */
|
|
253
|
+
if (!calibration)
|
|
254
|
+
{
|
|
255
|
+
if (ret == 0)
|
|
256
|
+
{
|
|
257
|
+
if (epoch < num_epochs)
|
|
258
|
+
{
|
|
259
|
+
logging(lg, "Loss has converged, terminating training");
|
|
260
|
+
}
|
|
261
|
+
else
|
|
262
|
+
{
|
|
263
|
+
logging(lg, "Reached maximum number of iterations, terminating training");
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
/* Restore the best weights. */
|
|
269
|
+
if (best_w != NULL)
|
|
270
|
+
{
|
|
271
|
+
sum_loss = best_sum_loss;
|
|
272
|
+
veccopy(w, best_w, K);
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
error_exit:
|
|
276
|
+
free(best_w);
|
|
277
|
+
free(pf);
|
|
278
|
+
if (ptr_loss != NULL)
|
|
279
|
+
{
|
|
280
|
+
*ptr_loss = sum_loss;
|
|
281
|
+
}
|
|
282
|
+
return ret;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
static floatval_t
|
|
286
|
+
l2sgd_calibration(
|
|
287
|
+
encoder_t *gm,
|
|
288
|
+
dataset_t *ds,
|
|
289
|
+
floatval_t *w,
|
|
290
|
+
logging_t *lg,
|
|
291
|
+
const training_option_t *opt)
|
|
292
|
+
{
|
|
293
|
+
int i;
|
|
294
|
+
int dec = 0, ok, trials = 1;
|
|
295
|
+
int num = opt->calibration_candidates;
|
|
296
|
+
clock_t clk_begin = clock();
|
|
297
|
+
floatval_t loss = 0.;
|
|
298
|
+
floatval_t init_loss = 0.;
|
|
299
|
+
floatval_t best_loss = DBL_MAX;
|
|
300
|
+
floatval_t eta = opt->calibration_eta;
|
|
301
|
+
floatval_t best_eta = opt->calibration_eta;
|
|
302
|
+
const int N = ds->num_instances;
|
|
303
|
+
const int S = MIN(N, opt->calibration_samples);
|
|
304
|
+
const int K = gm->num_features;
|
|
305
|
+
const floatval_t init_eta = opt->calibration_eta;
|
|
306
|
+
const floatval_t rate = opt->calibration_rate;
|
|
307
|
+
const floatval_t lambda = opt->lambda;
|
|
308
|
+
|
|
309
|
+
logging(lg, "Calibrating learning rate");
|
|
310
|
+
|
|
311
|
+
/* Initialize a permutation that shuffles the instances. */
|
|
312
|
+
dataset_shuffle(ds);
|
|
313
|
+
|
|
314
|
+
/* Initialize feature weights as zero. */
|
|
315
|
+
vecset(w, 0, K);
|
|
316
|
+
|
|
317
|
+
/* Compute the initial loss. */
|
|
318
|
+
gm->set_weights(gm, w, 1.);
|
|
319
|
+
init_loss = 0;
|
|
320
|
+
for (i = 0; i < S; ++i)
|
|
321
|
+
{
|
|
322
|
+
floatval_t score;
|
|
323
|
+
const crfsuite_instance_t *inst = dataset_get(ds, i);
|
|
324
|
+
gm->set_instance(gm, inst);
|
|
325
|
+
gm->score(gm, inst->labels, &score);
|
|
326
|
+
init_loss -= score;
|
|
327
|
+
gm->partition_factor(gm, &score);
|
|
328
|
+
init_loss += score;
|
|
329
|
+
}
|
|
330
|
+
init_loss += 0.5 * lambda * vecdot(w, w, K) * N;
|
|
331
|
+
logging(lg, "Initial training loss: %f", init_loss);
|
|
332
|
+
|
|
333
|
+
while (num > 0 || !dec)
|
|
334
|
+
{
|
|
335
|
+
/* Perform SGD for one epoch. */
|
|
336
|
+
l2sgd(
|
|
337
|
+
gm,
|
|
338
|
+
ds,
|
|
339
|
+
NULL,
|
|
340
|
+
w,
|
|
341
|
+
lg,
|
|
342
|
+
S, 1.0 / (lambda * eta), lambda, 1, 1, 1, 0., &loss);
|
|
343
|
+
|
|
344
|
+
/* Make sure that the learning rate decreases the log-likelihood. */
|
|
345
|
+
ok = isfinite(loss) && (loss < init_loss);
|
|
346
|
+
|
|
347
|
+
logging(lg, "Trial %d, learning rate %f, training loss: %f", trials, eta, loss);
|
|
348
|
+
|
|
349
|
+
if (ok)
|
|
350
|
+
{
|
|
351
|
+
--num;
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
if (isfinite(loss) && loss < best_loss)
|
|
355
|
+
{
|
|
356
|
+
best_loss = loss;
|
|
357
|
+
best_eta = eta;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
if (!dec)
|
|
361
|
+
{
|
|
362
|
+
if (ok && 0 < num)
|
|
363
|
+
{
|
|
364
|
+
eta *= rate;
|
|
365
|
+
}
|
|
366
|
+
else
|
|
367
|
+
{
|
|
368
|
+
dec = 1;
|
|
369
|
+
num = opt->calibration_candidates;
|
|
370
|
+
eta = init_eta / rate;
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
else
|
|
374
|
+
{
|
|
375
|
+
eta /= rate;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
++trials;
|
|
379
|
+
if (opt->calibration_max_trials <= trials)
|
|
380
|
+
{
|
|
381
|
+
break;
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
eta = best_eta;
|
|
386
|
+
logging(lg, "Best learning rate: %f", eta);
|
|
387
|
+
|
|
388
|
+
return 1.0 / (lambda * eta);
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
int exchange_options(crfsuite_params_t *params, training_option_t *opt, int mode)
|
|
392
|
+
{
|
|
393
|
+
BEGIN_PARAM_MAP(params, mode)
|
|
394
|
+
DDX_PARAM_FLOAT(
|
|
395
|
+
"c2", opt->c2, 1.,
|
|
396
|
+
"Coefficient for L2 regularization.")
|
|
397
|
+
DDX_PARAM_INT(
|
|
398
|
+
"max_iterations", opt->max_iterations, 1000,
|
|
399
|
+
"The maximum number of iterations (epochs) for SGD optimization.")
|
|
400
|
+
DDX_PARAM_INT(
|
|
401
|
+
"period", opt->period, 10,
|
|
402
|
+
"The duration of iterations to test the stopping criterion.")
|
|
403
|
+
DDX_PARAM_FLOAT(
|
|
404
|
+
"delta", opt->delta, 1e-6,
|
|
405
|
+
"The threshold for the stopping criterion; an optimization process stops when\n"
|
|
406
|
+
"the improvement of the log likelihood over the last ${period} iterations is no\n"
|
|
407
|
+
"greater than this threshold.")
|
|
408
|
+
DDX_PARAM_FLOAT(
|
|
409
|
+
"calibration.eta", opt->calibration_eta, 0.1,
|
|
410
|
+
"The initial value of learning rate (eta) used for calibration.")
|
|
411
|
+
DDX_PARAM_FLOAT(
|
|
412
|
+
"calibration.rate", opt->calibration_rate, 2.,
|
|
413
|
+
"The rate of increase/decrease of learning rate for calibration.")
|
|
414
|
+
DDX_PARAM_INT(
|
|
415
|
+
"calibration.samples", opt->calibration_samples, 1000,
|
|
416
|
+
"The number of instances used for calibration.")
|
|
417
|
+
DDX_PARAM_INT(
|
|
418
|
+
"calibration.candidates", opt->calibration_candidates, 10,
|
|
419
|
+
"The number of candidates of learning rate.")
|
|
420
|
+
DDX_PARAM_INT(
|
|
421
|
+
"calibration.max_trials", opt->calibration_max_trials, 20,
|
|
422
|
+
"The maximum number of trials of learning rates for calibration.")
|
|
423
|
+
END_PARAM_MAP()
|
|
424
|
+
|
|
425
|
+
return 0;
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
void crfsuite_train_l2sgd_init(crfsuite_params_t *params)
|
|
429
|
+
{
|
|
430
|
+
exchange_options(params, NULL, 0);
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
int crfsuite_train_l2sgd(
|
|
434
|
+
encoder_t *gm,
|
|
435
|
+
dataset_t *trainset,
|
|
436
|
+
dataset_t *testset,
|
|
437
|
+
crfsuite_params_t *params,
|
|
438
|
+
logging_t *lg,
|
|
439
|
+
floatval_t **ptr_w)
|
|
440
|
+
{
|
|
441
|
+
int ret = 0;
|
|
442
|
+
floatval_t *w = NULL;
|
|
443
|
+
clock_t clk_begin;
|
|
444
|
+
floatval_t loss = 0;
|
|
445
|
+
const int N = trainset->num_instances;
|
|
446
|
+
const int K = gm->num_features;
|
|
447
|
+
const int T = gm->cap_items;
|
|
448
|
+
training_option_t opt;
|
|
449
|
+
|
|
450
|
+
/* Obtain parameter values. */
|
|
451
|
+
exchange_options(params, &opt, -1);
|
|
452
|
+
|
|
453
|
+
/* Allocate arrays. */
|
|
454
|
+
w = (floatval_t *)calloc(sizeof(floatval_t), K);
|
|
455
|
+
if (w == NULL)
|
|
456
|
+
{
|
|
457
|
+
ret = CRFSUITEERR_OUTOFMEMORY;
|
|
458
|
+
goto error_exit;
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
opt.lambda = 2. * opt.c2 / N;
|
|
462
|
+
|
|
463
|
+
logging(lg, "Start training with SGD");
|
|
464
|
+
clk_begin = clock();
|
|
465
|
+
|
|
466
|
+
/* Calibrate the training rate (eta). */
|
|
467
|
+
opt.t0 = l2sgd_calibration(gm, trainset, w, lg, &opt);
|
|
468
|
+
|
|
469
|
+
/* Perform stochastic gradient descent. */
|
|
470
|
+
ret = l2sgd(
|
|
471
|
+
gm,
|
|
472
|
+
trainset,
|
|
473
|
+
testset,
|
|
474
|
+
w,
|
|
475
|
+
lg,
|
|
476
|
+
N,
|
|
477
|
+
opt.t0,
|
|
478
|
+
opt.lambda,
|
|
479
|
+
opt.max_iterations,
|
|
480
|
+
0,
|
|
481
|
+
opt.period,
|
|
482
|
+
opt.delta,
|
|
483
|
+
&loss);
|
|
484
|
+
|
|
485
|
+
*ptr_w = w;
|
|
486
|
+
return ret;
|
|
487
|
+
|
|
488
|
+
error_exit:
|
|
489
|
+
free(w);
|
|
490
|
+
return ret;
|
|
491
|
+
}
|