chaine 3.13.1__cp312-cp312-musllinux_1_2_i686.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of chaine might be problematic. Click here for more details.
- chaine/__init__.py +2 -0
- chaine/_core/crf.cpp +19854 -0
- chaine/_core/crf.cpython-312-i386-linux-musl.so +0 -0
- chaine/_core/crf.pyx +271 -0
- chaine/_core/crfsuite/COPYING +27 -0
- chaine/_core/crfsuite/README +183 -0
- chaine/_core/crfsuite/include/crfsuite.h +1077 -0
- chaine/_core/crfsuite/include/crfsuite.hpp +649 -0
- chaine/_core/crfsuite/include/crfsuite_api.hpp +406 -0
- chaine/_core/crfsuite/include/os.h +65 -0
- chaine/_core/crfsuite/lib/cqdb/COPYING +28 -0
- chaine/_core/crfsuite/lib/cqdb/include/cqdb.h +518 -0
- chaine/_core/crfsuite/lib/cqdb/src/cqdb.c +639 -0
- chaine/_core/crfsuite/lib/cqdb/src/lookup3.c +1271 -0
- chaine/_core/crfsuite/lib/cqdb/src/main.c +184 -0
- chaine/_core/crfsuite/lib/crf/src/crf1d.h +354 -0
- chaine/_core/crfsuite/lib/crf/src/crf1d_context.c +788 -0
- chaine/_core/crfsuite/lib/crf/src/crf1d_encode.c +1020 -0
- chaine/_core/crfsuite/lib/crf/src/crf1d_feature.c +382 -0
- chaine/_core/crfsuite/lib/crf/src/crf1d_model.c +1085 -0
- chaine/_core/crfsuite/lib/crf/src/crf1d_tag.c +582 -0
- chaine/_core/crfsuite/lib/crf/src/crfsuite.c +500 -0
- chaine/_core/crfsuite/lib/crf/src/crfsuite_internal.h +233 -0
- chaine/_core/crfsuite/lib/crf/src/crfsuite_train.c +302 -0
- chaine/_core/crfsuite/lib/crf/src/dataset.c +115 -0
- chaine/_core/crfsuite/lib/crf/src/dictionary.c +127 -0
- chaine/_core/crfsuite/lib/crf/src/holdout.c +83 -0
- chaine/_core/crfsuite/lib/crf/src/json.c +1497 -0
- chaine/_core/crfsuite/lib/crf/src/json.h +120 -0
- chaine/_core/crfsuite/lib/crf/src/logging.c +85 -0
- chaine/_core/crfsuite/lib/crf/src/logging.h +49 -0
- chaine/_core/crfsuite/lib/crf/src/params.c +370 -0
- chaine/_core/crfsuite/lib/crf/src/params.h +84 -0
- chaine/_core/crfsuite/lib/crf/src/quark.c +180 -0
- chaine/_core/crfsuite/lib/crf/src/quark.h +46 -0
- chaine/_core/crfsuite/lib/crf/src/rumavl.c +1178 -0
- chaine/_core/crfsuite/lib/crf/src/rumavl.h +144 -0
- chaine/_core/crfsuite/lib/crf/src/train_arow.c +409 -0
- chaine/_core/crfsuite/lib/crf/src/train_averaged_perceptron.c +237 -0
- chaine/_core/crfsuite/lib/crf/src/train_l2sgd.c +491 -0
- chaine/_core/crfsuite/lib/crf/src/train_lbfgs.c +323 -0
- chaine/_core/crfsuite/lib/crf/src/train_passive_aggressive.c +442 -0
- chaine/_core/crfsuite/lib/crf/src/vecmath.h +360 -0
- chaine/_core/crfsuite/swig/crfsuite.cpp +1 -0
- chaine/_core/crfsuite_api.pxd +67 -0
- chaine/_core/liblbfgs/COPYING +22 -0
- chaine/_core/liblbfgs/README +71 -0
- chaine/_core/liblbfgs/include/lbfgs.h +745 -0
- chaine/_core/liblbfgs/lib/arithmetic_ansi.h +142 -0
- chaine/_core/liblbfgs/lib/arithmetic_sse_double.h +303 -0
- chaine/_core/liblbfgs/lib/arithmetic_sse_float.h +312 -0
- chaine/_core/liblbfgs/lib/lbfgs.c +1531 -0
- chaine/_core/tagger_wrapper.hpp +58 -0
- chaine/_core/trainer_wrapper.cpp +32 -0
- chaine/_core/trainer_wrapper.hpp +26 -0
- chaine/crf.py +505 -0
- chaine/logging.py +214 -0
- chaine/optimization/__init__.py +10 -0
- chaine/optimization/metrics.py +129 -0
- chaine/optimization/spaces.py +394 -0
- chaine/optimization/trial.py +103 -0
- chaine/optimization/utils.py +119 -0
- chaine/training.py +184 -0
- chaine/typing.py +18 -0
- chaine/validation.py +43 -0
- chaine-3.13.1.dist-info/METADATA +348 -0
- chaine-3.13.1.dist-info/RECORD +70 -0
- chaine-3.13.1.dist-info/WHEEL +4 -0
- chaine.libs/libgcc_s-1257a076.so.1 +0 -0
- chaine.libs/libstdc++-0530927c.so.6.0.32 +0 -0
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* CRFsuite internal interface.
|
|
3
|
+
*
|
|
4
|
+
* Copyright (c) 2007-2010, Naoaki Okazaki
|
|
5
|
+
* All rights reserved.
|
|
6
|
+
*
|
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
|
14
|
+
* * Neither the names of the authors nor the names of its contributors
|
|
15
|
+
* may be used to endorse or promote products derived from this
|
|
16
|
+
* software without specific prior written permission.
|
|
17
|
+
*
|
|
18
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
19
|
+
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
20
|
+
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
21
|
+
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
22
|
+
* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
23
|
+
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
24
|
+
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
25
|
+
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
26
|
+
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
27
|
+
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
28
|
+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
29
|
+
*/
|
|
30
|
+
|
|
31
|
+
/* $Id$ */
|
|
32
|
+
|
|
33
|
+
#ifndef __CRFSUITE_INTERNAL_H__
|
|
34
|
+
#define __CRFSUITE_INTERNAL_H__
|
|
35
|
+
|
|
36
|
+
#include <crfsuite.h>
|
|
37
|
+
#include "logging.h"
|
|
38
|
+
|
|
39
|
+
enum
|
|
40
|
+
{
|
|
41
|
+
FTYPE_NONE = 0, /**< Unselected. */
|
|
42
|
+
FTYPE_CRF1D, /**< 1st-order tyad features. */
|
|
43
|
+
FTYPE_CRF1T, /**< 1st-order triad features. */
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
enum
|
|
47
|
+
{
|
|
48
|
+
TRAIN_NONE = 0, /**< Unselected. */
|
|
49
|
+
TRAIN_LBFGS, /**< L-BFGS batch training. */
|
|
50
|
+
TRAIN_L2SGD, /**< Pegasos online training. */
|
|
51
|
+
TRAIN_AVERAGED_PERCEPTRON, /**< Averaged perceptron. */
|
|
52
|
+
TRAIN_PASSIVE_AGGRESSIVE,
|
|
53
|
+
TRAIN_AROW,
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
struct tag_crfsuite_train_internal;
|
|
57
|
+
typedef struct tag_crfsuite_train_internal crfsuite_train_internal_t;
|
|
58
|
+
|
|
59
|
+
struct tag_encoder;
|
|
60
|
+
typedef struct tag_encoder encoder_t;
|
|
61
|
+
|
|
62
|
+
typedef struct
|
|
63
|
+
{
|
|
64
|
+
crfsuite_data_t *data;
|
|
65
|
+
int *perm;
|
|
66
|
+
int num_instances;
|
|
67
|
+
} dataset_t;
|
|
68
|
+
|
|
69
|
+
void dataset_init_trainset(dataset_t *ds, crfsuite_data_t *data, int holdout);
|
|
70
|
+
void dataset_init_testset(dataset_t *ds, crfsuite_data_t *data, int holdout);
|
|
71
|
+
void dataset_finish(dataset_t *ds);
|
|
72
|
+
void dataset_shuffle(dataset_t *ds);
|
|
73
|
+
crfsuite_instance_t *dataset_get(dataset_t *ds, int i);
|
|
74
|
+
|
|
75
|
+
typedef void (*crfsuite_encoder_features_on_path_callback)(void *instance, int fid, floatval_t value);
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Internal data structure for
|
|
79
|
+
*/
|
|
80
|
+
struct tag_crfsuite_train_internal
|
|
81
|
+
{
|
|
82
|
+
encoder_t *gm; /** Interface to the graphical model. */
|
|
83
|
+
crfsuite_params_t *params; /**< Parameter interface. */
|
|
84
|
+
logging_t *lg; /**< Logging interface. */
|
|
85
|
+
int feature_type; /**< Feature type. */
|
|
86
|
+
int algorithm; /**< Training algorithm. */
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Interface for a graphical model.
|
|
91
|
+
*/
|
|
92
|
+
struct tag_encoder
|
|
93
|
+
{
|
|
94
|
+
void *internal;
|
|
95
|
+
|
|
96
|
+
const floatval_t *w;
|
|
97
|
+
floatval_t scale;
|
|
98
|
+
|
|
99
|
+
dataset_t *ds;
|
|
100
|
+
const crfsuite_instance_t *inst;
|
|
101
|
+
int level;
|
|
102
|
+
|
|
103
|
+
int num_features;
|
|
104
|
+
int cap_items;
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Exchanges options.
|
|
108
|
+
* @param self The encoder instance.
|
|
109
|
+
* @param params The parameter interface.
|
|
110
|
+
* @param mode The direction of parameter exchange.
|
|
111
|
+
* @return A status code.
|
|
112
|
+
*/
|
|
113
|
+
int (*exchange_options)(encoder_t *self, crfsuite_params_t *params, int mode);
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Initializes the encoder with a training data set.
|
|
117
|
+
* @param self The encoder instance.
|
|
118
|
+
* @param ds The data set for training.
|
|
119
|
+
* @param lg The logging interface.
|
|
120
|
+
* @return A status code.
|
|
121
|
+
*/
|
|
122
|
+
int (*initialize)(encoder_t *self, dataset_t *ds, logging_t *lg);
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Compute the objective value and gradients for the whole data set.
|
|
126
|
+
* @param self The encoder instance.
|
|
127
|
+
* @param ds The data set.
|
|
128
|
+
* @param w The feature weights.
|
|
129
|
+
* @param f The pointer to a floatval_t variable to which the
|
|
130
|
+
* objective value is stored by this function.
|
|
131
|
+
* @param g The pointer to the array that receives gradients.
|
|
132
|
+
* @return A status code.
|
|
133
|
+
*/
|
|
134
|
+
int (*objective_and_gradients_batch)(encoder_t *self, dataset_t *ds, const floatval_t *w, floatval_t *f, floatval_t *g);
|
|
135
|
+
|
|
136
|
+
int (*features_on_path)(encoder_t *self, const crfsuite_instance_t *inst, const int *path, crfsuite_encoder_features_on_path_callback func, void *instance);
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Sets the feature weights (and their scale factor).
|
|
140
|
+
* @param self The encoder instance.
|
|
141
|
+
* @param w The array of feature weights.
|
|
142
|
+
* @param scale The scale factor that should be applied to the
|
|
143
|
+
* feature weights.
|
|
144
|
+
* @return A status code.
|
|
145
|
+
*/
|
|
146
|
+
int (*set_weights)(encoder_t *self, const floatval_t *w, floatval_t scale);
|
|
147
|
+
|
|
148
|
+
/* Instance-wise operations. */
|
|
149
|
+
int (*set_instance)(encoder_t *self, const crfsuite_instance_t *inst);
|
|
150
|
+
|
|
151
|
+
/* Level 0. */
|
|
152
|
+
|
|
153
|
+
/* Level 1 (feature weights). */
|
|
154
|
+
int (*score)(encoder_t *self, const int *path, floatval_t *ptr_score);
|
|
155
|
+
int (*viterbi)(encoder_t *self, int *path, floatval_t *ptr_score);
|
|
156
|
+
|
|
157
|
+
/* Level 2 (forward-backward). */
|
|
158
|
+
int (*partition_factor)(encoder_t *self, floatval_t *ptr_pf);
|
|
159
|
+
|
|
160
|
+
/* Level 3 (marginals). */
|
|
161
|
+
int (*objective_and_gradients)(encoder_t *self, floatval_t *f, floatval_t *g, floatval_t gain, floatval_t weight);
|
|
162
|
+
|
|
163
|
+
int (*save_model)(encoder_t *self, const char *filename, const floatval_t *w, logging_t *lg);
|
|
164
|
+
|
|
165
|
+
void (*release)(encoder_t *self);
|
|
166
|
+
};
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* \defgroup crf1d_encode.c
|
|
170
|
+
*/
|
|
171
|
+
/** @{ */
|
|
172
|
+
|
|
173
|
+
encoder_t *crf1d_create_encoder();
|
|
174
|
+
|
|
175
|
+
/** @} */
|
|
176
|
+
|
|
177
|
+
void holdout_evaluation(
|
|
178
|
+
encoder_t *gm,
|
|
179
|
+
dataset_t *testset,
|
|
180
|
+
const floatval_t *w,
|
|
181
|
+
logging_t *lg);
|
|
182
|
+
|
|
183
|
+
int crfsuite_train_lbfgs(
|
|
184
|
+
encoder_t *gm,
|
|
185
|
+
dataset_t *trainset,
|
|
186
|
+
dataset_t *testset,
|
|
187
|
+
crfsuite_params_t *params,
|
|
188
|
+
logging_t *lg,
|
|
189
|
+
floatval_t **ptr_w);
|
|
190
|
+
|
|
191
|
+
void crfsuite_train_lbfgs_init(crfsuite_params_t *params);
|
|
192
|
+
|
|
193
|
+
void crfsuite_train_averaged_perceptron_init(crfsuite_params_t *params);
|
|
194
|
+
|
|
195
|
+
int crfsuite_train_averaged_perceptron(
|
|
196
|
+
encoder_t *gm,
|
|
197
|
+
dataset_t *trainset,
|
|
198
|
+
dataset_t *testset,
|
|
199
|
+
crfsuite_params_t *params,
|
|
200
|
+
logging_t *lg,
|
|
201
|
+
floatval_t **ptr_w);
|
|
202
|
+
|
|
203
|
+
void crfsuite_train_l2sgd_init(crfsuite_params_t *params);
|
|
204
|
+
|
|
205
|
+
int crfsuite_train_l2sgd(
|
|
206
|
+
encoder_t *gm,
|
|
207
|
+
dataset_t *trainset,
|
|
208
|
+
dataset_t *testset,
|
|
209
|
+
crfsuite_params_t *params,
|
|
210
|
+
logging_t *lg,
|
|
211
|
+
floatval_t **ptr_w);
|
|
212
|
+
|
|
213
|
+
void crfsuite_train_passive_aggressive_init(crfsuite_params_t *params);
|
|
214
|
+
|
|
215
|
+
int crfsuite_train_passive_aggressive(
|
|
216
|
+
encoder_t *gm,
|
|
217
|
+
dataset_t *trainset,
|
|
218
|
+
dataset_t *testset,
|
|
219
|
+
crfsuite_params_t *params,
|
|
220
|
+
logging_t *lg,
|
|
221
|
+
floatval_t **ptr_w);
|
|
222
|
+
|
|
223
|
+
void crfsuite_train_arow_init(crfsuite_params_t *params);
|
|
224
|
+
|
|
225
|
+
int crfsuite_train_arow(
|
|
226
|
+
encoder_t *gm,
|
|
227
|
+
dataset_t *trainset,
|
|
228
|
+
dataset_t *testset,
|
|
229
|
+
crfsuite_params_t *params,
|
|
230
|
+
logging_t *lg,
|
|
231
|
+
floatval_t **ptr_w);
|
|
232
|
+
|
|
233
|
+
#endif /*__CRFSUITE_INTERNAL_H__*/
|
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Implementation of the training interface (crfsuite_trainer_t).
|
|
3
|
+
*
|
|
4
|
+
* Copyright (c) 2007-2010, Naoaki Okazaki
|
|
5
|
+
* All rights reserved.
|
|
6
|
+
*
|
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
|
14
|
+
* * Neither the names of the authors nor the names of its contributors
|
|
15
|
+
* may be used to endorse or promote products derived from this
|
|
16
|
+
* software without specific prior written permission.
|
|
17
|
+
*
|
|
18
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
19
|
+
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
20
|
+
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
21
|
+
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
22
|
+
* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
23
|
+
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
24
|
+
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
25
|
+
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
26
|
+
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
27
|
+
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
28
|
+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
29
|
+
*/
|
|
30
|
+
|
|
31
|
+
/* $Id$ */
|
|
32
|
+
|
|
33
|
+
#ifdef HAVE_CONFIG_H
|
|
34
|
+
#include <config.h>
|
|
35
|
+
#endif /*HAVE_CONFIG_H*/
|
|
36
|
+
|
|
37
|
+
#include <os.h>
|
|
38
|
+
|
|
39
|
+
#include <stdlib.h>
|
|
40
|
+
#include <string.h>
|
|
41
|
+
|
|
42
|
+
#include <crfsuite.h>
|
|
43
|
+
#include "crfsuite_internal.h"
|
|
44
|
+
#include "params.h"
|
|
45
|
+
#include "logging.h"
|
|
46
|
+
#include "crf1d.h"
|
|
47
|
+
|
|
48
|
+
static crfsuite_train_internal_t *crfsuite_train_new(int ftype, int algorithm)
|
|
49
|
+
{
|
|
50
|
+
crfsuite_train_internal_t *tr = (crfsuite_train_internal_t *)calloc(1, sizeof(crfsuite_train_internal_t));
|
|
51
|
+
if (tr != NULL)
|
|
52
|
+
{
|
|
53
|
+
tr->lg = (logging_t *)calloc(1, sizeof(logging_t));
|
|
54
|
+
tr->params = params_create_instance();
|
|
55
|
+
tr->feature_type = ftype;
|
|
56
|
+
tr->algorithm = algorithm;
|
|
57
|
+
|
|
58
|
+
tr->gm = crf1d_create_encoder();
|
|
59
|
+
tr->gm->exchange_options(tr->gm, tr->params, 0);
|
|
60
|
+
|
|
61
|
+
/* Initialize parameters for the training algorithm. */
|
|
62
|
+
switch (algorithm)
|
|
63
|
+
{
|
|
64
|
+
case TRAIN_LBFGS:
|
|
65
|
+
crfsuite_train_lbfgs_init(tr->params);
|
|
66
|
+
break;
|
|
67
|
+
case TRAIN_L2SGD:
|
|
68
|
+
crfsuite_train_l2sgd_init(tr->params);
|
|
69
|
+
break;
|
|
70
|
+
case TRAIN_AVERAGED_PERCEPTRON:
|
|
71
|
+
crfsuite_train_averaged_perceptron_init(tr->params);
|
|
72
|
+
break;
|
|
73
|
+
case TRAIN_PASSIVE_AGGRESSIVE:
|
|
74
|
+
crfsuite_train_passive_aggressive_init(tr->params);
|
|
75
|
+
break;
|
|
76
|
+
case TRAIN_AROW:
|
|
77
|
+
crfsuite_train_arow_init(tr->params);
|
|
78
|
+
break;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
return tr;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
static void crfsuite_train_delete(crfsuite_trainer_t *self)
|
|
86
|
+
{
|
|
87
|
+
crfsuite_train_internal_t *tr = (crfsuite_train_internal_t *)self->internal;
|
|
88
|
+
if (tr != NULL)
|
|
89
|
+
{
|
|
90
|
+
if (tr->gm != NULL)
|
|
91
|
+
{
|
|
92
|
+
tr->gm->release(tr->gm);
|
|
93
|
+
}
|
|
94
|
+
if (tr->params != NULL)
|
|
95
|
+
{
|
|
96
|
+
tr->params->release(tr->params);
|
|
97
|
+
}
|
|
98
|
+
free(tr->lg);
|
|
99
|
+
free(tr);
|
|
100
|
+
}
|
|
101
|
+
free(self);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
static int crfsuite_train_addref(crfsuite_trainer_t *tr)
|
|
105
|
+
{
|
|
106
|
+
return crfsuite_interlocked_increment(&tr->nref);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
static int crfsuite_train_release(crfsuite_trainer_t *self)
|
|
110
|
+
{
|
|
111
|
+
int count = crfsuite_interlocked_decrement(&self->nref);
|
|
112
|
+
if (count == 0)
|
|
113
|
+
{
|
|
114
|
+
crfsuite_train_delete(self);
|
|
115
|
+
}
|
|
116
|
+
return count;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
static void crfsuite_train_set_message_callback(crfsuite_trainer_t *self, void *instance, crfsuite_logging_callback cbm)
|
|
120
|
+
{
|
|
121
|
+
crfsuite_train_internal_t *tr = (crfsuite_train_internal_t *)self->internal;
|
|
122
|
+
tr->lg->func = cbm;
|
|
123
|
+
tr->lg->instance = instance;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
static crfsuite_params_t *crfsuite_train_params(crfsuite_trainer_t *self)
|
|
127
|
+
{
|
|
128
|
+
crfsuite_train_internal_t *tr = (crfsuite_train_internal_t *)self->internal;
|
|
129
|
+
crfsuite_params_t *params = tr->params;
|
|
130
|
+
params->addref(params);
|
|
131
|
+
return params;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
static int crfsuite_train_train(
|
|
135
|
+
crfsuite_trainer_t *self,
|
|
136
|
+
const crfsuite_data_t *data,
|
|
137
|
+
const char *filename,
|
|
138
|
+
int holdout)
|
|
139
|
+
{
|
|
140
|
+
char *algorithm = NULL;
|
|
141
|
+
crfsuite_train_internal_t *tr = (crfsuite_train_internal_t *)self->internal;
|
|
142
|
+
logging_t *lg = tr->lg;
|
|
143
|
+
encoder_t *gm = tr->gm;
|
|
144
|
+
floatval_t *w = NULL;
|
|
145
|
+
dataset_t trainset;
|
|
146
|
+
dataset_t testset;
|
|
147
|
+
|
|
148
|
+
/* Prepare the data set(s) for training (and holdout evaluation). */
|
|
149
|
+
dataset_init_trainset(&trainset, (crfsuite_data_t *)data, holdout);
|
|
150
|
+
if (0 <= holdout)
|
|
151
|
+
{
|
|
152
|
+
dataset_init_testset(&testset, (crfsuite_data_t *)data, holdout);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/* Set the training set to the CRF, and generate features. */
|
|
156
|
+
gm->exchange_options(gm, tr->params, -1);
|
|
157
|
+
gm->initialize(gm, &trainset, lg);
|
|
158
|
+
|
|
159
|
+
/* Call the training algorithm. */
|
|
160
|
+
switch (tr->algorithm)
|
|
161
|
+
{
|
|
162
|
+
case TRAIN_LBFGS:
|
|
163
|
+
crfsuite_train_lbfgs(
|
|
164
|
+
gm,
|
|
165
|
+
&trainset,
|
|
166
|
+
(holdout != -1 ? &testset : NULL),
|
|
167
|
+
tr->params,
|
|
168
|
+
lg,
|
|
169
|
+
&w);
|
|
170
|
+
break;
|
|
171
|
+
case TRAIN_L2SGD:
|
|
172
|
+
crfsuite_train_l2sgd(
|
|
173
|
+
gm,
|
|
174
|
+
&trainset,
|
|
175
|
+
(holdout != -1 ? &testset : NULL),
|
|
176
|
+
tr->params,
|
|
177
|
+
lg,
|
|
178
|
+
&w);
|
|
179
|
+
break;
|
|
180
|
+
case TRAIN_AVERAGED_PERCEPTRON:
|
|
181
|
+
crfsuite_train_averaged_perceptron(
|
|
182
|
+
gm,
|
|
183
|
+
&trainset,
|
|
184
|
+
(holdout != -1 ? &testset : NULL),
|
|
185
|
+
tr->params,
|
|
186
|
+
lg,
|
|
187
|
+
&w);
|
|
188
|
+
break;
|
|
189
|
+
case TRAIN_PASSIVE_AGGRESSIVE:
|
|
190
|
+
crfsuite_train_passive_aggressive(
|
|
191
|
+
gm,
|
|
192
|
+
&trainset,
|
|
193
|
+
(holdout != -1 ? &testset : NULL),
|
|
194
|
+
tr->params,
|
|
195
|
+
lg,
|
|
196
|
+
&w);
|
|
197
|
+
break;
|
|
198
|
+
case TRAIN_AROW:
|
|
199
|
+
crfsuite_train_arow(
|
|
200
|
+
gm,
|
|
201
|
+
&trainset,
|
|
202
|
+
(holdout != -1 ? &testset : NULL),
|
|
203
|
+
tr->params,
|
|
204
|
+
lg,
|
|
205
|
+
&w);
|
|
206
|
+
break;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
/* Store the model file. */
|
|
210
|
+
if (filename != NULL && *filename != '\0')
|
|
211
|
+
{
|
|
212
|
+
gm->save_model(gm, filename, w, lg);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
if (0 <= holdout)
|
|
216
|
+
{
|
|
217
|
+
dataset_finish(&testset);
|
|
218
|
+
}
|
|
219
|
+
dataset_finish(&trainset);
|
|
220
|
+
free(w);
|
|
221
|
+
|
|
222
|
+
return 0;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
int crf1de_create_instance(const char *interface, void **ptr)
|
|
226
|
+
{
|
|
227
|
+
int ftype = FTYPE_NONE;
|
|
228
|
+
int algorithm = TRAIN_NONE;
|
|
229
|
+
|
|
230
|
+
/* Check if the interface name begins with "train/". */
|
|
231
|
+
if (strncmp(interface, "train/", 6) != 0)
|
|
232
|
+
{
|
|
233
|
+
return 1;
|
|
234
|
+
}
|
|
235
|
+
interface += 6;
|
|
236
|
+
|
|
237
|
+
/* Obtain the feature type. */
|
|
238
|
+
if (strncmp(interface, "crf1d/", 6) == 0)
|
|
239
|
+
{
|
|
240
|
+
ftype = FTYPE_CRF1D;
|
|
241
|
+
interface += 6;
|
|
242
|
+
}
|
|
243
|
+
else
|
|
244
|
+
{
|
|
245
|
+
return 1;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/* Obtain the training algorithm. */
|
|
249
|
+
if (strcmp(interface, "lbfgs") == 0)
|
|
250
|
+
{
|
|
251
|
+
algorithm = TRAIN_LBFGS;
|
|
252
|
+
}
|
|
253
|
+
else if (strcmp(interface, "l2sgd") == 0)
|
|
254
|
+
{
|
|
255
|
+
algorithm = TRAIN_L2SGD;
|
|
256
|
+
}
|
|
257
|
+
else if (strcmp(interface, "averaged-perceptron") == 0)
|
|
258
|
+
{
|
|
259
|
+
algorithm = TRAIN_AVERAGED_PERCEPTRON;
|
|
260
|
+
}
|
|
261
|
+
else if (strcmp(interface, "passive-aggressive") == 0)
|
|
262
|
+
{
|
|
263
|
+
algorithm = TRAIN_PASSIVE_AGGRESSIVE;
|
|
264
|
+
}
|
|
265
|
+
else if (strcmp(interface, "arow") == 0)
|
|
266
|
+
{
|
|
267
|
+
algorithm = TRAIN_AROW;
|
|
268
|
+
}
|
|
269
|
+
else
|
|
270
|
+
{
|
|
271
|
+
return 1;
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/* Create an instance. */
|
|
275
|
+
if (ftype != FTYPE_NONE && algorithm != TRAIN_NONE)
|
|
276
|
+
{
|
|
277
|
+
crfsuite_trainer_t *trainer = (crfsuite_trainer_t *)calloc(1, sizeof(crfsuite_trainer_t));
|
|
278
|
+
if (trainer != NULL)
|
|
279
|
+
{
|
|
280
|
+
trainer->internal = crfsuite_train_new(ftype, algorithm);
|
|
281
|
+
if (trainer->internal != NULL)
|
|
282
|
+
{
|
|
283
|
+
trainer->nref = 1;
|
|
284
|
+
trainer->addref = crfsuite_train_addref;
|
|
285
|
+
trainer->release = crfsuite_train_release;
|
|
286
|
+
trainer->params = crfsuite_train_params;
|
|
287
|
+
trainer->set_message_callback = crfsuite_train_set_message_callback;
|
|
288
|
+
trainer->train = crfsuite_train_train;
|
|
289
|
+
|
|
290
|
+
*ptr = trainer;
|
|
291
|
+
return 0;
|
|
292
|
+
}
|
|
293
|
+
else
|
|
294
|
+
{
|
|
295
|
+
free(trainer);
|
|
296
|
+
trainer = NULL;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
return 1;
|
|
302
|
+
}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Implementation for data sets (dataset_t).
|
|
3
|
+
*
|
|
4
|
+
* Copyright (c) 2007-2010, Naoaki Okazaki
|
|
5
|
+
* All rights reserved.
|
|
6
|
+
*
|
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
|
14
|
+
* * Neither the names of the authors nor the names of its contributors
|
|
15
|
+
* may be used to endorse or promote products derived from this
|
|
16
|
+
* software without specific prior written permission.
|
|
17
|
+
*
|
|
18
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
19
|
+
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
20
|
+
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
21
|
+
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
22
|
+
* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
23
|
+
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
24
|
+
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
25
|
+
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
26
|
+
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
27
|
+
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
28
|
+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
29
|
+
*/
|
|
30
|
+
|
|
31
|
+
/* $Id$ */
|
|
32
|
+
|
|
33
|
+
#ifdef HAVE_CONFIG_H
|
|
34
|
+
#include <config.h>
|
|
35
|
+
#endif /*HAVE_CONFIG_H*/
|
|
36
|
+
|
|
37
|
+
#include <os.h>
|
|
38
|
+
|
|
39
|
+
#include <stdlib.h>
|
|
40
|
+
#include <crfsuite.h>
|
|
41
|
+
#include "crfsuite_internal.h"
|
|
42
|
+
|
|
43
|
+
void dataset_init_trainset(dataset_t *ds, crfsuite_data_t *data, int holdout)
|
|
44
|
+
{
|
|
45
|
+
int i, n = 0;
|
|
46
|
+
|
|
47
|
+
for (i = 0; i < data->num_instances; ++i)
|
|
48
|
+
{
|
|
49
|
+
if (data->instances[i].group != holdout)
|
|
50
|
+
{
|
|
51
|
+
++n;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
ds->data = data;
|
|
56
|
+
ds->num_instances = n;
|
|
57
|
+
ds->perm = (int *)malloc(sizeof(int) * n);
|
|
58
|
+
|
|
59
|
+
n = 0;
|
|
60
|
+
for (i = 0; i < data->num_instances; ++i)
|
|
61
|
+
{
|
|
62
|
+
if (data->instances[i].group != holdout)
|
|
63
|
+
{
|
|
64
|
+
ds->perm[n++] = i;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
void dataset_init_testset(dataset_t *ds, crfsuite_data_t *data, int holdout)
|
|
70
|
+
{
|
|
71
|
+
int i, n = 0;
|
|
72
|
+
|
|
73
|
+
for (i = 0; i < data->num_instances; ++i)
|
|
74
|
+
{
|
|
75
|
+
if (data->instances[i].group == holdout)
|
|
76
|
+
{
|
|
77
|
+
++n;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
ds->data = data;
|
|
82
|
+
ds->num_instances = n;
|
|
83
|
+
ds->perm = (int *)malloc(sizeof(int) * n);
|
|
84
|
+
|
|
85
|
+
n = 0;
|
|
86
|
+
for (i = 0; i < data->num_instances; ++i)
|
|
87
|
+
{
|
|
88
|
+
if (data->instances[i].group == holdout)
|
|
89
|
+
{
|
|
90
|
+
ds->perm[n++] = i;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
void dataset_finish(dataset_t *ds)
|
|
96
|
+
{
|
|
97
|
+
free(ds->perm);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
void dataset_shuffle(dataset_t *ds)
|
|
101
|
+
{
|
|
102
|
+
int i;
|
|
103
|
+
for (i = 0; i < ds->num_instances; ++i)
|
|
104
|
+
{
|
|
105
|
+
int j = rand() % ds->num_instances;
|
|
106
|
+
int tmp = ds->perm[j];
|
|
107
|
+
ds->perm[j] = ds->perm[i];
|
|
108
|
+
ds->perm[i] = tmp;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
crfsuite_instance_t *dataset_get(dataset_t *ds, int i)
|
|
113
|
+
{
|
|
114
|
+
return &ds->data->instances[ds->perm[i]];
|
|
115
|
+
}
|