wapiti 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +13 -0
- data/.gitignore +5 -0
- data/.rspec +3 -0
- data/Gemfile +6 -0
- data/LICENSE +30 -0
- data/README.md +153 -0
- data/Rakefile +33 -0
- data/ext/wapiti/bcd.c +392 -0
- data/ext/wapiti/decoder.c +535 -0
- data/ext/wapiti/decoder.h +46 -0
- data/ext/wapiti/extconf.rb +8 -0
- data/ext/wapiti/gradient.c +818 -0
- data/ext/wapiti/gradient.h +81 -0
- data/ext/wapiti/lbfgs.c +294 -0
- data/ext/wapiti/model.c +296 -0
- data/ext/wapiti/model.h +100 -0
- data/ext/wapiti/native.c +1238 -0
- data/ext/wapiti/native.h +15 -0
- data/ext/wapiti/options.c +278 -0
- data/ext/wapiti/options.h +91 -0
- data/ext/wapiti/pattern.c +395 -0
- data/ext/wapiti/pattern.h +56 -0
- data/ext/wapiti/progress.c +167 -0
- data/ext/wapiti/progress.h +43 -0
- data/ext/wapiti/quark.c +272 -0
- data/ext/wapiti/quark.h +46 -0
- data/ext/wapiti/reader.c +553 -0
- data/ext/wapiti/reader.h +73 -0
- data/ext/wapiti/rprop.c +191 -0
- data/ext/wapiti/sequence.h +148 -0
- data/ext/wapiti/sgdl1.c +218 -0
- data/ext/wapiti/thread.c +171 -0
- data/ext/wapiti/thread.h +42 -0
- data/ext/wapiti/tools.c +202 -0
- data/ext/wapiti/tools.h +54 -0
- data/ext/wapiti/trainers.h +39 -0
- data/ext/wapiti/vmath.c +372 -0
- data/ext/wapiti/vmath.h +51 -0
- data/ext/wapiti/wapiti.c +288 -0
- data/ext/wapiti/wapiti.h +45 -0
- data/lib/wapiti.rb +30 -0
- data/lib/wapiti/errors.rb +17 -0
- data/lib/wapiti/model.rb +49 -0
- data/lib/wapiti/options.rb +113 -0
- data/lib/wapiti/utility.rb +15 -0
- data/lib/wapiti/version.rb +3 -0
- data/spec/fixtures/ch.mod +18550 -0
- data/spec/fixtures/chpattern.txt +52 -0
- data/spec/fixtures/chtest.txt +1973 -0
- data/spec/fixtures/chtrain.txt +19995 -0
- data/spec/fixtures/nppattern.txt +52 -0
- data/spec/fixtures/nptest.txt +1973 -0
- data/spec/fixtures/nptrain.txt +19995 -0
- data/spec/fixtures/pattern.txt +14 -0
- data/spec/fixtures/test.txt +60000 -0
- data/spec/fixtures/train.txt +1200 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/wapiti/model_spec.rb +173 -0
- data/spec/wapiti/native_spec.rb +12 -0
- data/spec/wapiti/options_spec.rb +175 -0
- data/spec/wapiti/utility_spec.rb +22 -0
- data/wapiti.gemspec +35 -0
- metadata +178 -0
data/ext/wapiti/reader.h
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#ifndef reader_h
|
29
|
+
#define reader_h
|
30
|
+
|
31
|
+
#include <stdbool.h>
|
32
|
+
#include <stdio.h>
|
33
|
+
|
34
|
+
#include "wapiti.h"
|
35
|
+
#include "pattern.h"
|
36
|
+
#include "quark.h"
|
37
|
+
#include "sequence.h"
|
38
|
+
|
39
|
+
/* rdr_t:
|
40
|
+
* The reader object who hold all informations needed to parse the input file:
|
41
|
+
* the patterns and quark for labels and observations. We keep separate count
|
42
|
+
* for unigrams and bigrams pattern for simpler allocation of sequences. We
|
43
|
+
* also store the expected number of column in the input data to check that
|
44
|
+
* pattern are appliables.
|
45
|
+
*/
|
46
|
+
typedef struct rdr_s rdr_t;
|
47
|
+
struct rdr_s {
|
48
|
+
bool maxent; // Is this a maxent reader
|
49
|
+
int npats; // P Total number of patterns
|
50
|
+
int nuni, nbi; // Number of unigram and bigram patterns
|
51
|
+
int ntoks; // Expected number of tokens in input
|
52
|
+
pat_t **pats; // [P] List of precompiled patterns
|
53
|
+
qrk_t *lbl; // Labels database
|
54
|
+
qrk_t *obs; // Observation database
|
55
|
+
};
|
56
|
+
|
57
|
+
rdr_t *rdr_new(bool maxent);
|
58
|
+
void rdr_free(rdr_t *rdr);
|
59
|
+
void rdr_freeraw(raw_t *raw);
|
60
|
+
void rdr_freeseq(seq_t *seq);
|
61
|
+
void rdr_freedat(dat_t *dat);
|
62
|
+
|
63
|
+
void rdr_loadpat(rdr_t *rdr, FILE *file);
|
64
|
+
raw_t *rdr_readraw(rdr_t *rdr, FILE *file);
|
65
|
+
seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl);
|
66
|
+
seq_t *rdr_readseq(rdr_t *rdr, FILE *file, bool lbl);
|
67
|
+
dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl);
|
68
|
+
|
69
|
+
void rdr_load(rdr_t *rdr, FILE *file);
|
70
|
+
void rdr_save(const rdr_t *rdr, FILE *file);
|
71
|
+
|
72
|
+
#endif
|
73
|
+
|
data/ext/wapiti/rprop.c
ADDED
@@ -0,0 +1,191 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
#include <math.h>
|
28
|
+
#include <stdbool.h>
|
29
|
+
#include <stddef.h>
|
30
|
+
#include <stdlib.h>
|
31
|
+
#include <string.h>
|
32
|
+
|
33
|
+
#include "wapiti.h"
|
34
|
+
#include "gradient.h"
|
35
|
+
#include "model.h"
|
36
|
+
#include "options.h"
|
37
|
+
#include "progress.h"
|
38
|
+
#include "tools.h"
|
39
|
+
#include "thread.h"
|
40
|
+
#include "vmath.h"
|
41
|
+
|
42
|
+
#define sign(v) ((v) < 0.0 ? -1.0 : ((v) > 0.0 ? 1.0 : 0.0))
|
43
|
+
#define sqr(v) ((v) * (v))
|
44
|
+
|
45
|
+
/******************************************************************************
|
46
|
+
* Resilient propagation optimizer
|
47
|
+
*
|
48
|
+
* This is an implementation of the RPROP algorithm (resilient propagation)
|
49
|
+
* described by Riedmiller and Braun in [1] with an adaptation to be useable
|
50
|
+
* with l1 regularization.
|
51
|
+
* The adaptation consist of using a pseudo-gradient similar to the one used
|
52
|
+
* in OWL-QN to choose an orthant at iterations steps and projecting the step
|
53
|
+
* in this orthant before the weight update.
|
54
|
+
*
|
55
|
+
* [1] A direct adaptive method for faster backpropagation learning: The RPROP
|
56
|
+
* algorithm, Martin Riedmiller and Heinrich Braun, IEEE International
|
57
|
+
* Conference on Neural Networks, San Francisco, USA, 586-591, March 1993.
|
58
|
+
******************************************************************************/
|
59
|
+
typedef struct rprop_s rprop_t;
|
60
|
+
struct rprop_s {
|
61
|
+
mdl_t *mdl;
|
62
|
+
double *xp;
|
63
|
+
double *stp;
|
64
|
+
double *g;
|
65
|
+
double *gp;
|
66
|
+
};
|
67
|
+
|
68
|
+
/* trn_rpropsub:
|
69
|
+
* Partial update of the weight vector including partial gradient in case of
|
70
|
+
* l1 regularisation. The sub vector updated depend on the id and cnt
|
71
|
+
* parameter given, the job scheduling system is not used here as we can
|
72
|
+
* easily split processing in equals parts.
|
73
|
+
*/
|
74
|
+
static void trn_rpropsub(job_t *job, int id, int cnt, rprop_t *st) {
|
75
|
+
unused(job);
|
76
|
+
mdl_t *mdl = st->mdl;
|
77
|
+
const size_t F = mdl->nftr;
|
78
|
+
const double stpmin = mdl->opt->rprop.stpmin;
|
79
|
+
const double stpmax = mdl->opt->rprop.stpmax;
|
80
|
+
const double stpinc = mdl->opt->rprop.stpinc;
|
81
|
+
const double stpdec = mdl->opt->rprop.stpdec;
|
82
|
+
const bool wbt = strcmp(mdl->opt->algo, "rprop-");
|
83
|
+
const double rho1 = mdl->opt->rho1;
|
84
|
+
const int l1 = (rho1 != 0.0) ? mdl->opt->rprop.cutoff + 1: 0;
|
85
|
+
double *x = mdl->theta;
|
86
|
+
double *xp = st->xp, *stp = st->stp;
|
87
|
+
double *g = st->g, *gp = st->gp;
|
88
|
+
const size_t from = F * id / cnt;
|
89
|
+
const size_t to = F * (id + 1) / cnt;
|
90
|
+
for (size_t f = from; f < to; f++) {
|
91
|
+
double pg = g[f];
|
92
|
+
// If there is a l1 component in the regularization component,
|
93
|
+
// we either project the gradient in the current orthant or
|
94
|
+
// check for cutdown depending on the projection scheme wanted.
|
95
|
+
if (l1 == 1) {
|
96
|
+
if (x[f] < 0.0) pg -= rho1;
|
97
|
+
else if (x[f] > 0.0) pg += rho1;
|
98
|
+
else if (g[f] < -rho1) pg += rho1;
|
99
|
+
else if (g[f] > rho1) pg -= rho1;
|
100
|
+
else pg = 0.0;
|
101
|
+
} else if (l1 && sqr(g[f] + rho1 * sign(x[f])) < sqr(rho1)) {
|
102
|
+
if (x[f] == 0.0 || ( gp[f] * g[f] < 0.0
|
103
|
+
&& xp[f] * x[f] < 0.0)) {
|
104
|
+
if (wbt)
|
105
|
+
xp[f] = x[f];
|
106
|
+
x[f] = 0.0;
|
107
|
+
gp[f] = g[f];
|
108
|
+
continue;
|
109
|
+
}
|
110
|
+
}
|
111
|
+
// Next we adjust the step depending of the new and
|
112
|
+
// previous gradient values.
|
113
|
+
if (gp[f] * pg > 0.0)
|
114
|
+
stp[f] = min(stp[f] * stpinc, stpmax);
|
115
|
+
else if (gp[f] * pg < 0.0)
|
116
|
+
stp[f] = max(stp[f] * stpdec, stpmin);
|
117
|
+
// Finally update the weight. if there is l1 penalty
|
118
|
+
// and the pseudo gradient projection is used, we have to
|
119
|
+
// project back the update in the choosen orthant.
|
120
|
+
if (!wbt || gp[f] * pg > 0.0) {
|
121
|
+
double dlt = stp[f] * -sign(g[f]);
|
122
|
+
if (l1 == 1 && dlt * pg >= 0.0)
|
123
|
+
dlt = 0.0;
|
124
|
+
if (wbt)
|
125
|
+
xp[f] = x[f];
|
126
|
+
x[f] += dlt;
|
127
|
+
} else if (gp[f] * pg < 0.0) {
|
128
|
+
x[f] = xp[f];
|
129
|
+
g[f] = 0.0;
|
130
|
+
} else {
|
131
|
+
xp[f] = x[f];
|
132
|
+
if (l1 != 1)
|
133
|
+
x[f] += stp[f] * -sign(pg);
|
134
|
+
}
|
135
|
+
gp[f] = g[f];
|
136
|
+
}
|
137
|
+
}
|
138
|
+
|
139
|
+
void trn_rprop(mdl_t *mdl) {
|
140
|
+
const size_t F = mdl->nftr;
|
141
|
+
const int K = mdl->opt->maxiter;
|
142
|
+
const size_t W = mdl->opt->nthread;
|
143
|
+
const bool wbt = strcmp(mdl->opt->algo, "rprop-");
|
144
|
+
const int cut = mdl->opt->rprop.cutoff;
|
145
|
+
// Allocate state memory and initialize it
|
146
|
+
double *xp = NULL, *stp = xvm_new(F);
|
147
|
+
double *g = xvm_new(F), *gp = xvm_new(F);
|
148
|
+
if (wbt && !cut)
|
149
|
+
xp = xvm_new(F);
|
150
|
+
for (unsigned f = 0; f < F; f++) {
|
151
|
+
if (wbt && !cut)
|
152
|
+
xp[f] = 0.0;
|
153
|
+
gp[f] = 0.0;
|
154
|
+
stp[f] = 0.1;
|
155
|
+
}
|
156
|
+
// Prepare the rprop state used to send information to the rprop worker
|
157
|
+
// about updating weight using the gradient.
|
158
|
+
rprop_t *st = wapiti_xmalloc(sizeof(rprop_t));
|
159
|
+
st->mdl = mdl;
|
160
|
+
st->xp = xp; st->stp = stp;
|
161
|
+
st->g = g; st->gp = gp;
|
162
|
+
rprop_t *rprop[W];
|
163
|
+
for (size_t w = 0; w < W; w++)
|
164
|
+
rprop[w] = st;
|
165
|
+
// Prepare the gradient state for the distributed gradient computation.
|
166
|
+
grd_t *grds[W];
|
167
|
+
grds[0] = grd_new(mdl, g);
|
168
|
+
for (size_t w = 1; w < W; w++)
|
169
|
+
grds[w] = grd_new(mdl, xvm_new(F));
|
170
|
+
// And iterate the gradient computation / weight update process until
|
171
|
+
// convergence or stop request
|
172
|
+
for (int k = 0; !uit_stop && k < K; k++) {
|
173
|
+
double fx = grd_gradient(mdl, g, grds);
|
174
|
+
if (uit_stop)
|
175
|
+
break;
|
176
|
+
mth_spawn((func_t *)trn_rpropsub, W, (void **)rprop, 0, 0);
|
177
|
+
if (uit_progress(mdl, k + 1, fx) == false)
|
178
|
+
break;
|
179
|
+
}
|
180
|
+
// Free all allocated memory
|
181
|
+
if (wbt && !cut)
|
182
|
+
xvm_free(xp);
|
183
|
+
xvm_free(g);
|
184
|
+
xvm_free(gp);
|
185
|
+
for (size_t w = 1; w < W; w++)
|
186
|
+
xvm_free(grds[w]->g);
|
187
|
+
for (size_t w = 0; w < W; w++)
|
188
|
+
grd_free(grds[w]);
|
189
|
+
free(st);
|
190
|
+
}
|
191
|
+
|
@@ -0,0 +1,148 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#ifndef sequence_h
|
29
|
+
#define sequence_h
|
30
|
+
|
31
|
+
#include <stddef.h>
|
32
|
+
|
33
|
+
#include "wapiti.h"
|
34
|
+
|
35
|
+
/*******************************************************************************
|
36
|
+
* Sequences and Dataset objects
|
37
|
+
*
|
38
|
+
* Sequences represent the input data feeded by the user in Wapiti either for
|
39
|
+
* training or labelling. The internal form used here is very different from
|
40
|
+
* the data read from files and the convertion process is done in three steps
|
41
|
+
* illustrated here:
|
42
|
+
* +------+ +-------+ +-------+ +-------+
|
43
|
+
* | FILE | --> | raw_t | --> | tok_t | --> | seq_t |
|
44
|
+
* +------+ +-------+ +-------+ +-------+
|
45
|
+
* First the sequence is read as a set of lines from the input file, this
|
46
|
+
* give a raw_t object. Next this set of lines is split in tokens and
|
47
|
+
* eventually the last one is separated as it will become a label, this result
|
48
|
+
* in a tok_t object.
|
49
|
+
* The last step consist in applying all the patterns givens by the user to
|
50
|
+
* extract from these tokens the observations made on the sequence in order to
|
51
|
+
* build the seq_t object which can be used by the trainer and tagger.
|
52
|
+
*
|
53
|
+
* A dataset object is just a container for a list of sequences in internal
|
54
|
+
* form used to store either training or development set.
|
55
|
+
*
|
56
|
+
* All the convertion process is driven by the reader object and, as it is
|
57
|
+
* responsible for creating the objects with a quite special allocation
|
58
|
+
* scheme, we just have to implement function for freeing these objects here.
|
59
|
+
******************************************************************************/
|
60
|
+
|
61
|
+
/* raw_t:
|
62
|
+
* Data-structure representing a raw sequence as a set of lines read from the
|
63
|
+
* input file. This is the result of the first step of the interning process.
|
64
|
+
* We keep this form separate from the tokenized one as we want to be able to
|
65
|
+
* output the sequence as it was read in the labelling mode.
|
66
|
+
*
|
67
|
+
* This represent a sequence of lengths <len> and for each position 't' you
|
68
|
+
* find the corresponding line at <lines>[t].
|
69
|
+
*
|
70
|
+
* The <lines> array is allocated with data structure, and the different lines
|
71
|
+
* are allocated separatly.
|
72
|
+
*/
|
73
|
+
typedef struct raw_s raw_t;
|
74
|
+
struct raw_s {
|
75
|
+
int len; // T Sequence length
|
76
|
+
char *lines[]; // [T] Raw lines directly from file
|
77
|
+
};
|
78
|
+
|
79
|
+
/* tok_t:
|
80
|
+
* Data-structure representing a tokenized sequence. This is the result of the
|
81
|
+
* second step of the interning process after the raw sequence have been split
|
82
|
+
* in tokens and eventual labels separated from the observations.
|
83
|
+
*
|
84
|
+
* For each position 't' in the sequence of length <len>, you find at <lbl>[t]
|
85
|
+
* the eventual label provided in input file, and at <toks>[t] a list of
|
86
|
+
* string tokens of length <cnts>[t].
|
87
|
+
*
|
88
|
+
* Memory allocation here is a bit special as the first token at each position
|
89
|
+
* point to a memory block who hold a copy of the raw line. Each other tokens
|
90
|
+
* and the label are pointer in this block. This reduce memory fragmentation.
|
91
|
+
*/
|
92
|
+
typedef struct tok_s tok_t;
|
93
|
+
struct tok_s {
|
94
|
+
int len; // T Sequence length
|
95
|
+
char **lbl; // [T] List of labels strings
|
96
|
+
int *cnts; // [T] Length of tokens lists
|
97
|
+
char **toks[]; // [T][] Tokens lists
|
98
|
+
};
|
99
|
+
|
100
|
+
/* seq_t:
|
101
|
+
* Data-structure representing a sequence of length <len> in the internal form
|
102
|
+
* used by the trainers and the tagger. For each position 't' in the sequence
|
103
|
+
* (0 <= t < <len>) there is some observations made on the data and an
|
104
|
+
* eventual label if provided in the input file.
|
105
|
+
*
|
106
|
+
* There is two kind of features: unigrams and bigrams one, build by combining
|
107
|
+
* one observation and one or two labels. At position 't', the unigrams
|
108
|
+
* features are build using the list of observations from <uobs>[t] which
|
109
|
+
* contains <ucnt>[t] items, and the observation at <lbl>[t]. The bigrams
|
110
|
+
* features are obtained in the same way using <bobs> and <bcnt>, and have to
|
111
|
+
* be combined also with <lbl>[t-1].
|
112
|
+
*
|
113
|
+
* If the sequence is read from a file without label, as it is the case in
|
114
|
+
* labelling mode, the <lbl> field will be NULL and so, the sequence cannot be
|
115
|
+
* used for training.
|
116
|
+
*
|
117
|
+
* The raw field is private and used internaly for efficient memory
|
118
|
+
* allocation. This allow to allocate <lbl>, <*cnt>, and all the list in
|
119
|
+
* <*obs> with the datastructure itself.
|
120
|
+
*/
|
121
|
+
typedef struct pos_s pos_t;
|
122
|
+
typedef struct seq_s seq_t;
|
123
|
+
struct seq_s {
|
124
|
+
int len;
|
125
|
+
size_t *raw;
|
126
|
+
struct pos_s {
|
127
|
+
size_t lbl;
|
128
|
+
size_t ucnt, bcnt;
|
129
|
+
size_t *uobs, *bobs;
|
130
|
+
} pos[];
|
131
|
+
};
|
132
|
+
|
133
|
+
/* dat_t:
|
134
|
+
* Data-structure representing a full dataset: a collection of sequences ready
|
135
|
+
* to be used for training or to be labelled. It keep tracks of the maximum
|
136
|
+
* sequence length as the trainer need this for memory allocation. The dataset
|
137
|
+
* contains <nseq> sequence stored in <seq>. These sequences are labeled only
|
138
|
+
* if <lbl> is true.
|
139
|
+
*/
|
140
|
+
typedef struct dat_s dat_t;
|
141
|
+
struct dat_s {
|
142
|
+
bool lbl; // True iff sequences are labelled
|
143
|
+
int mlen; // Length of the longest sequence in the set
|
144
|
+
size_t nseq; // S Number of sequences in the set
|
145
|
+
seq_t **seq; // [S] List of sequences
|
146
|
+
};
|
147
|
+
|
148
|
+
#endif
|
data/ext/wapiti/sgdl1.c
ADDED
@@ -0,0 +1,218 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
#include <math.h>
|
28
|
+
#include <stdbool.h>
|
29
|
+
#include <stddef.h>
|
30
|
+
#include <stdlib.h>
|
31
|
+
#include <string.h>
|
32
|
+
|
33
|
+
#include "wapiti.h"
|
34
|
+
#include "gradient.h"
|
35
|
+
#include "model.h"
|
36
|
+
#include "options.h"
|
37
|
+
#include "progress.h"
|
38
|
+
#include "sequence.h"
|
39
|
+
#include "tools.h"
|
40
|
+
|
41
|
+
/******************************************************************************
|
42
|
+
* The SGD-L1 trainer
|
43
|
+
*
|
44
|
+
* Implementation of the stochatic gradient descend with L1 penalty described
|
45
|
+
* in [1] by Tsurukoa et al. This allow to build really sparse models with the
|
46
|
+
* SGD method.
|
47
|
+
*
|
48
|
+
* [1] Stochastic gradient descent training for L1-regularized log-linear
|
49
|
+
* models with cumulative penalty, Yoshimasa Tsuruoka and Jun'ichi Tsuji
|
50
|
+
* and Sophia Ananiadou, in Proceedings of the ACL and the 4th IJCNLP of
|
51
|
+
* the AFNLP, pages 477-485, August 2009
|
52
|
+
******************************************************************************/
|
53
|
+
typedef struct sgd_idx_s {
|
54
|
+
size_t *uobs;
|
55
|
+
size_t *bobs;
|
56
|
+
} sgd_idx_t;
|
57
|
+
|
58
|
+
/* applypenalty:
|
59
|
+
* This macro is quite ugly as it make a lot of things and use local variables
|
60
|
+
* of the function below. I'm sorry for this but this is allow to not
|
61
|
+
* duplicate the code below. Due to the way unigrams and bigrams observation
|
62
|
+
* are stored we must use this two times. As this macro is dangerous when
|
63
|
+
* called outsize of sgd-l1 we undef it just after.
|
64
|
+
* This function match exactly the APPLYPENALTY function defined in [1] pp 481
|
65
|
+
* and the formula on the middle of the page 480.
|
66
|
+
*/
|
67
|
+
#define applypenalty(f) do { \
|
68
|
+
const double z = w[f]; \
|
69
|
+
if (z > 0.0) w[f] = max(0.0, z - (u + q[f])); \
|
70
|
+
else if (z < 0.0) w[f] = min(0.0, z + (u - q[f])); \
|
71
|
+
q[f] += w[f] - z; \
|
72
|
+
} while (false)
|
73
|
+
|
74
|
+
/* sgd_add:
|
75
|
+
* Add the <new> value in the array <obs> of size <cnt>. If the value is
|
76
|
+
* already present, we do nothing, else we add it.
|
77
|
+
*/
|
78
|
+
static void sgd_add(size_t *obs, size_t *cnt, size_t new) {
|
79
|
+
// First check if value is already in the array, we do a linear probing
|
80
|
+
// as it is simpler and since these array will be very short in
|
81
|
+
// practice, it's efficient enough.
|
82
|
+
for (size_t p = 0; p < *cnt; p++)
|
83
|
+
if (obs[p] == new)
|
84
|
+
return;
|
85
|
+
// Insert the new value at the end since we have not found it.
|
86
|
+
obs[*cnt] = new;
|
87
|
+
*cnt = *cnt + 1;
|
88
|
+
}
|
89
|
+
|
90
|
+
/* trn_sgdl1:
|
91
|
+
* Train the model with the SGD-l1 algorithm described by tsurukoa et al.
|
92
|
+
*/
|
93
|
+
void trn_sgdl1(mdl_t *mdl) {
|
94
|
+
const size_t Y = mdl->nlbl;
|
95
|
+
const size_t F = mdl->nftr;
|
96
|
+
const int U = mdl->reader->nuni;
|
97
|
+
const int B = mdl->reader->nbi;
|
98
|
+
const int S = mdl->train->nseq;
|
99
|
+
const int K = mdl->opt->maxiter;
|
100
|
+
double *w = mdl->theta;
|
101
|
+
// First we have to build and index who hold, for each sequences, the
|
102
|
+
// list of actives observations.
|
103
|
+
// The index is a simple table indexed by sequences number. Each entry
|
104
|
+
// point to two lists of observations terminated by <none>, one for
|
105
|
+
// unigrams obss and one for bigrams obss.
|
106
|
+
info(" - Build the index\n");
|
107
|
+
sgd_idx_t *idx = wapiti_xmalloc(sizeof(sgd_idx_t) * S);
|
108
|
+
for (int s = 0; s < S; s++) {
|
109
|
+
const seq_t *seq = mdl->train->seq[s];
|
110
|
+
const int T = seq->len;
|
111
|
+
size_t uobs[U * T + 1], ucnt = 0;
|
112
|
+
size_t bobs[B * T + 1], bcnt = 0;
|
113
|
+
for (int t = 0; t < seq->len; t++) {
|
114
|
+
const pos_t *pos = &seq->pos[t];
|
115
|
+
for (size_t p = 0; p < pos->ucnt; p++)
|
116
|
+
sgd_add(uobs, &ucnt, pos->uobs[p]);
|
117
|
+
for (size_t p = 0; p < pos->bcnt; p++)
|
118
|
+
sgd_add(bobs, &bcnt, pos->bobs[p]);
|
119
|
+
}
|
120
|
+
uobs[ucnt++] = none;
|
121
|
+
bobs[bcnt++] = none;
|
122
|
+
idx[s].uobs = wapiti_xmalloc(sizeof(size_t) * ucnt);
|
123
|
+
idx[s].bobs = wapiti_xmalloc(sizeof(size_t) * bcnt);
|
124
|
+
memcpy(idx[s].uobs, uobs, ucnt * sizeof(size_t));
|
125
|
+
memcpy(idx[s].bobs, bobs, bcnt * sizeof(size_t));
|
126
|
+
}
|
127
|
+
info(" Done\n");
|
128
|
+
// We will process sequences in random order in each iteration, so we
|
129
|
+
// will have to permute them. The current permutation is stored in a
|
130
|
+
// vector called <perm> shuffled at the start of each iteration. We
|
131
|
+
// just initialize it with the identity permutation.
|
132
|
+
// As we use the same gradient function than the other trainers, we need
|
133
|
+
// an array to store it. These functions accumulate the gradient so we
|
134
|
+
// need to clear it at start and before each new computation. As we now
|
135
|
+
// which features are active and so which gradient cell are updated, we
|
136
|
+
// can clear them selectively instead of fully clear the gradient each
|
137
|
+
// time.
|
138
|
+
// We also need an aditional vector named <q> who hold the penalty
|
139
|
+
// already applied to each features.
|
140
|
+
int *perm = wapiti_xmalloc(sizeof(int) * S);
|
141
|
+
for (int s = 0; s < S; s++)
|
142
|
+
perm[s] = s;
|
143
|
+
double *g = wapiti_xmalloc(sizeof(double) * F);
|
144
|
+
double *q = wapiti_xmalloc(sizeof(double) * F);
|
145
|
+
for (size_t f = 0; f < F; f++)
|
146
|
+
g[f] = q[f] = 0.0;
|
147
|
+
// We can now start training the model, we perform the requested number
|
148
|
+
// of iteration, each of these going through all the sequences. For
|
149
|
+
// computing the decay, we will need to keep track of the number of
|
150
|
+
// already processed sequences, this is tracked by the <i> variable.
|
151
|
+
double u = 0.0;
|
152
|
+
grd_t *grd = grd_new(mdl, g);
|
153
|
+
for (int k = 0, i = 0; k < K && !uit_stop; k++) {
|
154
|
+
// First we shuffle the sequence by making a lot of random swap
|
155
|
+
// of entry in the permutation index.
|
156
|
+
for (int s = 0; s < S; s++) {
|
157
|
+
const int a = rand() % S;
|
158
|
+
const int b = rand() % S;
|
159
|
+
const int t = perm[a];
|
160
|
+
perm[a] = perm[b];
|
161
|
+
perm[b] = t;
|
162
|
+
}
|
163
|
+
// And so, we can process sequence in a random order
|
164
|
+
for (int sp = 0; sp < S && !uit_stop; sp++, i++) {
|
165
|
+
const int s = perm[sp];
|
166
|
+
const seq_t *seq = mdl->train->seq[s];
|
167
|
+
grd_dospl(grd, seq);
|
168
|
+
// Before applying the gradient, we have to compute the
|
169
|
+
// learning rate to apply to this sequence. For this we
|
170
|
+
// use an exponential decay [1, pp 481(5)]
|
171
|
+
// η_i = η_0 * α^{i/S}
|
172
|
+
// And at the same time, we update the total penalty
|
173
|
+
// that must have been applied to each features.
|
174
|
+
// u <- u + η * rho1 / S
|
175
|
+
const double n0 = mdl->opt->sgdl1.eta0;
|
176
|
+
const double alpha = mdl->opt->sgdl1.alpha;
|
177
|
+
const double nk = n0 * pow(alpha, (double)i / S);
|
178
|
+
u = u + nk * mdl->opt->rho1 / S;
|
179
|
+
// Now we apply the update to all unigrams and bigrams
|
180
|
+
// observations actives in the current sequence. We must
|
181
|
+
// not forget to clear the gradient for the next
|
182
|
+
// sequence.
|
183
|
+
for (size_t n = 0; idx[s].uobs[n] != none; n++) {
|
184
|
+
size_t f = mdl->uoff[idx[s].uobs[n]];
|
185
|
+
for (size_t y = 0; y < Y; y++, f++) {
|
186
|
+
w[f] -= nk * g[f];
|
187
|
+
applypenalty(f);
|
188
|
+
g[f] = 0.0;
|
189
|
+
}
|
190
|
+
}
|
191
|
+
for (size_t n = 0; idx[s].bobs[n] != none; n++) {
|
192
|
+
size_t f = mdl->boff[idx[s].bobs[n]];
|
193
|
+
for (size_t d = 0; d < Y * Y; d++, f++) {
|
194
|
+
w[f] -= nk * g[f];
|
195
|
+
applypenalty(f);
|
196
|
+
g[f] = 0.0;
|
197
|
+
}
|
198
|
+
}
|
199
|
+
}
|
200
|
+
if (uit_stop)
|
201
|
+
break;
|
202
|
+
// Repport progress back to the user
|
203
|
+
if (!uit_progress(mdl, k + 1, -1.0))
|
204
|
+
break;
|
205
|
+
}
|
206
|
+
grd_free(grd);
|
207
|
+
// Cleanup allocated memory before returning
|
208
|
+
for (int s = 0; s < S; s++) {
|
209
|
+
free(idx[s].uobs);
|
210
|
+
free(idx[s].bobs);
|
211
|
+
}
|
212
|
+
free(idx);
|
213
|
+
free(perm);
|
214
|
+
free(g);
|
215
|
+
free(q);
|
216
|
+
}
|
217
|
+
#undef applypenalty
|
218
|
+
|