wapiti 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.autotest +13 -0
- data/.gitignore +5 -0
- data/.rspec +3 -0
- data/Gemfile +6 -0
- data/LICENSE +30 -0
- data/README.md +153 -0
- data/Rakefile +33 -0
- data/ext/wapiti/bcd.c +392 -0
- data/ext/wapiti/decoder.c +535 -0
- data/ext/wapiti/decoder.h +46 -0
- data/ext/wapiti/extconf.rb +8 -0
- data/ext/wapiti/gradient.c +818 -0
- data/ext/wapiti/gradient.h +81 -0
- data/ext/wapiti/lbfgs.c +294 -0
- data/ext/wapiti/model.c +296 -0
- data/ext/wapiti/model.h +100 -0
- data/ext/wapiti/native.c +1238 -0
- data/ext/wapiti/native.h +15 -0
- data/ext/wapiti/options.c +278 -0
- data/ext/wapiti/options.h +91 -0
- data/ext/wapiti/pattern.c +395 -0
- data/ext/wapiti/pattern.h +56 -0
- data/ext/wapiti/progress.c +167 -0
- data/ext/wapiti/progress.h +43 -0
- data/ext/wapiti/quark.c +272 -0
- data/ext/wapiti/quark.h +46 -0
- data/ext/wapiti/reader.c +553 -0
- data/ext/wapiti/reader.h +73 -0
- data/ext/wapiti/rprop.c +191 -0
- data/ext/wapiti/sequence.h +148 -0
- data/ext/wapiti/sgdl1.c +218 -0
- data/ext/wapiti/thread.c +171 -0
- data/ext/wapiti/thread.h +42 -0
- data/ext/wapiti/tools.c +202 -0
- data/ext/wapiti/tools.h +54 -0
- data/ext/wapiti/trainers.h +39 -0
- data/ext/wapiti/vmath.c +372 -0
- data/ext/wapiti/vmath.h +51 -0
- data/ext/wapiti/wapiti.c +288 -0
- data/ext/wapiti/wapiti.h +45 -0
- data/lib/wapiti.rb +30 -0
- data/lib/wapiti/errors.rb +17 -0
- data/lib/wapiti/model.rb +49 -0
- data/lib/wapiti/options.rb +113 -0
- data/lib/wapiti/utility.rb +15 -0
- data/lib/wapiti/version.rb +3 -0
- data/spec/fixtures/ch.mod +18550 -0
- data/spec/fixtures/chpattern.txt +52 -0
- data/spec/fixtures/chtest.txt +1973 -0
- data/spec/fixtures/chtrain.txt +19995 -0
- data/spec/fixtures/nppattern.txt +52 -0
- data/spec/fixtures/nptest.txt +1973 -0
- data/spec/fixtures/nptrain.txt +19995 -0
- data/spec/fixtures/pattern.txt +14 -0
- data/spec/fixtures/test.txt +60000 -0
- data/spec/fixtures/train.txt +1200 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/wapiti/model_spec.rb +173 -0
- data/spec/wapiti/native_spec.rb +12 -0
- data/spec/wapiti/options_spec.rb +175 -0
- data/spec/wapiti/utility_spec.rb +22 -0
- data/wapiti.gemspec +35 -0
- metadata +178 -0
data/ext/wapiti/reader.h
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#ifndef reader_h
|
29
|
+
#define reader_h
|
30
|
+
|
31
|
+
#include <stdbool.h>
|
32
|
+
#include <stdio.h>
|
33
|
+
|
34
|
+
#include "wapiti.h"
|
35
|
+
#include "pattern.h"
|
36
|
+
#include "quark.h"
|
37
|
+
#include "sequence.h"
|
38
|
+
|
39
|
+
/* rdr_t:
|
40
|
+
* The reader object who hold all informations needed to parse the input file:
|
41
|
+
* the patterns and quark for labels and observations. We keep separate count
|
42
|
+
* for unigrams and bigrams pattern for simpler allocation of sequences. We
|
43
|
+
* also store the expected number of column in the input data to check that
|
44
|
+
* pattern are appliables.
|
45
|
+
*/
|
46
|
+
typedef struct rdr_s rdr_t;
|
47
|
+
struct rdr_s {
|
48
|
+
bool maxent; // Is this a maxent reader
|
49
|
+
int npats; // P Total number of patterns
|
50
|
+
int nuni, nbi; // Number of unigram and bigram patterns
|
51
|
+
int ntoks; // Expected number of tokens in input
|
52
|
+
pat_t **pats; // [P] List of precompiled patterns
|
53
|
+
qrk_t *lbl; // Labels database
|
54
|
+
qrk_t *obs; // Observation database
|
55
|
+
};
|
56
|
+
|
57
|
+
rdr_t *rdr_new(bool maxent);
|
58
|
+
void rdr_free(rdr_t *rdr);
|
59
|
+
void rdr_freeraw(raw_t *raw);
|
60
|
+
void rdr_freeseq(seq_t *seq);
|
61
|
+
void rdr_freedat(dat_t *dat);
|
62
|
+
|
63
|
+
void rdr_loadpat(rdr_t *rdr, FILE *file);
|
64
|
+
raw_t *rdr_readraw(rdr_t *rdr, FILE *file);
|
65
|
+
seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl);
|
66
|
+
seq_t *rdr_readseq(rdr_t *rdr, FILE *file, bool lbl);
|
67
|
+
dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl);
|
68
|
+
|
69
|
+
void rdr_load(rdr_t *rdr, FILE *file);
|
70
|
+
void rdr_save(const rdr_t *rdr, FILE *file);
|
71
|
+
|
72
|
+
#endif
|
73
|
+
|
data/ext/wapiti/rprop.c
ADDED
@@ -0,0 +1,191 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
#include <math.h>
|
28
|
+
#include <stdbool.h>
|
29
|
+
#include <stddef.h>
|
30
|
+
#include <stdlib.h>
|
31
|
+
#include <string.h>
|
32
|
+
|
33
|
+
#include "wapiti.h"
|
34
|
+
#include "gradient.h"
|
35
|
+
#include "model.h"
|
36
|
+
#include "options.h"
|
37
|
+
#include "progress.h"
|
38
|
+
#include "tools.h"
|
39
|
+
#include "thread.h"
|
40
|
+
#include "vmath.h"
|
41
|
+
|
42
|
+
#define sign(v) ((v) < 0.0 ? -1.0 : ((v) > 0.0 ? 1.0 : 0.0))
|
43
|
+
#define sqr(v) ((v) * (v))
|
44
|
+
|
45
|
+
/******************************************************************************
|
46
|
+
* Resilient propagation optimizer
|
47
|
+
*
|
48
|
+
* This is an implementation of the RPROP algorithm (resilient propagation)
|
49
|
+
* described by Riedmiller and Braun in [1] with an adaptation to be useable
|
50
|
+
* with l1 regularization.
|
51
|
+
* The adaptation consist of using a pseudo-gradient similar to the one used
|
52
|
+
* in OWL-QN to choose an orthant at iterations steps and projecting the step
|
53
|
+
* in this orthant before the weight update.
|
54
|
+
*
|
55
|
+
* [1] A direct adaptive method for faster backpropagation learning: The RPROP
|
56
|
+
* algorithm, Martin Riedmiller and Heinrich Braun, IEEE International
|
57
|
+
* Conference on Neural Networks, San Francisco, USA, 586-591, March 1993.
|
58
|
+
******************************************************************************/
|
59
|
+
typedef struct rprop_s rprop_t;
|
60
|
+
struct rprop_s {
|
61
|
+
mdl_t *mdl;
|
62
|
+
double *xp;
|
63
|
+
double *stp;
|
64
|
+
double *g;
|
65
|
+
double *gp;
|
66
|
+
};
|
67
|
+
|
68
|
+
/* trn_rpropsub:
|
69
|
+
* Partial update of the weight vector including partial gradient in case of
|
70
|
+
* l1 regularisation. The sub vector updated depend on the id and cnt
|
71
|
+
* parameter given, the job scheduling system is not used here as we can
|
72
|
+
* easily split processing in equals parts.
|
73
|
+
*/
|
74
|
+
static void trn_rpropsub(job_t *job, int id, int cnt, rprop_t *st) {
|
75
|
+
unused(job);
|
76
|
+
mdl_t *mdl = st->mdl;
|
77
|
+
const size_t F = mdl->nftr;
|
78
|
+
const double stpmin = mdl->opt->rprop.stpmin;
|
79
|
+
const double stpmax = mdl->opt->rprop.stpmax;
|
80
|
+
const double stpinc = mdl->opt->rprop.stpinc;
|
81
|
+
const double stpdec = mdl->opt->rprop.stpdec;
|
82
|
+
const bool wbt = strcmp(mdl->opt->algo, "rprop-");
|
83
|
+
const double rho1 = mdl->opt->rho1;
|
84
|
+
const int l1 = (rho1 != 0.0) ? mdl->opt->rprop.cutoff + 1: 0;
|
85
|
+
double *x = mdl->theta;
|
86
|
+
double *xp = st->xp, *stp = st->stp;
|
87
|
+
double *g = st->g, *gp = st->gp;
|
88
|
+
const size_t from = F * id / cnt;
|
89
|
+
const size_t to = F * (id + 1) / cnt;
|
90
|
+
for (size_t f = from; f < to; f++) {
|
91
|
+
double pg = g[f];
|
92
|
+
// If there is a l1 component in the regularization component,
|
93
|
+
// we either project the gradient in the current orthant or
|
94
|
+
// check for cutdown depending on the projection scheme wanted.
|
95
|
+
if (l1 == 1) {
|
96
|
+
if (x[f] < 0.0) pg -= rho1;
|
97
|
+
else if (x[f] > 0.0) pg += rho1;
|
98
|
+
else if (g[f] < -rho1) pg += rho1;
|
99
|
+
else if (g[f] > rho1) pg -= rho1;
|
100
|
+
else pg = 0.0;
|
101
|
+
} else if (l1 && sqr(g[f] + rho1 * sign(x[f])) < sqr(rho1)) {
|
102
|
+
if (x[f] == 0.0 || ( gp[f] * g[f] < 0.0
|
103
|
+
&& xp[f] * x[f] < 0.0)) {
|
104
|
+
if (wbt)
|
105
|
+
xp[f] = x[f];
|
106
|
+
x[f] = 0.0;
|
107
|
+
gp[f] = g[f];
|
108
|
+
continue;
|
109
|
+
}
|
110
|
+
}
|
111
|
+
// Next we adjust the step depending of the new and
|
112
|
+
// previous gradient values.
|
113
|
+
if (gp[f] * pg > 0.0)
|
114
|
+
stp[f] = min(stp[f] * stpinc, stpmax);
|
115
|
+
else if (gp[f] * pg < 0.0)
|
116
|
+
stp[f] = max(stp[f] * stpdec, stpmin);
|
117
|
+
// Finally update the weight. if there is l1 penalty
|
118
|
+
// and the pseudo gradient projection is used, we have to
|
119
|
+
// project back the update in the choosen orthant.
|
120
|
+
if (!wbt || gp[f] * pg > 0.0) {
|
121
|
+
double dlt = stp[f] * -sign(g[f]);
|
122
|
+
if (l1 == 1 && dlt * pg >= 0.0)
|
123
|
+
dlt = 0.0;
|
124
|
+
if (wbt)
|
125
|
+
xp[f] = x[f];
|
126
|
+
x[f] += dlt;
|
127
|
+
} else if (gp[f] * pg < 0.0) {
|
128
|
+
x[f] = xp[f];
|
129
|
+
g[f] = 0.0;
|
130
|
+
} else {
|
131
|
+
xp[f] = x[f];
|
132
|
+
if (l1 != 1)
|
133
|
+
x[f] += stp[f] * -sign(pg);
|
134
|
+
}
|
135
|
+
gp[f] = g[f];
|
136
|
+
}
|
137
|
+
}
|
138
|
+
|
139
|
+
void trn_rprop(mdl_t *mdl) {
|
140
|
+
const size_t F = mdl->nftr;
|
141
|
+
const int K = mdl->opt->maxiter;
|
142
|
+
const size_t W = mdl->opt->nthread;
|
143
|
+
const bool wbt = strcmp(mdl->opt->algo, "rprop-");
|
144
|
+
const int cut = mdl->opt->rprop.cutoff;
|
145
|
+
// Allocate state memory and initialize it
|
146
|
+
double *xp = NULL, *stp = xvm_new(F);
|
147
|
+
double *g = xvm_new(F), *gp = xvm_new(F);
|
148
|
+
if (wbt && !cut)
|
149
|
+
xp = xvm_new(F);
|
150
|
+
for (unsigned f = 0; f < F; f++) {
|
151
|
+
if (wbt && !cut)
|
152
|
+
xp[f] = 0.0;
|
153
|
+
gp[f] = 0.0;
|
154
|
+
stp[f] = 0.1;
|
155
|
+
}
|
156
|
+
// Prepare the rprop state used to send information to the rprop worker
|
157
|
+
// about updating weight using the gradient.
|
158
|
+
rprop_t *st = wapiti_xmalloc(sizeof(rprop_t));
|
159
|
+
st->mdl = mdl;
|
160
|
+
st->xp = xp; st->stp = stp;
|
161
|
+
st->g = g; st->gp = gp;
|
162
|
+
rprop_t *rprop[W];
|
163
|
+
for (size_t w = 0; w < W; w++)
|
164
|
+
rprop[w] = st;
|
165
|
+
// Prepare the gradient state for the distributed gradient computation.
|
166
|
+
grd_t *grds[W];
|
167
|
+
grds[0] = grd_new(mdl, g);
|
168
|
+
for (size_t w = 1; w < W; w++)
|
169
|
+
grds[w] = grd_new(mdl, xvm_new(F));
|
170
|
+
// And iterate the gradient computation / weight update process until
|
171
|
+
// convergence or stop request
|
172
|
+
for (int k = 0; !uit_stop && k < K; k++) {
|
173
|
+
double fx = grd_gradient(mdl, g, grds);
|
174
|
+
if (uit_stop)
|
175
|
+
break;
|
176
|
+
mth_spawn((func_t *)trn_rpropsub, W, (void **)rprop, 0, 0);
|
177
|
+
if (uit_progress(mdl, k + 1, fx) == false)
|
178
|
+
break;
|
179
|
+
}
|
180
|
+
// Free all allocated memory
|
181
|
+
if (wbt && !cut)
|
182
|
+
xvm_free(xp);
|
183
|
+
xvm_free(g);
|
184
|
+
xvm_free(gp);
|
185
|
+
for (size_t w = 1; w < W; w++)
|
186
|
+
xvm_free(grds[w]->g);
|
187
|
+
for (size_t w = 0; w < W; w++)
|
188
|
+
grd_free(grds[w]);
|
189
|
+
free(st);
|
190
|
+
}
|
191
|
+
|
@@ -0,0 +1,148 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#ifndef sequence_h
|
29
|
+
#define sequence_h
|
30
|
+
|
31
|
+
#include <stddef.h>
|
32
|
+
|
33
|
+
#include "wapiti.h"
|
34
|
+
|
35
|
+
/*******************************************************************************
|
36
|
+
* Sequences and Dataset objects
|
37
|
+
*
|
38
|
+
* Sequences represent the input data feeded by the user in Wapiti either for
|
39
|
+
* training or labelling. The internal form used here is very different from
|
40
|
+
* the data read from files and the convertion process is done in three steps
|
41
|
+
* illustrated here:
|
42
|
+
* +------+ +-------+ +-------+ +-------+
|
43
|
+
* | FILE | --> | raw_t | --> | tok_t | --> | seq_t |
|
44
|
+
* +------+ +-------+ +-------+ +-------+
|
45
|
+
* First the sequence is read as a set of lines from the input file, this
|
46
|
+
* give a raw_t object. Next this set of lines is split in tokens and
|
47
|
+
* eventually the last one is separated as it will become a label, this result
|
48
|
+
* in a tok_t object.
|
49
|
+
* The last step consist in applying all the patterns givens by the user to
|
50
|
+
* extract from these tokens the observations made on the sequence in order to
|
51
|
+
* build the seq_t object which can be used by the trainer and tagger.
|
52
|
+
*
|
53
|
+
* A dataset object is just a container for a list of sequences in internal
|
54
|
+
* form used to store either training or development set.
|
55
|
+
*
|
56
|
+
* All the convertion process is driven by the reader object and, as it is
|
57
|
+
* responsible for creating the objects with a quite special allocation
|
58
|
+
* scheme, we just have to implement function for freeing these objects here.
|
59
|
+
******************************************************************************/
|
60
|
+
|
61
|
+
/* raw_t:
|
62
|
+
* Data-structure representing a raw sequence as a set of lines read from the
|
63
|
+
* input file. This is the result of the first step of the interning process.
|
64
|
+
* We keep this form separate from the tokenized one as we want to be able to
|
65
|
+
* output the sequence as it was read in the labelling mode.
|
66
|
+
*
|
67
|
+
* This represent a sequence of lengths <len> and for each position 't' you
|
68
|
+
* find the corresponding line at <lines>[t].
|
69
|
+
*
|
70
|
+
* The <lines> array is allocated with data structure, and the different lines
|
71
|
+
* are allocated separatly.
|
72
|
+
*/
|
73
|
+
typedef struct raw_s raw_t;
|
74
|
+
struct raw_s {
|
75
|
+
int len; // T Sequence length
|
76
|
+
char *lines[]; // [T] Raw lines directly from file
|
77
|
+
};
|
78
|
+
|
79
|
+
/* tok_t:
|
80
|
+
* Data-structure representing a tokenized sequence. This is the result of the
|
81
|
+
* second step of the interning process after the raw sequence have been split
|
82
|
+
* in tokens and eventual labels separated from the observations.
|
83
|
+
*
|
84
|
+
* For each position 't' in the sequence of length <len>, you find at <lbl>[t]
|
85
|
+
* the eventual label provided in input file, and at <toks>[t] a list of
|
86
|
+
* string tokens of length <cnts>[t].
|
87
|
+
*
|
88
|
+
* Memory allocation here is a bit special as the first token at each position
|
89
|
+
* point to a memory block who hold a copy of the raw line. Each other tokens
|
90
|
+
* and the label are pointer in this block. This reduce memory fragmentation.
|
91
|
+
*/
|
92
|
+
typedef struct tok_s tok_t;
|
93
|
+
struct tok_s {
|
94
|
+
int len; // T Sequence length
|
95
|
+
char **lbl; // [T] List of labels strings
|
96
|
+
int *cnts; // [T] Length of tokens lists
|
97
|
+
char **toks[]; // [T][] Tokens lists
|
98
|
+
};
|
99
|
+
|
100
|
+
/* seq_t:
|
101
|
+
* Data-structure representing a sequence of length <len> in the internal form
|
102
|
+
* used by the trainers and the tagger. For each position 't' in the sequence
|
103
|
+
* (0 <= t < <len>) there is some observations made on the data and an
|
104
|
+
* eventual label if provided in the input file.
|
105
|
+
*
|
106
|
+
* There is two kind of features: unigrams and bigrams one, build by combining
|
107
|
+
* one observation and one or two labels. At position 't', the unigrams
|
108
|
+
* features are build using the list of observations from <uobs>[t] which
|
109
|
+
* contains <ucnt>[t] items, and the observation at <lbl>[t]. The bigrams
|
110
|
+
* features are obtained in the same way using <bobs> and <bcnt>, and have to
|
111
|
+
* be combined also with <lbl>[t-1].
|
112
|
+
*
|
113
|
+
* If the sequence is read from a file without label, as it is the case in
|
114
|
+
* labelling mode, the <lbl> field will be NULL and so, the sequence cannot be
|
115
|
+
* used for training.
|
116
|
+
*
|
117
|
+
* The raw field is private and used internaly for efficient memory
|
118
|
+
* allocation. This allow to allocate <lbl>, <*cnt>, and all the list in
|
119
|
+
* <*obs> with the datastructure itself.
|
120
|
+
*/
|
121
|
+
typedef struct pos_s pos_t;
|
122
|
+
typedef struct seq_s seq_t;
|
123
|
+
struct seq_s {
|
124
|
+
int len;
|
125
|
+
size_t *raw;
|
126
|
+
struct pos_s {
|
127
|
+
size_t lbl;
|
128
|
+
size_t ucnt, bcnt;
|
129
|
+
size_t *uobs, *bobs;
|
130
|
+
} pos[];
|
131
|
+
};
|
132
|
+
|
133
|
+
/* dat_t:
|
134
|
+
* Data-structure representing a full dataset: a collection of sequences ready
|
135
|
+
* to be used for training or to be labelled. It keep tracks of the maximum
|
136
|
+
* sequence length as the trainer need this for memory allocation. The dataset
|
137
|
+
* contains <nseq> sequence stored in <seq>. These sequences are labeled only
|
138
|
+
* if <lbl> is true.
|
139
|
+
*/
|
140
|
+
typedef struct dat_s dat_t;
|
141
|
+
struct dat_s {
|
142
|
+
bool lbl; // True iff sequences are labelled
|
143
|
+
int mlen; // Length of the longest sequence in the set
|
144
|
+
size_t nseq; // S Number of sequences in the set
|
145
|
+
seq_t **seq; // [S] List of sequences
|
146
|
+
};
|
147
|
+
|
148
|
+
#endif
|
data/ext/wapiti/sgdl1.c
ADDED
@@ -0,0 +1,218 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
#include <math.h>
|
28
|
+
#include <stdbool.h>
|
29
|
+
#include <stddef.h>
|
30
|
+
#include <stdlib.h>
|
31
|
+
#include <string.h>
|
32
|
+
|
33
|
+
#include "wapiti.h"
|
34
|
+
#include "gradient.h"
|
35
|
+
#include "model.h"
|
36
|
+
#include "options.h"
|
37
|
+
#include "progress.h"
|
38
|
+
#include "sequence.h"
|
39
|
+
#include "tools.h"
|
40
|
+
|
41
|
+
/******************************************************************************
|
42
|
+
* The SGD-L1 trainer
|
43
|
+
*
|
44
|
+
* Implementation of the stochatic gradient descend with L1 penalty described
|
45
|
+
* in [1] by Tsurukoa et al. This allow to build really sparse models with the
|
46
|
+
* SGD method.
|
47
|
+
*
|
48
|
+
* [1] Stochastic gradient descent training for L1-regularized log-linear
|
49
|
+
* models with cumulative penalty, Yoshimasa Tsuruoka and Jun'ichi Tsuji
|
50
|
+
* and Sophia Ananiadou, in Proceedings of the ACL and the 4th IJCNLP of
|
51
|
+
* the AFNLP, pages 477-485, August 2009
|
52
|
+
******************************************************************************/
|
53
|
+
typedef struct sgd_idx_s {
|
54
|
+
size_t *uobs;
|
55
|
+
size_t *bobs;
|
56
|
+
} sgd_idx_t;
|
57
|
+
|
58
|
+
/* applypenalty:
|
59
|
+
* This macro is quite ugly as it make a lot of things and use local variables
|
60
|
+
* of the function below. I'm sorry for this but this is allow to not
|
61
|
+
* duplicate the code below. Due to the way unigrams and bigrams observation
|
62
|
+
* are stored we must use this two times. As this macro is dangerous when
|
63
|
+
* called outsize of sgd-l1 we undef it just after.
|
64
|
+
* This function match exactly the APPLYPENALTY function defined in [1] pp 481
|
65
|
+
* and the formula on the middle of the page 480.
|
66
|
+
*/
|
67
|
+
#define applypenalty(f) do { \
|
68
|
+
const double z = w[f]; \
|
69
|
+
if (z > 0.0) w[f] = max(0.0, z - (u + q[f])); \
|
70
|
+
else if (z < 0.0) w[f] = min(0.0, z + (u - q[f])); \
|
71
|
+
q[f] += w[f] - z; \
|
72
|
+
} while (false)
|
73
|
+
|
74
|
+
/* sgd_add:
|
75
|
+
* Add the <new> value in the array <obs> of size <cnt>. If the value is
|
76
|
+
* already present, we do nothing, else we add it.
|
77
|
+
*/
|
78
|
+
static void sgd_add(size_t *obs, size_t *cnt, size_t new) {
|
79
|
+
// First check if value is already in the array, we do a linear probing
|
80
|
+
// as it is simpler and since these array will be very short in
|
81
|
+
// practice, it's efficient enough.
|
82
|
+
for (size_t p = 0; p < *cnt; p++)
|
83
|
+
if (obs[p] == new)
|
84
|
+
return;
|
85
|
+
// Insert the new value at the end since we have not found it.
|
86
|
+
obs[*cnt] = new;
|
87
|
+
*cnt = *cnt + 1;
|
88
|
+
}
|
89
|
+
|
90
|
+
/* trn_sgdl1:
|
91
|
+
* Train the model with the SGD-l1 algorithm described by tsurukoa et al.
|
92
|
+
*/
|
93
|
+
void trn_sgdl1(mdl_t *mdl) {
|
94
|
+
const size_t Y = mdl->nlbl;
|
95
|
+
const size_t F = mdl->nftr;
|
96
|
+
const int U = mdl->reader->nuni;
|
97
|
+
const int B = mdl->reader->nbi;
|
98
|
+
const int S = mdl->train->nseq;
|
99
|
+
const int K = mdl->opt->maxiter;
|
100
|
+
double *w = mdl->theta;
|
101
|
+
// First we have to build and index who hold, for each sequences, the
|
102
|
+
// list of actives observations.
|
103
|
+
// The index is a simple table indexed by sequences number. Each entry
|
104
|
+
// point to two lists of observations terminated by <none>, one for
|
105
|
+
// unigrams obss and one for bigrams obss.
|
106
|
+
info(" - Build the index\n");
|
107
|
+
sgd_idx_t *idx = wapiti_xmalloc(sizeof(sgd_idx_t) * S);
|
108
|
+
for (int s = 0; s < S; s++) {
|
109
|
+
const seq_t *seq = mdl->train->seq[s];
|
110
|
+
const int T = seq->len;
|
111
|
+
size_t uobs[U * T + 1], ucnt = 0;
|
112
|
+
size_t bobs[B * T + 1], bcnt = 0;
|
113
|
+
for (int t = 0; t < seq->len; t++) {
|
114
|
+
const pos_t *pos = &seq->pos[t];
|
115
|
+
for (size_t p = 0; p < pos->ucnt; p++)
|
116
|
+
sgd_add(uobs, &ucnt, pos->uobs[p]);
|
117
|
+
for (size_t p = 0; p < pos->bcnt; p++)
|
118
|
+
sgd_add(bobs, &bcnt, pos->bobs[p]);
|
119
|
+
}
|
120
|
+
uobs[ucnt++] = none;
|
121
|
+
bobs[bcnt++] = none;
|
122
|
+
idx[s].uobs = wapiti_xmalloc(sizeof(size_t) * ucnt);
|
123
|
+
idx[s].bobs = wapiti_xmalloc(sizeof(size_t) * bcnt);
|
124
|
+
memcpy(idx[s].uobs, uobs, ucnt * sizeof(size_t));
|
125
|
+
memcpy(idx[s].bobs, bobs, bcnt * sizeof(size_t));
|
126
|
+
}
|
127
|
+
info(" Done\n");
|
128
|
+
// We will process sequences in random order in each iteration, so we
|
129
|
+
// will have to permute them. The current permutation is stored in a
|
130
|
+
// vector called <perm> shuffled at the start of each iteration. We
|
131
|
+
// just initialize it with the identity permutation.
|
132
|
+
// As we use the same gradient function than the other trainers, we need
|
133
|
+
// an array to store it. These functions accumulate the gradient so we
|
134
|
+
// need to clear it at start and before each new computation. As we now
|
135
|
+
// which features are active and so which gradient cell are updated, we
|
136
|
+
// can clear them selectively instead of fully clear the gradient each
|
137
|
+
// time.
|
138
|
+
// We also need an aditional vector named <q> who hold the penalty
|
139
|
+
// already applied to each features.
|
140
|
+
int *perm = wapiti_xmalloc(sizeof(int) * S);
|
141
|
+
for (int s = 0; s < S; s++)
|
142
|
+
perm[s] = s;
|
143
|
+
double *g = wapiti_xmalloc(sizeof(double) * F);
|
144
|
+
double *q = wapiti_xmalloc(sizeof(double) * F);
|
145
|
+
for (size_t f = 0; f < F; f++)
|
146
|
+
g[f] = q[f] = 0.0;
|
147
|
+
// We can now start training the model, we perform the requested number
|
148
|
+
// of iteration, each of these going through all the sequences. For
|
149
|
+
// computing the decay, we will need to keep track of the number of
|
150
|
+
// already processed sequences, this is tracked by the <i> variable.
|
151
|
+
double u = 0.0;
|
152
|
+
grd_t *grd = grd_new(mdl, g);
|
153
|
+
for (int k = 0, i = 0; k < K && !uit_stop; k++) {
|
154
|
+
// First we shuffle the sequence by making a lot of random swap
|
155
|
+
// of entry in the permutation index.
|
156
|
+
for (int s = 0; s < S; s++) {
|
157
|
+
const int a = rand() % S;
|
158
|
+
const int b = rand() % S;
|
159
|
+
const int t = perm[a];
|
160
|
+
perm[a] = perm[b];
|
161
|
+
perm[b] = t;
|
162
|
+
}
|
163
|
+
// And so, we can process sequence in a random order
|
164
|
+
for (int sp = 0; sp < S && !uit_stop; sp++, i++) {
|
165
|
+
const int s = perm[sp];
|
166
|
+
const seq_t *seq = mdl->train->seq[s];
|
167
|
+
grd_dospl(grd, seq);
|
168
|
+
// Before applying the gradient, we have to compute the
|
169
|
+
// learning rate to apply to this sequence. For this we
|
170
|
+
// use an exponential decay [1, pp 481(5)]
|
171
|
+
// η_i = η_0 * α^{i/S}
|
172
|
+
// And at the same time, we update the total penalty
|
173
|
+
// that must have been applied to each features.
|
174
|
+
// u <- u + η * rho1 / S
|
175
|
+
const double n0 = mdl->opt->sgdl1.eta0;
|
176
|
+
const double alpha = mdl->opt->sgdl1.alpha;
|
177
|
+
const double nk = n0 * pow(alpha, (double)i / S);
|
178
|
+
u = u + nk * mdl->opt->rho1 / S;
|
179
|
+
// Now we apply the update to all unigrams and bigrams
|
180
|
+
// observations actives in the current sequence. We must
|
181
|
+
// not forget to clear the gradient for the next
|
182
|
+
// sequence.
|
183
|
+
for (size_t n = 0; idx[s].uobs[n] != none; n++) {
|
184
|
+
size_t f = mdl->uoff[idx[s].uobs[n]];
|
185
|
+
for (size_t y = 0; y < Y; y++, f++) {
|
186
|
+
w[f] -= nk * g[f];
|
187
|
+
applypenalty(f);
|
188
|
+
g[f] = 0.0;
|
189
|
+
}
|
190
|
+
}
|
191
|
+
for (size_t n = 0; idx[s].bobs[n] != none; n++) {
|
192
|
+
size_t f = mdl->boff[idx[s].bobs[n]];
|
193
|
+
for (size_t d = 0; d < Y * Y; d++, f++) {
|
194
|
+
w[f] -= nk * g[f];
|
195
|
+
applypenalty(f);
|
196
|
+
g[f] = 0.0;
|
197
|
+
}
|
198
|
+
}
|
199
|
+
}
|
200
|
+
if (uit_stop)
|
201
|
+
break;
|
202
|
+
// Repport progress back to the user
|
203
|
+
if (!uit_progress(mdl, k + 1, -1.0))
|
204
|
+
break;
|
205
|
+
}
|
206
|
+
grd_free(grd);
|
207
|
+
// Cleanup allocated memory before returning
|
208
|
+
for (int s = 0; s < S; s++) {
|
209
|
+
free(idx[s].uobs);
|
210
|
+
free(idx[s].bobs);
|
211
|
+
}
|
212
|
+
free(idx);
|
213
|
+
free(perm);
|
214
|
+
free(g);
|
215
|
+
free(q);
|
216
|
+
}
|
217
|
+
#undef applypenalty
|
218
|
+
|