wapiti 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/.autotest +13 -0
  2. data/.gitignore +5 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +30 -0
  6. data/README.md +153 -0
  7. data/Rakefile +33 -0
  8. data/ext/wapiti/bcd.c +392 -0
  9. data/ext/wapiti/decoder.c +535 -0
  10. data/ext/wapiti/decoder.h +46 -0
  11. data/ext/wapiti/extconf.rb +8 -0
  12. data/ext/wapiti/gradient.c +818 -0
  13. data/ext/wapiti/gradient.h +81 -0
  14. data/ext/wapiti/lbfgs.c +294 -0
  15. data/ext/wapiti/model.c +296 -0
  16. data/ext/wapiti/model.h +100 -0
  17. data/ext/wapiti/native.c +1238 -0
  18. data/ext/wapiti/native.h +15 -0
  19. data/ext/wapiti/options.c +278 -0
  20. data/ext/wapiti/options.h +91 -0
  21. data/ext/wapiti/pattern.c +395 -0
  22. data/ext/wapiti/pattern.h +56 -0
  23. data/ext/wapiti/progress.c +167 -0
  24. data/ext/wapiti/progress.h +43 -0
  25. data/ext/wapiti/quark.c +272 -0
  26. data/ext/wapiti/quark.h +46 -0
  27. data/ext/wapiti/reader.c +553 -0
  28. data/ext/wapiti/reader.h +73 -0
  29. data/ext/wapiti/rprop.c +191 -0
  30. data/ext/wapiti/sequence.h +148 -0
  31. data/ext/wapiti/sgdl1.c +218 -0
  32. data/ext/wapiti/thread.c +171 -0
  33. data/ext/wapiti/thread.h +42 -0
  34. data/ext/wapiti/tools.c +202 -0
  35. data/ext/wapiti/tools.h +54 -0
  36. data/ext/wapiti/trainers.h +39 -0
  37. data/ext/wapiti/vmath.c +372 -0
  38. data/ext/wapiti/vmath.h +51 -0
  39. data/ext/wapiti/wapiti.c +288 -0
  40. data/ext/wapiti/wapiti.h +45 -0
  41. data/lib/wapiti.rb +30 -0
  42. data/lib/wapiti/errors.rb +17 -0
  43. data/lib/wapiti/model.rb +49 -0
  44. data/lib/wapiti/options.rb +113 -0
  45. data/lib/wapiti/utility.rb +15 -0
  46. data/lib/wapiti/version.rb +3 -0
  47. data/spec/fixtures/ch.mod +18550 -0
  48. data/spec/fixtures/chpattern.txt +52 -0
  49. data/spec/fixtures/chtest.txt +1973 -0
  50. data/spec/fixtures/chtrain.txt +19995 -0
  51. data/spec/fixtures/nppattern.txt +52 -0
  52. data/spec/fixtures/nptest.txt +1973 -0
  53. data/spec/fixtures/nptrain.txt +19995 -0
  54. data/spec/fixtures/pattern.txt +14 -0
  55. data/spec/fixtures/test.txt +60000 -0
  56. data/spec/fixtures/train.txt +1200 -0
  57. data/spec/spec_helper.rb +21 -0
  58. data/spec/wapiti/model_spec.rb +173 -0
  59. data/spec/wapiti/native_spec.rb +12 -0
  60. data/spec/wapiti/options_spec.rb +175 -0
  61. data/spec/wapiti/utility_spec.rb +22 -0
  62. data/wapiti.gemspec +35 -0
  63. metadata +178 -0
@@ -0,0 +1,73 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #ifndef reader_h
29
+ #define reader_h
30
+
31
+ #include <stdbool.h>
32
+ #include <stdio.h>
33
+
34
+ #include "wapiti.h"
35
+ #include "pattern.h"
36
+ #include "quark.h"
37
+ #include "sequence.h"
38
+
39
+ /* rdr_t:
40
+ * The reader object who hold all informations needed to parse the input file:
41
+ * the patterns and quark for labels and observations. We keep separate count
42
+ * for unigrams and bigrams pattern for simpler allocation of sequences. We
43
+ * also store the expected number of column in the input data to check that
44
+ * pattern are appliables.
45
+ */
46
+ typedef struct rdr_s rdr_t;
47
+ struct rdr_s {
48
+ bool maxent; // Is this a maxent reader
49
+ int npats; // P Total number of patterns
50
+ int nuni, nbi; // Number of unigram and bigram patterns
51
+ int ntoks; // Expected number of tokens in input
52
+ pat_t **pats; // [P] List of precompiled patterns
53
+ qrk_t *lbl; // Labels database
54
+ qrk_t *obs; // Observation database
55
+ };
56
+
57
+ rdr_t *rdr_new(bool maxent);
58
+ void rdr_free(rdr_t *rdr);
59
+ void rdr_freeraw(raw_t *raw);
60
+ void rdr_freeseq(seq_t *seq);
61
+ void rdr_freedat(dat_t *dat);
62
+
63
+ void rdr_loadpat(rdr_t *rdr, FILE *file);
64
+ raw_t *rdr_readraw(rdr_t *rdr, FILE *file);
65
+ seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl);
66
+ seq_t *rdr_readseq(rdr_t *rdr, FILE *file, bool lbl);
67
+ dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl);
68
+
69
+ void rdr_load(rdr_t *rdr, FILE *file);
70
+ void rdr_save(const rdr_t *rdr, FILE *file);
71
+
72
+ #endif
73
+
@@ -0,0 +1,191 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+ #include <math.h>
28
+ #include <stdbool.h>
29
+ #include <stddef.h>
30
+ #include <stdlib.h>
31
+ #include <string.h>
32
+
33
+ #include "wapiti.h"
34
+ #include "gradient.h"
35
+ #include "model.h"
36
+ #include "options.h"
37
+ #include "progress.h"
38
+ #include "tools.h"
39
+ #include "thread.h"
40
+ #include "vmath.h"
41
+
42
+ #define sign(v) ((v) < 0.0 ? -1.0 : ((v) > 0.0 ? 1.0 : 0.0))
43
+ #define sqr(v) ((v) * (v))
44
+
45
+ /******************************************************************************
46
+ * Resilient propagation optimizer
47
+ *
48
+ * This is an implementation of the RPROP algorithm (resilient propagation)
49
+ * described by Riedmiller and Braun in [1] with an adaptation to be useable
50
+ * with l1 regularization.
51
+ * The adaptation consist of using a pseudo-gradient similar to the one used
52
+ * in OWL-QN to choose an orthant at iterations steps and projecting the step
53
+ * in this orthant before the weight update.
54
+ *
55
+ * [1] A direct adaptive method for faster backpropagation learning: The RPROP
56
+ * algorithm, Martin Riedmiller and Heinrich Braun, IEEE International
57
+ * Conference on Neural Networks, San Francisco, USA, 586-591, March 1993.
58
+ ******************************************************************************/
59
+ typedef struct rprop_s rprop_t;
60
+ struct rprop_s {
61
+ mdl_t *mdl;
62
+ double *xp;
63
+ double *stp;
64
+ double *g;
65
+ double *gp;
66
+ };
67
+
68
+ /* trn_rpropsub:
69
+ * Partial update of the weight vector including partial gradient in case of
70
+ * l1 regularisation. The sub vector updated depend on the id and cnt
71
+ * parameter given, the job scheduling system is not used here as we can
72
+ * easily split processing in equals parts.
73
+ */
74
+ static void trn_rpropsub(job_t *job, int id, int cnt, rprop_t *st) {
75
+ unused(job);
76
+ mdl_t *mdl = st->mdl;
77
+ const size_t F = mdl->nftr;
78
+ const double stpmin = mdl->opt->rprop.stpmin;
79
+ const double stpmax = mdl->opt->rprop.stpmax;
80
+ const double stpinc = mdl->opt->rprop.stpinc;
81
+ const double stpdec = mdl->opt->rprop.stpdec;
82
+ const bool wbt = strcmp(mdl->opt->algo, "rprop-");
83
+ const double rho1 = mdl->opt->rho1;
84
+ const int l1 = (rho1 != 0.0) ? mdl->opt->rprop.cutoff + 1: 0;
85
+ double *x = mdl->theta;
86
+ double *xp = st->xp, *stp = st->stp;
87
+ double *g = st->g, *gp = st->gp;
88
+ const size_t from = F * id / cnt;
89
+ const size_t to = F * (id + 1) / cnt;
90
+ for (size_t f = from; f < to; f++) {
91
+ double pg = g[f];
92
+ // If there is a l1 component in the regularization component,
93
+ // we either project the gradient in the current orthant or
94
+ // check for cutdown depending on the projection scheme wanted.
95
+ if (l1 == 1) {
96
+ if (x[f] < 0.0) pg -= rho1;
97
+ else if (x[f] > 0.0) pg += rho1;
98
+ else if (g[f] < -rho1) pg += rho1;
99
+ else if (g[f] > rho1) pg -= rho1;
100
+ else pg = 0.0;
101
+ } else if (l1 && sqr(g[f] + rho1 * sign(x[f])) < sqr(rho1)) {
102
+ if (x[f] == 0.0 || ( gp[f] * g[f] < 0.0
103
+ && xp[f] * x[f] < 0.0)) {
104
+ if (wbt)
105
+ xp[f] = x[f];
106
+ x[f] = 0.0;
107
+ gp[f] = g[f];
108
+ continue;
109
+ }
110
+ }
111
+ // Next we adjust the step depending of the new and
112
+ // previous gradient values.
113
+ if (gp[f] * pg > 0.0)
114
+ stp[f] = min(stp[f] * stpinc, stpmax);
115
+ else if (gp[f] * pg < 0.0)
116
+ stp[f] = max(stp[f] * stpdec, stpmin);
117
+ // Finally update the weight. if there is l1 penalty
118
+ // and the pseudo gradient projection is used, we have to
119
+ // project back the update in the choosen orthant.
120
+ if (!wbt || gp[f] * pg > 0.0) {
121
+ double dlt = stp[f] * -sign(g[f]);
122
+ if (l1 == 1 && dlt * pg >= 0.0)
123
+ dlt = 0.0;
124
+ if (wbt)
125
+ xp[f] = x[f];
126
+ x[f] += dlt;
127
+ } else if (gp[f] * pg < 0.0) {
128
+ x[f] = xp[f];
129
+ g[f] = 0.0;
130
+ } else {
131
+ xp[f] = x[f];
132
+ if (l1 != 1)
133
+ x[f] += stp[f] * -sign(pg);
134
+ }
135
+ gp[f] = g[f];
136
+ }
137
+ }
138
+
139
+ void trn_rprop(mdl_t *mdl) {
140
+ const size_t F = mdl->nftr;
141
+ const int K = mdl->opt->maxiter;
142
+ const size_t W = mdl->opt->nthread;
143
+ const bool wbt = strcmp(mdl->opt->algo, "rprop-");
144
+ const int cut = mdl->opt->rprop.cutoff;
145
+ // Allocate state memory and initialize it
146
+ double *xp = NULL, *stp = xvm_new(F);
147
+ double *g = xvm_new(F), *gp = xvm_new(F);
148
+ if (wbt && !cut)
149
+ xp = xvm_new(F);
150
+ for (unsigned f = 0; f < F; f++) {
151
+ if (wbt && !cut)
152
+ xp[f] = 0.0;
153
+ gp[f] = 0.0;
154
+ stp[f] = 0.1;
155
+ }
156
+ // Prepare the rprop state used to send information to the rprop worker
157
+ // about updating weight using the gradient.
158
+ rprop_t *st = wapiti_xmalloc(sizeof(rprop_t));
159
+ st->mdl = mdl;
160
+ st->xp = xp; st->stp = stp;
161
+ st->g = g; st->gp = gp;
162
+ rprop_t *rprop[W];
163
+ for (size_t w = 0; w < W; w++)
164
+ rprop[w] = st;
165
+ // Prepare the gradient state for the distributed gradient computation.
166
+ grd_t *grds[W];
167
+ grds[0] = grd_new(mdl, g);
168
+ for (size_t w = 1; w < W; w++)
169
+ grds[w] = grd_new(mdl, xvm_new(F));
170
+ // And iterate the gradient computation / weight update process until
171
+ // convergence or stop request
172
+ for (int k = 0; !uit_stop && k < K; k++) {
173
+ double fx = grd_gradient(mdl, g, grds);
174
+ if (uit_stop)
175
+ break;
176
+ mth_spawn((func_t *)trn_rpropsub, W, (void **)rprop, 0, 0);
177
+ if (uit_progress(mdl, k + 1, fx) == false)
178
+ break;
179
+ }
180
+ // Free all allocated memory
181
+ if (wbt && !cut)
182
+ xvm_free(xp);
183
+ xvm_free(g);
184
+ xvm_free(gp);
185
+ for (size_t w = 1; w < W; w++)
186
+ xvm_free(grds[w]->g);
187
+ for (size_t w = 0; w < W; w++)
188
+ grd_free(grds[w]);
189
+ free(st);
190
+ }
191
+
@@ -0,0 +1,148 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #ifndef sequence_h
29
+ #define sequence_h
30
+
31
+ #include <stddef.h>
32
+
33
+ #include "wapiti.h"
34
+
35
+ /*******************************************************************************
36
+ * Sequences and Dataset objects
37
+ *
38
+ * Sequences represent the input data feeded by the user in Wapiti either for
39
+ * training or labelling. The internal form used here is very different from
40
+ * the data read from files and the convertion process is done in three steps
41
+ * illustrated here:
42
+ * +------+ +-------+ +-------+ +-------+
43
+ * | FILE | --> | raw_t | --> | tok_t | --> | seq_t |
44
+ * +------+ +-------+ +-------+ +-------+
45
+ * First the sequence is read as a set of lines from the input file, this
46
+ * give a raw_t object. Next this set of lines is split in tokens and
47
+ * eventually the last one is separated as it will become a label, this result
48
+ * in a tok_t object.
49
+ * The last step consist in applying all the patterns givens by the user to
50
+ * extract from these tokens the observations made on the sequence in order to
51
+ * build the seq_t object which can be used by the trainer and tagger.
52
+ *
53
+ * A dataset object is just a container for a list of sequences in internal
54
+ * form used to store either training or development set.
55
+ *
56
+ * All the convertion process is driven by the reader object and, as it is
57
+ * responsible for creating the objects with a quite special allocation
58
+ * scheme, we just have to implement function for freeing these objects here.
59
+ ******************************************************************************/
60
+
61
+ /* raw_t:
62
+ * Data-structure representing a raw sequence as a set of lines read from the
63
+ * input file. This is the result of the first step of the interning process.
64
+ * We keep this form separate from the tokenized one as we want to be able to
65
+ * output the sequence as it was read in the labelling mode.
66
+ *
67
+ * This represent a sequence of lengths <len> and for each position 't' you
68
+ * find the corresponding line at <lines>[t].
69
+ *
70
+ * The <lines> array is allocated with data structure, and the different lines
71
+ * are allocated separatly.
72
+ */
73
+ typedef struct raw_s raw_t;
74
+ struct raw_s {
75
+ int len; // T Sequence length
76
+ char *lines[]; // [T] Raw lines directly from file
77
+ };
78
+
79
+ /* tok_t:
80
+ * Data-structure representing a tokenized sequence. This is the result of the
81
+ * second step of the interning process after the raw sequence have been split
82
+ * in tokens and eventual labels separated from the observations.
83
+ *
84
+ * For each position 't' in the sequence of length <len>, you find at <lbl>[t]
85
+ * the eventual label provided in input file, and at <toks>[t] a list of
86
+ * string tokens of length <cnts>[t].
87
+ *
88
+ * Memory allocation here is a bit special as the first token at each position
89
+ * point to a memory block who hold a copy of the raw line. Each other tokens
90
+ * and the label are pointer in this block. This reduce memory fragmentation.
91
+ */
92
+ typedef struct tok_s tok_t;
93
+ struct tok_s {
94
+ int len; // T Sequence length
95
+ char **lbl; // [T] List of labels strings
96
+ int *cnts; // [T] Length of tokens lists
97
+ char **toks[]; // [T][] Tokens lists
98
+ };
99
+
100
+ /* seq_t:
101
+ * Data-structure representing a sequence of length <len> in the internal form
102
+ * used by the trainers and the tagger. For each position 't' in the sequence
103
+ * (0 <= t < <len>) there is some observations made on the data and an
104
+ * eventual label if provided in the input file.
105
+ *
106
+ * There is two kind of features: unigrams and bigrams one, build by combining
107
+ * one observation and one or two labels. At position 't', the unigrams
108
+ * features are build using the list of observations from <uobs>[t] which
109
+ * contains <ucnt>[t] items, and the observation at <lbl>[t]. The bigrams
110
+ * features are obtained in the same way using <bobs> and <bcnt>, and have to
111
+ * be combined also with <lbl>[t-1].
112
+ *
113
+ * If the sequence is read from a file without label, as it is the case in
114
+ * labelling mode, the <lbl> field will be NULL and so, the sequence cannot be
115
+ * used for training.
116
+ *
117
+ * The raw field is private and used internaly for efficient memory
118
+ * allocation. This allow to allocate <lbl>, <*cnt>, and all the list in
119
+ * <*obs> with the datastructure itself.
120
+ */
121
+ typedef struct pos_s pos_t;
122
+ typedef struct seq_s seq_t;
123
+ struct seq_s {
124
+ int len;
125
+ size_t *raw;
126
+ struct pos_s {
127
+ size_t lbl;
128
+ size_t ucnt, bcnt;
129
+ size_t *uobs, *bobs;
130
+ } pos[];
131
+ };
132
+
133
+ /* dat_t:
134
+ * Data-structure representing a full dataset: a collection of sequences ready
135
+ * to be used for training or to be labelled. It keep tracks of the maximum
136
+ * sequence length as the trainer need this for memory allocation. The dataset
137
+ * contains <nseq> sequence stored in <seq>. These sequences are labeled only
138
+ * if <lbl> is true.
139
+ */
140
+ typedef struct dat_s dat_t;
141
+ struct dat_s {
142
+ bool lbl; // True iff sequences are labelled
143
+ int mlen; // Length of the longest sequence in the set
144
+ size_t nseq; // S Number of sequences in the set
145
+ seq_t **seq; // [S] List of sequences
146
+ };
147
+
148
+ #endif
@@ -0,0 +1,218 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+ #include <math.h>
28
+ #include <stdbool.h>
29
+ #include <stddef.h>
30
+ #include <stdlib.h>
31
+ #include <string.h>
32
+
33
+ #include "wapiti.h"
34
+ #include "gradient.h"
35
+ #include "model.h"
36
+ #include "options.h"
37
+ #include "progress.h"
38
+ #include "sequence.h"
39
+ #include "tools.h"
40
+
41
+ /******************************************************************************
42
+ * The SGD-L1 trainer
43
+ *
44
+ * Implementation of the stochatic gradient descend with L1 penalty described
45
+ * in [1] by Tsurukoa et al. This allow to build really sparse models with the
46
+ * SGD method.
47
+ *
48
+ * [1] Stochastic gradient descent training for L1-regularized log-linear
49
+ * models with cumulative penalty, Yoshimasa Tsuruoka and Jun'ichi Tsuji
50
+ * and Sophia Ananiadou, in Proceedings of the ACL and the 4th IJCNLP of
51
+ * the AFNLP, pages 477-485, August 2009
52
+ ******************************************************************************/
53
+ typedef struct sgd_idx_s {
54
+ size_t *uobs;
55
+ size_t *bobs;
56
+ } sgd_idx_t;
57
+
58
+ /* applypenalty:
59
+ * This macro is quite ugly as it make a lot of things and use local variables
60
+ * of the function below. I'm sorry for this but this is allow to not
61
+ * duplicate the code below. Due to the way unigrams and bigrams observation
62
+ * are stored we must use this two times. As this macro is dangerous when
63
+ * called outsize of sgd-l1 we undef it just after.
64
+ * This function match exactly the APPLYPENALTY function defined in [1] pp 481
65
+ * and the formula on the middle of the page 480.
66
+ */
67
+ #define applypenalty(f) do { \
68
+ const double z = w[f]; \
69
+ if (z > 0.0) w[f] = max(0.0, z - (u + q[f])); \
70
+ else if (z < 0.0) w[f] = min(0.0, z + (u - q[f])); \
71
+ q[f] += w[f] - z; \
72
+ } while (false)
73
+
74
+ /* sgd_add:
75
+ * Add the <new> value in the array <obs> of size <cnt>. If the value is
76
+ * already present, we do nothing, else we add it.
77
+ */
78
+ static void sgd_add(size_t *obs, size_t *cnt, size_t new) {
79
+ // First check if value is already in the array, we do a linear probing
80
+ // as it is simpler and since these array will be very short in
81
+ // practice, it's efficient enough.
82
+ for (size_t p = 0; p < *cnt; p++)
83
+ if (obs[p] == new)
84
+ return;
85
+ // Insert the new value at the end since we have not found it.
86
+ obs[*cnt] = new;
87
+ *cnt = *cnt + 1;
88
+ }
89
+
90
+ /* trn_sgdl1:
91
+ * Train the model with the SGD-l1 algorithm described by tsurukoa et al.
92
+ */
93
+ void trn_sgdl1(mdl_t *mdl) {
94
+ const size_t Y = mdl->nlbl;
95
+ const size_t F = mdl->nftr;
96
+ const int U = mdl->reader->nuni;
97
+ const int B = mdl->reader->nbi;
98
+ const int S = mdl->train->nseq;
99
+ const int K = mdl->opt->maxiter;
100
+ double *w = mdl->theta;
101
+ // First we have to build and index who hold, for each sequences, the
102
+ // list of actives observations.
103
+ // The index is a simple table indexed by sequences number. Each entry
104
+ // point to two lists of observations terminated by <none>, one for
105
+ // unigrams obss and one for bigrams obss.
106
+ info(" - Build the index\n");
107
+ sgd_idx_t *idx = wapiti_xmalloc(sizeof(sgd_idx_t) * S);
108
+ for (int s = 0; s < S; s++) {
109
+ const seq_t *seq = mdl->train->seq[s];
110
+ const int T = seq->len;
111
+ size_t uobs[U * T + 1], ucnt = 0;
112
+ size_t bobs[B * T + 1], bcnt = 0;
113
+ for (int t = 0; t < seq->len; t++) {
114
+ const pos_t *pos = &seq->pos[t];
115
+ for (size_t p = 0; p < pos->ucnt; p++)
116
+ sgd_add(uobs, &ucnt, pos->uobs[p]);
117
+ for (size_t p = 0; p < pos->bcnt; p++)
118
+ sgd_add(bobs, &bcnt, pos->bobs[p]);
119
+ }
120
+ uobs[ucnt++] = none;
121
+ bobs[bcnt++] = none;
122
+ idx[s].uobs = wapiti_xmalloc(sizeof(size_t) * ucnt);
123
+ idx[s].bobs = wapiti_xmalloc(sizeof(size_t) * bcnt);
124
+ memcpy(idx[s].uobs, uobs, ucnt * sizeof(size_t));
125
+ memcpy(idx[s].bobs, bobs, bcnt * sizeof(size_t));
126
+ }
127
+ info(" Done\n");
128
+ // We will process sequences in random order in each iteration, so we
129
+ // will have to permute them. The current permutation is stored in a
130
+ // vector called <perm> shuffled at the start of each iteration. We
131
+ // just initialize it with the identity permutation.
132
+ // As we use the same gradient function than the other trainers, we need
133
+ // an array to store it. These functions accumulate the gradient so we
134
+ // need to clear it at start and before each new computation. As we now
135
+ // which features are active and so which gradient cell are updated, we
136
+ // can clear them selectively instead of fully clear the gradient each
137
+ // time.
138
+ // We also need an aditional vector named <q> who hold the penalty
139
+ // already applied to each features.
140
+ int *perm = wapiti_xmalloc(sizeof(int) * S);
141
+ for (int s = 0; s < S; s++)
142
+ perm[s] = s;
143
+ double *g = wapiti_xmalloc(sizeof(double) * F);
144
+ double *q = wapiti_xmalloc(sizeof(double) * F);
145
+ for (size_t f = 0; f < F; f++)
146
+ g[f] = q[f] = 0.0;
147
+ // We can now start training the model, we perform the requested number
148
+ // of iteration, each of these going through all the sequences. For
149
+ // computing the decay, we will need to keep track of the number of
150
+ // already processed sequences, this is tracked by the <i> variable.
151
+ double u = 0.0;
152
+ grd_t *grd = grd_new(mdl, g);
153
+ for (int k = 0, i = 0; k < K && !uit_stop; k++) {
154
+ // First we shuffle the sequence by making a lot of random swap
155
+ // of entry in the permutation index.
156
+ for (int s = 0; s < S; s++) {
157
+ const int a = rand() % S;
158
+ const int b = rand() % S;
159
+ const int t = perm[a];
160
+ perm[a] = perm[b];
161
+ perm[b] = t;
162
+ }
163
+ // And so, we can process sequence in a random order
164
+ for (int sp = 0; sp < S && !uit_stop; sp++, i++) {
165
+ const int s = perm[sp];
166
+ const seq_t *seq = mdl->train->seq[s];
167
+ grd_dospl(grd, seq);
168
+ // Before applying the gradient, we have to compute the
169
+ // learning rate to apply to this sequence. For this we
170
+ // use an exponential decay [1, pp 481(5)]
171
+ // η_i = η_0 * α^{i/S}
172
+ // And at the same time, we update the total penalty
173
+ // that must have been applied to each features.
174
+ // u <- u + η * rho1 / S
175
+ const double n0 = mdl->opt->sgdl1.eta0;
176
+ const double alpha = mdl->opt->sgdl1.alpha;
177
+ const double nk = n0 * pow(alpha, (double)i / S);
178
+ u = u + nk * mdl->opt->rho1 / S;
179
+ // Now we apply the update to all unigrams and bigrams
180
+ // observations actives in the current sequence. We must
181
+ // not forget to clear the gradient for the next
182
+ // sequence.
183
+ for (size_t n = 0; idx[s].uobs[n] != none; n++) {
184
+ size_t f = mdl->uoff[idx[s].uobs[n]];
185
+ for (size_t y = 0; y < Y; y++, f++) {
186
+ w[f] -= nk * g[f];
187
+ applypenalty(f);
188
+ g[f] = 0.0;
189
+ }
190
+ }
191
+ for (size_t n = 0; idx[s].bobs[n] != none; n++) {
192
+ size_t f = mdl->boff[idx[s].bobs[n]];
193
+ for (size_t d = 0; d < Y * Y; d++, f++) {
194
+ w[f] -= nk * g[f];
195
+ applypenalty(f);
196
+ g[f] = 0.0;
197
+ }
198
+ }
199
+ }
200
+ if (uit_stop)
201
+ break;
202
+ // Repport progress back to the user
203
+ if (!uit_progress(mdl, k + 1, -1.0))
204
+ break;
205
+ }
206
+ grd_free(grd);
207
+ // Cleanup allocated memory before returning
208
+ for (int s = 0; s < S; s++) {
209
+ free(idx[s].uobs);
210
+ free(idx[s].bobs);
211
+ }
212
+ free(idx);
213
+ free(perm);
214
+ free(g);
215
+ free(q);
216
+ }
217
+ #undef applypenalty
218
+