wapiti 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/.autotest +13 -0
  2. data/.gitignore +5 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +30 -0
  6. data/README.md +153 -0
  7. data/Rakefile +33 -0
  8. data/ext/wapiti/bcd.c +392 -0
  9. data/ext/wapiti/decoder.c +535 -0
  10. data/ext/wapiti/decoder.h +46 -0
  11. data/ext/wapiti/extconf.rb +8 -0
  12. data/ext/wapiti/gradient.c +818 -0
  13. data/ext/wapiti/gradient.h +81 -0
  14. data/ext/wapiti/lbfgs.c +294 -0
  15. data/ext/wapiti/model.c +296 -0
  16. data/ext/wapiti/model.h +100 -0
  17. data/ext/wapiti/native.c +1238 -0
  18. data/ext/wapiti/native.h +15 -0
  19. data/ext/wapiti/options.c +278 -0
  20. data/ext/wapiti/options.h +91 -0
  21. data/ext/wapiti/pattern.c +395 -0
  22. data/ext/wapiti/pattern.h +56 -0
  23. data/ext/wapiti/progress.c +167 -0
  24. data/ext/wapiti/progress.h +43 -0
  25. data/ext/wapiti/quark.c +272 -0
  26. data/ext/wapiti/quark.h +46 -0
  27. data/ext/wapiti/reader.c +553 -0
  28. data/ext/wapiti/reader.h +73 -0
  29. data/ext/wapiti/rprop.c +191 -0
  30. data/ext/wapiti/sequence.h +148 -0
  31. data/ext/wapiti/sgdl1.c +218 -0
  32. data/ext/wapiti/thread.c +171 -0
  33. data/ext/wapiti/thread.h +42 -0
  34. data/ext/wapiti/tools.c +202 -0
  35. data/ext/wapiti/tools.h +54 -0
  36. data/ext/wapiti/trainers.h +39 -0
  37. data/ext/wapiti/vmath.c +372 -0
  38. data/ext/wapiti/vmath.h +51 -0
  39. data/ext/wapiti/wapiti.c +288 -0
  40. data/ext/wapiti/wapiti.h +45 -0
  41. data/lib/wapiti.rb +30 -0
  42. data/lib/wapiti/errors.rb +17 -0
  43. data/lib/wapiti/model.rb +49 -0
  44. data/lib/wapiti/options.rb +113 -0
  45. data/lib/wapiti/utility.rb +15 -0
  46. data/lib/wapiti/version.rb +3 -0
  47. data/spec/fixtures/ch.mod +18550 -0
  48. data/spec/fixtures/chpattern.txt +52 -0
  49. data/spec/fixtures/chtest.txt +1973 -0
  50. data/spec/fixtures/chtrain.txt +19995 -0
  51. data/spec/fixtures/nppattern.txt +52 -0
  52. data/spec/fixtures/nptest.txt +1973 -0
  53. data/spec/fixtures/nptrain.txt +19995 -0
  54. data/spec/fixtures/pattern.txt +14 -0
  55. data/spec/fixtures/test.txt +60000 -0
  56. data/spec/fixtures/train.txt +1200 -0
  57. data/spec/spec_helper.rb +21 -0
  58. data/spec/wapiti/model_spec.rb +173 -0
  59. data/spec/wapiti/native_spec.rb +12 -0
  60. data/spec/wapiti/options_spec.rb +175 -0
  61. data/spec/wapiti/utility_spec.rb +22 -0
  62. data/wapiti.gemspec +35 -0
  63. metadata +178 -0
@@ -0,0 +1,73 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #ifndef reader_h
29
+ #define reader_h
30
+
31
+ #include <stdbool.h>
32
+ #include <stdio.h>
33
+
34
+ #include "wapiti.h"
35
+ #include "pattern.h"
36
+ #include "quark.h"
37
+ #include "sequence.h"
38
+
39
+ /* rdr_t:
40
+ * The reader object who hold all informations needed to parse the input file:
41
+ * the patterns and quark for labels and observations. We keep separate count
42
+ * for unigrams and bigrams pattern for simpler allocation of sequences. We
43
+ * also store the expected number of column in the input data to check that
44
+ * pattern are appliables.
45
+ */
46
+ typedef struct rdr_s rdr_t;
47
+ struct rdr_s {
48
+ bool maxent; // Is this a maxent reader
49
+ int npats; // P Total number of patterns
50
+ int nuni, nbi; // Number of unigram and bigram patterns
51
+ int ntoks; // Expected number of tokens in input
52
+ pat_t **pats; // [P] List of precompiled patterns
53
+ qrk_t *lbl; // Labels database
54
+ qrk_t *obs; // Observation database
55
+ };
56
+
57
+ rdr_t *rdr_new(bool maxent);
58
+ void rdr_free(rdr_t *rdr);
59
+ void rdr_freeraw(raw_t *raw);
60
+ void rdr_freeseq(seq_t *seq);
61
+ void rdr_freedat(dat_t *dat);
62
+
63
+ void rdr_loadpat(rdr_t *rdr, FILE *file);
64
+ raw_t *rdr_readraw(rdr_t *rdr, FILE *file);
65
+ seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl);
66
+ seq_t *rdr_readseq(rdr_t *rdr, FILE *file, bool lbl);
67
+ dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl);
68
+
69
+ void rdr_load(rdr_t *rdr, FILE *file);
70
+ void rdr_save(const rdr_t *rdr, FILE *file);
71
+
72
+ #endif
73
+
@@ -0,0 +1,191 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+ #include <math.h>
28
+ #include <stdbool.h>
29
+ #include <stddef.h>
30
+ #include <stdlib.h>
31
+ #include <string.h>
32
+
33
+ #include "wapiti.h"
34
+ #include "gradient.h"
35
+ #include "model.h"
36
+ #include "options.h"
37
+ #include "progress.h"
38
+ #include "tools.h"
39
+ #include "thread.h"
40
+ #include "vmath.h"
41
+
42
+ #define sign(v) ((v) < 0.0 ? -1.0 : ((v) > 0.0 ? 1.0 : 0.0))
43
+ #define sqr(v) ((v) * (v))
44
+
45
+ /******************************************************************************
46
+ * Resilient propagation optimizer
47
+ *
48
+ * This is an implementation of the RPROP algorithm (resilient propagation)
49
+ * described by Riedmiller and Braun in [1] with an adaptation to be useable
50
+ * with l1 regularization.
51
+ * The adaptation consist of using a pseudo-gradient similar to the one used
52
+ * in OWL-QN to choose an orthant at iterations steps and projecting the step
53
+ * in this orthant before the weight update.
54
+ *
55
+ * [1] A direct adaptive method for faster backpropagation learning: The RPROP
56
+ * algorithm, Martin Riedmiller and Heinrich Braun, IEEE International
57
+ * Conference on Neural Networks, San Francisco, USA, 586-591, March 1993.
58
+ ******************************************************************************/
59
+ typedef struct rprop_s rprop_t;
60
+ struct rprop_s {
61
+ mdl_t *mdl;
62
+ double *xp;
63
+ double *stp;
64
+ double *g;
65
+ double *gp;
66
+ };
67
+
68
+ /* trn_rpropsub:
69
+ * Partial update of the weight vector including partial gradient in case of
70
+ * l1 regularisation. The sub vector updated depend on the id and cnt
71
+ * parameter given, the job scheduling system is not used here as we can
72
+ * easily split processing in equals parts.
73
+ */
74
+ static void trn_rpropsub(job_t *job, int id, int cnt, rprop_t *st) {
75
+ unused(job);
76
+ mdl_t *mdl = st->mdl;
77
+ const size_t F = mdl->nftr;
78
+ const double stpmin = mdl->opt->rprop.stpmin;
79
+ const double stpmax = mdl->opt->rprop.stpmax;
80
+ const double stpinc = mdl->opt->rprop.stpinc;
81
+ const double stpdec = mdl->opt->rprop.stpdec;
82
+ const bool wbt = strcmp(mdl->opt->algo, "rprop-");
83
+ const double rho1 = mdl->opt->rho1;
84
+ const int l1 = (rho1 != 0.0) ? mdl->opt->rprop.cutoff + 1: 0;
85
+ double *x = mdl->theta;
86
+ double *xp = st->xp, *stp = st->stp;
87
+ double *g = st->g, *gp = st->gp;
88
+ const size_t from = F * id / cnt;
89
+ const size_t to = F * (id + 1) / cnt;
90
+ for (size_t f = from; f < to; f++) {
91
+ double pg = g[f];
92
+ // If there is a l1 component in the regularization component,
93
+ // we either project the gradient in the current orthant or
94
+ // check for cutdown depending on the projection scheme wanted.
95
+ if (l1 == 1) {
96
+ if (x[f] < 0.0) pg -= rho1;
97
+ else if (x[f] > 0.0) pg += rho1;
98
+ else if (g[f] < -rho1) pg += rho1;
99
+ else if (g[f] > rho1) pg -= rho1;
100
+ else pg = 0.0;
101
+ } else if (l1 && sqr(g[f] + rho1 * sign(x[f])) < sqr(rho1)) {
102
+ if (x[f] == 0.0 || ( gp[f] * g[f] < 0.0
103
+ && xp[f] * x[f] < 0.0)) {
104
+ if (wbt)
105
+ xp[f] = x[f];
106
+ x[f] = 0.0;
107
+ gp[f] = g[f];
108
+ continue;
109
+ }
110
+ }
111
+ // Next we adjust the step depending of the new and
112
+ // previous gradient values.
113
+ if (gp[f] * pg > 0.0)
114
+ stp[f] = min(stp[f] * stpinc, stpmax);
115
+ else if (gp[f] * pg < 0.0)
116
+ stp[f] = max(stp[f] * stpdec, stpmin);
117
+ // Finally update the weight. if there is l1 penalty
118
+ // and the pseudo gradient projection is used, we have to
119
+ // project back the update in the choosen orthant.
120
+ if (!wbt || gp[f] * pg > 0.0) {
121
+ double dlt = stp[f] * -sign(g[f]);
122
+ if (l1 == 1 && dlt * pg >= 0.0)
123
+ dlt = 0.0;
124
+ if (wbt)
125
+ xp[f] = x[f];
126
+ x[f] += dlt;
127
+ } else if (gp[f] * pg < 0.0) {
128
+ x[f] = xp[f];
129
+ g[f] = 0.0;
130
+ } else {
131
+ xp[f] = x[f];
132
+ if (l1 != 1)
133
+ x[f] += stp[f] * -sign(pg);
134
+ }
135
+ gp[f] = g[f];
136
+ }
137
+ }
138
+
139
+ void trn_rprop(mdl_t *mdl) {
140
+ const size_t F = mdl->nftr;
141
+ const int K = mdl->opt->maxiter;
142
+ const size_t W = mdl->opt->nthread;
143
+ const bool wbt = strcmp(mdl->opt->algo, "rprop-");
144
+ const int cut = mdl->opt->rprop.cutoff;
145
+ // Allocate state memory and initialize it
146
+ double *xp = NULL, *stp = xvm_new(F);
147
+ double *g = xvm_new(F), *gp = xvm_new(F);
148
+ if (wbt && !cut)
149
+ xp = xvm_new(F);
150
+ for (unsigned f = 0; f < F; f++) {
151
+ if (wbt && !cut)
152
+ xp[f] = 0.0;
153
+ gp[f] = 0.0;
154
+ stp[f] = 0.1;
155
+ }
156
+ // Prepare the rprop state used to send information to the rprop worker
157
+ // about updating weight using the gradient.
158
+ rprop_t *st = wapiti_xmalloc(sizeof(rprop_t));
159
+ st->mdl = mdl;
160
+ st->xp = xp; st->stp = stp;
161
+ st->g = g; st->gp = gp;
162
+ rprop_t *rprop[W];
163
+ for (size_t w = 0; w < W; w++)
164
+ rprop[w] = st;
165
+ // Prepare the gradient state for the distributed gradient computation.
166
+ grd_t *grds[W];
167
+ grds[0] = grd_new(mdl, g);
168
+ for (size_t w = 1; w < W; w++)
169
+ grds[w] = grd_new(mdl, xvm_new(F));
170
+ // And iterate the gradient computation / weight update process until
171
+ // convergence or stop request
172
+ for (int k = 0; !uit_stop && k < K; k++) {
173
+ double fx = grd_gradient(mdl, g, grds);
174
+ if (uit_stop)
175
+ break;
176
+ mth_spawn((func_t *)trn_rpropsub, W, (void **)rprop, 0, 0);
177
+ if (uit_progress(mdl, k + 1, fx) == false)
178
+ break;
179
+ }
180
+ // Free all allocated memory
181
+ if (wbt && !cut)
182
+ xvm_free(xp);
183
+ xvm_free(g);
184
+ xvm_free(gp);
185
+ for (size_t w = 1; w < W; w++)
186
+ xvm_free(grds[w]->g);
187
+ for (size_t w = 0; w < W; w++)
188
+ grd_free(grds[w]);
189
+ free(st);
190
+ }
191
+
@@ -0,0 +1,148 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #ifndef sequence_h
29
+ #define sequence_h
30
+
31
+ #include <stddef.h>
32
+
33
+ #include "wapiti.h"
34
+
35
+ /*******************************************************************************
36
+ * Sequences and Dataset objects
37
+ *
38
+ * Sequences represent the input data feeded by the user in Wapiti either for
39
+ * training or labelling. The internal form used here is very different from
40
+ * the data read from files and the convertion process is done in three steps
41
+ * illustrated here:
42
+ * +------+ +-------+ +-------+ +-------+
43
+ * | FILE | --> | raw_t | --> | tok_t | --> | seq_t |
44
+ * +------+ +-------+ +-------+ +-------+
45
+ * First the sequence is read as a set of lines from the input file, this
46
+ * give a raw_t object. Next this set of lines is split in tokens and
47
+ * eventually the last one is separated as it will become a label, this result
48
+ * in a tok_t object.
49
+ * The last step consist in applying all the patterns givens by the user to
50
+ * extract from these tokens the observations made on the sequence in order to
51
+ * build the seq_t object which can be used by the trainer and tagger.
52
+ *
53
+ * A dataset object is just a container for a list of sequences in internal
54
+ * form used to store either training or development set.
55
+ *
56
+ * All the convertion process is driven by the reader object and, as it is
57
+ * responsible for creating the objects with a quite special allocation
58
+ * scheme, we just have to implement function for freeing these objects here.
59
+ ******************************************************************************/
60
+
61
+ /* raw_t:
62
+ * Data-structure representing a raw sequence as a set of lines read from the
63
+ * input file. This is the result of the first step of the interning process.
64
+ * We keep this form separate from the tokenized one as we want to be able to
65
+ * output the sequence as it was read in the labelling mode.
66
+ *
67
+ * This represent a sequence of lengths <len> and for each position 't' you
68
+ * find the corresponding line at <lines>[t].
69
+ *
70
+ * The <lines> array is allocated with data structure, and the different lines
71
+ * are allocated separatly.
72
+ */
73
+ typedef struct raw_s raw_t;
74
+ struct raw_s {
75
+ int len; // T Sequence length
76
+ char *lines[]; // [T] Raw lines directly from file
77
+ };
78
+
79
+ /* tok_t:
80
+ * Data-structure representing a tokenized sequence. This is the result of the
81
+ * second step of the interning process after the raw sequence have been split
82
+ * in tokens and eventual labels separated from the observations.
83
+ *
84
+ * For each position 't' in the sequence of length <len>, you find at <lbl>[t]
85
+ * the eventual label provided in input file, and at <toks>[t] a list of
86
+ * string tokens of length <cnts>[t].
87
+ *
88
+ * Memory allocation here is a bit special as the first token at each position
89
+ * point to a memory block who hold a copy of the raw line. Each other tokens
90
+ * and the label are pointer in this block. This reduce memory fragmentation.
91
+ */
92
+ typedef struct tok_s tok_t;
93
+ struct tok_s {
94
+ int len; // T Sequence length
95
+ char **lbl; // [T] List of labels strings
96
+ int *cnts; // [T] Length of tokens lists
97
+ char **toks[]; // [T][] Tokens lists
98
+ };
99
+
100
+ /* seq_t:
101
+ * Data-structure representing a sequence of length <len> in the internal form
102
+ * used by the trainers and the tagger. For each position 't' in the sequence
103
+ * (0 <= t < <len>) there is some observations made on the data and an
104
+ * eventual label if provided in the input file.
105
+ *
106
+ * There is two kind of features: unigrams and bigrams one, build by combining
107
+ * one observation and one or two labels. At position 't', the unigrams
108
+ * features are build using the list of observations from <uobs>[t] which
109
+ * contains <ucnt>[t] items, and the observation at <lbl>[t]. The bigrams
110
+ * features are obtained in the same way using <bobs> and <bcnt>, and have to
111
+ * be combined also with <lbl>[t-1].
112
+ *
113
+ * If the sequence is read from a file without label, as it is the case in
114
+ * labelling mode, the <lbl> field will be NULL and so, the sequence cannot be
115
+ * used for training.
116
+ *
117
+ * The raw field is private and used internaly for efficient memory
118
+ * allocation. This allow to allocate <lbl>, <*cnt>, and all the list in
119
+ * <*obs> with the datastructure itself.
120
+ */
121
+ typedef struct pos_s pos_t;
122
+ typedef struct seq_s seq_t;
123
+ struct seq_s {
124
+ int len;
125
+ size_t *raw;
126
+ struct pos_s {
127
+ size_t lbl;
128
+ size_t ucnt, bcnt;
129
+ size_t *uobs, *bobs;
130
+ } pos[];
131
+ };
132
+
133
+ /* dat_t:
134
+ * Data-structure representing a full dataset: a collection of sequences ready
135
+ * to be used for training or to be labelled. It keep tracks of the maximum
136
+ * sequence length as the trainer need this for memory allocation. The dataset
137
+ * contains <nseq> sequence stored in <seq>. These sequences are labeled only
138
+ * if <lbl> is true.
139
+ */
140
+ typedef struct dat_s dat_t;
141
+ struct dat_s {
142
+ bool lbl; // True iff sequences are labelled
143
+ int mlen; // Length of the longest sequence in the set
144
+ size_t nseq; // S Number of sequences in the set
145
+ seq_t **seq; // [S] List of sequences
146
+ };
147
+
148
+ #endif
@@ -0,0 +1,218 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+ #include <math.h>
28
+ #include <stdbool.h>
29
+ #include <stddef.h>
30
+ #include <stdlib.h>
31
+ #include <string.h>
32
+
33
+ #include "wapiti.h"
34
+ #include "gradient.h"
35
+ #include "model.h"
36
+ #include "options.h"
37
+ #include "progress.h"
38
+ #include "sequence.h"
39
+ #include "tools.h"
40
+
41
+ /******************************************************************************
42
+ * The SGD-L1 trainer
43
+ *
44
+ * Implementation of the stochatic gradient descend with L1 penalty described
45
+ * in [1] by Tsurukoa et al. This allow to build really sparse models with the
46
+ * SGD method.
47
+ *
48
+ * [1] Stochastic gradient descent training for L1-regularized log-linear
49
+ * models with cumulative penalty, Yoshimasa Tsuruoka and Jun'ichi Tsuji
50
+ * and Sophia Ananiadou, in Proceedings of the ACL and the 4th IJCNLP of
51
+ * the AFNLP, pages 477-485, August 2009
52
+ ******************************************************************************/
53
+ typedef struct sgd_idx_s {
54
+ size_t *uobs;
55
+ size_t *bobs;
56
+ } sgd_idx_t;
57
+
58
+ /* applypenalty:
59
+ * This macro is quite ugly as it make a lot of things and use local variables
60
+ * of the function below. I'm sorry for this but this is allow to not
61
+ * duplicate the code below. Due to the way unigrams and bigrams observation
62
+ * are stored we must use this two times. As this macro is dangerous when
63
+ * called outsize of sgd-l1 we undef it just after.
64
+ * This function match exactly the APPLYPENALTY function defined in [1] pp 481
65
+ * and the formula on the middle of the page 480.
66
+ */
67
+ #define applypenalty(f) do { \
68
+ const double z = w[f]; \
69
+ if (z > 0.0) w[f] = max(0.0, z - (u + q[f])); \
70
+ else if (z < 0.0) w[f] = min(0.0, z + (u - q[f])); \
71
+ q[f] += w[f] - z; \
72
+ } while (false)
73
+
74
+ /* sgd_add:
75
+ * Add the <new> value in the array <obs> of size <cnt>. If the value is
76
+ * already present, we do nothing, else we add it.
77
+ */
78
+ static void sgd_add(size_t *obs, size_t *cnt, size_t new) {
79
+ // First check if value is already in the array, we do a linear probing
80
+ // as it is simpler and since these array will be very short in
81
+ // practice, it's efficient enough.
82
+ for (size_t p = 0; p < *cnt; p++)
83
+ if (obs[p] == new)
84
+ return;
85
+ // Insert the new value at the end since we have not found it.
86
+ obs[*cnt] = new;
87
+ *cnt = *cnt + 1;
88
+ }
89
+
90
+ /* trn_sgdl1:
91
+ * Train the model with the SGD-l1 algorithm described by tsurukoa et al.
92
+ */
93
+ void trn_sgdl1(mdl_t *mdl) {
94
+ const size_t Y = mdl->nlbl;
95
+ const size_t F = mdl->nftr;
96
+ const int U = mdl->reader->nuni;
97
+ const int B = mdl->reader->nbi;
98
+ const int S = mdl->train->nseq;
99
+ const int K = mdl->opt->maxiter;
100
+ double *w = mdl->theta;
101
+ // First we have to build and index who hold, for each sequences, the
102
+ // list of actives observations.
103
+ // The index is a simple table indexed by sequences number. Each entry
104
+ // point to two lists of observations terminated by <none>, one for
105
+ // unigrams obss and one for bigrams obss.
106
+ info(" - Build the index\n");
107
+ sgd_idx_t *idx = wapiti_xmalloc(sizeof(sgd_idx_t) * S);
108
+ for (int s = 0; s < S; s++) {
109
+ const seq_t *seq = mdl->train->seq[s];
110
+ const int T = seq->len;
111
+ size_t uobs[U * T + 1], ucnt = 0;
112
+ size_t bobs[B * T + 1], bcnt = 0;
113
+ for (int t = 0; t < seq->len; t++) {
114
+ const pos_t *pos = &seq->pos[t];
115
+ for (size_t p = 0; p < pos->ucnt; p++)
116
+ sgd_add(uobs, &ucnt, pos->uobs[p]);
117
+ for (size_t p = 0; p < pos->bcnt; p++)
118
+ sgd_add(bobs, &bcnt, pos->bobs[p]);
119
+ }
120
+ uobs[ucnt++] = none;
121
+ bobs[bcnt++] = none;
122
+ idx[s].uobs = wapiti_xmalloc(sizeof(size_t) * ucnt);
123
+ idx[s].bobs = wapiti_xmalloc(sizeof(size_t) * bcnt);
124
+ memcpy(idx[s].uobs, uobs, ucnt * sizeof(size_t));
125
+ memcpy(idx[s].bobs, bobs, bcnt * sizeof(size_t));
126
+ }
127
+ info(" Done\n");
128
+ // We will process sequences in random order in each iteration, so we
129
+ // will have to permute them. The current permutation is stored in a
130
+ // vector called <perm> shuffled at the start of each iteration. We
131
+ // just initialize it with the identity permutation.
132
+ // As we use the same gradient function than the other trainers, we need
133
+ // an array to store it. These functions accumulate the gradient so we
134
+ // need to clear it at start and before each new computation. As we now
135
+ // which features are active and so which gradient cell are updated, we
136
+ // can clear them selectively instead of fully clear the gradient each
137
+ // time.
138
+ // We also need an aditional vector named <q> who hold the penalty
139
+ // already applied to each features.
140
+ int *perm = wapiti_xmalloc(sizeof(int) * S);
141
+ for (int s = 0; s < S; s++)
142
+ perm[s] = s;
143
+ double *g = wapiti_xmalloc(sizeof(double) * F);
144
+ double *q = wapiti_xmalloc(sizeof(double) * F);
145
+ for (size_t f = 0; f < F; f++)
146
+ g[f] = q[f] = 0.0;
147
+ // We can now start training the model, we perform the requested number
148
+ // of iteration, each of these going through all the sequences. For
149
+ // computing the decay, we will need to keep track of the number of
150
+ // already processed sequences, this is tracked by the <i> variable.
151
+ double u = 0.0;
152
+ grd_t *grd = grd_new(mdl, g);
153
+ for (int k = 0, i = 0; k < K && !uit_stop; k++) {
154
+ // First we shuffle the sequence by making a lot of random swap
155
+ // of entry in the permutation index.
156
+ for (int s = 0; s < S; s++) {
157
+ const int a = rand() % S;
158
+ const int b = rand() % S;
159
+ const int t = perm[a];
160
+ perm[a] = perm[b];
161
+ perm[b] = t;
162
+ }
163
+ // And so, we can process sequence in a random order
164
+ for (int sp = 0; sp < S && !uit_stop; sp++, i++) {
165
+ const int s = perm[sp];
166
+ const seq_t *seq = mdl->train->seq[s];
167
+ grd_dospl(grd, seq);
168
+ // Before applying the gradient, we have to compute the
169
+ // learning rate to apply to this sequence. For this we
170
+ // use an exponential decay [1, pp 481(5)]
171
+ // η_i = η_0 * α^{i/S}
172
+ // And at the same time, we update the total penalty
173
+ // that must have been applied to each features.
174
+ // u <- u + η * rho1 / S
175
+ const double n0 = mdl->opt->sgdl1.eta0;
176
+ const double alpha = mdl->opt->sgdl1.alpha;
177
+ const double nk = n0 * pow(alpha, (double)i / S);
178
+ u = u + nk * mdl->opt->rho1 / S;
179
+ // Now we apply the update to all unigrams and bigrams
180
+ // observations actives in the current sequence. We must
181
+ // not forget to clear the gradient for the next
182
+ // sequence.
183
+ for (size_t n = 0; idx[s].uobs[n] != none; n++) {
184
+ size_t f = mdl->uoff[idx[s].uobs[n]];
185
+ for (size_t y = 0; y < Y; y++, f++) {
186
+ w[f] -= nk * g[f];
187
+ applypenalty(f);
188
+ g[f] = 0.0;
189
+ }
190
+ }
191
+ for (size_t n = 0; idx[s].bobs[n] != none; n++) {
192
+ size_t f = mdl->boff[idx[s].bobs[n]];
193
+ for (size_t d = 0; d < Y * Y; d++, f++) {
194
+ w[f] -= nk * g[f];
195
+ applypenalty(f);
196
+ g[f] = 0.0;
197
+ }
198
+ }
199
+ }
200
+ if (uit_stop)
201
+ break;
202
+ // Repport progress back to the user
203
+ if (!uit_progress(mdl, k + 1, -1.0))
204
+ break;
205
+ }
206
+ grd_free(grd);
207
+ // Cleanup allocated memory before returning
208
+ for (int s = 0; s < S; s++) {
209
+ free(idx[s].uobs);
210
+ free(idx[s].bobs);
211
+ }
212
+ free(idx);
213
+ free(perm);
214
+ free(g);
215
+ free(q);
216
+ }
217
+ #undef applypenalty
218
+