wapiti 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.autotest +13 -0
- data/.gitignore +5 -0
- data/.rspec +3 -0
- data/Gemfile +6 -0
- data/LICENSE +30 -0
- data/README.md +153 -0
- data/Rakefile +33 -0
- data/ext/wapiti/bcd.c +392 -0
- data/ext/wapiti/decoder.c +535 -0
- data/ext/wapiti/decoder.h +46 -0
- data/ext/wapiti/extconf.rb +8 -0
- data/ext/wapiti/gradient.c +818 -0
- data/ext/wapiti/gradient.h +81 -0
- data/ext/wapiti/lbfgs.c +294 -0
- data/ext/wapiti/model.c +296 -0
- data/ext/wapiti/model.h +100 -0
- data/ext/wapiti/native.c +1238 -0
- data/ext/wapiti/native.h +15 -0
- data/ext/wapiti/options.c +278 -0
- data/ext/wapiti/options.h +91 -0
- data/ext/wapiti/pattern.c +395 -0
- data/ext/wapiti/pattern.h +56 -0
- data/ext/wapiti/progress.c +167 -0
- data/ext/wapiti/progress.h +43 -0
- data/ext/wapiti/quark.c +272 -0
- data/ext/wapiti/quark.h +46 -0
- data/ext/wapiti/reader.c +553 -0
- data/ext/wapiti/reader.h +73 -0
- data/ext/wapiti/rprop.c +191 -0
- data/ext/wapiti/sequence.h +148 -0
- data/ext/wapiti/sgdl1.c +218 -0
- data/ext/wapiti/thread.c +171 -0
- data/ext/wapiti/thread.h +42 -0
- data/ext/wapiti/tools.c +202 -0
- data/ext/wapiti/tools.h +54 -0
- data/ext/wapiti/trainers.h +39 -0
- data/ext/wapiti/vmath.c +372 -0
- data/ext/wapiti/vmath.h +51 -0
- data/ext/wapiti/wapiti.c +288 -0
- data/ext/wapiti/wapiti.h +45 -0
- data/lib/wapiti.rb +30 -0
- data/lib/wapiti/errors.rb +17 -0
- data/lib/wapiti/model.rb +49 -0
- data/lib/wapiti/options.rb +113 -0
- data/lib/wapiti/utility.rb +15 -0
- data/lib/wapiti/version.rb +3 -0
- data/spec/fixtures/ch.mod +18550 -0
- data/spec/fixtures/chpattern.txt +52 -0
- data/spec/fixtures/chtest.txt +1973 -0
- data/spec/fixtures/chtrain.txt +19995 -0
- data/spec/fixtures/nppattern.txt +52 -0
- data/spec/fixtures/nptest.txt +1973 -0
- data/spec/fixtures/nptrain.txt +19995 -0
- data/spec/fixtures/pattern.txt +14 -0
- data/spec/fixtures/test.txt +60000 -0
- data/spec/fixtures/train.txt +1200 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/wapiti/model_spec.rb +173 -0
- data/spec/wapiti/native_spec.rb +12 -0
- data/spec/wapiti/options_spec.rb +175 -0
- data/spec/wapiti/utility_spec.rb +22 -0
- data/wapiti.gemspec +35 -0
- metadata +178 -0
data/ext/wapiti/native.h
ADDED
@@ -0,0 +1,278 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#include <limits.h>
|
29
|
+
#include <stdbool.h>
|
30
|
+
#include <stddef.h>
|
31
|
+
#include <stdlib.h>
|
32
|
+
#include <stdio.h>
|
33
|
+
#include <string.h>
|
34
|
+
|
35
|
+
#include "wapiti.h"
|
36
|
+
#include "tools.h"
|
37
|
+
#include "options.h"
|
38
|
+
#include "vmath.h"
|
39
|
+
|
40
|
+
/******************************************************************************
|
41
|
+
* Command line parsing
|
42
|
+
*
|
43
|
+
* This module handle command line parsing and put all things defined by the
|
44
|
+
* user in a special structure in order to make them accessible to the
|
45
|
+
* remaining of the program.
|
46
|
+
******************************************************************************/
|
47
|
+
|
48
|
+
/* opt_help:
|
49
|
+
* Just display the help message describing modes and switch.
|
50
|
+
*/
|
51
|
+
static void opt_help(const char *pname) {
|
52
|
+
static const char msg[] =
|
53
|
+
"Global switchs:\n"
|
54
|
+
"\t-h | --help display this help message\n"
|
55
|
+
"\t | --version display version information\n"
|
56
|
+
"\n"
|
57
|
+
"Training mode:\n"
|
58
|
+
" %1$s train [options] [input data] [model file]\n"
|
59
|
+
"\t | --me force maxent mode\n"
|
60
|
+
"\t-a | --algo STRING training algorithm to use\n"
|
61
|
+
"\t-p | --pattern FILE patterns for extracting features\n"
|
62
|
+
"\t-m | --model FILE model file to preload\n"
|
63
|
+
"\t-d | --devel FILE development dataset\n"
|
64
|
+
"\t-c | --compact compact model after training\n"
|
65
|
+
"\t-t | --nthread INT number of worker threads\n"
|
66
|
+
"\t-j | --jobsize INT job size for worker threads\n"
|
67
|
+
"\t-s | --sparse enable sparse forward/backward\n"
|
68
|
+
"\t-i | --maxiter INT maximum number of iterations\n"
|
69
|
+
"\t-1 | --rho1 FLOAT l1 penalty parameter\n"
|
70
|
+
"\t-2 | --rho2 FLOAT l2 penalty parameter\n"
|
71
|
+
"\t-o | --objwin INT convergence window size\n"
|
72
|
+
"\t-w | --stopwin INT stop window size\n"
|
73
|
+
"\t-e | --stopeps FLOAT stop epsilon value\n"
|
74
|
+
"\t | --clip (l-bfgs) clip gradient\n"
|
75
|
+
"\t | --histsz INT (l-bfgs) history size\n"
|
76
|
+
"\t | --maxls INT (l-bfgs) max linesearch iters\n"
|
77
|
+
"\t | --eta0 FLOAT (sgd-l1) learning rate\n"
|
78
|
+
"\t | --alpha FLOAT (sgd-l1) exp decay parameter\n"
|
79
|
+
"\t | --kappa FLOAT (bcd) stability parameter\n"
|
80
|
+
"\t | --stpmin FLOAT (rprop) minimum step size\n"
|
81
|
+
"\t | --stpmax FLOAT (rprop) maximum step size\n"
|
82
|
+
"\t | --stpinc FLOAT (rprop) step increment factor\n"
|
83
|
+
"\t | --stpdec FLOAT (rprop) step decrement factor\n"
|
84
|
+
"\t | --cutoff (rprop) alternate projection\n"
|
85
|
+
"\n"
|
86
|
+
"Labelling mode:\n"
|
87
|
+
" %1$s label [options] [input data] [output data]\n"
|
88
|
+
"\t | --me force maxent mode\n"
|
89
|
+
"\t-m | --model FILE model file to load\n"
|
90
|
+
"\t-l | --label output only labels\n"
|
91
|
+
"\t-c | --check input is already labeled\n"
|
92
|
+
"\t-s | --score add scores to output\n"
|
93
|
+
"\t-p | --post label using posteriors\n"
|
94
|
+
"\t-n | --nbest INT output n-best list\n"
|
95
|
+
"\n"
|
96
|
+
"Dumping mode\n"
|
97
|
+
" %1$s dump [input model] [output text]\n";
|
98
|
+
fprintf(stderr, msg, pname);
|
99
|
+
}
|
100
|
+
|
101
|
+
/* opt_defaults:
|
102
|
+
* Default values for all parameters of the model.
|
103
|
+
*/
|
104
|
+
const opt_t opt_defaults = {
|
105
|
+
.mode = -1,
|
106
|
+
.input = NULL, .output = NULL,
|
107
|
+
.maxent = false,
|
108
|
+
.algo = "l-bfgs", .pattern = NULL, .model = NULL, .devel = NULL,
|
109
|
+
.compact = false, .sparse = false,
|
110
|
+
.nthread = 1, .jobsize = 64, .maxiter = 0,
|
111
|
+
.rho1 = 0.5, .rho2 = 0.0001,
|
112
|
+
.objwin = 5, .stopwin = 5, .stopeps = 0.02,
|
113
|
+
.lbfgs = {.clip = false, .histsz = 5, .maxls = 40},
|
114
|
+
.sgdl1 = {.eta0 = 0.8, .alpha = 0.85},
|
115
|
+
.bcd = {.kappa = 1.5},
|
116
|
+
.rprop = {.stpmin = 1e-8, .stpmax = 50.0, .stpinc = 1.2, .stpdec = 0.5,
|
117
|
+
.cutoff = false},
|
118
|
+
.label = false, .check = false, .outsc = false,
|
119
|
+
.lblpost = false, .nbest = 1
|
120
|
+
};
|
121
|
+
|
122
|
+
/* opt_switch:
|
123
|
+
* Define available switchs for the different modes in a readable way for the
|
124
|
+
* command line argument parser.
|
125
|
+
*/
|
126
|
+
struct {
|
127
|
+
int mode;
|
128
|
+
char *dshort;
|
129
|
+
char *dlong;
|
130
|
+
char kind;
|
131
|
+
size_t offset;
|
132
|
+
} opt_switch[] = {
|
133
|
+
{0, "##", "--me", 'B', offsetof(opt_t, maxent )},
|
134
|
+
{0, "-a", "--algo", 'S', offsetof(opt_t, algo )},
|
135
|
+
{0, "-p", "--pattern", 'S', offsetof(opt_t, pattern )},
|
136
|
+
{0, "-m", "--model", 'S', offsetof(opt_t, model )},
|
137
|
+
{0, "-d", "--devel", 'S', offsetof(opt_t, devel )},
|
138
|
+
{0, "-c", "--compact", 'B', offsetof(opt_t, compact )},
|
139
|
+
{0, "-s", "--sparse", 'B', offsetof(opt_t, sparse )},
|
140
|
+
{0, "-t", "--nthread", 'I', offsetof(opt_t, nthread )},
|
141
|
+
{0, "-j", "--josize", 'I', offsetof(opt_t, jobsize )},
|
142
|
+
{0, "-i", "--maxiter", 'I', offsetof(opt_t, maxiter )},
|
143
|
+
{0, "-1", "--rho1", 'F', offsetof(opt_t, rho1 )},
|
144
|
+
{0, "-2", "--rho2", 'F', offsetof(opt_t, rho2 )},
|
145
|
+
{0, "-o", "--objsz", 'I', offsetof(opt_t, objwin )},
|
146
|
+
{0, "-w", "--stopwin", 'I', offsetof(opt_t, stopwin )},
|
147
|
+
{0, "-e", "--stopeps", 'F', offsetof(opt_t, stopeps )},
|
148
|
+
{0, "##", "--clip", 'B', offsetof(opt_t, lbfgs.clip )},
|
149
|
+
{0, "##", "--histsz", 'I', offsetof(opt_t, lbfgs.histsz)},
|
150
|
+
{0, "##", "--maxls", 'I', offsetof(opt_t, lbfgs.maxls )},
|
151
|
+
{0, "##", "--eta0", 'F', offsetof(opt_t, sgdl1.eta0 )},
|
152
|
+
{0," ##", "--alpha", 'F', offsetof(opt_t, sgdl1.alpha )},
|
153
|
+
{0, "##", "--kappa", 'F', offsetof(opt_t, bcd.kappa )},
|
154
|
+
{0, "##", "--stpmin", 'F', offsetof(opt_t, rprop.stpmin)},
|
155
|
+
{0, "##", "--stpmax", 'F', offsetof(opt_t, rprop.stpmax)},
|
156
|
+
{0, "##", "--stpinc", 'F', offsetof(opt_t, rprop.stpinc)},
|
157
|
+
{0, "##", "--stpdec", 'F', offsetof(opt_t, rprop.stpdec)},
|
158
|
+
{0, "##", "--cutoff", 'B', offsetof(opt_t, rprop.cutoff)},
|
159
|
+
{1, "##", "--me", 'B', offsetof(opt_t, maxent )},
|
160
|
+
{1, "-m", "--model", 'S', offsetof(opt_t, model )},
|
161
|
+
{1, "-l", "--label", 'B', offsetof(opt_t, label )},
|
162
|
+
{1, "-c", "--check", 'B', offsetof(opt_t, check )},
|
163
|
+
{1, "-s", "--score", 'B', offsetof(opt_t, outsc )},
|
164
|
+
{1, "-p", "--post", 'B', offsetof(opt_t, lblpost )},
|
165
|
+
{1, "-n", "--nbest", 'I', offsetof(opt_t, nbest )},
|
166
|
+
{-1, NULL, NULL, '\0', 0}
|
167
|
+
};
|
168
|
+
|
169
|
+
/* argparse:
|
170
|
+
* This is the main function for command line parsing. It use the previous
|
171
|
+
* table to known how to interpret the switchs and store values in the opt_t
|
172
|
+
* structure.
|
173
|
+
*/
|
174
|
+
void opt_parse(int argc, char *argv[argc], opt_t *opt) {
|
175
|
+
static const char *err_badval = "invalid value for switch '%s'";
|
176
|
+
const char *pname = argv[0];
|
177
|
+
argc--, argv++;
|
178
|
+
if (argc == 0) {
|
179
|
+
opt_help(pname);
|
180
|
+
fatal("no mode specified");
|
181
|
+
}
|
182
|
+
// First special handling for help and version
|
183
|
+
if (!strcmp(argv[0], "-h") || !strcmp(argv[0], "--help")) {
|
184
|
+
opt_help(pname);
|
185
|
+
exit(EXIT_FAILURE);
|
186
|
+
} else if (!strcmp(argv[0], "--version")) {
|
187
|
+
fprintf(stderr, "Wapiti v" VERSION "\n");
|
188
|
+
fprintf(stderr, " Optimization mode: %s\n", xvm_mode());
|
189
|
+
exit(EXIT_SUCCESS);
|
190
|
+
}
|
191
|
+
// Get the mode to use
|
192
|
+
if (!strcmp(argv[0], "t") || !strcmp(argv[0], "train")) {
|
193
|
+
opt->mode = 0;
|
194
|
+
} else if (!strcmp(argv[0], "l") || !strcmp(argv[0], "label")) {
|
195
|
+
opt->mode = 1;
|
196
|
+
} else if (!strcmp(argv[0], "d") || !strcmp(argv[0], "dump")) {
|
197
|
+
opt->mode = 2;
|
198
|
+
} else {
|
199
|
+
fatal("unknown mode <%s>", argv[0]);
|
200
|
+
}
|
201
|
+
argc--, argv++;
|
202
|
+
// Parse remaining arguments
|
203
|
+
opt->input = NULL;
|
204
|
+
opt->output = NULL;
|
205
|
+
while (argc > 0) {
|
206
|
+
const char *arg = argv[0];
|
207
|
+
int idx;
|
208
|
+
// Check if this argument is a filename or an option
|
209
|
+
if (arg[0] != '-') {
|
210
|
+
if (opt->input == NULL)
|
211
|
+
opt->input = argv[0];
|
212
|
+
else if (opt->output == NULL)
|
213
|
+
opt->output = argv[0];
|
214
|
+
else
|
215
|
+
fatal("too much input files on command line");
|
216
|
+
argc--, argv++;
|
217
|
+
continue;
|
218
|
+
}
|
219
|
+
// Search the current switch in the table or fail if it cannot
|
220
|
+
// be found.
|
221
|
+
for (idx = 0; opt_switch[idx].mode != -1; idx++) {
|
222
|
+
if (opt_switch[idx].mode != opt->mode)
|
223
|
+
continue;
|
224
|
+
if (!strcmp(arg, opt_switch[idx].dshort))
|
225
|
+
break;
|
226
|
+
if (!strcmp(arg, opt_switch[idx].dlong))
|
227
|
+
break;
|
228
|
+
}
|
229
|
+
if (opt_switch[idx].mode == -1)
|
230
|
+
fatal("unknown option '%s'", arg);
|
231
|
+
// Decode the argument and store it in the structure
|
232
|
+
if (opt_switch[idx].kind != 'B' && argc < 2)
|
233
|
+
fatal("missing argument for switch '%s'", arg);
|
234
|
+
void *ptr = (void *)((char *)opt + opt_switch[idx].offset);
|
235
|
+
switch (opt_switch[idx].kind) {
|
236
|
+
case 'S':
|
237
|
+
*((char **)ptr) = argv[1];
|
238
|
+
argc -= 2, argv += 2;
|
239
|
+
break;
|
240
|
+
case 'I':
|
241
|
+
if (sscanf(argv[1], "%d", (int *)ptr) != 1)
|
242
|
+
fatal(err_badval, arg);
|
243
|
+
argc -= 2, argv += 2;
|
244
|
+
break;
|
245
|
+
case 'F': {
|
246
|
+
double tmp;
|
247
|
+
if (sscanf(argv[1], "%lf", &tmp) != 1)
|
248
|
+
fatal(err_badval, arg);
|
249
|
+
*((double *)ptr) = tmp;
|
250
|
+
argc -= 2, argv += 2;
|
251
|
+
break; }
|
252
|
+
case 'B':
|
253
|
+
*((bool *)ptr) = true;
|
254
|
+
argc--, argv++;
|
255
|
+
break;
|
256
|
+
}
|
257
|
+
}
|
258
|
+
// Small trick for the maxiter switch
|
259
|
+
if (opt->maxiter == 0)
|
260
|
+
opt->maxiter = INT_MAX;
|
261
|
+
// Check that all options are valid
|
262
|
+
#define argchecksub(name, test) \
|
263
|
+
if (!(test)) \
|
264
|
+
fatal("invalid value for <"name">");
|
265
|
+
argchecksub("--thread", opt->nthread > 0 );
|
266
|
+
argchecksub("--jobsize", opt->jobsize > 0 );
|
267
|
+
argchecksub("--rho1", opt->rho1 >= 0.0);
|
268
|
+
argchecksub("--rho2", opt->rho2 >= 0.0);
|
269
|
+
argchecksub("--histsz", opt->lbfgs.histsz > 0 );
|
270
|
+
argchecksub("--maxls", opt->lbfgs.maxls > 0 );
|
271
|
+
argchecksub("--eta0", opt->sgdl1.eta0 > 0.0);
|
272
|
+
argchecksub("--alpha", opt->sgdl1.alpha > 0.0);
|
273
|
+
argchecksub("--nbest", opt->nbest > 0 );
|
274
|
+
#undef argchecksub
|
275
|
+
if (opt->maxent && !strcmp(opt->algo, "bcd"))
|
276
|
+
fatal("BCD not supported for training maxent models");
|
277
|
+
}
|
278
|
+
|
@@ -0,0 +1,91 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
#ifndef options_h
|
28
|
+
#define options_h
|
29
|
+
|
30
|
+
#include <stdbool.h>
|
31
|
+
|
32
|
+
#include "wapiti.h"
|
33
|
+
|
34
|
+
/* opt_t:
|
35
|
+
* This structure hold all user configurable parameter for Wapiti and is
|
36
|
+
* filled with parameters from command line.
|
37
|
+
*/
|
38
|
+
typedef struct opt_s opt_t;
|
39
|
+
struct opt_s {
|
40
|
+
int mode;
|
41
|
+
char *input, *output;
|
42
|
+
bool maxent;
|
43
|
+
// Options for training
|
44
|
+
char *algo, *pattern;
|
45
|
+
char *model, *devel;
|
46
|
+
bool compact, sparse;
|
47
|
+
int nthread;
|
48
|
+
int jobsize;
|
49
|
+
int maxiter;
|
50
|
+
double rho1, rho2;
|
51
|
+
// Window size criterion
|
52
|
+
int objwin;
|
53
|
+
int stopwin;
|
54
|
+
double stopeps;
|
55
|
+
// Options specific to L-BFGS
|
56
|
+
struct {
|
57
|
+
bool clip;
|
58
|
+
int histsz;
|
59
|
+
int maxls;
|
60
|
+
} lbfgs;
|
61
|
+
// Options specific to SGD-L1
|
62
|
+
struct {
|
63
|
+
double eta0;
|
64
|
+
double alpha;
|
65
|
+
} sgdl1;
|
66
|
+
// Options specific to BCD
|
67
|
+
struct {
|
68
|
+
double kappa;
|
69
|
+
} bcd;
|
70
|
+
// Options specific to RPROP
|
71
|
+
struct {
|
72
|
+
double stpmin;
|
73
|
+
double stpmax;
|
74
|
+
double stpinc;
|
75
|
+
double stpdec;
|
76
|
+
bool cutoff;
|
77
|
+
} rprop;
|
78
|
+
// Options for labelling
|
79
|
+
bool label;
|
80
|
+
bool check;
|
81
|
+
bool outsc;
|
82
|
+
bool lblpost;
|
83
|
+
int nbest;
|
84
|
+
};
|
85
|
+
|
86
|
+
extern const opt_t opt_defaults;
|
87
|
+
|
88
|
+
void opt_parse(int argc, char *argv[argc], opt_t *opt);
|
89
|
+
|
90
|
+
#endif
|
91
|
+
|
@@ -0,0 +1,395 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#include <ctype.h>
|
29
|
+
#include <stdbool.h>
|
30
|
+
#include <stddef.h>
|
31
|
+
#include <stdio.h>
|
32
|
+
#include <stdlib.h>
|
33
|
+
#include <string.h>
|
34
|
+
|
35
|
+
#include "pattern.h"
|
36
|
+
#include "sequence.h"
|
37
|
+
#include "tools.h"
|
38
|
+
|
39
|
+
/******************************************************************************
|
40
|
+
* A simple regular expression matcher
|
41
|
+
*
|
42
|
+
* This module implement a simple regular expression matcher, it implement
|
43
|
+
* just a subset of the classical regexp simple to implement but sufficient
|
44
|
+
* for most usages and avoid to add a dependency to a full regexp library.
|
45
|
+
*
|
46
|
+
* The recognized subset is quite simple. First for matching characters :
|
47
|
+
* . -> match any characters
|
48
|
+
* \x -> match a character class (in uppercase, match the complement)
|
49
|
+
* \d : digit \a : alpha \w : alpha + digit
|
50
|
+
* \l : lowercase \u : uppercase \p : punctuation
|
51
|
+
* \s : space
|
52
|
+
* or escape a character
|
53
|
+
* x -> any other character match itself
|
54
|
+
* And the constructs :
|
55
|
+
* ^ -> at the begining of the regexp, anchor it at start of string
|
56
|
+
* $ -> at the end of regexp, anchor it at end of string
|
57
|
+
* * -> match any number of repetition of the previous character
|
58
|
+
* ? -> optionally match the previous character
|
59
|
+
*
|
60
|
+
* This subset is implemented quite efficiently using recursion. All recursive
|
61
|
+
* calls are tail-call so they should be optimized by the compiler. As we do
|
62
|
+
* direct interpretation, we have to backtrack so performance can be very poor
|
63
|
+
* on specialy designed regexp. This is not a problem as the regexp as well as
|
64
|
+
* the string is expected to be very simple here. If this is not the case, you
|
65
|
+
* better have to prepare your data better.
|
66
|
+
******************************************************************************/
|
67
|
+
|
68
|
+
/* rex_matchit:
|
69
|
+
* Match a single caracter at the start fo the string. The character might be
|
70
|
+
* a plain char, a dot or char class.
|
71
|
+
*/
|
72
|
+
static bool rex_matchit(const char *ch, const char *str) {
|
73
|
+
if (str[0] == '\0')
|
74
|
+
return false;
|
75
|
+
if (ch[0] == '.')
|
76
|
+
return true;
|
77
|
+
if (ch[0] == '\\') {
|
78
|
+
switch (ch[1]) {
|
79
|
+
case 'a': return isalpha(str[0]);
|
80
|
+
case 'd': return isdigit(str[0]);
|
81
|
+
case 'l': return islower(str[0]);
|
82
|
+
case 'p': return ispunct(str[0]);
|
83
|
+
case 's': return isspace(str[0]);
|
84
|
+
case 'u': return isupper(str[0]);
|
85
|
+
case 'w': return isalnum(str[0]);
|
86
|
+
case 'A': return !isalpha(str[0]);
|
87
|
+
case 'D': return !isdigit(str[0]);
|
88
|
+
case 'L': return !islower(str[0]);
|
89
|
+
case 'P': return !ispunct(str[0]);
|
90
|
+
case 'S': return !isspace(str[0]);
|
91
|
+
case 'U': return !isupper(str[0]);
|
92
|
+
case 'W': return !isalnum(str[0]);
|
93
|
+
}
|
94
|
+
return ch[1] == str[0];
|
95
|
+
}
|
96
|
+
return ch[0] == str[0];
|
97
|
+
}
|
98
|
+
|
99
|
+
/* rex_matchme:
|
100
|
+
* Match a regular expresion at the start of the string. If a match is found,
|
101
|
+
* is length is returned in len. The mathing is done through tail-recursion
|
102
|
+
* for good performances.
|
103
|
+
*/
|
104
|
+
static bool rex_matchme(const char *re, const char *str, int *len) {
|
105
|
+
// Special check for end of regexp
|
106
|
+
if (re[0] == '\0')
|
107
|
+
return true;
|
108
|
+
if (re[0] == '$' && re[1] == '\0')
|
109
|
+
return (str[0] == '\0');
|
110
|
+
// Get first char of regexp
|
111
|
+
const char *ch = re;
|
112
|
+
const char *nxt = re + 1 + (ch[0] == '\\');
|
113
|
+
// Special check for the following construct "x**" where the first star
|
114
|
+
// is consumed normally but lead the second (which is wrong) to be
|
115
|
+
// interpreted as a char to mach as if it was escaped (and same for the
|
116
|
+
// optional construct)
|
117
|
+
if (*ch == '*' || *ch == '?')
|
118
|
+
fatal("unescaped * or ? in regexp: %s", re);
|
119
|
+
// Handle star repetition
|
120
|
+
if (nxt[0] == '*') {
|
121
|
+
nxt++;
|
122
|
+
do {
|
123
|
+
const int save = *len;
|
124
|
+
if (rex_matchme(nxt, str, len))
|
125
|
+
return true;
|
126
|
+
*len = save + 1;
|
127
|
+
} while (rex_matchit(ch, str++));
|
128
|
+
return false;
|
129
|
+
}
|
130
|
+
// Handle optional
|
131
|
+
if (nxt[0] == '?') {
|
132
|
+
nxt++;
|
133
|
+
if (rex_matchit(ch, str)) {
|
134
|
+
(*len)++;
|
135
|
+
if (rex_matchme(nxt, str + 1, len))
|
136
|
+
return true;
|
137
|
+
(*len)--;
|
138
|
+
}
|
139
|
+
return rex_matchme(nxt, str, len);
|
140
|
+
}
|
141
|
+
// Classical char matching
|
142
|
+
(*len)++;
|
143
|
+
if (rex_matchit(ch, str))
|
144
|
+
return rex_matchme(nxt, str + 1, len);
|
145
|
+
return false;
|
146
|
+
}
|
147
|
+
|
148
|
+
/* rex_match:
|
149
|
+
* Match a regular expresion in the given string. If a match is found, the
|
150
|
+
* position of the start of the match is returned and is len is returned in
|
151
|
+
* len, else -1 is returned.
|
152
|
+
*/
|
153
|
+
static int rex_match(const char *re, const char *str, int *len) {
|
154
|
+
// Special case for anchor at start
|
155
|
+
if (*re == '^') {
|
156
|
+
*len = 0;
|
157
|
+
if (rex_matchme(re + 1, str, len))
|
158
|
+
return 0;
|
159
|
+
return -1;
|
160
|
+
}
|
161
|
+
// And general case for any position
|
162
|
+
int pos = 0;
|
163
|
+
do {
|
164
|
+
*len = 0;
|
165
|
+
if (rex_matchme(re, str + pos, len))
|
166
|
+
return pos;
|
167
|
+
} while (str[pos++] != '\0');
|
168
|
+
// Matching failed
|
169
|
+
return -1;
|
170
|
+
}
|
171
|
+
|
172
|
+
/*******************************************************************************
|
173
|
+
* Pattern handling
|
174
|
+
*
|
175
|
+
* Patterns are the heart the data input process, they provide a way to tell
|
176
|
+
* Wapiti how the interesting information can be extracted from the input
|
177
|
+
* data. A pattern is simply a string who embed special commands about tokens
|
178
|
+
* to extract from the input sequence. They are compiled to a special form
|
179
|
+
* used during data loading.
|
180
|
+
* For training, each position of a sequence hold a list of observation made
|
181
|
+
* at this position, pattern give a way to specify these observations.
|
182
|
+
*
|
183
|
+
* During sequence loading, all patterns are applied at each position to
|
184
|
+
* produce a list of string representing the observations which will be in
|
185
|
+
* turn transformed to numerical identifiers. This module take care of
|
186
|
+
* building the string representation.
|
187
|
+
*
|
188
|
+
* As said, a patern is a string with specific commands in the forms %c[...]
|
189
|
+
* where 'c' is the command with arguments between the bracket. All commands
|
190
|
+
* take at least to numerical arguments which define a token in the input
|
191
|
+
* sequence. The first one is an offset from the current position and the
|
192
|
+
* second one is a column number. With these two parameters, we get a string
|
193
|
+
* in the input sequence on which we apply the command.
|
194
|
+
*
|
195
|
+
* All command are specified with a character and result in a string which
|
196
|
+
* will replace the command in the pattern string. If the command character is
|
197
|
+
* lower case, the result is copied verbatim, if it is uppercase, the result
|
198
|
+
* is copied with casing removed. The following commands are available:
|
199
|
+
* 'x' -- result is the token itself
|
200
|
+
* 't' -- test if a regular expression match the token. Result will be
|
201
|
+
* either "true" or "false"
|
202
|
+
* 'm' -- match a regular expression on the token. Result is the first
|
203
|
+
* substring matched.
|
204
|
+
******************************************************************************/
|
205
|
+
|
206
|
+
/* pat_comp:
|
207
|
+
* Compile the pattern to a form more suitable to easily apply it on tokens
|
208
|
+
* list during data reading. The given pattern string is interned in the
|
209
|
+
* compiled pattern and will be freed with it, so you don't have to take care
|
210
|
+
* of it and must not modify it after the compilation.
|
211
|
+
*/
|
212
|
+
pat_t *pat_comp(char *p) {
|
213
|
+
pat_t *pat = NULL;
|
214
|
+
// Allocate memory for the compiled pattern, the allocation is based
|
215
|
+
// on an over-estimation of the number of required item. As compiled
|
216
|
+
// pattern take a neglectible amount of memory, this waste is not
|
217
|
+
// important.
|
218
|
+
int mitems = 0;
|
219
|
+
for (int pos = 0; p[pos] != '\0'; pos++)
|
220
|
+
if (p[pos] == '%')
|
221
|
+
mitems++;
|
222
|
+
mitems = mitems * 2 + 1;
|
223
|
+
pat = wapiti_xmalloc(sizeof(pat_t) + sizeof(pat->items[0]) * mitems);
|
224
|
+
pat->src = p;
|
225
|
+
// Next, we go through the pattern compiling the items as they are
|
226
|
+
// found. Commands are parsed and put in a corresponding item, and
|
227
|
+
// segment of char not in a command are put in a 's' item.
|
228
|
+
int nitems = 0;
|
229
|
+
int ntoks = 0;
|
230
|
+
int pos = 0;
|
231
|
+
while (p[pos] != '\0') {
|
232
|
+
pat_item_t *item = &(pat->items[nitems++]);
|
233
|
+
item->value = NULL;
|
234
|
+
if (p[pos] == '%') {
|
235
|
+
// This is a command, so first parse its type and check
|
236
|
+
// its a valid one. Next prepare the item.
|
237
|
+
const char type = tolower(p[pos + 1]);
|
238
|
+
if (type != 'x' && type != 't' && type != 'm')
|
239
|
+
fatal("unknown command type: '%c'", type);
|
240
|
+
item->type = type;
|
241
|
+
item->caps = (p[pos + 1] != type);
|
242
|
+
pos += 2;
|
243
|
+
// Next we parse the offset and column and store them in
|
244
|
+
// the item.
|
245
|
+
const char *at = p + pos;
|
246
|
+
int off, col, nch;
|
247
|
+
item->absolute = false;
|
248
|
+
if (sscanf(at, "[@%d,%d%n", &off, &col, &nch) == 2)
|
249
|
+
item->absolute = true;
|
250
|
+
else if (sscanf(at, "[%d,%d%n", &off, &col, &nch) != 2)
|
251
|
+
fatal("invalid pattern: %s", p);
|
252
|
+
if (col < 0)
|
253
|
+
fatal("invalid column number: %d", col);
|
254
|
+
item->offset = off;
|
255
|
+
item->column = col;
|
256
|
+
ntoks = max(ntoks, col);
|
257
|
+
pos += nch;
|
258
|
+
// And parse the end of the argument list, for 'x' there
|
259
|
+
// is nothing to read but for 't' and 'm' we have to get
|
260
|
+
// read the regexp.
|
261
|
+
if (type == 't' || type == 'm') {
|
262
|
+
if (p[pos] != ',' && p[pos + 1] != '"')
|
263
|
+
fatal("missing arg in pattern: %s", p);
|
264
|
+
const int start = (pos += 2);
|
265
|
+
while (p[pos] != '\0') {
|
266
|
+
if (p[pos] == '"')
|
267
|
+
break;
|
268
|
+
if (p[pos] == '\\' && p[pos+1] != '\0')
|
269
|
+
pos++;
|
270
|
+
pos++;
|
271
|
+
}
|
272
|
+
if (p[pos] != '"')
|
273
|
+
fatal("unended argument: %s", p);
|
274
|
+
const int len = pos - start;
|
275
|
+
item->value = wapiti_xmalloc(sizeof(char) * (len + 1));
|
276
|
+
memcpy(item->value, p + start, len);
|
277
|
+
item->value[len] = '\0';
|
278
|
+
pos++;
|
279
|
+
}
|
280
|
+
// Just check the end of the arg list and loop.
|
281
|
+
if (p[pos] != ']')
|
282
|
+
fatal("missing end of pattern: %s", p);
|
283
|
+
pos++;
|
284
|
+
} else {
|
285
|
+
// No command here, so build an 's' item with the chars
|
286
|
+
// until end of pattern or next command and put it in
|
287
|
+
// the list.
|
288
|
+
const int start = pos;
|
289
|
+
while (p[pos] != '\0' && p[pos] != '%')
|
290
|
+
pos++;
|
291
|
+
const int len = pos - start;
|
292
|
+
item->type = 's';
|
293
|
+
item->caps = false;
|
294
|
+
item->value = wapiti_xmalloc(sizeof(char) * (len + 1));
|
295
|
+
memcpy(item->value, p + start, len);
|
296
|
+
item->value[len] = '\0';
|
297
|
+
}
|
298
|
+
}
|
299
|
+
pat->ntoks = ntoks;
|
300
|
+
pat->nitems = nitems;
|
301
|
+
return pat;
|
302
|
+
}
|
303
|
+
|
304
|
+
/* pat_exec:
|
305
|
+
* Execute a compiled pattern at position 'at' in the given tokens sequences
|
306
|
+
* in order to produce an observation string. The string is returned as a
|
307
|
+
* newly allocated memory block and the caller is responsible to free it when
|
308
|
+
* not needed anymore.
|
309
|
+
*/
|
310
|
+
char *pat_exec(const pat_t *pat, const tok_t *tok, int at) {
|
311
|
+
static char *bval[] = {"_x-1", "_x-2", "_x-3", "_x-4", "_x-#"};
|
312
|
+
static char *eval[] = {"_x+1", "_x+2", "_x+3", "_x+4", "_x+#"};
|
313
|
+
const int T = tok->len;
|
314
|
+
// Prepare the buffer who will hold the result
|
315
|
+
int size = 16, pos = 0;
|
316
|
+
char *buffer = wapiti_xmalloc(sizeof(char) * size);
|
317
|
+
// And loop over the compiled items
|
318
|
+
for (int it = 0; it < pat->nitems; it++) {
|
319
|
+
const pat_item_t *item = &(pat->items[it]);
|
320
|
+
char *value = NULL;
|
321
|
+
int len = 0;
|
322
|
+
// First, if needed, we retrieve the token at the referenced
|
323
|
+
// position in the sequence. We store it in value and let the
|
324
|
+
// command handler do what it need with it.
|
325
|
+
if (item->type != 's') {
|
326
|
+
int pos = item->offset;
|
327
|
+
if (item->absolute) {
|
328
|
+
if (item->offset < 0)
|
329
|
+
pos += T;
|
330
|
+
else
|
331
|
+
pos--;
|
332
|
+
} else {
|
333
|
+
pos += at;
|
334
|
+
}
|
335
|
+
int col = item->column;
|
336
|
+
if (pos < 0)
|
337
|
+
value = bval[min(-pos - 1, 4)];
|
338
|
+
else if (pos >= T)
|
339
|
+
value = eval[min( pos - T, 4)];
|
340
|
+
else if (col >= tok->cnts[pos])
|
341
|
+
fatal("missing tokens, cannot apply pattern");
|
342
|
+
else
|
343
|
+
value = tok->toks[pos][col];
|
344
|
+
}
|
345
|
+
// Next, we handle the command, 's' and 'x' are very simple but
|
346
|
+
// 't' and 'm' require us to call the regexp matcher.
|
347
|
+
if (item->type == 's') {
|
348
|
+
value = item->value;
|
349
|
+
len = strlen(value);
|
350
|
+
} else if (item->type == 'x') {
|
351
|
+
len = strlen(value);
|
352
|
+
} else if (item->type == 't') {
|
353
|
+
if (rex_match(item->value, value, &len) == -1)
|
354
|
+
value = "false";
|
355
|
+
else
|
356
|
+
value = "true";
|
357
|
+
len = strlen(value);
|
358
|
+
} else if (item->type == 'm') {
|
359
|
+
int pos = rex_match(item->value, value, &len);
|
360
|
+
if (pos == -1)
|
361
|
+
len = 0;
|
362
|
+
value += pos;
|
363
|
+
}
|
364
|
+
// And we add it to the buffer, growing it if needed. If the
|
365
|
+
// user requested it, we also remove caps from the string.
|
366
|
+
if (pos + len >= size - 1) {
|
367
|
+
while (pos + len >= size - 1)
|
368
|
+
size = size * 1.4;
|
369
|
+
buffer = wapiti_xrealloc(buffer, sizeof(char) * size);
|
370
|
+
}
|
371
|
+
memcpy(buffer + pos, value, len);
|
372
|
+
if (item->caps)
|
373
|
+
for (int i = pos; i < pos + len; i++)
|
374
|
+
buffer[i] = tolower(buffer[i]);
|
375
|
+
pos += len;
|
376
|
+
}
|
377
|
+
// Adjust the result and return it.
|
378
|
+
buffer[pos++] = '\0';
|
379
|
+
buffer = wapiti_xrealloc(buffer, sizeof(char) * pos);
|
380
|
+
return buffer;
|
381
|
+
}
|
382
|
+
|
383
|
+
/* pat_free:
|
384
|
+
* Free all memory used by a compiled pattern object. Note that this will free
|
385
|
+
* the pointer to the source string given to pat_comp so you must be sure to
|
386
|
+
* not use this pointer again.
|
387
|
+
*/
|
388
|
+
void pat_free(pat_t *pat) {
|
389
|
+
for (int it = 0; it < pat->nitems; it++)
|
390
|
+
free(pat->items[it].value);
|
391
|
+
free(pat->src);
|
392
|
+
free(pat);
|
393
|
+
}
|
394
|
+
|
395
|
+
|