wapiti 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +13 -0
- data/.gitignore +5 -0
- data/.rspec +3 -0
- data/Gemfile +6 -0
- data/LICENSE +30 -0
- data/README.md +153 -0
- data/Rakefile +33 -0
- data/ext/wapiti/bcd.c +392 -0
- data/ext/wapiti/decoder.c +535 -0
- data/ext/wapiti/decoder.h +46 -0
- data/ext/wapiti/extconf.rb +8 -0
- data/ext/wapiti/gradient.c +818 -0
- data/ext/wapiti/gradient.h +81 -0
- data/ext/wapiti/lbfgs.c +294 -0
- data/ext/wapiti/model.c +296 -0
- data/ext/wapiti/model.h +100 -0
- data/ext/wapiti/native.c +1238 -0
- data/ext/wapiti/native.h +15 -0
- data/ext/wapiti/options.c +278 -0
- data/ext/wapiti/options.h +91 -0
- data/ext/wapiti/pattern.c +395 -0
- data/ext/wapiti/pattern.h +56 -0
- data/ext/wapiti/progress.c +167 -0
- data/ext/wapiti/progress.h +43 -0
- data/ext/wapiti/quark.c +272 -0
- data/ext/wapiti/quark.h +46 -0
- data/ext/wapiti/reader.c +553 -0
- data/ext/wapiti/reader.h +73 -0
- data/ext/wapiti/rprop.c +191 -0
- data/ext/wapiti/sequence.h +148 -0
- data/ext/wapiti/sgdl1.c +218 -0
- data/ext/wapiti/thread.c +171 -0
- data/ext/wapiti/thread.h +42 -0
- data/ext/wapiti/tools.c +202 -0
- data/ext/wapiti/tools.h +54 -0
- data/ext/wapiti/trainers.h +39 -0
- data/ext/wapiti/vmath.c +372 -0
- data/ext/wapiti/vmath.h +51 -0
- data/ext/wapiti/wapiti.c +288 -0
- data/ext/wapiti/wapiti.h +45 -0
- data/lib/wapiti.rb +30 -0
- data/lib/wapiti/errors.rb +17 -0
- data/lib/wapiti/model.rb +49 -0
- data/lib/wapiti/options.rb +113 -0
- data/lib/wapiti/utility.rb +15 -0
- data/lib/wapiti/version.rb +3 -0
- data/spec/fixtures/ch.mod +18550 -0
- data/spec/fixtures/chpattern.txt +52 -0
- data/spec/fixtures/chtest.txt +1973 -0
- data/spec/fixtures/chtrain.txt +19995 -0
- data/spec/fixtures/nppattern.txt +52 -0
- data/spec/fixtures/nptest.txt +1973 -0
- data/spec/fixtures/nptrain.txt +19995 -0
- data/spec/fixtures/pattern.txt +14 -0
- data/spec/fixtures/test.txt +60000 -0
- data/spec/fixtures/train.txt +1200 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/wapiti/model_spec.rb +173 -0
- data/spec/wapiti/native_spec.rb +12 -0
- data/spec/wapiti/options_spec.rb +175 -0
- data/spec/wapiti/utility_spec.rb +22 -0
- data/wapiti.gemspec +35 -0
- metadata +178 -0
data/ext/wapiti/native.h
ADDED
@@ -0,0 +1,278 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#include <limits.h>
|
29
|
+
#include <stdbool.h>
|
30
|
+
#include <stddef.h>
|
31
|
+
#include <stdlib.h>
|
32
|
+
#include <stdio.h>
|
33
|
+
#include <string.h>
|
34
|
+
|
35
|
+
#include "wapiti.h"
|
36
|
+
#include "tools.h"
|
37
|
+
#include "options.h"
|
38
|
+
#include "vmath.h"
|
39
|
+
|
40
|
+
/******************************************************************************
|
41
|
+
* Command line parsing
|
42
|
+
*
|
43
|
+
* This module handle command line parsing and put all things defined by the
|
44
|
+
* user in a special structure in order to make them accessible to the
|
45
|
+
* remaining of the program.
|
46
|
+
******************************************************************************/
|
47
|
+
|
48
|
+
/* opt_help:
|
49
|
+
* Just display the help message describing modes and switch.
|
50
|
+
*/
|
51
|
+
static void opt_help(const char *pname) {
|
52
|
+
static const char msg[] =
|
53
|
+
"Global switchs:\n"
|
54
|
+
"\t-h | --help display this help message\n"
|
55
|
+
"\t | --version display version information\n"
|
56
|
+
"\n"
|
57
|
+
"Training mode:\n"
|
58
|
+
" %1$s train [options] [input data] [model file]\n"
|
59
|
+
"\t | --me force maxent mode\n"
|
60
|
+
"\t-a | --algo STRING training algorithm to use\n"
|
61
|
+
"\t-p | --pattern FILE patterns for extracting features\n"
|
62
|
+
"\t-m | --model FILE model file to preload\n"
|
63
|
+
"\t-d | --devel FILE development dataset\n"
|
64
|
+
"\t-c | --compact compact model after training\n"
|
65
|
+
"\t-t | --nthread INT number of worker threads\n"
|
66
|
+
"\t-j | --jobsize INT job size for worker threads\n"
|
67
|
+
"\t-s | --sparse enable sparse forward/backward\n"
|
68
|
+
"\t-i | --maxiter INT maximum number of iterations\n"
|
69
|
+
"\t-1 | --rho1 FLOAT l1 penalty parameter\n"
|
70
|
+
"\t-2 | --rho2 FLOAT l2 penalty parameter\n"
|
71
|
+
"\t-o | --objwin INT convergence window size\n"
|
72
|
+
"\t-w | --stopwin INT stop window size\n"
|
73
|
+
"\t-e | --stopeps FLOAT stop epsilon value\n"
|
74
|
+
"\t | --clip (l-bfgs) clip gradient\n"
|
75
|
+
"\t | --histsz INT (l-bfgs) history size\n"
|
76
|
+
"\t | --maxls INT (l-bfgs) max linesearch iters\n"
|
77
|
+
"\t | --eta0 FLOAT (sgd-l1) learning rate\n"
|
78
|
+
"\t | --alpha FLOAT (sgd-l1) exp decay parameter\n"
|
79
|
+
"\t | --kappa FLOAT (bcd) stability parameter\n"
|
80
|
+
"\t | --stpmin FLOAT (rprop) minimum step size\n"
|
81
|
+
"\t | --stpmax FLOAT (rprop) maximum step size\n"
|
82
|
+
"\t | --stpinc FLOAT (rprop) step increment factor\n"
|
83
|
+
"\t | --stpdec FLOAT (rprop) step decrement factor\n"
|
84
|
+
"\t | --cutoff (rprop) alternate projection\n"
|
85
|
+
"\n"
|
86
|
+
"Labelling mode:\n"
|
87
|
+
" %1$s label [options] [input data] [output data]\n"
|
88
|
+
"\t | --me force maxent mode\n"
|
89
|
+
"\t-m | --model FILE model file to load\n"
|
90
|
+
"\t-l | --label output only labels\n"
|
91
|
+
"\t-c | --check input is already labeled\n"
|
92
|
+
"\t-s | --score add scores to output\n"
|
93
|
+
"\t-p | --post label using posteriors\n"
|
94
|
+
"\t-n | --nbest INT output n-best list\n"
|
95
|
+
"\n"
|
96
|
+
"Dumping mode\n"
|
97
|
+
" %1$s dump [input model] [output text]\n";
|
98
|
+
fprintf(stderr, msg, pname);
|
99
|
+
}
|
100
|
+
|
101
|
+
/* opt_defaults:
|
102
|
+
* Default values for all parameters of the model.
|
103
|
+
*/
|
104
|
+
const opt_t opt_defaults = {
|
105
|
+
.mode = -1,
|
106
|
+
.input = NULL, .output = NULL,
|
107
|
+
.maxent = false,
|
108
|
+
.algo = "l-bfgs", .pattern = NULL, .model = NULL, .devel = NULL,
|
109
|
+
.compact = false, .sparse = false,
|
110
|
+
.nthread = 1, .jobsize = 64, .maxiter = 0,
|
111
|
+
.rho1 = 0.5, .rho2 = 0.0001,
|
112
|
+
.objwin = 5, .stopwin = 5, .stopeps = 0.02,
|
113
|
+
.lbfgs = {.clip = false, .histsz = 5, .maxls = 40},
|
114
|
+
.sgdl1 = {.eta0 = 0.8, .alpha = 0.85},
|
115
|
+
.bcd = {.kappa = 1.5},
|
116
|
+
.rprop = {.stpmin = 1e-8, .stpmax = 50.0, .stpinc = 1.2, .stpdec = 0.5,
|
117
|
+
.cutoff = false},
|
118
|
+
.label = false, .check = false, .outsc = false,
|
119
|
+
.lblpost = false, .nbest = 1
|
120
|
+
};
|
121
|
+
|
122
|
+
/* opt_switch:
|
123
|
+
* Define available switchs for the different modes in a readable way for the
|
124
|
+
* command line argument parser.
|
125
|
+
*/
|
126
|
+
struct {
|
127
|
+
int mode;
|
128
|
+
char *dshort;
|
129
|
+
char *dlong;
|
130
|
+
char kind;
|
131
|
+
size_t offset;
|
132
|
+
} opt_switch[] = {
|
133
|
+
{0, "##", "--me", 'B', offsetof(opt_t, maxent )},
|
134
|
+
{0, "-a", "--algo", 'S', offsetof(opt_t, algo )},
|
135
|
+
{0, "-p", "--pattern", 'S', offsetof(opt_t, pattern )},
|
136
|
+
{0, "-m", "--model", 'S', offsetof(opt_t, model )},
|
137
|
+
{0, "-d", "--devel", 'S', offsetof(opt_t, devel )},
|
138
|
+
{0, "-c", "--compact", 'B', offsetof(opt_t, compact )},
|
139
|
+
{0, "-s", "--sparse", 'B', offsetof(opt_t, sparse )},
|
140
|
+
{0, "-t", "--nthread", 'I', offsetof(opt_t, nthread )},
|
141
|
+
{0, "-j", "--josize", 'I', offsetof(opt_t, jobsize )},
|
142
|
+
{0, "-i", "--maxiter", 'I', offsetof(opt_t, maxiter )},
|
143
|
+
{0, "-1", "--rho1", 'F', offsetof(opt_t, rho1 )},
|
144
|
+
{0, "-2", "--rho2", 'F', offsetof(opt_t, rho2 )},
|
145
|
+
{0, "-o", "--objsz", 'I', offsetof(opt_t, objwin )},
|
146
|
+
{0, "-w", "--stopwin", 'I', offsetof(opt_t, stopwin )},
|
147
|
+
{0, "-e", "--stopeps", 'F', offsetof(opt_t, stopeps )},
|
148
|
+
{0, "##", "--clip", 'B', offsetof(opt_t, lbfgs.clip )},
|
149
|
+
{0, "##", "--histsz", 'I', offsetof(opt_t, lbfgs.histsz)},
|
150
|
+
{0, "##", "--maxls", 'I', offsetof(opt_t, lbfgs.maxls )},
|
151
|
+
{0, "##", "--eta0", 'F', offsetof(opt_t, sgdl1.eta0 )},
|
152
|
+
{0," ##", "--alpha", 'F', offsetof(opt_t, sgdl1.alpha )},
|
153
|
+
{0, "##", "--kappa", 'F', offsetof(opt_t, bcd.kappa )},
|
154
|
+
{0, "##", "--stpmin", 'F', offsetof(opt_t, rprop.stpmin)},
|
155
|
+
{0, "##", "--stpmax", 'F', offsetof(opt_t, rprop.stpmax)},
|
156
|
+
{0, "##", "--stpinc", 'F', offsetof(opt_t, rprop.stpinc)},
|
157
|
+
{0, "##", "--stpdec", 'F', offsetof(opt_t, rprop.stpdec)},
|
158
|
+
{0, "##", "--cutoff", 'B', offsetof(opt_t, rprop.cutoff)},
|
159
|
+
{1, "##", "--me", 'B', offsetof(opt_t, maxent )},
|
160
|
+
{1, "-m", "--model", 'S', offsetof(opt_t, model )},
|
161
|
+
{1, "-l", "--label", 'B', offsetof(opt_t, label )},
|
162
|
+
{1, "-c", "--check", 'B', offsetof(opt_t, check )},
|
163
|
+
{1, "-s", "--score", 'B', offsetof(opt_t, outsc )},
|
164
|
+
{1, "-p", "--post", 'B', offsetof(opt_t, lblpost )},
|
165
|
+
{1, "-n", "--nbest", 'I', offsetof(opt_t, nbest )},
|
166
|
+
{-1, NULL, NULL, '\0', 0}
|
167
|
+
};
|
168
|
+
|
169
|
+
/* argparse:
|
170
|
+
* This is the main function for command line parsing. It use the previous
|
171
|
+
* table to known how to interpret the switchs and store values in the opt_t
|
172
|
+
* structure.
|
173
|
+
*/
|
174
|
+
void opt_parse(int argc, char *argv[argc], opt_t *opt) {
|
175
|
+
static const char *err_badval = "invalid value for switch '%s'";
|
176
|
+
const char *pname = argv[0];
|
177
|
+
argc--, argv++;
|
178
|
+
if (argc == 0) {
|
179
|
+
opt_help(pname);
|
180
|
+
fatal("no mode specified");
|
181
|
+
}
|
182
|
+
// First special handling for help and version
|
183
|
+
if (!strcmp(argv[0], "-h") || !strcmp(argv[0], "--help")) {
|
184
|
+
opt_help(pname);
|
185
|
+
exit(EXIT_FAILURE);
|
186
|
+
} else if (!strcmp(argv[0], "--version")) {
|
187
|
+
fprintf(stderr, "Wapiti v" VERSION "\n");
|
188
|
+
fprintf(stderr, " Optimization mode: %s\n", xvm_mode());
|
189
|
+
exit(EXIT_SUCCESS);
|
190
|
+
}
|
191
|
+
// Get the mode to use
|
192
|
+
if (!strcmp(argv[0], "t") || !strcmp(argv[0], "train")) {
|
193
|
+
opt->mode = 0;
|
194
|
+
} else if (!strcmp(argv[0], "l") || !strcmp(argv[0], "label")) {
|
195
|
+
opt->mode = 1;
|
196
|
+
} else if (!strcmp(argv[0], "d") || !strcmp(argv[0], "dump")) {
|
197
|
+
opt->mode = 2;
|
198
|
+
} else {
|
199
|
+
fatal("unknown mode <%s>", argv[0]);
|
200
|
+
}
|
201
|
+
argc--, argv++;
|
202
|
+
// Parse remaining arguments
|
203
|
+
opt->input = NULL;
|
204
|
+
opt->output = NULL;
|
205
|
+
while (argc > 0) {
|
206
|
+
const char *arg = argv[0];
|
207
|
+
int idx;
|
208
|
+
// Check if this argument is a filename or an option
|
209
|
+
if (arg[0] != '-') {
|
210
|
+
if (opt->input == NULL)
|
211
|
+
opt->input = argv[0];
|
212
|
+
else if (opt->output == NULL)
|
213
|
+
opt->output = argv[0];
|
214
|
+
else
|
215
|
+
fatal("too much input files on command line");
|
216
|
+
argc--, argv++;
|
217
|
+
continue;
|
218
|
+
}
|
219
|
+
// Search the current switch in the table or fail if it cannot
|
220
|
+
// be found.
|
221
|
+
for (idx = 0; opt_switch[idx].mode != -1; idx++) {
|
222
|
+
if (opt_switch[idx].mode != opt->mode)
|
223
|
+
continue;
|
224
|
+
if (!strcmp(arg, opt_switch[idx].dshort))
|
225
|
+
break;
|
226
|
+
if (!strcmp(arg, opt_switch[idx].dlong))
|
227
|
+
break;
|
228
|
+
}
|
229
|
+
if (opt_switch[idx].mode == -1)
|
230
|
+
fatal("unknown option '%s'", arg);
|
231
|
+
// Decode the argument and store it in the structure
|
232
|
+
if (opt_switch[idx].kind != 'B' && argc < 2)
|
233
|
+
fatal("missing argument for switch '%s'", arg);
|
234
|
+
void *ptr = (void *)((char *)opt + opt_switch[idx].offset);
|
235
|
+
switch (opt_switch[idx].kind) {
|
236
|
+
case 'S':
|
237
|
+
*((char **)ptr) = argv[1];
|
238
|
+
argc -= 2, argv += 2;
|
239
|
+
break;
|
240
|
+
case 'I':
|
241
|
+
if (sscanf(argv[1], "%d", (int *)ptr) != 1)
|
242
|
+
fatal(err_badval, arg);
|
243
|
+
argc -= 2, argv += 2;
|
244
|
+
break;
|
245
|
+
case 'F': {
|
246
|
+
double tmp;
|
247
|
+
if (sscanf(argv[1], "%lf", &tmp) != 1)
|
248
|
+
fatal(err_badval, arg);
|
249
|
+
*((double *)ptr) = tmp;
|
250
|
+
argc -= 2, argv += 2;
|
251
|
+
break; }
|
252
|
+
case 'B':
|
253
|
+
*((bool *)ptr) = true;
|
254
|
+
argc--, argv++;
|
255
|
+
break;
|
256
|
+
}
|
257
|
+
}
|
258
|
+
// Small trick for the maxiter switch
|
259
|
+
if (opt->maxiter == 0)
|
260
|
+
opt->maxiter = INT_MAX;
|
261
|
+
// Check that all options are valid
|
262
|
+
#define argchecksub(name, test) \
|
263
|
+
if (!(test)) \
|
264
|
+
fatal("invalid value for <"name">");
|
265
|
+
argchecksub("--thread", opt->nthread > 0 );
|
266
|
+
argchecksub("--jobsize", opt->jobsize > 0 );
|
267
|
+
argchecksub("--rho1", opt->rho1 >= 0.0);
|
268
|
+
argchecksub("--rho2", opt->rho2 >= 0.0);
|
269
|
+
argchecksub("--histsz", opt->lbfgs.histsz > 0 );
|
270
|
+
argchecksub("--maxls", opt->lbfgs.maxls > 0 );
|
271
|
+
argchecksub("--eta0", opt->sgdl1.eta0 > 0.0);
|
272
|
+
argchecksub("--alpha", opt->sgdl1.alpha > 0.0);
|
273
|
+
argchecksub("--nbest", opt->nbest > 0 );
|
274
|
+
#undef argchecksub
|
275
|
+
if (opt->maxent && !strcmp(opt->algo, "bcd"))
|
276
|
+
fatal("BCD not supported for training maxent models");
|
277
|
+
}
|
278
|
+
|
@@ -0,0 +1,91 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
#ifndef options_h
|
28
|
+
#define options_h
|
29
|
+
|
30
|
+
#include <stdbool.h>
|
31
|
+
|
32
|
+
#include "wapiti.h"
|
33
|
+
|
34
|
+
/* opt_t:
|
35
|
+
* This structure hold all user configurable parameter for Wapiti and is
|
36
|
+
* filled with parameters from command line.
|
37
|
+
*/
|
38
|
+
typedef struct opt_s opt_t;
|
39
|
+
struct opt_s {
|
40
|
+
int mode;
|
41
|
+
char *input, *output;
|
42
|
+
bool maxent;
|
43
|
+
// Options for training
|
44
|
+
char *algo, *pattern;
|
45
|
+
char *model, *devel;
|
46
|
+
bool compact, sparse;
|
47
|
+
int nthread;
|
48
|
+
int jobsize;
|
49
|
+
int maxiter;
|
50
|
+
double rho1, rho2;
|
51
|
+
// Window size criterion
|
52
|
+
int objwin;
|
53
|
+
int stopwin;
|
54
|
+
double stopeps;
|
55
|
+
// Options specific to L-BFGS
|
56
|
+
struct {
|
57
|
+
bool clip;
|
58
|
+
int histsz;
|
59
|
+
int maxls;
|
60
|
+
} lbfgs;
|
61
|
+
// Options specific to SGD-L1
|
62
|
+
struct {
|
63
|
+
double eta0;
|
64
|
+
double alpha;
|
65
|
+
} sgdl1;
|
66
|
+
// Options specific to BCD
|
67
|
+
struct {
|
68
|
+
double kappa;
|
69
|
+
} bcd;
|
70
|
+
// Options specific to RPROP
|
71
|
+
struct {
|
72
|
+
double stpmin;
|
73
|
+
double stpmax;
|
74
|
+
double stpinc;
|
75
|
+
double stpdec;
|
76
|
+
bool cutoff;
|
77
|
+
} rprop;
|
78
|
+
// Options for labelling
|
79
|
+
bool label;
|
80
|
+
bool check;
|
81
|
+
bool outsc;
|
82
|
+
bool lblpost;
|
83
|
+
int nbest;
|
84
|
+
};
|
85
|
+
|
86
|
+
extern const opt_t opt_defaults;
|
87
|
+
|
88
|
+
void opt_parse(int argc, char *argv[argc], opt_t *opt);
|
89
|
+
|
90
|
+
#endif
|
91
|
+
|
@@ -0,0 +1,395 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#include <ctype.h>
|
29
|
+
#include <stdbool.h>
|
30
|
+
#include <stddef.h>
|
31
|
+
#include <stdio.h>
|
32
|
+
#include <stdlib.h>
|
33
|
+
#include <string.h>
|
34
|
+
|
35
|
+
#include "pattern.h"
|
36
|
+
#include "sequence.h"
|
37
|
+
#include "tools.h"
|
38
|
+
|
39
|
+
/******************************************************************************
|
40
|
+
* A simple regular expression matcher
|
41
|
+
*
|
42
|
+
* This module implement a simple regular expression matcher, it implement
|
43
|
+
* just a subset of the classical regexp simple to implement but sufficient
|
44
|
+
* for most usages and avoid to add a dependency to a full regexp library.
|
45
|
+
*
|
46
|
+
* The recognized subset is quite simple. First for matching characters :
|
47
|
+
* . -> match any characters
|
48
|
+
* \x -> match a character class (in uppercase, match the complement)
|
49
|
+
* \d : digit \a : alpha \w : alpha + digit
|
50
|
+
* \l : lowercase \u : uppercase \p : punctuation
|
51
|
+
* \s : space
|
52
|
+
* or escape a character
|
53
|
+
* x -> any other character match itself
|
54
|
+
* And the constructs :
|
55
|
+
* ^ -> at the begining of the regexp, anchor it at start of string
|
56
|
+
* $ -> at the end of regexp, anchor it at end of string
|
57
|
+
* * -> match any number of repetition of the previous character
|
58
|
+
* ? -> optionally match the previous character
|
59
|
+
*
|
60
|
+
* This subset is implemented quite efficiently using recursion. All recursive
|
61
|
+
* calls are tail-call so they should be optimized by the compiler. As we do
|
62
|
+
* direct interpretation, we have to backtrack so performance can be very poor
|
63
|
+
* on specialy designed regexp. This is not a problem as the regexp as well as
|
64
|
+
* the string is expected to be very simple here. If this is not the case, you
|
65
|
+
* better have to prepare your data better.
|
66
|
+
******************************************************************************/
|
67
|
+
|
68
|
+
/* rex_matchit:
|
69
|
+
* Match a single caracter at the start fo the string. The character might be
|
70
|
+
* a plain char, a dot or char class.
|
71
|
+
*/
|
72
|
+
static bool rex_matchit(const char *ch, const char *str) {
|
73
|
+
if (str[0] == '\0')
|
74
|
+
return false;
|
75
|
+
if (ch[0] == '.')
|
76
|
+
return true;
|
77
|
+
if (ch[0] == '\\') {
|
78
|
+
switch (ch[1]) {
|
79
|
+
case 'a': return isalpha(str[0]);
|
80
|
+
case 'd': return isdigit(str[0]);
|
81
|
+
case 'l': return islower(str[0]);
|
82
|
+
case 'p': return ispunct(str[0]);
|
83
|
+
case 's': return isspace(str[0]);
|
84
|
+
case 'u': return isupper(str[0]);
|
85
|
+
case 'w': return isalnum(str[0]);
|
86
|
+
case 'A': return !isalpha(str[0]);
|
87
|
+
case 'D': return !isdigit(str[0]);
|
88
|
+
case 'L': return !islower(str[0]);
|
89
|
+
case 'P': return !ispunct(str[0]);
|
90
|
+
case 'S': return !isspace(str[0]);
|
91
|
+
case 'U': return !isupper(str[0]);
|
92
|
+
case 'W': return !isalnum(str[0]);
|
93
|
+
}
|
94
|
+
return ch[1] == str[0];
|
95
|
+
}
|
96
|
+
return ch[0] == str[0];
|
97
|
+
}
|
98
|
+
|
99
|
+
/* rex_matchme:
|
100
|
+
* Match a regular expresion at the start of the string. If a match is found,
|
101
|
+
* is length is returned in len. The mathing is done through tail-recursion
|
102
|
+
* for good performances.
|
103
|
+
*/
|
104
|
+
static bool rex_matchme(const char *re, const char *str, int *len) {
|
105
|
+
// Special check for end of regexp
|
106
|
+
if (re[0] == '\0')
|
107
|
+
return true;
|
108
|
+
if (re[0] == '$' && re[1] == '\0')
|
109
|
+
return (str[0] == '\0');
|
110
|
+
// Get first char of regexp
|
111
|
+
const char *ch = re;
|
112
|
+
const char *nxt = re + 1 + (ch[0] == '\\');
|
113
|
+
// Special check for the following construct "x**" where the first star
|
114
|
+
// is consumed normally but lead the second (which is wrong) to be
|
115
|
+
// interpreted as a char to mach as if it was escaped (and same for the
|
116
|
+
// optional construct)
|
117
|
+
if (*ch == '*' || *ch == '?')
|
118
|
+
fatal("unescaped * or ? in regexp: %s", re);
|
119
|
+
// Handle star repetition
|
120
|
+
if (nxt[0] == '*') {
|
121
|
+
nxt++;
|
122
|
+
do {
|
123
|
+
const int save = *len;
|
124
|
+
if (rex_matchme(nxt, str, len))
|
125
|
+
return true;
|
126
|
+
*len = save + 1;
|
127
|
+
} while (rex_matchit(ch, str++));
|
128
|
+
return false;
|
129
|
+
}
|
130
|
+
// Handle optional
|
131
|
+
if (nxt[0] == '?') {
|
132
|
+
nxt++;
|
133
|
+
if (rex_matchit(ch, str)) {
|
134
|
+
(*len)++;
|
135
|
+
if (rex_matchme(nxt, str + 1, len))
|
136
|
+
return true;
|
137
|
+
(*len)--;
|
138
|
+
}
|
139
|
+
return rex_matchme(nxt, str, len);
|
140
|
+
}
|
141
|
+
// Classical char matching
|
142
|
+
(*len)++;
|
143
|
+
if (rex_matchit(ch, str))
|
144
|
+
return rex_matchme(nxt, str + 1, len);
|
145
|
+
return false;
|
146
|
+
}
|
147
|
+
|
148
|
+
/* rex_match:
|
149
|
+
* Match a regular expresion in the given string. If a match is found, the
|
150
|
+
* position of the start of the match is returned and is len is returned in
|
151
|
+
* len, else -1 is returned.
|
152
|
+
*/
|
153
|
+
static int rex_match(const char *re, const char *str, int *len) {
|
154
|
+
// Special case for anchor at start
|
155
|
+
if (*re == '^') {
|
156
|
+
*len = 0;
|
157
|
+
if (rex_matchme(re + 1, str, len))
|
158
|
+
return 0;
|
159
|
+
return -1;
|
160
|
+
}
|
161
|
+
// And general case for any position
|
162
|
+
int pos = 0;
|
163
|
+
do {
|
164
|
+
*len = 0;
|
165
|
+
if (rex_matchme(re, str + pos, len))
|
166
|
+
return pos;
|
167
|
+
} while (str[pos++] != '\0');
|
168
|
+
// Matching failed
|
169
|
+
return -1;
|
170
|
+
}
|
171
|
+
|
172
|
+
/*******************************************************************************
|
173
|
+
* Pattern handling
|
174
|
+
*
|
175
|
+
* Patterns are the heart the data input process, they provide a way to tell
|
176
|
+
* Wapiti how the interesting information can be extracted from the input
|
177
|
+
* data. A pattern is simply a string who embed special commands about tokens
|
178
|
+
* to extract from the input sequence. They are compiled to a special form
|
179
|
+
* used during data loading.
|
180
|
+
* For training, each position of a sequence hold a list of observation made
|
181
|
+
* at this position, pattern give a way to specify these observations.
|
182
|
+
*
|
183
|
+
* During sequence loading, all patterns are applied at each position to
|
184
|
+
* produce a list of string representing the observations which will be in
|
185
|
+
* turn transformed to numerical identifiers. This module take care of
|
186
|
+
* building the string representation.
|
187
|
+
*
|
188
|
+
* As said, a patern is a string with specific commands in the forms %c[...]
|
189
|
+
* where 'c' is the command with arguments between the bracket. All commands
|
190
|
+
* take at least to numerical arguments which define a token in the input
|
191
|
+
* sequence. The first one is an offset from the current position and the
|
192
|
+
* second one is a column number. With these two parameters, we get a string
|
193
|
+
* in the input sequence on which we apply the command.
|
194
|
+
*
|
195
|
+
* All command are specified with a character and result in a string which
|
196
|
+
* will replace the command in the pattern string. If the command character is
|
197
|
+
* lower case, the result is copied verbatim, if it is uppercase, the result
|
198
|
+
* is copied with casing removed. The following commands are available:
|
199
|
+
* 'x' -- result is the token itself
|
200
|
+
* 't' -- test if a regular expression match the token. Result will be
|
201
|
+
* either "true" or "false"
|
202
|
+
* 'm' -- match a regular expression on the token. Result is the first
|
203
|
+
* substring matched.
|
204
|
+
******************************************************************************/
|
205
|
+
|
206
|
+
/* pat_comp:
|
207
|
+
* Compile the pattern to a form more suitable to easily apply it on tokens
|
208
|
+
* list during data reading. The given pattern string is interned in the
|
209
|
+
* compiled pattern and will be freed with it, so you don't have to take care
|
210
|
+
* of it and must not modify it after the compilation.
|
211
|
+
*/
|
212
|
+
pat_t *pat_comp(char *p) {
|
213
|
+
pat_t *pat = NULL;
|
214
|
+
// Allocate memory for the compiled pattern, the allocation is based
|
215
|
+
// on an over-estimation of the number of required item. As compiled
|
216
|
+
// pattern take a neglectible amount of memory, this waste is not
|
217
|
+
// important.
|
218
|
+
int mitems = 0;
|
219
|
+
for (int pos = 0; p[pos] != '\0'; pos++)
|
220
|
+
if (p[pos] == '%')
|
221
|
+
mitems++;
|
222
|
+
mitems = mitems * 2 + 1;
|
223
|
+
pat = wapiti_xmalloc(sizeof(pat_t) + sizeof(pat->items[0]) * mitems);
|
224
|
+
pat->src = p;
|
225
|
+
// Next, we go through the pattern compiling the items as they are
|
226
|
+
// found. Commands are parsed and put in a corresponding item, and
|
227
|
+
// segment of char not in a command are put in a 's' item.
|
228
|
+
int nitems = 0;
|
229
|
+
int ntoks = 0;
|
230
|
+
int pos = 0;
|
231
|
+
while (p[pos] != '\0') {
|
232
|
+
pat_item_t *item = &(pat->items[nitems++]);
|
233
|
+
item->value = NULL;
|
234
|
+
if (p[pos] == '%') {
|
235
|
+
// This is a command, so first parse its type and check
|
236
|
+
// its a valid one. Next prepare the item.
|
237
|
+
const char type = tolower(p[pos + 1]);
|
238
|
+
if (type != 'x' && type != 't' && type != 'm')
|
239
|
+
fatal("unknown command type: '%c'", type);
|
240
|
+
item->type = type;
|
241
|
+
item->caps = (p[pos + 1] != type);
|
242
|
+
pos += 2;
|
243
|
+
// Next we parse the offset and column and store them in
|
244
|
+
// the item.
|
245
|
+
const char *at = p + pos;
|
246
|
+
int off, col, nch;
|
247
|
+
item->absolute = false;
|
248
|
+
if (sscanf(at, "[@%d,%d%n", &off, &col, &nch) == 2)
|
249
|
+
item->absolute = true;
|
250
|
+
else if (sscanf(at, "[%d,%d%n", &off, &col, &nch) != 2)
|
251
|
+
fatal("invalid pattern: %s", p);
|
252
|
+
if (col < 0)
|
253
|
+
fatal("invalid column number: %d", col);
|
254
|
+
item->offset = off;
|
255
|
+
item->column = col;
|
256
|
+
ntoks = max(ntoks, col);
|
257
|
+
pos += nch;
|
258
|
+
// And parse the end of the argument list, for 'x' there
|
259
|
+
// is nothing to read but for 't' and 'm' we have to get
|
260
|
+
// read the regexp.
|
261
|
+
if (type == 't' || type == 'm') {
|
262
|
+
if (p[pos] != ',' && p[pos + 1] != '"')
|
263
|
+
fatal("missing arg in pattern: %s", p);
|
264
|
+
const int start = (pos += 2);
|
265
|
+
while (p[pos] != '\0') {
|
266
|
+
if (p[pos] == '"')
|
267
|
+
break;
|
268
|
+
if (p[pos] == '\\' && p[pos+1] != '\0')
|
269
|
+
pos++;
|
270
|
+
pos++;
|
271
|
+
}
|
272
|
+
if (p[pos] != '"')
|
273
|
+
fatal("unended argument: %s", p);
|
274
|
+
const int len = pos - start;
|
275
|
+
item->value = wapiti_xmalloc(sizeof(char) * (len + 1));
|
276
|
+
memcpy(item->value, p + start, len);
|
277
|
+
item->value[len] = '\0';
|
278
|
+
pos++;
|
279
|
+
}
|
280
|
+
// Just check the end of the arg list and loop.
|
281
|
+
if (p[pos] != ']')
|
282
|
+
fatal("missing end of pattern: %s", p);
|
283
|
+
pos++;
|
284
|
+
} else {
|
285
|
+
// No command here, so build an 's' item with the chars
|
286
|
+
// until end of pattern or next command and put it in
|
287
|
+
// the list.
|
288
|
+
const int start = pos;
|
289
|
+
while (p[pos] != '\0' && p[pos] != '%')
|
290
|
+
pos++;
|
291
|
+
const int len = pos - start;
|
292
|
+
item->type = 's';
|
293
|
+
item->caps = false;
|
294
|
+
item->value = wapiti_xmalloc(sizeof(char) * (len + 1));
|
295
|
+
memcpy(item->value, p + start, len);
|
296
|
+
item->value[len] = '\0';
|
297
|
+
}
|
298
|
+
}
|
299
|
+
pat->ntoks = ntoks;
|
300
|
+
pat->nitems = nitems;
|
301
|
+
return pat;
|
302
|
+
}
|
303
|
+
|
304
|
+
/* pat_exec:
|
305
|
+
* Execute a compiled pattern at position 'at' in the given tokens sequences
|
306
|
+
* in order to produce an observation string. The string is returned as a
|
307
|
+
* newly allocated memory block and the caller is responsible to free it when
|
308
|
+
* not needed anymore.
|
309
|
+
*/
|
310
|
+
char *pat_exec(const pat_t *pat, const tok_t *tok, int at) {
|
311
|
+
static char *bval[] = {"_x-1", "_x-2", "_x-3", "_x-4", "_x-#"};
|
312
|
+
static char *eval[] = {"_x+1", "_x+2", "_x+3", "_x+4", "_x+#"};
|
313
|
+
const int T = tok->len;
|
314
|
+
// Prepare the buffer who will hold the result
|
315
|
+
int size = 16, pos = 0;
|
316
|
+
char *buffer = wapiti_xmalloc(sizeof(char) * size);
|
317
|
+
// And loop over the compiled items
|
318
|
+
for (int it = 0; it < pat->nitems; it++) {
|
319
|
+
const pat_item_t *item = &(pat->items[it]);
|
320
|
+
char *value = NULL;
|
321
|
+
int len = 0;
|
322
|
+
// First, if needed, we retrieve the token at the referenced
|
323
|
+
// position in the sequence. We store it in value and let the
|
324
|
+
// command handler do what it need with it.
|
325
|
+
if (item->type != 's') {
|
326
|
+
int pos = item->offset;
|
327
|
+
if (item->absolute) {
|
328
|
+
if (item->offset < 0)
|
329
|
+
pos += T;
|
330
|
+
else
|
331
|
+
pos--;
|
332
|
+
} else {
|
333
|
+
pos += at;
|
334
|
+
}
|
335
|
+
int col = item->column;
|
336
|
+
if (pos < 0)
|
337
|
+
value = bval[min(-pos - 1, 4)];
|
338
|
+
else if (pos >= T)
|
339
|
+
value = eval[min( pos - T, 4)];
|
340
|
+
else if (col >= tok->cnts[pos])
|
341
|
+
fatal("missing tokens, cannot apply pattern");
|
342
|
+
else
|
343
|
+
value = tok->toks[pos][col];
|
344
|
+
}
|
345
|
+
// Next, we handle the command, 's' and 'x' are very simple but
|
346
|
+
// 't' and 'm' require us to call the regexp matcher.
|
347
|
+
if (item->type == 's') {
|
348
|
+
value = item->value;
|
349
|
+
len = strlen(value);
|
350
|
+
} else if (item->type == 'x') {
|
351
|
+
len = strlen(value);
|
352
|
+
} else if (item->type == 't') {
|
353
|
+
if (rex_match(item->value, value, &len) == -1)
|
354
|
+
value = "false";
|
355
|
+
else
|
356
|
+
value = "true";
|
357
|
+
len = strlen(value);
|
358
|
+
} else if (item->type == 'm') {
|
359
|
+
int pos = rex_match(item->value, value, &len);
|
360
|
+
if (pos == -1)
|
361
|
+
len = 0;
|
362
|
+
value += pos;
|
363
|
+
}
|
364
|
+
// And we add it to the buffer, growing it if needed. If the
|
365
|
+
// user requested it, we also remove caps from the string.
|
366
|
+
if (pos + len >= size - 1) {
|
367
|
+
while (pos + len >= size - 1)
|
368
|
+
size = size * 1.4;
|
369
|
+
buffer = wapiti_xrealloc(buffer, sizeof(char) * size);
|
370
|
+
}
|
371
|
+
memcpy(buffer + pos, value, len);
|
372
|
+
if (item->caps)
|
373
|
+
for (int i = pos; i < pos + len; i++)
|
374
|
+
buffer[i] = tolower(buffer[i]);
|
375
|
+
pos += len;
|
376
|
+
}
|
377
|
+
// Adjust the result and return it.
|
378
|
+
buffer[pos++] = '\0';
|
379
|
+
buffer = wapiti_xrealloc(buffer, sizeof(char) * pos);
|
380
|
+
return buffer;
|
381
|
+
}
|
382
|
+
|
383
|
+
/* pat_free:
|
384
|
+
* Free all memory used by a compiled pattern object. Note that this will free
|
385
|
+
* the pointer to the source string given to pat_comp so you must be sure to
|
386
|
+
* not use this pointer again.
|
387
|
+
*/
|
388
|
+
void pat_free(pat_t *pat) {
|
389
|
+
for (int it = 0; it < pat->nitems; it++)
|
390
|
+
free(pat->items[it].value);
|
391
|
+
free(pat->src);
|
392
|
+
free(pat);
|
393
|
+
}
|
394
|
+
|
395
|
+
|