wapiti 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/.autotest +13 -0
  2. data/.gitignore +5 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +30 -0
  6. data/README.md +153 -0
  7. data/Rakefile +33 -0
  8. data/ext/wapiti/bcd.c +392 -0
  9. data/ext/wapiti/decoder.c +535 -0
  10. data/ext/wapiti/decoder.h +46 -0
  11. data/ext/wapiti/extconf.rb +8 -0
  12. data/ext/wapiti/gradient.c +818 -0
  13. data/ext/wapiti/gradient.h +81 -0
  14. data/ext/wapiti/lbfgs.c +294 -0
  15. data/ext/wapiti/model.c +296 -0
  16. data/ext/wapiti/model.h +100 -0
  17. data/ext/wapiti/native.c +1238 -0
  18. data/ext/wapiti/native.h +15 -0
  19. data/ext/wapiti/options.c +278 -0
  20. data/ext/wapiti/options.h +91 -0
  21. data/ext/wapiti/pattern.c +395 -0
  22. data/ext/wapiti/pattern.h +56 -0
  23. data/ext/wapiti/progress.c +167 -0
  24. data/ext/wapiti/progress.h +43 -0
  25. data/ext/wapiti/quark.c +272 -0
  26. data/ext/wapiti/quark.h +46 -0
  27. data/ext/wapiti/reader.c +553 -0
  28. data/ext/wapiti/reader.h +73 -0
  29. data/ext/wapiti/rprop.c +191 -0
  30. data/ext/wapiti/sequence.h +148 -0
  31. data/ext/wapiti/sgdl1.c +218 -0
  32. data/ext/wapiti/thread.c +171 -0
  33. data/ext/wapiti/thread.h +42 -0
  34. data/ext/wapiti/tools.c +202 -0
  35. data/ext/wapiti/tools.h +54 -0
  36. data/ext/wapiti/trainers.h +39 -0
  37. data/ext/wapiti/vmath.c +372 -0
  38. data/ext/wapiti/vmath.h +51 -0
  39. data/ext/wapiti/wapiti.c +288 -0
  40. data/ext/wapiti/wapiti.h +45 -0
  41. data/lib/wapiti.rb +30 -0
  42. data/lib/wapiti/errors.rb +17 -0
  43. data/lib/wapiti/model.rb +49 -0
  44. data/lib/wapiti/options.rb +113 -0
  45. data/lib/wapiti/utility.rb +15 -0
  46. data/lib/wapiti/version.rb +3 -0
  47. data/spec/fixtures/ch.mod +18550 -0
  48. data/spec/fixtures/chpattern.txt +52 -0
  49. data/spec/fixtures/chtest.txt +1973 -0
  50. data/spec/fixtures/chtrain.txt +19995 -0
  51. data/spec/fixtures/nppattern.txt +52 -0
  52. data/spec/fixtures/nptest.txt +1973 -0
  53. data/spec/fixtures/nptrain.txt +19995 -0
  54. data/spec/fixtures/pattern.txt +14 -0
  55. data/spec/fixtures/test.txt +60000 -0
  56. data/spec/fixtures/train.txt +1200 -0
  57. data/spec/spec_helper.rb +21 -0
  58. data/spec/wapiti/model_spec.rb +173 -0
  59. data/spec/wapiti/native_spec.rb +12 -0
  60. data/spec/wapiti/options_spec.rb +175 -0
  61. data/spec/wapiti/utility_spec.rb +22 -0
  62. data/wapiti.gemspec +35 -0
  63. metadata +178 -0
@@ -0,0 +1,15 @@
1
+ #ifndef native_h
2
+ #define native_h
3
+
4
+ #include <ruby.h>
5
+
6
+ extern VALUE mWapiti;
7
+ extern VALUE mNative;
8
+
9
+ extern VALUE cOptions;
10
+ extern VALUE cModel;
11
+
12
+ extern VALUE cNativeError;
13
+ extern VALUE cLogger;
14
+
15
+ #endif
@@ -0,0 +1,278 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #include <limits.h>
29
+ #include <stdbool.h>
30
+ #include <stddef.h>
31
+ #include <stdlib.h>
32
+ #include <stdio.h>
33
+ #include <string.h>
34
+
35
+ #include "wapiti.h"
36
+ #include "tools.h"
37
+ #include "options.h"
38
+ #include "vmath.h"
39
+
40
+ /******************************************************************************
41
+ * Command line parsing
42
+ *
43
+ * This module handle command line parsing and put all things defined by the
44
+ * user in a special structure in order to make them accessible to the
45
+ * remaining of the program.
46
+ ******************************************************************************/
47
+
48
+ /* opt_help:
49
+ * Just display the help message describing modes and switch.
50
+ */
51
+ static void opt_help(const char *pname) {
52
+ static const char msg[] =
53
+ "Global switchs:\n"
54
+ "\t-h | --help display this help message\n"
55
+ "\t | --version display version information\n"
56
+ "\n"
57
+ "Training mode:\n"
58
+ " %1$s train [options] [input data] [model file]\n"
59
+ "\t | --me force maxent mode\n"
60
+ "\t-a | --algo STRING training algorithm to use\n"
61
+ "\t-p | --pattern FILE patterns for extracting features\n"
62
+ "\t-m | --model FILE model file to preload\n"
63
+ "\t-d | --devel FILE development dataset\n"
64
+ "\t-c | --compact compact model after training\n"
65
+ "\t-t | --nthread INT number of worker threads\n"
66
+ "\t-j | --jobsize INT job size for worker threads\n"
67
+ "\t-s | --sparse enable sparse forward/backward\n"
68
+ "\t-i | --maxiter INT maximum number of iterations\n"
69
+ "\t-1 | --rho1 FLOAT l1 penalty parameter\n"
70
+ "\t-2 | --rho2 FLOAT l2 penalty parameter\n"
71
+ "\t-o | --objwin INT convergence window size\n"
72
+ "\t-w | --stopwin INT stop window size\n"
73
+ "\t-e | --stopeps FLOAT stop epsilon value\n"
74
+ "\t | --clip (l-bfgs) clip gradient\n"
75
+ "\t | --histsz INT (l-bfgs) history size\n"
76
+ "\t | --maxls INT (l-bfgs) max linesearch iters\n"
77
+ "\t | --eta0 FLOAT (sgd-l1) learning rate\n"
78
+ "\t | --alpha FLOAT (sgd-l1) exp decay parameter\n"
79
+ "\t | --kappa FLOAT (bcd) stability parameter\n"
80
+ "\t | --stpmin FLOAT (rprop) minimum step size\n"
81
+ "\t | --stpmax FLOAT (rprop) maximum step size\n"
82
+ "\t | --stpinc FLOAT (rprop) step increment factor\n"
83
+ "\t | --stpdec FLOAT (rprop) step decrement factor\n"
84
+ "\t | --cutoff (rprop) alternate projection\n"
85
+ "\n"
86
+ "Labelling mode:\n"
87
+ " %1$s label [options] [input data] [output data]\n"
88
+ "\t | --me force maxent mode\n"
89
+ "\t-m | --model FILE model file to load\n"
90
+ "\t-l | --label output only labels\n"
91
+ "\t-c | --check input is already labeled\n"
92
+ "\t-s | --score add scores to output\n"
93
+ "\t-p | --post label using posteriors\n"
94
+ "\t-n | --nbest INT output n-best list\n"
95
+ "\n"
96
+ "Dumping mode\n"
97
+ " %1$s dump [input model] [output text]\n";
98
+ fprintf(stderr, msg, pname);
99
+ }
100
+
101
+ /* opt_defaults:
102
+ * Default values for all parameters of the model.
103
+ */
104
+ const opt_t opt_defaults = {
105
+ .mode = -1,
106
+ .input = NULL, .output = NULL,
107
+ .maxent = false,
108
+ .algo = "l-bfgs", .pattern = NULL, .model = NULL, .devel = NULL,
109
+ .compact = false, .sparse = false,
110
+ .nthread = 1, .jobsize = 64, .maxiter = 0,
111
+ .rho1 = 0.5, .rho2 = 0.0001,
112
+ .objwin = 5, .stopwin = 5, .stopeps = 0.02,
113
+ .lbfgs = {.clip = false, .histsz = 5, .maxls = 40},
114
+ .sgdl1 = {.eta0 = 0.8, .alpha = 0.85},
115
+ .bcd = {.kappa = 1.5},
116
+ .rprop = {.stpmin = 1e-8, .stpmax = 50.0, .stpinc = 1.2, .stpdec = 0.5,
117
+ .cutoff = false},
118
+ .label = false, .check = false, .outsc = false,
119
+ .lblpost = false, .nbest = 1
120
+ };
121
+
122
+ /* opt_switch:
123
+ * Define available switchs for the different modes in a readable way for the
124
+ * command line argument parser.
125
+ */
126
+ struct {
127
+ int mode;
128
+ char *dshort;
129
+ char *dlong;
130
+ char kind;
131
+ size_t offset;
132
+ } opt_switch[] = {
133
+ {0, "##", "--me", 'B', offsetof(opt_t, maxent )},
134
+ {0, "-a", "--algo", 'S', offsetof(opt_t, algo )},
135
+ {0, "-p", "--pattern", 'S', offsetof(opt_t, pattern )},
136
+ {0, "-m", "--model", 'S', offsetof(opt_t, model )},
137
+ {0, "-d", "--devel", 'S', offsetof(opt_t, devel )},
138
+ {0, "-c", "--compact", 'B', offsetof(opt_t, compact )},
139
+ {0, "-s", "--sparse", 'B', offsetof(opt_t, sparse )},
140
+ {0, "-t", "--nthread", 'I', offsetof(opt_t, nthread )},
141
+ {0, "-j", "--josize", 'I', offsetof(opt_t, jobsize )},
142
+ {0, "-i", "--maxiter", 'I', offsetof(opt_t, maxiter )},
143
+ {0, "-1", "--rho1", 'F', offsetof(opt_t, rho1 )},
144
+ {0, "-2", "--rho2", 'F', offsetof(opt_t, rho2 )},
145
+ {0, "-o", "--objsz", 'I', offsetof(opt_t, objwin )},
146
+ {0, "-w", "--stopwin", 'I', offsetof(opt_t, stopwin )},
147
+ {0, "-e", "--stopeps", 'F', offsetof(opt_t, stopeps )},
148
+ {0, "##", "--clip", 'B', offsetof(opt_t, lbfgs.clip )},
149
+ {0, "##", "--histsz", 'I', offsetof(opt_t, lbfgs.histsz)},
150
+ {0, "##", "--maxls", 'I', offsetof(opt_t, lbfgs.maxls )},
151
+ {0, "##", "--eta0", 'F', offsetof(opt_t, sgdl1.eta0 )},
152
+ {0," ##", "--alpha", 'F', offsetof(opt_t, sgdl1.alpha )},
153
+ {0, "##", "--kappa", 'F', offsetof(opt_t, bcd.kappa )},
154
+ {0, "##", "--stpmin", 'F', offsetof(opt_t, rprop.stpmin)},
155
+ {0, "##", "--stpmax", 'F', offsetof(opt_t, rprop.stpmax)},
156
+ {0, "##", "--stpinc", 'F', offsetof(opt_t, rprop.stpinc)},
157
+ {0, "##", "--stpdec", 'F', offsetof(opt_t, rprop.stpdec)},
158
+ {0, "##", "--cutoff", 'B', offsetof(opt_t, rprop.cutoff)},
159
+ {1, "##", "--me", 'B', offsetof(opt_t, maxent )},
160
+ {1, "-m", "--model", 'S', offsetof(opt_t, model )},
161
+ {1, "-l", "--label", 'B', offsetof(opt_t, label )},
162
+ {1, "-c", "--check", 'B', offsetof(opt_t, check )},
163
+ {1, "-s", "--score", 'B', offsetof(opt_t, outsc )},
164
+ {1, "-p", "--post", 'B', offsetof(opt_t, lblpost )},
165
+ {1, "-n", "--nbest", 'I', offsetof(opt_t, nbest )},
166
+ {-1, NULL, NULL, '\0', 0}
167
+ };
168
+
169
+ /* argparse:
170
+ * This is the main function for command line parsing. It use the previous
171
+ * table to known how to interpret the switchs and store values in the opt_t
172
+ * structure.
173
+ */
174
+ void opt_parse(int argc, char *argv[argc], opt_t *opt) {
175
+ static const char *err_badval = "invalid value for switch '%s'";
176
+ const char *pname = argv[0];
177
+ argc--, argv++;
178
+ if (argc == 0) {
179
+ opt_help(pname);
180
+ fatal("no mode specified");
181
+ }
182
+ // First special handling for help and version
183
+ if (!strcmp(argv[0], "-h") || !strcmp(argv[0], "--help")) {
184
+ opt_help(pname);
185
+ exit(EXIT_FAILURE);
186
+ } else if (!strcmp(argv[0], "--version")) {
187
+ fprintf(stderr, "Wapiti v" VERSION "\n");
188
+ fprintf(stderr, " Optimization mode: %s\n", xvm_mode());
189
+ exit(EXIT_SUCCESS);
190
+ }
191
+ // Get the mode to use
192
+ if (!strcmp(argv[0], "t") || !strcmp(argv[0], "train")) {
193
+ opt->mode = 0;
194
+ } else if (!strcmp(argv[0], "l") || !strcmp(argv[0], "label")) {
195
+ opt->mode = 1;
196
+ } else if (!strcmp(argv[0], "d") || !strcmp(argv[0], "dump")) {
197
+ opt->mode = 2;
198
+ } else {
199
+ fatal("unknown mode <%s>", argv[0]);
200
+ }
201
+ argc--, argv++;
202
+ // Parse remaining arguments
203
+ opt->input = NULL;
204
+ opt->output = NULL;
205
+ while (argc > 0) {
206
+ const char *arg = argv[0];
207
+ int idx;
208
+ // Check if this argument is a filename or an option
209
+ if (arg[0] != '-') {
210
+ if (opt->input == NULL)
211
+ opt->input = argv[0];
212
+ else if (opt->output == NULL)
213
+ opt->output = argv[0];
214
+ else
215
+ fatal("too much input files on command line");
216
+ argc--, argv++;
217
+ continue;
218
+ }
219
+ // Search the current switch in the table or fail if it cannot
220
+ // be found.
221
+ for (idx = 0; opt_switch[idx].mode != -1; idx++) {
222
+ if (opt_switch[idx].mode != opt->mode)
223
+ continue;
224
+ if (!strcmp(arg, opt_switch[idx].dshort))
225
+ break;
226
+ if (!strcmp(arg, opt_switch[idx].dlong))
227
+ break;
228
+ }
229
+ if (opt_switch[idx].mode == -1)
230
+ fatal("unknown option '%s'", arg);
231
+ // Decode the argument and store it in the structure
232
+ if (opt_switch[idx].kind != 'B' && argc < 2)
233
+ fatal("missing argument for switch '%s'", arg);
234
+ void *ptr = (void *)((char *)opt + opt_switch[idx].offset);
235
+ switch (opt_switch[idx].kind) {
236
+ case 'S':
237
+ *((char **)ptr) = argv[1];
238
+ argc -= 2, argv += 2;
239
+ break;
240
+ case 'I':
241
+ if (sscanf(argv[1], "%d", (int *)ptr) != 1)
242
+ fatal(err_badval, arg);
243
+ argc -= 2, argv += 2;
244
+ break;
245
+ case 'F': {
246
+ double tmp;
247
+ if (sscanf(argv[1], "%lf", &tmp) != 1)
248
+ fatal(err_badval, arg);
249
+ *((double *)ptr) = tmp;
250
+ argc -= 2, argv += 2;
251
+ break; }
252
+ case 'B':
253
+ *((bool *)ptr) = true;
254
+ argc--, argv++;
255
+ break;
256
+ }
257
+ }
258
+ // Small trick for the maxiter switch
259
+ if (opt->maxiter == 0)
260
+ opt->maxiter = INT_MAX;
261
+ // Check that all options are valid
262
+ #define argchecksub(name, test) \
263
+ if (!(test)) \
264
+ fatal("invalid value for <"name">");
265
+ argchecksub("--thread", opt->nthread > 0 );
266
+ argchecksub("--jobsize", opt->jobsize > 0 );
267
+ argchecksub("--rho1", opt->rho1 >= 0.0);
268
+ argchecksub("--rho2", opt->rho2 >= 0.0);
269
+ argchecksub("--histsz", opt->lbfgs.histsz > 0 );
270
+ argchecksub("--maxls", opt->lbfgs.maxls > 0 );
271
+ argchecksub("--eta0", opt->sgdl1.eta0 > 0.0);
272
+ argchecksub("--alpha", opt->sgdl1.alpha > 0.0);
273
+ argchecksub("--nbest", opt->nbest > 0 );
274
+ #undef argchecksub
275
+ if (opt->maxent && !strcmp(opt->algo, "bcd"))
276
+ fatal("BCD not supported for training maxent models");
277
+ }
278
+
@@ -0,0 +1,91 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+ #ifndef options_h
28
+ #define options_h
29
+
30
+ #include <stdbool.h>
31
+
32
+ #include "wapiti.h"
33
+
34
+ /* opt_t:
35
+ * This structure hold all user configurable parameter for Wapiti and is
36
+ * filled with parameters from command line.
37
+ */
38
+ typedef struct opt_s opt_t;
39
+ struct opt_s {
40
+ int mode;
41
+ char *input, *output;
42
+ bool maxent;
43
+ // Options for training
44
+ char *algo, *pattern;
45
+ char *model, *devel;
46
+ bool compact, sparse;
47
+ int nthread;
48
+ int jobsize;
49
+ int maxiter;
50
+ double rho1, rho2;
51
+ // Window size criterion
52
+ int objwin;
53
+ int stopwin;
54
+ double stopeps;
55
+ // Options specific to L-BFGS
56
+ struct {
57
+ bool clip;
58
+ int histsz;
59
+ int maxls;
60
+ } lbfgs;
61
+ // Options specific to SGD-L1
62
+ struct {
63
+ double eta0;
64
+ double alpha;
65
+ } sgdl1;
66
+ // Options specific to BCD
67
+ struct {
68
+ double kappa;
69
+ } bcd;
70
+ // Options specific to RPROP
71
+ struct {
72
+ double stpmin;
73
+ double stpmax;
74
+ double stpinc;
75
+ double stpdec;
76
+ bool cutoff;
77
+ } rprop;
78
+ // Options for labelling
79
+ bool label;
80
+ bool check;
81
+ bool outsc;
82
+ bool lblpost;
83
+ int nbest;
84
+ };
85
+
86
+ extern const opt_t opt_defaults;
87
+
88
+ void opt_parse(int argc, char *argv[argc], opt_t *opt);
89
+
90
+ #endif
91
+
@@ -0,0 +1,395 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #include <ctype.h>
29
+ #include <stdbool.h>
30
+ #include <stddef.h>
31
+ #include <stdio.h>
32
+ #include <stdlib.h>
33
+ #include <string.h>
34
+
35
+ #include "pattern.h"
36
+ #include "sequence.h"
37
+ #include "tools.h"
38
+
39
+ /******************************************************************************
40
+ * A simple regular expression matcher
41
+ *
42
+ * This module implement a simple regular expression matcher, it implement
43
+ * just a subset of the classical regexp simple to implement but sufficient
44
+ * for most usages and avoid to add a dependency to a full regexp library.
45
+ *
46
+ * The recognized subset is quite simple. First for matching characters :
47
+ * . -> match any characters
48
+ * \x -> match a character class (in uppercase, match the complement)
49
+ * \d : digit \a : alpha \w : alpha + digit
50
+ * \l : lowercase \u : uppercase \p : punctuation
51
+ * \s : space
52
+ * or escape a character
53
+ * x -> any other character match itself
54
+ * And the constructs :
55
+ * ^ -> at the begining of the regexp, anchor it at start of string
56
+ * $ -> at the end of regexp, anchor it at end of string
57
+ * * -> match any number of repetition of the previous character
58
+ * ? -> optionally match the previous character
59
+ *
60
+ * This subset is implemented quite efficiently using recursion. All recursive
61
+ * calls are tail-call so they should be optimized by the compiler. As we do
62
+ * direct interpretation, we have to backtrack so performance can be very poor
63
+ * on specialy designed regexp. This is not a problem as the regexp as well as
64
+ * the string is expected to be very simple here. If this is not the case, you
65
+ * better have to prepare your data better.
66
+ ******************************************************************************/
67
+
68
+ /* rex_matchit:
69
+ * Match a single caracter at the start fo the string. The character might be
70
+ * a plain char, a dot or char class.
71
+ */
72
+ static bool rex_matchit(const char *ch, const char *str) {
73
+ if (str[0] == '\0')
74
+ return false;
75
+ if (ch[0] == '.')
76
+ return true;
77
+ if (ch[0] == '\\') {
78
+ switch (ch[1]) {
79
+ case 'a': return isalpha(str[0]);
80
+ case 'd': return isdigit(str[0]);
81
+ case 'l': return islower(str[0]);
82
+ case 'p': return ispunct(str[0]);
83
+ case 's': return isspace(str[0]);
84
+ case 'u': return isupper(str[0]);
85
+ case 'w': return isalnum(str[0]);
86
+ case 'A': return !isalpha(str[0]);
87
+ case 'D': return !isdigit(str[0]);
88
+ case 'L': return !islower(str[0]);
89
+ case 'P': return !ispunct(str[0]);
90
+ case 'S': return !isspace(str[0]);
91
+ case 'U': return !isupper(str[0]);
92
+ case 'W': return !isalnum(str[0]);
93
+ }
94
+ return ch[1] == str[0];
95
+ }
96
+ return ch[0] == str[0];
97
+ }
98
+
99
+ /* rex_matchme:
100
+ * Match a regular expresion at the start of the string. If a match is found,
101
+ * is length is returned in len. The mathing is done through tail-recursion
102
+ * for good performances.
103
+ */
104
+ static bool rex_matchme(const char *re, const char *str, int *len) {
105
+ // Special check for end of regexp
106
+ if (re[0] == '\0')
107
+ return true;
108
+ if (re[0] == '$' && re[1] == '\0')
109
+ return (str[0] == '\0');
110
+ // Get first char of regexp
111
+ const char *ch = re;
112
+ const char *nxt = re + 1 + (ch[0] == '\\');
113
+ // Special check for the following construct "x**" where the first star
114
+ // is consumed normally but lead the second (which is wrong) to be
115
+ // interpreted as a char to mach as if it was escaped (and same for the
116
+ // optional construct)
117
+ if (*ch == '*' || *ch == '?')
118
+ fatal("unescaped * or ? in regexp: %s", re);
119
+ // Handle star repetition
120
+ if (nxt[0] == '*') {
121
+ nxt++;
122
+ do {
123
+ const int save = *len;
124
+ if (rex_matchme(nxt, str, len))
125
+ return true;
126
+ *len = save + 1;
127
+ } while (rex_matchit(ch, str++));
128
+ return false;
129
+ }
130
+ // Handle optional
131
+ if (nxt[0] == '?') {
132
+ nxt++;
133
+ if (rex_matchit(ch, str)) {
134
+ (*len)++;
135
+ if (rex_matchme(nxt, str + 1, len))
136
+ return true;
137
+ (*len)--;
138
+ }
139
+ return rex_matchme(nxt, str, len);
140
+ }
141
+ // Classical char matching
142
+ (*len)++;
143
+ if (rex_matchit(ch, str))
144
+ return rex_matchme(nxt, str + 1, len);
145
+ return false;
146
+ }
147
+
148
+ /* rex_match:
149
+ * Match a regular expresion in the given string. If a match is found, the
150
+ * position of the start of the match is returned and is len is returned in
151
+ * len, else -1 is returned.
152
+ */
153
+ static int rex_match(const char *re, const char *str, int *len) {
154
+ // Special case for anchor at start
155
+ if (*re == '^') {
156
+ *len = 0;
157
+ if (rex_matchme(re + 1, str, len))
158
+ return 0;
159
+ return -1;
160
+ }
161
+ // And general case for any position
162
+ int pos = 0;
163
+ do {
164
+ *len = 0;
165
+ if (rex_matchme(re, str + pos, len))
166
+ return pos;
167
+ } while (str[pos++] != '\0');
168
+ // Matching failed
169
+ return -1;
170
+ }
171
+
172
+ /*******************************************************************************
173
+ * Pattern handling
174
+ *
175
+ * Patterns are the heart the data input process, they provide a way to tell
176
+ * Wapiti how the interesting information can be extracted from the input
177
+ * data. A pattern is simply a string who embed special commands about tokens
178
+ * to extract from the input sequence. They are compiled to a special form
179
+ * used during data loading.
180
+ * For training, each position of a sequence hold a list of observation made
181
+ * at this position, pattern give a way to specify these observations.
182
+ *
183
+ * During sequence loading, all patterns are applied at each position to
184
+ * produce a list of string representing the observations which will be in
185
+ * turn transformed to numerical identifiers. This module take care of
186
+ * building the string representation.
187
+ *
188
+ * As said, a patern is a string with specific commands in the forms %c[...]
189
+ * where 'c' is the command with arguments between the bracket. All commands
190
+ * take at least to numerical arguments which define a token in the input
191
+ * sequence. The first one is an offset from the current position and the
192
+ * second one is a column number. With these two parameters, we get a string
193
+ * in the input sequence on which we apply the command.
194
+ *
195
+ * All command are specified with a character and result in a string which
196
+ * will replace the command in the pattern string. If the command character is
197
+ * lower case, the result is copied verbatim, if it is uppercase, the result
198
+ * is copied with casing removed. The following commands are available:
199
+ * 'x' -- result is the token itself
200
+ * 't' -- test if a regular expression match the token. Result will be
201
+ * either "true" or "false"
202
+ * 'm' -- match a regular expression on the token. Result is the first
203
+ * substring matched.
204
+ ******************************************************************************/
205
+
206
+ /* pat_comp:
207
+ * Compile the pattern to a form more suitable to easily apply it on tokens
208
+ * list during data reading. The given pattern string is interned in the
209
+ * compiled pattern and will be freed with it, so you don't have to take care
210
+ * of it and must not modify it after the compilation.
211
+ */
212
+ pat_t *pat_comp(char *p) {
213
+ pat_t *pat = NULL;
214
+ // Allocate memory for the compiled pattern, the allocation is based
215
+ // on an over-estimation of the number of required item. As compiled
216
+ // pattern take a neglectible amount of memory, this waste is not
217
+ // important.
218
+ int mitems = 0;
219
+ for (int pos = 0; p[pos] != '\0'; pos++)
220
+ if (p[pos] == '%')
221
+ mitems++;
222
+ mitems = mitems * 2 + 1;
223
+ pat = wapiti_xmalloc(sizeof(pat_t) + sizeof(pat->items[0]) * mitems);
224
+ pat->src = p;
225
+ // Next, we go through the pattern compiling the items as they are
226
+ // found. Commands are parsed and put in a corresponding item, and
227
+ // segment of char not in a command are put in a 's' item.
228
+ int nitems = 0;
229
+ int ntoks = 0;
230
+ int pos = 0;
231
+ while (p[pos] != '\0') {
232
+ pat_item_t *item = &(pat->items[nitems++]);
233
+ item->value = NULL;
234
+ if (p[pos] == '%') {
235
+ // This is a command, so first parse its type and check
236
+ // its a valid one. Next prepare the item.
237
+ const char type = tolower(p[pos + 1]);
238
+ if (type != 'x' && type != 't' && type != 'm')
239
+ fatal("unknown command type: '%c'", type);
240
+ item->type = type;
241
+ item->caps = (p[pos + 1] != type);
242
+ pos += 2;
243
+ // Next we parse the offset and column and store them in
244
+ // the item.
245
+ const char *at = p + pos;
246
+ int off, col, nch;
247
+ item->absolute = false;
248
+ if (sscanf(at, "[@%d,%d%n", &off, &col, &nch) == 2)
249
+ item->absolute = true;
250
+ else if (sscanf(at, "[%d,%d%n", &off, &col, &nch) != 2)
251
+ fatal("invalid pattern: %s", p);
252
+ if (col < 0)
253
+ fatal("invalid column number: %d", col);
254
+ item->offset = off;
255
+ item->column = col;
256
+ ntoks = max(ntoks, col);
257
+ pos += nch;
258
+ // And parse the end of the argument list, for 'x' there
259
+ // is nothing to read but for 't' and 'm' we have to get
260
+ // read the regexp.
261
+ if (type == 't' || type == 'm') {
262
+ if (p[pos] != ',' && p[pos + 1] != '"')
263
+ fatal("missing arg in pattern: %s", p);
264
+ const int start = (pos += 2);
265
+ while (p[pos] != '\0') {
266
+ if (p[pos] == '"')
267
+ break;
268
+ if (p[pos] == '\\' && p[pos+1] != '\0')
269
+ pos++;
270
+ pos++;
271
+ }
272
+ if (p[pos] != '"')
273
+ fatal("unended argument: %s", p);
274
+ const int len = pos - start;
275
+ item->value = wapiti_xmalloc(sizeof(char) * (len + 1));
276
+ memcpy(item->value, p + start, len);
277
+ item->value[len] = '\0';
278
+ pos++;
279
+ }
280
+ // Just check the end of the arg list and loop.
281
+ if (p[pos] != ']')
282
+ fatal("missing end of pattern: %s", p);
283
+ pos++;
284
+ } else {
285
+ // No command here, so build an 's' item with the chars
286
+ // until end of pattern or next command and put it in
287
+ // the list.
288
+ const int start = pos;
289
+ while (p[pos] != '\0' && p[pos] != '%')
290
+ pos++;
291
+ const int len = pos - start;
292
+ item->type = 's';
293
+ item->caps = false;
294
+ item->value = wapiti_xmalloc(sizeof(char) * (len + 1));
295
+ memcpy(item->value, p + start, len);
296
+ item->value[len] = '\0';
297
+ }
298
+ }
299
+ pat->ntoks = ntoks;
300
+ pat->nitems = nitems;
301
+ return pat;
302
+ }
303
+
304
+ /* pat_exec:
305
+ * Execute a compiled pattern at position 'at' in the given tokens sequences
306
+ * in order to produce an observation string. The string is returned as a
307
+ * newly allocated memory block and the caller is responsible to free it when
308
+ * not needed anymore.
309
+ */
310
+ char *pat_exec(const pat_t *pat, const tok_t *tok, int at) {
311
+ static char *bval[] = {"_x-1", "_x-2", "_x-3", "_x-4", "_x-#"};
312
+ static char *eval[] = {"_x+1", "_x+2", "_x+3", "_x+4", "_x+#"};
313
+ const int T = tok->len;
314
+ // Prepare the buffer who will hold the result
315
+ int size = 16, pos = 0;
316
+ char *buffer = wapiti_xmalloc(sizeof(char) * size);
317
+ // And loop over the compiled items
318
+ for (int it = 0; it < pat->nitems; it++) {
319
+ const pat_item_t *item = &(pat->items[it]);
320
+ char *value = NULL;
321
+ int len = 0;
322
+ // First, if needed, we retrieve the token at the referenced
323
+ // position in the sequence. We store it in value and let the
324
+ // command handler do what it need with it.
325
+ if (item->type != 's') {
326
+ int pos = item->offset;
327
+ if (item->absolute) {
328
+ if (item->offset < 0)
329
+ pos += T;
330
+ else
331
+ pos--;
332
+ } else {
333
+ pos += at;
334
+ }
335
+ int col = item->column;
336
+ if (pos < 0)
337
+ value = bval[min(-pos - 1, 4)];
338
+ else if (pos >= T)
339
+ value = eval[min( pos - T, 4)];
340
+ else if (col >= tok->cnts[pos])
341
+ fatal("missing tokens, cannot apply pattern");
342
+ else
343
+ value = tok->toks[pos][col];
344
+ }
345
+ // Next, we handle the command, 's' and 'x' are very simple but
346
+ // 't' and 'm' require us to call the regexp matcher.
347
+ if (item->type == 's') {
348
+ value = item->value;
349
+ len = strlen(value);
350
+ } else if (item->type == 'x') {
351
+ len = strlen(value);
352
+ } else if (item->type == 't') {
353
+ if (rex_match(item->value, value, &len) == -1)
354
+ value = "false";
355
+ else
356
+ value = "true";
357
+ len = strlen(value);
358
+ } else if (item->type == 'm') {
359
+ int pos = rex_match(item->value, value, &len);
360
+ if (pos == -1)
361
+ len = 0;
362
+ value += pos;
363
+ }
364
+ // And we add it to the buffer, growing it if needed. If the
365
+ // user requested it, we also remove caps from the string.
366
+ if (pos + len >= size - 1) {
367
+ while (pos + len >= size - 1)
368
+ size = size * 1.4;
369
+ buffer = wapiti_xrealloc(buffer, sizeof(char) * size);
370
+ }
371
+ memcpy(buffer + pos, value, len);
372
+ if (item->caps)
373
+ for (int i = pos; i < pos + len; i++)
374
+ buffer[i] = tolower(buffer[i]);
375
+ pos += len;
376
+ }
377
+ // Adjust the result and return it.
378
+ buffer[pos++] = '\0';
379
+ buffer = wapiti_xrealloc(buffer, sizeof(char) * pos);
380
+ return buffer;
381
+ }
382
+
383
+ /* pat_free:
384
+ * Free all memory used by a compiled pattern object. Note that this will free
385
+ * the pointer to the source string given to pat_comp so you must be sure to
386
+ * not use this pointer again.
387
+ */
388
+ void pat_free(pat_t *pat) {
389
+ for (int it = 0; it < pat->nitems; it++)
390
+ free(pat->items[it].value);
391
+ free(pat->src);
392
+ free(pat);
393
+ }
394
+
395
+