wapiti 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/.autotest +13 -0
  2. data/.gitignore +5 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +30 -0
  6. data/README.md +153 -0
  7. data/Rakefile +33 -0
  8. data/ext/wapiti/bcd.c +392 -0
  9. data/ext/wapiti/decoder.c +535 -0
  10. data/ext/wapiti/decoder.h +46 -0
  11. data/ext/wapiti/extconf.rb +8 -0
  12. data/ext/wapiti/gradient.c +818 -0
  13. data/ext/wapiti/gradient.h +81 -0
  14. data/ext/wapiti/lbfgs.c +294 -0
  15. data/ext/wapiti/model.c +296 -0
  16. data/ext/wapiti/model.h +100 -0
  17. data/ext/wapiti/native.c +1238 -0
  18. data/ext/wapiti/native.h +15 -0
  19. data/ext/wapiti/options.c +278 -0
  20. data/ext/wapiti/options.h +91 -0
  21. data/ext/wapiti/pattern.c +395 -0
  22. data/ext/wapiti/pattern.h +56 -0
  23. data/ext/wapiti/progress.c +167 -0
  24. data/ext/wapiti/progress.h +43 -0
  25. data/ext/wapiti/quark.c +272 -0
  26. data/ext/wapiti/quark.h +46 -0
  27. data/ext/wapiti/reader.c +553 -0
  28. data/ext/wapiti/reader.h +73 -0
  29. data/ext/wapiti/rprop.c +191 -0
  30. data/ext/wapiti/sequence.h +148 -0
  31. data/ext/wapiti/sgdl1.c +218 -0
  32. data/ext/wapiti/thread.c +171 -0
  33. data/ext/wapiti/thread.h +42 -0
  34. data/ext/wapiti/tools.c +202 -0
  35. data/ext/wapiti/tools.h +54 -0
  36. data/ext/wapiti/trainers.h +39 -0
  37. data/ext/wapiti/vmath.c +372 -0
  38. data/ext/wapiti/vmath.h +51 -0
  39. data/ext/wapiti/wapiti.c +288 -0
  40. data/ext/wapiti/wapiti.h +45 -0
  41. data/lib/wapiti.rb +30 -0
  42. data/lib/wapiti/errors.rb +17 -0
  43. data/lib/wapiti/model.rb +49 -0
  44. data/lib/wapiti/options.rb +113 -0
  45. data/lib/wapiti/utility.rb +15 -0
  46. data/lib/wapiti/version.rb +3 -0
  47. data/spec/fixtures/ch.mod +18550 -0
  48. data/spec/fixtures/chpattern.txt +52 -0
  49. data/spec/fixtures/chtest.txt +1973 -0
  50. data/spec/fixtures/chtrain.txt +19995 -0
  51. data/spec/fixtures/nppattern.txt +52 -0
  52. data/spec/fixtures/nptest.txt +1973 -0
  53. data/spec/fixtures/nptrain.txt +19995 -0
  54. data/spec/fixtures/pattern.txt +14 -0
  55. data/spec/fixtures/test.txt +60000 -0
  56. data/spec/fixtures/train.txt +1200 -0
  57. data/spec/spec_helper.rb +21 -0
  58. data/spec/wapiti/model_spec.rb +173 -0
  59. data/spec/wapiti/native_spec.rb +12 -0
  60. data/spec/wapiti/options_spec.rb +175 -0
  61. data/spec/wapiti/utility_spec.rb +22 -0
  62. data/wapiti.gemspec +35 -0
  63. metadata +178 -0
@@ -0,0 +1,15 @@
1
+ #ifndef native_h
2
+ #define native_h
3
+
4
+ #include <ruby.h>
5
+
6
+ extern VALUE mWapiti;
7
+ extern VALUE mNative;
8
+
9
+ extern VALUE cOptions;
10
+ extern VALUE cModel;
11
+
12
+ extern VALUE cNativeError;
13
+ extern VALUE cLogger;
14
+
15
+ #endif
@@ -0,0 +1,278 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #include <limits.h>
29
+ #include <stdbool.h>
30
+ #include <stddef.h>
31
+ #include <stdlib.h>
32
+ #include <stdio.h>
33
+ #include <string.h>
34
+
35
+ #include "wapiti.h"
36
+ #include "tools.h"
37
+ #include "options.h"
38
+ #include "vmath.h"
39
+
40
+ /******************************************************************************
41
+ * Command line parsing
42
+ *
43
+ * This module handle command line parsing and put all things defined by the
44
+ * user in a special structure in order to make them accessible to the
45
+ * remaining of the program.
46
+ ******************************************************************************/
47
+
48
+ /* opt_help:
49
+ * Just display the help message describing modes and switch.
50
+ */
51
+ static void opt_help(const char *pname) {
52
+ static const char msg[] =
53
+ "Global switchs:\n"
54
+ "\t-h | --help display this help message\n"
55
+ "\t | --version display version information\n"
56
+ "\n"
57
+ "Training mode:\n"
58
+ " %1$s train [options] [input data] [model file]\n"
59
+ "\t | --me force maxent mode\n"
60
+ "\t-a | --algo STRING training algorithm to use\n"
61
+ "\t-p | --pattern FILE patterns for extracting features\n"
62
+ "\t-m | --model FILE model file to preload\n"
63
+ "\t-d | --devel FILE development dataset\n"
64
+ "\t-c | --compact compact model after training\n"
65
+ "\t-t | --nthread INT number of worker threads\n"
66
+ "\t-j | --jobsize INT job size for worker threads\n"
67
+ "\t-s | --sparse enable sparse forward/backward\n"
68
+ "\t-i | --maxiter INT maximum number of iterations\n"
69
+ "\t-1 | --rho1 FLOAT l1 penalty parameter\n"
70
+ "\t-2 | --rho2 FLOAT l2 penalty parameter\n"
71
+ "\t-o | --objwin INT convergence window size\n"
72
+ "\t-w | --stopwin INT stop window size\n"
73
+ "\t-e | --stopeps FLOAT stop epsilon value\n"
74
+ "\t | --clip (l-bfgs) clip gradient\n"
75
+ "\t | --histsz INT (l-bfgs) history size\n"
76
+ "\t | --maxls INT (l-bfgs) max linesearch iters\n"
77
+ "\t | --eta0 FLOAT (sgd-l1) learning rate\n"
78
+ "\t | --alpha FLOAT (sgd-l1) exp decay parameter\n"
79
+ "\t | --kappa FLOAT (bcd) stability parameter\n"
80
+ "\t | --stpmin FLOAT (rprop) minimum step size\n"
81
+ "\t | --stpmax FLOAT (rprop) maximum step size\n"
82
+ "\t | --stpinc FLOAT (rprop) step increment factor\n"
83
+ "\t | --stpdec FLOAT (rprop) step decrement factor\n"
84
+ "\t | --cutoff (rprop) alternate projection\n"
85
+ "\n"
86
+ "Labelling mode:\n"
87
+ " %1$s label [options] [input data] [output data]\n"
88
+ "\t | --me force maxent mode\n"
89
+ "\t-m | --model FILE model file to load\n"
90
+ "\t-l | --label output only labels\n"
91
+ "\t-c | --check input is already labeled\n"
92
+ "\t-s | --score add scores to output\n"
93
+ "\t-p | --post label using posteriors\n"
94
+ "\t-n | --nbest INT output n-best list\n"
95
+ "\n"
96
+ "Dumping mode\n"
97
+ " %1$s dump [input model] [output text]\n";
98
+ fprintf(stderr, msg, pname);
99
+ }
100
+
101
+ /* opt_defaults:
102
+ * Default values for all parameters of the model.
103
+ */
104
+ const opt_t opt_defaults = {
105
+ .mode = -1,
106
+ .input = NULL, .output = NULL,
107
+ .maxent = false,
108
+ .algo = "l-bfgs", .pattern = NULL, .model = NULL, .devel = NULL,
109
+ .compact = false, .sparse = false,
110
+ .nthread = 1, .jobsize = 64, .maxiter = 0,
111
+ .rho1 = 0.5, .rho2 = 0.0001,
112
+ .objwin = 5, .stopwin = 5, .stopeps = 0.02,
113
+ .lbfgs = {.clip = false, .histsz = 5, .maxls = 40},
114
+ .sgdl1 = {.eta0 = 0.8, .alpha = 0.85},
115
+ .bcd = {.kappa = 1.5},
116
+ .rprop = {.stpmin = 1e-8, .stpmax = 50.0, .stpinc = 1.2, .stpdec = 0.5,
117
+ .cutoff = false},
118
+ .label = false, .check = false, .outsc = false,
119
+ .lblpost = false, .nbest = 1
120
+ };
121
+
122
+ /* opt_switch:
123
+ * Define available switchs for the different modes in a readable way for the
124
+ * command line argument parser.
125
+ */
126
+ struct {
127
+ int mode;
128
+ char *dshort;
129
+ char *dlong;
130
+ char kind;
131
+ size_t offset;
132
+ } opt_switch[] = {
133
+ {0, "##", "--me", 'B', offsetof(opt_t, maxent )},
134
+ {0, "-a", "--algo", 'S', offsetof(opt_t, algo )},
135
+ {0, "-p", "--pattern", 'S', offsetof(opt_t, pattern )},
136
+ {0, "-m", "--model", 'S', offsetof(opt_t, model )},
137
+ {0, "-d", "--devel", 'S', offsetof(opt_t, devel )},
138
+ {0, "-c", "--compact", 'B', offsetof(opt_t, compact )},
139
+ {0, "-s", "--sparse", 'B', offsetof(opt_t, sparse )},
140
+ {0, "-t", "--nthread", 'I', offsetof(opt_t, nthread )},
141
+ {0, "-j", "--josize", 'I', offsetof(opt_t, jobsize )},
142
+ {0, "-i", "--maxiter", 'I', offsetof(opt_t, maxiter )},
143
+ {0, "-1", "--rho1", 'F', offsetof(opt_t, rho1 )},
144
+ {0, "-2", "--rho2", 'F', offsetof(opt_t, rho2 )},
145
+ {0, "-o", "--objsz", 'I', offsetof(opt_t, objwin )},
146
+ {0, "-w", "--stopwin", 'I', offsetof(opt_t, stopwin )},
147
+ {0, "-e", "--stopeps", 'F', offsetof(opt_t, stopeps )},
148
+ {0, "##", "--clip", 'B', offsetof(opt_t, lbfgs.clip )},
149
+ {0, "##", "--histsz", 'I', offsetof(opt_t, lbfgs.histsz)},
150
+ {0, "##", "--maxls", 'I', offsetof(opt_t, lbfgs.maxls )},
151
+ {0, "##", "--eta0", 'F', offsetof(opt_t, sgdl1.eta0 )},
152
+ {0," ##", "--alpha", 'F', offsetof(opt_t, sgdl1.alpha )},
153
+ {0, "##", "--kappa", 'F', offsetof(opt_t, bcd.kappa )},
154
+ {0, "##", "--stpmin", 'F', offsetof(opt_t, rprop.stpmin)},
155
+ {0, "##", "--stpmax", 'F', offsetof(opt_t, rprop.stpmax)},
156
+ {0, "##", "--stpinc", 'F', offsetof(opt_t, rprop.stpinc)},
157
+ {0, "##", "--stpdec", 'F', offsetof(opt_t, rprop.stpdec)},
158
+ {0, "##", "--cutoff", 'B', offsetof(opt_t, rprop.cutoff)},
159
+ {1, "##", "--me", 'B', offsetof(opt_t, maxent )},
160
+ {1, "-m", "--model", 'S', offsetof(opt_t, model )},
161
+ {1, "-l", "--label", 'B', offsetof(opt_t, label )},
162
+ {1, "-c", "--check", 'B', offsetof(opt_t, check )},
163
+ {1, "-s", "--score", 'B', offsetof(opt_t, outsc )},
164
+ {1, "-p", "--post", 'B', offsetof(opt_t, lblpost )},
165
+ {1, "-n", "--nbest", 'I', offsetof(opt_t, nbest )},
166
+ {-1, NULL, NULL, '\0', 0}
167
+ };
168
+
169
+ /* argparse:
170
+ * This is the main function for command line parsing. It use the previous
171
+ * table to known how to interpret the switchs and store values in the opt_t
172
+ * structure.
173
+ */
174
+ void opt_parse(int argc, char *argv[argc], opt_t *opt) {
175
+ static const char *err_badval = "invalid value for switch '%s'";
176
+ const char *pname = argv[0];
177
+ argc--, argv++;
178
+ if (argc == 0) {
179
+ opt_help(pname);
180
+ fatal("no mode specified");
181
+ }
182
+ // First special handling for help and version
183
+ if (!strcmp(argv[0], "-h") || !strcmp(argv[0], "--help")) {
184
+ opt_help(pname);
185
+ exit(EXIT_FAILURE);
186
+ } else if (!strcmp(argv[0], "--version")) {
187
+ fprintf(stderr, "Wapiti v" VERSION "\n");
188
+ fprintf(stderr, " Optimization mode: %s\n", xvm_mode());
189
+ exit(EXIT_SUCCESS);
190
+ }
191
+ // Get the mode to use
192
+ if (!strcmp(argv[0], "t") || !strcmp(argv[0], "train")) {
193
+ opt->mode = 0;
194
+ } else if (!strcmp(argv[0], "l") || !strcmp(argv[0], "label")) {
195
+ opt->mode = 1;
196
+ } else if (!strcmp(argv[0], "d") || !strcmp(argv[0], "dump")) {
197
+ opt->mode = 2;
198
+ } else {
199
+ fatal("unknown mode <%s>", argv[0]);
200
+ }
201
+ argc--, argv++;
202
+ // Parse remaining arguments
203
+ opt->input = NULL;
204
+ opt->output = NULL;
205
+ while (argc > 0) {
206
+ const char *arg = argv[0];
207
+ int idx;
208
+ // Check if this argument is a filename or an option
209
+ if (arg[0] != '-') {
210
+ if (opt->input == NULL)
211
+ opt->input = argv[0];
212
+ else if (opt->output == NULL)
213
+ opt->output = argv[0];
214
+ else
215
+ fatal("too much input files on command line");
216
+ argc--, argv++;
217
+ continue;
218
+ }
219
+ // Search the current switch in the table or fail if it cannot
220
+ // be found.
221
+ for (idx = 0; opt_switch[idx].mode != -1; idx++) {
222
+ if (opt_switch[idx].mode != opt->mode)
223
+ continue;
224
+ if (!strcmp(arg, opt_switch[idx].dshort))
225
+ break;
226
+ if (!strcmp(arg, opt_switch[idx].dlong))
227
+ break;
228
+ }
229
+ if (opt_switch[idx].mode == -1)
230
+ fatal("unknown option '%s'", arg);
231
+ // Decode the argument and store it in the structure
232
+ if (opt_switch[idx].kind != 'B' && argc < 2)
233
+ fatal("missing argument for switch '%s'", arg);
234
+ void *ptr = (void *)((char *)opt + opt_switch[idx].offset);
235
+ switch (opt_switch[idx].kind) {
236
+ case 'S':
237
+ *((char **)ptr) = argv[1];
238
+ argc -= 2, argv += 2;
239
+ break;
240
+ case 'I':
241
+ if (sscanf(argv[1], "%d", (int *)ptr) != 1)
242
+ fatal(err_badval, arg);
243
+ argc -= 2, argv += 2;
244
+ break;
245
+ case 'F': {
246
+ double tmp;
247
+ if (sscanf(argv[1], "%lf", &tmp) != 1)
248
+ fatal(err_badval, arg);
249
+ *((double *)ptr) = tmp;
250
+ argc -= 2, argv += 2;
251
+ break; }
252
+ case 'B':
253
+ *((bool *)ptr) = true;
254
+ argc--, argv++;
255
+ break;
256
+ }
257
+ }
258
+ // Small trick for the maxiter switch
259
+ if (opt->maxiter == 0)
260
+ opt->maxiter = INT_MAX;
261
+ // Check that all options are valid
262
+ #define argchecksub(name, test) \
263
+ if (!(test)) \
264
+ fatal("invalid value for <"name">");
265
+ argchecksub("--thread", opt->nthread > 0 );
266
+ argchecksub("--jobsize", opt->jobsize > 0 );
267
+ argchecksub("--rho1", opt->rho1 >= 0.0);
268
+ argchecksub("--rho2", opt->rho2 >= 0.0);
269
+ argchecksub("--histsz", opt->lbfgs.histsz > 0 );
270
+ argchecksub("--maxls", opt->lbfgs.maxls > 0 );
271
+ argchecksub("--eta0", opt->sgdl1.eta0 > 0.0);
272
+ argchecksub("--alpha", opt->sgdl1.alpha > 0.0);
273
+ argchecksub("--nbest", opt->nbest > 0 );
274
+ #undef argchecksub
275
+ if (opt->maxent && !strcmp(opt->algo, "bcd"))
276
+ fatal("BCD not supported for training maxent models");
277
+ }
278
+
@@ -0,0 +1,91 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+ #ifndef options_h
28
+ #define options_h
29
+
30
+ #include <stdbool.h>
31
+
32
+ #include "wapiti.h"
33
+
34
+ /* opt_t:
35
+ * This structure hold all user configurable parameter for Wapiti and is
36
+ * filled with parameters from command line.
37
+ */
38
+ typedef struct opt_s opt_t;
39
+ struct opt_s {
40
+ int mode;
41
+ char *input, *output;
42
+ bool maxent;
43
+ // Options for training
44
+ char *algo, *pattern;
45
+ char *model, *devel;
46
+ bool compact, sparse;
47
+ int nthread;
48
+ int jobsize;
49
+ int maxiter;
50
+ double rho1, rho2;
51
+ // Window size criterion
52
+ int objwin;
53
+ int stopwin;
54
+ double stopeps;
55
+ // Options specific to L-BFGS
56
+ struct {
57
+ bool clip;
58
+ int histsz;
59
+ int maxls;
60
+ } lbfgs;
61
+ // Options specific to SGD-L1
62
+ struct {
63
+ double eta0;
64
+ double alpha;
65
+ } sgdl1;
66
+ // Options specific to BCD
67
+ struct {
68
+ double kappa;
69
+ } bcd;
70
+ // Options specific to RPROP
71
+ struct {
72
+ double stpmin;
73
+ double stpmax;
74
+ double stpinc;
75
+ double stpdec;
76
+ bool cutoff;
77
+ } rprop;
78
+ // Options for labelling
79
+ bool label;
80
+ bool check;
81
+ bool outsc;
82
+ bool lblpost;
83
+ int nbest;
84
+ };
85
+
86
+ extern const opt_t opt_defaults;
87
+
88
+ void opt_parse(int argc, char *argv[argc], opt_t *opt);
89
+
90
+ #endif
91
+
@@ -0,0 +1,395 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #include <ctype.h>
29
+ #include <stdbool.h>
30
+ #include <stddef.h>
31
+ #include <stdio.h>
32
+ #include <stdlib.h>
33
+ #include <string.h>
34
+
35
+ #include "pattern.h"
36
+ #include "sequence.h"
37
+ #include "tools.h"
38
+
39
+ /******************************************************************************
40
+ * A simple regular expression matcher
41
+ *
42
+ * This module implement a simple regular expression matcher, it implement
43
+ * just a subset of the classical regexp simple to implement but sufficient
44
+ * for most usages and avoid to add a dependency to a full regexp library.
45
+ *
46
+ * The recognized subset is quite simple. First for matching characters :
47
+ * . -> match any characters
48
+ * \x -> match a character class (in uppercase, match the complement)
49
+ * \d : digit \a : alpha \w : alpha + digit
50
+ * \l : lowercase \u : uppercase \p : punctuation
51
+ * \s : space
52
+ * or escape a character
53
+ * x -> any other character match itself
54
+ * And the constructs :
55
+ * ^ -> at the begining of the regexp, anchor it at start of string
56
+ * $ -> at the end of regexp, anchor it at end of string
57
+ * * -> match any number of repetition of the previous character
58
+ * ? -> optionally match the previous character
59
+ *
60
+ * This subset is implemented quite efficiently using recursion. All recursive
61
+ * calls are tail-call so they should be optimized by the compiler. As we do
62
+ * direct interpretation, we have to backtrack so performance can be very poor
63
+ * on specialy designed regexp. This is not a problem as the regexp as well as
64
+ * the string is expected to be very simple here. If this is not the case, you
65
+ * better have to prepare your data better.
66
+ ******************************************************************************/
67
+
68
+ /* rex_matchit:
69
+ * Match a single caracter at the start fo the string. The character might be
70
+ * a plain char, a dot or char class.
71
+ */
72
+ static bool rex_matchit(const char *ch, const char *str) {
73
+ if (str[0] == '\0')
74
+ return false;
75
+ if (ch[0] == '.')
76
+ return true;
77
+ if (ch[0] == '\\') {
78
+ switch (ch[1]) {
79
+ case 'a': return isalpha(str[0]);
80
+ case 'd': return isdigit(str[0]);
81
+ case 'l': return islower(str[0]);
82
+ case 'p': return ispunct(str[0]);
83
+ case 's': return isspace(str[0]);
84
+ case 'u': return isupper(str[0]);
85
+ case 'w': return isalnum(str[0]);
86
+ case 'A': return !isalpha(str[0]);
87
+ case 'D': return !isdigit(str[0]);
88
+ case 'L': return !islower(str[0]);
89
+ case 'P': return !ispunct(str[0]);
90
+ case 'S': return !isspace(str[0]);
91
+ case 'U': return !isupper(str[0]);
92
+ case 'W': return !isalnum(str[0]);
93
+ }
94
+ return ch[1] == str[0];
95
+ }
96
+ return ch[0] == str[0];
97
+ }
98
+
99
+ /* rex_matchme:
100
+ * Match a regular expresion at the start of the string. If a match is found,
101
+ * is length is returned in len. The mathing is done through tail-recursion
102
+ * for good performances.
103
+ */
104
+ static bool rex_matchme(const char *re, const char *str, int *len) {
105
+ // Special check for end of regexp
106
+ if (re[0] == '\0')
107
+ return true;
108
+ if (re[0] == '$' && re[1] == '\0')
109
+ return (str[0] == '\0');
110
+ // Get first char of regexp
111
+ const char *ch = re;
112
+ const char *nxt = re + 1 + (ch[0] == '\\');
113
+ // Special check for the following construct "x**" where the first star
114
+ // is consumed normally but lead the second (which is wrong) to be
115
+ // interpreted as a char to mach as if it was escaped (and same for the
116
+ // optional construct)
117
+ if (*ch == '*' || *ch == '?')
118
+ fatal("unescaped * or ? in regexp: %s", re);
119
+ // Handle star repetition
120
+ if (nxt[0] == '*') {
121
+ nxt++;
122
+ do {
123
+ const int save = *len;
124
+ if (rex_matchme(nxt, str, len))
125
+ return true;
126
+ *len = save + 1;
127
+ } while (rex_matchit(ch, str++));
128
+ return false;
129
+ }
130
+ // Handle optional
131
+ if (nxt[0] == '?') {
132
+ nxt++;
133
+ if (rex_matchit(ch, str)) {
134
+ (*len)++;
135
+ if (rex_matchme(nxt, str + 1, len))
136
+ return true;
137
+ (*len)--;
138
+ }
139
+ return rex_matchme(nxt, str, len);
140
+ }
141
+ // Classical char matching
142
+ (*len)++;
143
+ if (rex_matchit(ch, str))
144
+ return rex_matchme(nxt, str + 1, len);
145
+ return false;
146
+ }
147
+
148
+ /* rex_match:
149
+ * Match a regular expresion in the given string. If a match is found, the
150
+ * position of the start of the match is returned and is len is returned in
151
+ * len, else -1 is returned.
152
+ */
153
+ static int rex_match(const char *re, const char *str, int *len) {
154
+ // Special case for anchor at start
155
+ if (*re == '^') {
156
+ *len = 0;
157
+ if (rex_matchme(re + 1, str, len))
158
+ return 0;
159
+ return -1;
160
+ }
161
+ // And general case for any position
162
+ int pos = 0;
163
+ do {
164
+ *len = 0;
165
+ if (rex_matchme(re, str + pos, len))
166
+ return pos;
167
+ } while (str[pos++] != '\0');
168
+ // Matching failed
169
+ return -1;
170
+ }
171
+
172
+ /*******************************************************************************
173
+ * Pattern handling
174
+ *
175
+ * Patterns are the heart the data input process, they provide a way to tell
176
+ * Wapiti how the interesting information can be extracted from the input
177
+ * data. A pattern is simply a string who embed special commands about tokens
178
+ * to extract from the input sequence. They are compiled to a special form
179
+ * used during data loading.
180
+ * For training, each position of a sequence hold a list of observation made
181
+ * at this position, pattern give a way to specify these observations.
182
+ *
183
+ * During sequence loading, all patterns are applied at each position to
184
+ * produce a list of string representing the observations which will be in
185
+ * turn transformed to numerical identifiers. This module take care of
186
+ * building the string representation.
187
+ *
188
+ * As said, a patern is a string with specific commands in the forms %c[...]
189
+ * where 'c' is the command with arguments between the bracket. All commands
190
+ * take at least to numerical arguments which define a token in the input
191
+ * sequence. The first one is an offset from the current position and the
192
+ * second one is a column number. With these two parameters, we get a string
193
+ * in the input sequence on which we apply the command.
194
+ *
195
+ * All command are specified with a character and result in a string which
196
+ * will replace the command in the pattern string. If the command character is
197
+ * lower case, the result is copied verbatim, if it is uppercase, the result
198
+ * is copied with casing removed. The following commands are available:
199
+ * 'x' -- result is the token itself
200
+ * 't' -- test if a regular expression match the token. Result will be
201
+ * either "true" or "false"
202
+ * 'm' -- match a regular expression on the token. Result is the first
203
+ * substring matched.
204
+ ******************************************************************************/
205
+
206
+ /* pat_comp:
207
+ * Compile the pattern to a form more suitable to easily apply it on tokens
208
+ * list during data reading. The given pattern string is interned in the
209
+ * compiled pattern and will be freed with it, so you don't have to take care
210
+ * of it and must not modify it after the compilation.
211
+ */
212
+ pat_t *pat_comp(char *p) {
213
+ pat_t *pat = NULL;
214
+ // Allocate memory for the compiled pattern, the allocation is based
215
+ // on an over-estimation of the number of required item. As compiled
216
+ // pattern take a neglectible amount of memory, this waste is not
217
+ // important.
218
+ int mitems = 0;
219
+ for (int pos = 0; p[pos] != '\0'; pos++)
220
+ if (p[pos] == '%')
221
+ mitems++;
222
+ mitems = mitems * 2 + 1;
223
+ pat = wapiti_xmalloc(sizeof(pat_t) + sizeof(pat->items[0]) * mitems);
224
+ pat->src = p;
225
+ // Next, we go through the pattern compiling the items as they are
226
+ // found. Commands are parsed and put in a corresponding item, and
227
+ // segment of char not in a command are put in a 's' item.
228
+ int nitems = 0;
229
+ int ntoks = 0;
230
+ int pos = 0;
231
+ while (p[pos] != '\0') {
232
+ pat_item_t *item = &(pat->items[nitems++]);
233
+ item->value = NULL;
234
+ if (p[pos] == '%') {
235
+ // This is a command, so first parse its type and check
236
+ // its a valid one. Next prepare the item.
237
+ const char type = tolower(p[pos + 1]);
238
+ if (type != 'x' && type != 't' && type != 'm')
239
+ fatal("unknown command type: '%c'", type);
240
+ item->type = type;
241
+ item->caps = (p[pos + 1] != type);
242
+ pos += 2;
243
+ // Next we parse the offset and column and store them in
244
+ // the item.
245
+ const char *at = p + pos;
246
+ int off, col, nch;
247
+ item->absolute = false;
248
+ if (sscanf(at, "[@%d,%d%n", &off, &col, &nch) == 2)
249
+ item->absolute = true;
250
+ else if (sscanf(at, "[%d,%d%n", &off, &col, &nch) != 2)
251
+ fatal("invalid pattern: %s", p);
252
+ if (col < 0)
253
+ fatal("invalid column number: %d", col);
254
+ item->offset = off;
255
+ item->column = col;
256
+ ntoks = max(ntoks, col);
257
+ pos += nch;
258
+ // And parse the end of the argument list, for 'x' there
259
+ // is nothing to read but for 't' and 'm' we have to get
260
+ // read the regexp.
261
+ if (type == 't' || type == 'm') {
262
+ if (p[pos] != ',' && p[pos + 1] != '"')
263
+ fatal("missing arg in pattern: %s", p);
264
+ const int start = (pos += 2);
265
+ while (p[pos] != '\0') {
266
+ if (p[pos] == '"')
267
+ break;
268
+ if (p[pos] == '\\' && p[pos+1] != '\0')
269
+ pos++;
270
+ pos++;
271
+ }
272
+ if (p[pos] != '"')
273
+ fatal("unended argument: %s", p);
274
+ const int len = pos - start;
275
+ item->value = wapiti_xmalloc(sizeof(char) * (len + 1));
276
+ memcpy(item->value, p + start, len);
277
+ item->value[len] = '\0';
278
+ pos++;
279
+ }
280
+ // Just check the end of the arg list and loop.
281
+ if (p[pos] != ']')
282
+ fatal("missing end of pattern: %s", p);
283
+ pos++;
284
+ } else {
285
+ // No command here, so build an 's' item with the chars
286
+ // until end of pattern or next command and put it in
287
+ // the list.
288
+ const int start = pos;
289
+ while (p[pos] != '\0' && p[pos] != '%')
290
+ pos++;
291
+ const int len = pos - start;
292
+ item->type = 's';
293
+ item->caps = false;
294
+ item->value = wapiti_xmalloc(sizeof(char) * (len + 1));
295
+ memcpy(item->value, p + start, len);
296
+ item->value[len] = '\0';
297
+ }
298
+ }
299
+ pat->ntoks = ntoks;
300
+ pat->nitems = nitems;
301
+ return pat;
302
+ }
303
+
304
+ /* pat_exec:
305
+ * Execute a compiled pattern at position 'at' in the given tokens sequences
306
+ * in order to produce an observation string. The string is returned as a
307
+ * newly allocated memory block and the caller is responsible to free it when
308
+ * not needed anymore.
309
+ */
310
+ char *pat_exec(const pat_t *pat, const tok_t *tok, int at) {
311
+ static char *bval[] = {"_x-1", "_x-2", "_x-3", "_x-4", "_x-#"};
312
+ static char *eval[] = {"_x+1", "_x+2", "_x+3", "_x+4", "_x+#"};
313
+ const int T = tok->len;
314
+ // Prepare the buffer who will hold the result
315
+ int size = 16, pos = 0;
316
+ char *buffer = wapiti_xmalloc(sizeof(char) * size);
317
+ // And loop over the compiled items
318
+ for (int it = 0; it < pat->nitems; it++) {
319
+ const pat_item_t *item = &(pat->items[it]);
320
+ char *value = NULL;
321
+ int len = 0;
322
+ // First, if needed, we retrieve the token at the referenced
323
+ // position in the sequence. We store it in value and let the
324
+ // command handler do what it need with it.
325
+ if (item->type != 's') {
326
+ int pos = item->offset;
327
+ if (item->absolute) {
328
+ if (item->offset < 0)
329
+ pos += T;
330
+ else
331
+ pos--;
332
+ } else {
333
+ pos += at;
334
+ }
335
+ int col = item->column;
336
+ if (pos < 0)
337
+ value = bval[min(-pos - 1, 4)];
338
+ else if (pos >= T)
339
+ value = eval[min( pos - T, 4)];
340
+ else if (col >= tok->cnts[pos])
341
+ fatal("missing tokens, cannot apply pattern");
342
+ else
343
+ value = tok->toks[pos][col];
344
+ }
345
+ // Next, we handle the command, 's' and 'x' are very simple but
346
+ // 't' and 'm' require us to call the regexp matcher.
347
+ if (item->type == 's') {
348
+ value = item->value;
349
+ len = strlen(value);
350
+ } else if (item->type == 'x') {
351
+ len = strlen(value);
352
+ } else if (item->type == 't') {
353
+ if (rex_match(item->value, value, &len) == -1)
354
+ value = "false";
355
+ else
356
+ value = "true";
357
+ len = strlen(value);
358
+ } else if (item->type == 'm') {
359
+ int pos = rex_match(item->value, value, &len);
360
+ if (pos == -1)
361
+ len = 0;
362
+ value += pos;
363
+ }
364
+ // And we add it to the buffer, growing it if needed. If the
365
+ // user requested it, we also remove caps from the string.
366
+ if (pos + len >= size - 1) {
367
+ while (pos + len >= size - 1)
368
+ size = size * 1.4;
369
+ buffer = wapiti_xrealloc(buffer, sizeof(char) * size);
370
+ }
371
+ memcpy(buffer + pos, value, len);
372
+ if (item->caps)
373
+ for (int i = pos; i < pos + len; i++)
374
+ buffer[i] = tolower(buffer[i]);
375
+ pos += len;
376
+ }
377
+ // Adjust the result and return it.
378
+ buffer[pos++] = '\0';
379
+ buffer = wapiti_xrealloc(buffer, sizeof(char) * pos);
380
+ return buffer;
381
+ }
382
+
383
+ /* pat_free:
384
+ * Free all memory used by a compiled pattern object. Note that this will free
385
+ * the pointer to the source string given to pat_comp so you must be sure to
386
+ * not use this pointer again.
387
+ */
388
+ void pat_free(pat_t *pat) {
389
+ for (int it = 0; it < pat->nitems; it++)
390
+ free(pat->items[it].value);
391
+ free(pat->src);
392
+ free(pat);
393
+ }
394
+
395
+