wapiti 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +13 -0
- data/.gitignore +5 -0
- data/.rspec +3 -0
- data/Gemfile +6 -0
- data/LICENSE +30 -0
- data/README.md +153 -0
- data/Rakefile +33 -0
- data/ext/wapiti/bcd.c +392 -0
- data/ext/wapiti/decoder.c +535 -0
- data/ext/wapiti/decoder.h +46 -0
- data/ext/wapiti/extconf.rb +8 -0
- data/ext/wapiti/gradient.c +818 -0
- data/ext/wapiti/gradient.h +81 -0
- data/ext/wapiti/lbfgs.c +294 -0
- data/ext/wapiti/model.c +296 -0
- data/ext/wapiti/model.h +100 -0
- data/ext/wapiti/native.c +1238 -0
- data/ext/wapiti/native.h +15 -0
- data/ext/wapiti/options.c +278 -0
- data/ext/wapiti/options.h +91 -0
- data/ext/wapiti/pattern.c +395 -0
- data/ext/wapiti/pattern.h +56 -0
- data/ext/wapiti/progress.c +167 -0
- data/ext/wapiti/progress.h +43 -0
- data/ext/wapiti/quark.c +272 -0
- data/ext/wapiti/quark.h +46 -0
- data/ext/wapiti/reader.c +553 -0
- data/ext/wapiti/reader.h +73 -0
- data/ext/wapiti/rprop.c +191 -0
- data/ext/wapiti/sequence.h +148 -0
- data/ext/wapiti/sgdl1.c +218 -0
- data/ext/wapiti/thread.c +171 -0
- data/ext/wapiti/thread.h +42 -0
- data/ext/wapiti/tools.c +202 -0
- data/ext/wapiti/tools.h +54 -0
- data/ext/wapiti/trainers.h +39 -0
- data/ext/wapiti/vmath.c +372 -0
- data/ext/wapiti/vmath.h +51 -0
- data/ext/wapiti/wapiti.c +288 -0
- data/ext/wapiti/wapiti.h +45 -0
- data/lib/wapiti.rb +30 -0
- data/lib/wapiti/errors.rb +17 -0
- data/lib/wapiti/model.rb +49 -0
- data/lib/wapiti/options.rb +113 -0
- data/lib/wapiti/utility.rb +15 -0
- data/lib/wapiti/version.rb +3 -0
- data/spec/fixtures/ch.mod +18550 -0
- data/spec/fixtures/chpattern.txt +52 -0
- data/spec/fixtures/chtest.txt +1973 -0
- data/spec/fixtures/chtrain.txt +19995 -0
- data/spec/fixtures/nppattern.txt +52 -0
- data/spec/fixtures/nptest.txt +1973 -0
- data/spec/fixtures/nptrain.txt +19995 -0
- data/spec/fixtures/pattern.txt +14 -0
- data/spec/fixtures/test.txt +60000 -0
- data/spec/fixtures/train.txt +1200 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/wapiti/model_spec.rb +173 -0
- data/spec/wapiti/native_spec.rb +12 -0
- data/spec/wapiti/options_spec.rb +175 -0
- data/spec/wapiti/utility_spec.rb +22 -0
- data/wapiti.gemspec +35 -0
- metadata +178 -0
data/ext/wapiti/wapiti.c
ADDED
@@ -0,0 +1,288 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
#include <stdbool.h>
|
28
|
+
#include <stddef.h>
|
29
|
+
#include <stdlib.h>
|
30
|
+
#include <stdio.h>
|
31
|
+
#include <string.h>
|
32
|
+
|
33
|
+
#include "decoder.h"
|
34
|
+
#include "model.h"
|
35
|
+
#include "options.h"
|
36
|
+
#include "progress.h"
|
37
|
+
#include "quark.h"
|
38
|
+
#include "reader.h"
|
39
|
+
#include "sequence.h"
|
40
|
+
#include "tools.h"
|
41
|
+
#include "trainers.h"
|
42
|
+
#include "wapiti.h"
|
43
|
+
|
44
|
+
/*******************************************************************************
|
45
|
+
* Training
|
46
|
+
******************************************************************************/
|
47
|
+
static void trn_auto(mdl_t *mdl) {
|
48
|
+
const int maxiter = mdl->opt->maxiter;
|
49
|
+
mdl->opt->maxiter = 3;
|
50
|
+
trn_sgdl1(mdl);
|
51
|
+
mdl->opt->maxiter = maxiter;
|
52
|
+
trn_lbfgs(mdl);
|
53
|
+
}
|
54
|
+
|
55
|
+
static const struct {
|
56
|
+
char *name;
|
57
|
+
void (* train)(mdl_t *mdl);
|
58
|
+
} trn_lst[] = {
|
59
|
+
{"l-bfgs", trn_lbfgs},
|
60
|
+
{"sgd-l1", trn_sgdl1},
|
61
|
+
{"bcd", trn_bcd },
|
62
|
+
{"rprop", trn_rprop},
|
63
|
+
{"rprop+", trn_rprop},
|
64
|
+
{"rprop-", trn_rprop},
|
65
|
+
{"auto", trn_auto }
|
66
|
+
};
|
67
|
+
static const int trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
|
68
|
+
|
69
|
+
void dotrain(mdl_t *mdl) {
|
70
|
+
// Check if the user requested the trainer list. If this is not the
|
71
|
+
// case, search the trainer.
|
72
|
+
if (!strcmp(mdl->opt->algo, "list")) {
|
73
|
+
info("Available training algorithms:\n");
|
74
|
+
for (int i = 0; i < trn_cnt; i++)
|
75
|
+
info("\t%s\n", trn_lst[i].name);
|
76
|
+
exit(EXIT_SUCCESS);
|
77
|
+
}
|
78
|
+
int trn;
|
79
|
+
for (trn = 0; trn < trn_cnt; trn++)
|
80
|
+
if (!strcmp(mdl->opt->algo, trn_lst[trn].name))
|
81
|
+
break;
|
82
|
+
if (trn == trn_cnt)
|
83
|
+
fatal("unknown algorithm '%s'", mdl->opt->algo);
|
84
|
+
// Load a previous model to train again if specified by the user.
|
85
|
+
if (mdl->opt->model != NULL) {
|
86
|
+
info("* Load previous model\n");
|
87
|
+
FILE *file = fopen(mdl->opt->model, "r");
|
88
|
+
if (file == NULL)
|
89
|
+
pfatal("cannot open input model file");
|
90
|
+
mdl_load(mdl, file);
|
91
|
+
}
|
92
|
+
// Load the pattern file. This will unlock the database if previously
|
93
|
+
// locked by loading a model.
|
94
|
+
if (mdl->opt->pattern != NULL) {
|
95
|
+
info("* Load patterns\n");
|
96
|
+
FILE *file = fopen(mdl->opt->pattern, "r");
|
97
|
+
if (file == NULL)
|
98
|
+
pfatal("cannot open pattern file");
|
99
|
+
rdr_loadpat(mdl->reader, file);
|
100
|
+
fclose(file);
|
101
|
+
qrk_lock(mdl->reader->obs, false);
|
102
|
+
}
|
103
|
+
// Load the training data. When this is done we lock the quarks as we
|
104
|
+
// don't want to put in the model, informations present only in the
|
105
|
+
// devlopment set.
|
106
|
+
info("* Load training data\n");
|
107
|
+
FILE *file = stdin;
|
108
|
+
if (mdl->opt->input != NULL) {
|
109
|
+
file = fopen(mdl->opt->input, "r");
|
110
|
+
if (file == NULL)
|
111
|
+
pfatal("cannot open input data file");
|
112
|
+
}
|
113
|
+
mdl->train = rdr_readdat(mdl->reader, file, true);
|
114
|
+
if (mdl->opt->input != NULL)
|
115
|
+
fclose(file);
|
116
|
+
qrk_lock(mdl->reader->lbl, true);
|
117
|
+
qrk_lock(mdl->reader->obs, true);
|
118
|
+
if (mdl->train == NULL || mdl->train->nseq == 0)
|
119
|
+
fatal("no train data loaded");
|
120
|
+
// If present, load the development set in the model. If not specified,
|
121
|
+
// the training dataset will be used instead.
|
122
|
+
if (mdl->opt->devel != NULL) {
|
123
|
+
info("* Load development data\n");
|
124
|
+
FILE *file = fopen(mdl->opt->devel, "r");
|
125
|
+
if (file == NULL)
|
126
|
+
pfatal("cannot open development file");
|
127
|
+
mdl->devel = rdr_readdat(mdl->reader, file, true);
|
128
|
+
fclose(file);
|
129
|
+
}
|
130
|
+
// Initialize the model. If a previous model was loaded, this will be
|
131
|
+
// just a resync, else the model structure will be created.
|
132
|
+
if (mdl->theta == NULL)
|
133
|
+
info("* Initialize the model\n");
|
134
|
+
else
|
135
|
+
info("* Resync the model\n");
|
136
|
+
mdl_sync(mdl);
|
137
|
+
// Display some statistics as we all love this.
|
138
|
+
info("* Summary\n");
|
139
|
+
info(" nb train: %d\n", mdl->train->nseq);
|
140
|
+
if (mdl->devel != NULL)
|
141
|
+
info(" nb devel: %d\n", mdl->devel->nseq);
|
142
|
+
info(" nb labels: %zu\n", mdl->nlbl);
|
143
|
+
info(" nb blocks: %zu\n", mdl->nobs);
|
144
|
+
info(" nb features: %zu\n", mdl->nftr);
|
145
|
+
// And train the model...
|
146
|
+
info("* Train the model with %s\n", mdl->opt->algo);
|
147
|
+
uit_setup(mdl);
|
148
|
+
trn_lst[trn].train(mdl);
|
149
|
+
uit_cleanup(mdl);
|
150
|
+
// If requested compact the model.
|
151
|
+
if (mdl->opt->compact) {
|
152
|
+
const size_t O = mdl->nobs;
|
153
|
+
const size_t F = mdl->nftr;
|
154
|
+
info("* Compacting the model\n");
|
155
|
+
mdl_compact(mdl);
|
156
|
+
info(" %8zu observations removed\n", O - mdl->nobs);
|
157
|
+
info(" %8zu features removed\n", F - mdl->nftr);
|
158
|
+
}
|
159
|
+
// And save the trained model
|
160
|
+
info("* Save the model\n");
|
161
|
+
file = stdout;
|
162
|
+
if (mdl->opt->output != NULL) {
|
163
|
+
file = fopen(mdl->opt->output, "w");
|
164
|
+
if (file == NULL)
|
165
|
+
pfatal("cannot open output model");
|
166
|
+
}
|
167
|
+
mdl_save(mdl, file);
|
168
|
+
if (mdl->opt->output != NULL)
|
169
|
+
fclose(file);
|
170
|
+
info("* Done\n");
|
171
|
+
}
|
172
|
+
|
173
|
+
/*******************************************************************************
|
174
|
+
* Labeling
|
175
|
+
******************************************************************************/
|
176
|
+
void dolabel(mdl_t *mdl) {
|
177
|
+
// First, load the model provided by the user. This is mandatory to
|
178
|
+
// label new datas ;-)
|
179
|
+
if (mdl->opt->model == NULL)
|
180
|
+
fatal("you must specify a model");
|
181
|
+
info("* Load model\n");
|
182
|
+
FILE *file = fopen(mdl->opt->model, "r");
|
183
|
+
if (file == NULL)
|
184
|
+
pfatal("cannot open input model file");
|
185
|
+
mdl_load(mdl, file);
|
186
|
+
// Open input and output files
|
187
|
+
FILE *fin = stdin, *fout = stdout;
|
188
|
+
if (mdl->opt->input != NULL) {
|
189
|
+
fin = fopen(mdl->opt->input, "r");
|
190
|
+
if (fin == NULL)
|
191
|
+
pfatal("cannot open input data file");
|
192
|
+
}
|
193
|
+
if (mdl->opt->output != NULL) {
|
194
|
+
fout = fopen(mdl->opt->output, "w");
|
195
|
+
if (fout == NULL)
|
196
|
+
pfatal("cannot open output data file");
|
197
|
+
}
|
198
|
+
// Do the labelling
|
199
|
+
info("* Label sequences\n");
|
200
|
+
tag_label(mdl, fin, fout);
|
201
|
+
info("* Done\n");
|
202
|
+
// And close files
|
203
|
+
if (mdl->opt->input != NULL)
|
204
|
+
fclose(fin);
|
205
|
+
if (mdl->opt->output != NULL)
|
206
|
+
fclose(fout);
|
207
|
+
}
|
208
|
+
|
209
|
+
/*******************************************************************************
|
210
|
+
* Dumping
|
211
|
+
******************************************************************************/
|
212
|
+
void dodump(mdl_t *mdl) {
|
213
|
+
// Load input model file
|
214
|
+
info("* Load model\n");
|
215
|
+
FILE *fin = stdin;
|
216
|
+
if (mdl->opt->input != NULL) {
|
217
|
+
fin = fopen(mdl->opt->input, "r");
|
218
|
+
if (fin == NULL)
|
219
|
+
pfatal("cannot open input data file");
|
220
|
+
}
|
221
|
+
mdl_load(mdl, fin);
|
222
|
+
if (mdl->opt->input != NULL)
|
223
|
+
fclose(fin);
|
224
|
+
// Open output file
|
225
|
+
FILE *fout = stdout;
|
226
|
+
if (mdl->opt->output != NULL) {
|
227
|
+
fout = fopen(mdl->opt->output, "w");
|
228
|
+
if (fout == NULL)
|
229
|
+
pfatal("cannot open output data file");
|
230
|
+
}
|
231
|
+
// Dump model
|
232
|
+
info("* Dump model\n");
|
233
|
+
const size_t Y = mdl->nlbl;
|
234
|
+
const size_t O = mdl->nobs;
|
235
|
+
const qrk_t *Qlbl = mdl->reader->lbl;
|
236
|
+
const qrk_t *Qobs = mdl->reader->obs;
|
237
|
+
for (size_t o = 0; o < O; o++) {
|
238
|
+
const char *obs = qrk_id2str(Qobs, o);
|
239
|
+
bool empty = true;
|
240
|
+
if (mdl->kind[o] & 1) {
|
241
|
+
const double *w = mdl->theta + mdl->uoff[o];
|
242
|
+
for (size_t y = 0; y < Y; y++) {
|
243
|
+
if (w[y] == 0.0)
|
244
|
+
continue;
|
245
|
+
const char *ly = qrk_id2str(Qlbl, y);
|
246
|
+
fprintf(fout, "%s\t#\t%s\t%f\n", obs, ly, w[y]);
|
247
|
+
empty = false;
|
248
|
+
}
|
249
|
+
}
|
250
|
+
if (mdl->kind[o] & 2) {
|
251
|
+
const double *w = mdl->theta + mdl->boff[o];
|
252
|
+
for (size_t d = 0; d < Y * Y; d++) {
|
253
|
+
if (w[d] == 0.0)
|
254
|
+
continue;
|
255
|
+
const char *ly = qrk_id2str(Qlbl, d % Y);
|
256
|
+
const char *lyp = qrk_id2str(Qlbl, d / Y);
|
257
|
+
fprintf(fout, "%s\t%s\t%s\t%f\n", obs, lyp, ly,
|
258
|
+
w[d]);
|
259
|
+
empty = false;
|
260
|
+
}
|
261
|
+
}
|
262
|
+
if (!empty)
|
263
|
+
fprintf(fout, "\n");
|
264
|
+
}
|
265
|
+
if (mdl->opt->output != NULL)
|
266
|
+
fclose(fout);
|
267
|
+
}
|
268
|
+
|
269
|
+
/*******************************************************************************
|
270
|
+
* Entry point
|
271
|
+
******************************************************************************/
|
272
|
+
int wapiti_main(int argc, char *argv[argc]) {
|
273
|
+
// We first parse command line switchs
|
274
|
+
opt_t opt = opt_defaults;
|
275
|
+
opt_parse(argc, argv, &opt);
|
276
|
+
// Next we prepare the model
|
277
|
+
mdl_t *mdl = mdl_new(rdr_new(opt.maxent));
|
278
|
+
mdl->opt = &opt;
|
279
|
+
// And switch to requested mode
|
280
|
+
switch (opt.mode) {
|
281
|
+
case 0: dotrain(mdl); break;
|
282
|
+
case 1: dolabel(mdl); break;
|
283
|
+
case 2: dodump(mdl); break;
|
284
|
+
}
|
285
|
+
// And cleanup
|
286
|
+
mdl_free(mdl);
|
287
|
+
return EXIT_SUCCESS;
|
288
|
+
}
|
data/ext/wapiti/wapiti.h
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
#ifndef wapiti_h
|
28
|
+
#define wapiti_h
|
29
|
+
|
30
|
+
#define VERSION "1.2.0"
|
31
|
+
|
32
|
+
/* XVM_ANSI:
|
33
|
+
* By uncomenting the following define, you can force wapiti to not use SSE2
|
34
|
+
* even if available.
|
35
|
+
*/
|
36
|
+
//#define XVM_ANSI
|
37
|
+
|
38
|
+
/* MTH_ANSI:
|
39
|
+
* By uncomenting the following define, you can disable the use of POSIX
|
40
|
+
* threads in the multi-threading part of Wapiti, for non-POSIX systems.
|
41
|
+
*/
|
42
|
+
//#define MTH_ANSI
|
43
|
+
|
44
|
+
#endif
|
45
|
+
|
data/lib/wapiti.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
|
2
|
+
require 'logger'
|
3
|
+
require 'tempfile'
|
4
|
+
|
5
|
+
require 'wapiti/version'
|
6
|
+
|
7
|
+
module Wapiti
|
8
|
+
|
9
|
+
Logger = ::Logger.new(STDOUT)
|
10
|
+
Logger.level = ::Logger::WARN
|
11
|
+
|
12
|
+
class << self
|
13
|
+
def log
|
14
|
+
Logger
|
15
|
+
end
|
16
|
+
|
17
|
+
def debug!
|
18
|
+
log.level == ::Logger::DEBUG
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
require 'wapiti/errors'
|
25
|
+
require 'wapiti/native'
|
26
|
+
|
27
|
+
require 'wapiti/options'
|
28
|
+
require 'wapiti/model'
|
29
|
+
|
30
|
+
require 'wapiti/utility'
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Wapiti
|
2
|
+
|
3
|
+
class Error < StandardError
|
4
|
+
|
5
|
+
attr_accessor :original
|
6
|
+
|
7
|
+
def initialize(message = '', original = $!)
|
8
|
+
super(message)
|
9
|
+
@original = original
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
13
|
+
|
14
|
+
class NativeError < Error; end
|
15
|
+
class ConfigurationError < Error; end
|
16
|
+
|
17
|
+
end
|
data/lib/wapiti/model.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
module Wapiti
|
2
|
+
|
3
|
+
class Model
|
4
|
+
|
5
|
+
class << self
|
6
|
+
|
7
|
+
def train(data, options, &block)
|
8
|
+
config = Options.new(options, &block)
|
9
|
+
|
10
|
+
# check configuration
|
11
|
+
if config.pattern.empty?
|
12
|
+
raise ConfigurationError, 'invalid options: no pattern specified'
|
13
|
+
end
|
14
|
+
|
15
|
+
unless config.valid?
|
16
|
+
raise ConfigurationError, "invalid options: #{ config.validate.join('; ') }"
|
17
|
+
end
|
18
|
+
|
19
|
+
new(config).train(data)
|
20
|
+
end
|
21
|
+
|
22
|
+
def load(filename)
|
23
|
+
m = new
|
24
|
+
m.path = filename
|
25
|
+
m.load
|
26
|
+
m
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
attr_accessor :path
|
32
|
+
|
33
|
+
def pattern
|
34
|
+
options.pattern
|
35
|
+
end
|
36
|
+
|
37
|
+
def pattern=(filename)
|
38
|
+
options.pattern = filename
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def tokenize(input)
|
44
|
+
input
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
module Wapiti
|
2
|
+
|
3
|
+
class Options
|
4
|
+
|
5
|
+
include Comparable
|
6
|
+
|
7
|
+
class << self
|
8
|
+
|
9
|
+
# Returns a sorted list of available option attributes.
|
10
|
+
def attribute_names
|
11
|
+
@attribute_names ||= %w{ stop_window convergence_window posterior
|
12
|
+
max_iterations jobsize threads rho1 rho2 stop_epsilon score check
|
13
|
+
algorithm pattern development_data maxent compact sparse label
|
14
|
+
}.sort.map(&:to_sym).freeze
|
15
|
+
end
|
16
|
+
|
17
|
+
# Returns the default options.
|
18
|
+
def defaults
|
19
|
+
@defaults ||= new.attributes
|
20
|
+
end
|
21
|
+
|
22
|
+
# Returns the list of supported algorithm options.
|
23
|
+
def algorithms
|
24
|
+
@algorithms ||= %w{ l-bfgs sgd-l1 bcd rprop rprop+ rprop- auto }.freeze
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
# Returns the value of the attribute identified by +name+ or nil
|
30
|
+
# if there is no such attribute.
|
31
|
+
def [](name)
|
32
|
+
has_attribute?(name) ? send(name) : nil
|
33
|
+
end
|
34
|
+
|
35
|
+
# Updates the value of the attribute identified by +name+ with the
|
36
|
+
# passed-in +value+.
|
37
|
+
def []=(name, value)
|
38
|
+
raise ArgumentError, "bad attribute name: #{name}" unless has_attribute?(name)
|
39
|
+
send("#{name}=", value)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Updates all the attributes from the passed-in hash.
|
43
|
+
def update(attributes = {})
|
44
|
+
attributes.each_pair do |k,v|
|
45
|
+
mid = "#{k}="
|
46
|
+
send(mid, v) if respond_to?(mid)
|
47
|
+
end
|
48
|
+
self
|
49
|
+
end
|
50
|
+
|
51
|
+
alias update_attributes update
|
52
|
+
|
53
|
+
def lbfgs
|
54
|
+
{ :clip => clip, :histsz => histsz, :maxls => maxls }
|
55
|
+
end
|
56
|
+
|
57
|
+
def sgdl1
|
58
|
+
{ :eta0 => eta0, :alpha => alpha }
|
59
|
+
end
|
60
|
+
|
61
|
+
def bcd
|
62
|
+
{ :kappa => kappa }
|
63
|
+
end
|
64
|
+
|
65
|
+
def rprop
|
66
|
+
{
|
67
|
+
:stpmin => stpmin, :stpmax => stpmax, :stpinc => stpinc,
|
68
|
+
:stpdec => stpdec, :cutoff => cutoff
|
69
|
+
}
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns a hash of all the attributes with their names and values.
|
73
|
+
def attributes
|
74
|
+
Hash[*Options.attribute_names.map { |a| [a, send(a)] }.flatten]
|
75
|
+
end
|
76
|
+
|
77
|
+
alias to_hash attributes
|
78
|
+
|
79
|
+
def has_attribute?(attribute)
|
80
|
+
Options.attribute_names.include?(attribute)
|
81
|
+
end
|
82
|
+
|
83
|
+
def valid_algorithm?
|
84
|
+
self.class.algorithms.include?(algorithm)
|
85
|
+
end
|
86
|
+
|
87
|
+
def valid?
|
88
|
+
validate.empty?
|
89
|
+
end
|
90
|
+
|
91
|
+
def validate
|
92
|
+
e = []
|
93
|
+
|
94
|
+
%w{ threads jobsize alpha histsz maxls eta0 alpha nbest }.each do |name|
|
95
|
+
e << "invalid value for #{name}: #{send(name)}" unless send(name) > 0
|
96
|
+
end
|
97
|
+
|
98
|
+
%w{ rho1 rho2 }.each do |name|
|
99
|
+
e << "invalid value for #{name}: #{send(name)}" unless send(name) >= 0.0
|
100
|
+
end
|
101
|
+
|
102
|
+
e << "unknown algorithm: #{algorithm}" unless valid_algorithm?
|
103
|
+
e << "BCD not supported for training maxent models" if maxent && algorithm == 'bcd'
|
104
|
+
e
|
105
|
+
end
|
106
|
+
|
107
|
+
def <=>(other)
|
108
|
+
other.respond_to?(:attributes) ? attributes <=> other.attributes : nil
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|