wapiti 0.0.5 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.simplecov +3 -0
- data/Gemfile +25 -2
- data/HISTORY.md +5 -1
- data/LICENSE +14 -13
- data/README.md +9 -16
- data/Rakefile +38 -8
- data/ext/wapiti/bcd.c +126 -124
- data/ext/wapiti/decoder.c +203 -124
- data/ext/wapiti/decoder.h +6 -4
- data/ext/wapiti/extconf.rb +2 -2
- data/ext/wapiti/gradient.c +491 -320
- data/ext/wapiti/gradient.h +52 -34
- data/ext/wapiti/lbfgs.c +74 -33
- data/ext/wapiti/model.c +47 -37
- data/ext/wapiti/model.h +22 -20
- data/ext/wapiti/native.c +850 -839
- data/ext/wapiti/native.h +1 -1
- data/ext/wapiti/options.c +52 -20
- data/ext/wapiti/options.h +37 -30
- data/ext/wapiti/pattern.c +35 -33
- data/ext/wapiti/pattern.h +12 -11
- data/ext/wapiti/progress.c +14 -13
- data/ext/wapiti/progress.h +3 -2
- data/ext/wapiti/quark.c +14 -16
- data/ext/wapiti/quark.h +6 -5
- data/ext/wapiti/reader.c +83 -69
- data/ext/wapiti/reader.h +11 -9
- data/ext/wapiti/rprop.c +84 -43
- data/ext/wapiti/sequence.h +18 -16
- data/ext/wapiti/sgdl1.c +45 -43
- data/ext/wapiti/thread.c +19 -17
- data/ext/wapiti/thread.h +5 -4
- data/ext/wapiti/tools.c +7 -7
- data/ext/wapiti/tools.h +3 -4
- data/ext/wapiti/trainers.h +1 -1
- data/ext/wapiti/vmath.c +40 -38
- data/ext/wapiti/vmath.h +12 -11
- data/ext/wapiti/wapiti.c +159 -37
- data/ext/wapiti/wapiti.h +18 -4
- data/lib/wapiti.rb +15 -15
- data/lib/wapiti/errors.rb +15 -15
- data/lib/wapiti/model.rb +92 -84
- data/lib/wapiti/options.rb +123 -124
- data/lib/wapiti/utility.rb +14 -14
- data/lib/wapiti/version.rb +2 -2
- data/spec/spec_helper.rb +29 -9
- data/spec/wapiti/model_spec.rb +230 -194
- data/spec/wapiti/native_spec.rb +7 -8
- data/spec/wapiti/options_spec.rb +184 -174
- data/wapiti.gemspec +22 -8
- metadata +38 -42
- data/.gitignore +0 -5
data/ext/wapiti/wapiti.c
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -24,19 +24,22 @@
|
|
24
24
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
25
|
* POSSIBILITY OF SUCH DAMAGE.
|
26
26
|
*/
|
27
|
+
#include <ctype.h>
|
28
|
+
#include <inttypes.h>
|
27
29
|
#include <stdbool.h>
|
28
30
|
#include <stddef.h>
|
31
|
+
#include <stdint.h>
|
29
32
|
#include <stdlib.h>
|
30
33
|
#include <stdio.h>
|
31
34
|
#include <string.h>
|
32
35
|
|
33
36
|
#include "decoder.h"
|
34
|
-
#include "model.h"
|
35
37
|
#include "options.h"
|
36
38
|
#include "progress.h"
|
37
39
|
#include "quark.h"
|
38
40
|
#include "reader.h"
|
39
41
|
#include "sequence.h"
|
42
|
+
#include "model.h"
|
40
43
|
#include "tools.h"
|
41
44
|
#include "trainers.h"
|
42
45
|
#include "wapiti.h"
|
@@ -44,16 +47,15 @@
|
|
44
47
|
/*******************************************************************************
|
45
48
|
* Training
|
46
49
|
******************************************************************************/
|
47
|
-
static
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
}
|
50
|
+
static const char *typ_lst[] = {
|
51
|
+
"maxent",
|
52
|
+
"memm",
|
53
|
+
"crf"
|
54
|
+
};
|
55
|
+
static const uint32_t typ_cnt = sizeof(typ_lst) / sizeof(typ_lst[0]);
|
54
56
|
|
55
57
|
static const struct {
|
56
|
-
char *name;
|
58
|
+
const char *name;
|
57
59
|
void (* train)(mdl_t *mdl);
|
58
60
|
} trn_lst[] = {
|
59
61
|
{"l-bfgs", trn_lbfgs},
|
@@ -62,20 +64,31 @@ static const struct {
|
|
62
64
|
{"rprop", trn_rprop},
|
63
65
|
{"rprop+", trn_rprop},
|
64
66
|
{"rprop-", trn_rprop},
|
65
|
-
{"auto", trn_auto }
|
66
67
|
};
|
67
|
-
static const
|
68
|
+
static const uint32_t trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
|
68
69
|
|
69
70
|
void dotrain(mdl_t *mdl) {
|
70
|
-
// Check if the user requested the trainer list. If this is not
|
71
|
-
// case, search the
|
71
|
+
// Check if the user requested the type or trainer list. If this is not
|
72
|
+
// the case, search them in the lists.
|
73
|
+
if (!strcmp(mdl->opt->type, "list")) {
|
74
|
+
info("Available types of models:\n");
|
75
|
+
for (uint32_t i = 0; i < typ_cnt; i++)
|
76
|
+
info("\t%s\n", typ_lst[i]);
|
77
|
+
exit(EXIT_SUCCESS);
|
78
|
+
}
|
72
79
|
if (!strcmp(mdl->opt->algo, "list")) {
|
73
80
|
info("Available training algorithms:\n");
|
74
|
-
for (
|
81
|
+
for (uint32_t i = 0; i < trn_cnt; i++)
|
75
82
|
info("\t%s\n", trn_lst[i].name);
|
76
83
|
exit(EXIT_SUCCESS);
|
77
84
|
}
|
78
|
-
|
85
|
+
uint32_t typ, trn;
|
86
|
+
for (typ = 0; typ < typ_cnt; typ++)
|
87
|
+
if (!strcmp(mdl->opt->type, typ_lst[typ]))
|
88
|
+
break;
|
89
|
+
if (typ == typ_cnt)
|
90
|
+
fatal("unknown model type '%s'", mdl->opt->type);
|
91
|
+
mdl->type = typ;
|
79
92
|
for (trn = 0; trn < trn_cnt; trn++)
|
80
93
|
if (!strcmp(mdl->opt->algo, trn_lst[trn].name))
|
81
94
|
break;
|
@@ -136,12 +149,12 @@ void dotrain(mdl_t *mdl) {
|
|
136
149
|
mdl_sync(mdl);
|
137
150
|
// Display some statistics as we all love this.
|
138
151
|
info("* Summary\n");
|
139
|
-
info(" nb train: %
|
152
|
+
info(" nb train: %"PRIu32"\n", mdl->train->nseq);
|
140
153
|
if (mdl->devel != NULL)
|
141
|
-
info(" nb devel: %
|
142
|
-
info(" nb labels: %
|
143
|
-
info(" nb blocks: %
|
144
|
-
info(" nb features: %
|
154
|
+
info(" nb devel: %"PRIu32"\n", mdl->devel->nseq);
|
155
|
+
info(" nb labels: %"PRIu32"\n", mdl->nlbl);
|
156
|
+
info(" nb blocks: %"PRIu64"\n", mdl->nobs);
|
157
|
+
info(" nb features: %"PRIu64"\n", mdl->nftr);
|
145
158
|
// And train the model...
|
146
159
|
info("* Train the model with %s\n", mdl->opt->algo);
|
147
160
|
uit_setup(mdl);
|
@@ -149,12 +162,12 @@ void dotrain(mdl_t *mdl) {
|
|
149
162
|
uit_cleanup(mdl);
|
150
163
|
// If requested compact the model.
|
151
164
|
if (mdl->opt->compact) {
|
152
|
-
const
|
153
|
-
const
|
165
|
+
const uint64_t O = mdl->nobs;
|
166
|
+
const uint64_t F = mdl->nftr;
|
154
167
|
info("* Compacting the model\n");
|
155
168
|
mdl_compact(mdl);
|
156
|
-
info(" %
|
157
|
-
info(" %
|
169
|
+
info(" %8"PRIu64" observations removed\n", O - mdl->nobs);
|
170
|
+
info(" %8"PRIu64" features removed\n", F - mdl->nftr);
|
158
171
|
}
|
159
172
|
// And save the trained model
|
160
173
|
info("* Save the model\n");
|
@@ -209,7 +222,7 @@ void dolabel(mdl_t *mdl) {
|
|
209
222
|
/*******************************************************************************
|
210
223
|
* Dumping
|
211
224
|
******************************************************************************/
|
212
|
-
void dodump(mdl_t *mdl) {
|
225
|
+
static void dodump(mdl_t *mdl) {
|
213
226
|
// Load input model file
|
214
227
|
info("* Load model\n");
|
215
228
|
FILE *fin = stdin;
|
@@ -230,32 +243,35 @@ void dodump(mdl_t *mdl) {
|
|
230
243
|
}
|
231
244
|
// Dump model
|
232
245
|
info("* Dump model\n");
|
233
|
-
const
|
234
|
-
const
|
246
|
+
const uint32_t Y = mdl->nlbl;
|
247
|
+
const uint64_t O = mdl->nobs;
|
235
248
|
const qrk_t *Qlbl = mdl->reader->lbl;
|
236
249
|
const qrk_t *Qobs = mdl->reader->obs;
|
237
|
-
|
250
|
+
char fmt[16];
|
251
|
+
sprintf(fmt, "%%.%df\n", mdl->opt->prec);
|
252
|
+
for (uint64_t o = 0; o < O; o++) {
|
238
253
|
const char *obs = qrk_id2str(Qobs, o);
|
239
254
|
bool empty = true;
|
240
255
|
if (mdl->kind[o] & 1) {
|
241
256
|
const double *w = mdl->theta + mdl->uoff[o];
|
242
|
-
for (
|
243
|
-
if (w[y] == 0.0)
|
257
|
+
for (uint32_t y = 0; y < Y; y++) {
|
258
|
+
if (!mdl->opt->all && w[y] == 0.0)
|
244
259
|
continue;
|
245
260
|
const char *ly = qrk_id2str(Qlbl, y);
|
246
|
-
fprintf(fout, "%s\t#\t%s\t
|
261
|
+
fprintf(fout, "%s\t#\t%s\t", obs, ly);
|
262
|
+
fprintf(fout, fmt, w[y]);
|
247
263
|
empty = false;
|
248
264
|
}
|
249
265
|
}
|
250
266
|
if (mdl->kind[o] & 2) {
|
251
267
|
const double *w = mdl->theta + mdl->boff[o];
|
252
|
-
for (
|
253
|
-
if (w[d] == 0.0)
|
268
|
+
for (uint32_t d = 0; d < Y * Y; d++) {
|
269
|
+
if (!mdl->opt->all && w[d] == 0.0)
|
254
270
|
continue;
|
255
271
|
const char *ly = qrk_id2str(Qlbl, d % Y);
|
256
272
|
const char *lyp = qrk_id2str(Qlbl, d / Y);
|
257
|
-
fprintf(fout, "%s\t%s\t%s\t
|
258
|
-
|
273
|
+
fprintf(fout, "%s\t%s\t%s\t", obs, lyp, ly);
|
274
|
+
fprintf(fout, fmt, w[d]);
|
259
275
|
empty = false;
|
260
276
|
}
|
261
277
|
}
|
@@ -266,6 +282,110 @@ void dodump(mdl_t *mdl) {
|
|
266
282
|
fclose(fout);
|
267
283
|
}
|
268
284
|
|
285
|
+
|
286
|
+
/*******************************************************************************
|
287
|
+
* Updating
|
288
|
+
******************************************************************************/
|
289
|
+
void doupdt(mdl_t *mdl) {
|
290
|
+
// Load input model file
|
291
|
+
info("* Load model\n");
|
292
|
+
if (mdl->opt->model == NULL)
|
293
|
+
fatal("no model file provided");
|
294
|
+
FILE *Min = fopen(mdl->opt->model, "r");
|
295
|
+
if (Min == NULL)
|
296
|
+
pfatal("cannot open model file %s", mdl->opt->model);
|
297
|
+
mdl_load(mdl, Min);
|
298
|
+
fclose(Min);
|
299
|
+
// Open patch file
|
300
|
+
info("* Update model\n");
|
301
|
+
FILE *fin = stdin;
|
302
|
+
if (mdl->opt->input != NULL) {
|
303
|
+
fin = fopen(mdl->opt->input, "r");
|
304
|
+
if (fin == NULL)
|
305
|
+
pfatal("cannot open update file");
|
306
|
+
}
|
307
|
+
int nline = 0;
|
308
|
+
while (!feof(fin)) {
|
309
|
+
char *raw = rdr_readline(fin);
|
310
|
+
if (raw == NULL)
|
311
|
+
break;
|
312
|
+
char *line = raw;
|
313
|
+
nline++;
|
314
|
+
// First we split the line in space separated tokens. We expect
|
315
|
+
// four of them and skip empty lines.
|
316
|
+
char *toks[4];
|
317
|
+
int ntoks = 0;
|
318
|
+
while (ntoks < 4) {
|
319
|
+
while (isspace(*line))
|
320
|
+
line++;
|
321
|
+
if (*line == '\0')
|
322
|
+
break;
|
323
|
+
toks[ntoks++] = line;
|
324
|
+
while (*line != '\0' && !isspace(*line))
|
325
|
+
line++;
|
326
|
+
if (*line == '\0')
|
327
|
+
break;
|
328
|
+
*line++ = '\0';
|
329
|
+
}
|
330
|
+
if (ntoks == 0) {
|
331
|
+
free(raw);
|
332
|
+
continue;
|
333
|
+
} else if (ntoks != 4) {
|
334
|
+
fatal("invalid line at %d", nline);
|
335
|
+
}
|
336
|
+
// Parse the tokens, the first three should be string maping to
|
337
|
+
// observations and labels and the last should be the weight.
|
338
|
+
uint64_t obs = none, yp = none, y = none;
|
339
|
+
obs = qrk_str2id(mdl->reader->obs, toks[0]);
|
340
|
+
if (obs == none)
|
341
|
+
fatal("bad on observation on line %d", nline);
|
342
|
+
if (strcmp(toks[1], "#")) {
|
343
|
+
yp = qrk_str2id(mdl->reader->lbl, toks[1]);
|
344
|
+
if (yp == none)
|
345
|
+
fatal("bad label <%s> line %d", toks[1], nline);
|
346
|
+
}
|
347
|
+
y = qrk_str2id(mdl->reader->lbl, toks[2]);
|
348
|
+
if (y == none)
|
349
|
+
fatal("bad label <%s> line %d", toks[2], nline);
|
350
|
+
double wgh = 0.0;
|
351
|
+
if (sscanf(toks[3], "%lf", &wgh) != 1)
|
352
|
+
fatal("bad weight on line %d", nline);
|
353
|
+
|
354
|
+
const uint32_t Y = mdl->nlbl;
|
355
|
+
if (yp == none) {
|
356
|
+
double *w = mdl->theta + mdl->uoff[obs];
|
357
|
+
w[y] = wgh;
|
358
|
+
} else {
|
359
|
+
double *w = mdl->theta + mdl->boff[obs];
|
360
|
+
w[yp * Y + y] = wgh;
|
361
|
+
}
|
362
|
+
free(raw);
|
363
|
+
}
|
364
|
+
if (mdl->opt->input != NULL)
|
365
|
+
fclose(fin);
|
366
|
+
// If requested compact the model.
|
367
|
+
if (mdl->opt->compact) {
|
368
|
+
const uint64_t O = mdl->nobs;
|
369
|
+
const uint64_t F = mdl->nftr;
|
370
|
+
info("* Compacting the model\n");
|
371
|
+
mdl_compact(mdl);
|
372
|
+
info(" %8"PRIu64" observations removed\n", O - mdl->nobs);
|
373
|
+
info(" %8"PRIu64" features removed\n", F - mdl->nftr);
|
374
|
+
}
|
375
|
+
// And save the updated model
|
376
|
+
info("* Save the model\n");
|
377
|
+
FILE *file = stdout;
|
378
|
+
if (mdl->opt->output != NULL) {
|
379
|
+
file = fopen(mdl->opt->output, "w");
|
380
|
+
if (file == NULL)
|
381
|
+
pfatal("cannot open output model");
|
382
|
+
}
|
383
|
+
mdl_save(mdl, file);
|
384
|
+
if (mdl->opt->output != NULL)
|
385
|
+
fclose(file);
|
386
|
+
info("* Done\n");
|
387
|
+
}
|
388
|
+
|
269
389
|
/*******************************************************************************
|
270
390
|
* Entry point
|
271
391
|
******************************************************************************/
|
@@ -280,9 +400,11 @@ int wapiti_main(int argc, char *argv[argc]) {
|
|
280
400
|
switch (opt.mode) {
|
281
401
|
case 0: dotrain(mdl); break;
|
282
402
|
case 1: dolabel(mdl); break;
|
283
|
-
case 2: dodump(mdl);
|
403
|
+
case 2: dodump(mdl); break;
|
404
|
+
case 3: doupdt(mdl); break;
|
284
405
|
}
|
285
406
|
// And cleanup
|
286
407
|
mdl_free(mdl);
|
287
408
|
return EXIT_SUCCESS;
|
288
409
|
}
|
410
|
+
|
data/ext/wapiti/wapiti.h
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -27,7 +27,7 @@
|
|
27
27
|
#ifndef wapiti_h
|
28
28
|
#define wapiti_h
|
29
29
|
|
30
|
-
#define VERSION "1.
|
30
|
+
#define VERSION "1.5.0"
|
31
31
|
|
32
32
|
/* XVM_ANSI:
|
33
33
|
* By uncomenting the following define, you can force wapiti to not use SSE2
|
@@ -36,10 +36,24 @@
|
|
36
36
|
//#define XVM_ANSI
|
37
37
|
|
38
38
|
/* MTH_ANSI:
|
39
|
-
*
|
40
|
-
*
|
39
|
+
* By uncomenting the following define, you can disable the use of POSIX
|
40
|
+
* threads in the multi-threading part of Wapiti, for non-POSIX systems.
|
41
41
|
*/
|
42
42
|
//#define MTH_ANSI
|
43
43
|
|
44
|
+
/* ATM_ANSI:
|
45
|
+
* By uncomenting the following define, you can disable the use of atomic
|
46
|
+
* operation to update the gradient. This imply that multi-threaded gradient
|
47
|
+
* computation will require more memory but is more portable.
|
48
|
+
*/
|
49
|
+
//#define ATM_ANSI
|
50
|
+
|
51
|
+
/* Without multi-threading we disable atomic updates as they are not needed and
|
52
|
+
* can only decrease performances in this case.
|
53
|
+
*/
|
54
|
+
#ifdef MTH_ANSI
|
55
|
+
#define ATM_ANSI
|
56
|
+
#endif
|
57
|
+
|
44
58
|
#endif
|
45
59
|
|
data/lib/wapiti.rb
CHANGED
@@ -5,20 +5,20 @@ require 'tempfile'
|
|
5
5
|
require 'wapiti/version'
|
6
6
|
|
7
7
|
module Wapiti
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
8
|
+
|
9
|
+
Logger = ::Logger.new(STDOUT)
|
10
|
+
Logger.level = ::Logger::WARN
|
11
|
+
|
12
|
+
class << self
|
13
|
+
def log
|
14
|
+
Logger
|
15
|
+
end
|
16
|
+
|
17
|
+
def debug!
|
18
|
+
log.level == ::Logger::DEBUG
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
22
|
end
|
23
23
|
|
24
24
|
require 'wapiti/errors'
|
@@ -27,4 +27,4 @@ require 'wapiti/native'
|
|
27
27
|
require 'wapiti/options'
|
28
28
|
require 'wapiti/model'
|
29
29
|
|
30
|
-
require 'wapiti/utility'
|
30
|
+
require 'wapiti/utility'
|
data/lib/wapiti/errors.rb
CHANGED
@@ -1,17 +1,17 @@
|
|
1
1
|
module Wapiti
|
2
|
-
|
3
|
-
class Error < StandardError
|
4
|
-
|
5
|
-
attr_accessor :original
|
6
|
-
|
7
|
-
def initialize(message = '', original = $!)
|
8
|
-
super(message)
|
9
|
-
@original = original
|
10
|
-
end
|
11
|
-
|
12
|
-
end
|
13
2
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
3
|
+
class Error < StandardError
|
4
|
+
|
5
|
+
attr_accessor :original
|
6
|
+
|
7
|
+
def initialize(message = '', original = $!)
|
8
|
+
super(message)
|
9
|
+
@original = original
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
13
|
+
|
14
|
+
class NativeError < Error; end
|
15
|
+
class ConfigurationError < Error; end
|
16
|
+
|
17
|
+
end
|
data/lib/wapiti/model.rb
CHANGED
@@ -1,85 +1,93 @@
|
|
1
1
|
module Wapiti
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
end
|
2
|
+
|
3
|
+
class Model
|
4
|
+
|
5
|
+
class << self
|
6
|
+
|
7
|
+
def train(data, options, &block)
|
8
|
+
config = Options.new(options, &block)
|
9
|
+
|
10
|
+
# check configuration
|
11
|
+
# if config.pattern.empty?
|
12
|
+
# raise ConfigurationError, 'invalid options: no pattern specified'
|
13
|
+
# end
|
14
|
+
|
15
|
+
unless config.valid?
|
16
|
+
raise ConfigurationError, "invalid options: #{ config.validate.join('; ') }"
|
17
|
+
end
|
18
|
+
|
19
|
+
new(config).train(data)
|
20
|
+
end
|
21
|
+
|
22
|
+
def load(filename)
|
23
|
+
m = new
|
24
|
+
m.path = filename
|
25
|
+
m.load
|
26
|
+
m
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
attr_accessor :path
|
32
|
+
|
33
|
+
attr_reader :token_count, :token_errors, :sequence_count, :sequence_errors
|
34
|
+
|
35
|
+
def pattern
|
36
|
+
options.pattern
|
37
|
+
end
|
38
|
+
|
39
|
+
def pattern=(filename)
|
40
|
+
options.pattern = filename
|
41
|
+
end
|
42
|
+
|
43
|
+
alias native_label label
|
44
|
+
|
45
|
+
def label(input, opts = nil)
|
46
|
+
options.update(opts) unless opts.nil?
|
47
|
+
block_given? ? native_label(input, &Proc.new) : native_label(input)
|
48
|
+
end
|
49
|
+
|
50
|
+
alias native_train train
|
51
|
+
|
52
|
+
def train(input, opts = nil)
|
53
|
+
options.update(opts) unless opts.nil?
|
54
|
+
block_given? ? native_train(input, &Proc.new) : native_train(input)
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
def statistics
|
59
|
+
s = {}
|
60
|
+
s[:tokens] = {
|
61
|
+
:total => token_count, :errors => token_errors, :rate => token_error_rate
|
62
|
+
}
|
63
|
+
s[:sequences] = {
|
64
|
+
:total => sequence_count, :errors => sequence_errors, :rate => sequence_error_rate
|
65
|
+
}
|
66
|
+
s
|
67
|
+
end
|
68
|
+
|
69
|
+
alias stats statistics
|
70
|
+
|
71
|
+
def clear_counters
|
72
|
+
@token_count = @token_errors = @sequence_count = @sequence_errors = 0
|
73
|
+
end
|
74
|
+
|
75
|
+
alias clear clear_counters
|
76
|
+
|
77
|
+
def token_error_rate
|
78
|
+
return 0 if token_errors.zero?
|
79
|
+
token_errors / token_count.to_f * 100.0
|
80
|
+
end
|
81
|
+
|
82
|
+
def sequence_error_rate
|
83
|
+
return 0 if sequence_errors.zero?
|
84
|
+
sequence_errors / sequence_count.to_f * 100.0
|
85
|
+
end
|
86
|
+
|
87
|
+
# alias native_save save
|
88
|
+
|
89
|
+
private :native_label, :native_train
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|