wapiti 0.0.5 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.simplecov +3 -0
- data/Gemfile +25 -2
- data/HISTORY.md +5 -1
- data/LICENSE +14 -13
- data/README.md +9 -16
- data/Rakefile +38 -8
- data/ext/wapiti/bcd.c +126 -124
- data/ext/wapiti/decoder.c +203 -124
- data/ext/wapiti/decoder.h +6 -4
- data/ext/wapiti/extconf.rb +2 -2
- data/ext/wapiti/gradient.c +491 -320
- data/ext/wapiti/gradient.h +52 -34
- data/ext/wapiti/lbfgs.c +74 -33
- data/ext/wapiti/model.c +47 -37
- data/ext/wapiti/model.h +22 -20
- data/ext/wapiti/native.c +850 -839
- data/ext/wapiti/native.h +1 -1
- data/ext/wapiti/options.c +52 -20
- data/ext/wapiti/options.h +37 -30
- data/ext/wapiti/pattern.c +35 -33
- data/ext/wapiti/pattern.h +12 -11
- data/ext/wapiti/progress.c +14 -13
- data/ext/wapiti/progress.h +3 -2
- data/ext/wapiti/quark.c +14 -16
- data/ext/wapiti/quark.h +6 -5
- data/ext/wapiti/reader.c +83 -69
- data/ext/wapiti/reader.h +11 -9
- data/ext/wapiti/rprop.c +84 -43
- data/ext/wapiti/sequence.h +18 -16
- data/ext/wapiti/sgdl1.c +45 -43
- data/ext/wapiti/thread.c +19 -17
- data/ext/wapiti/thread.h +5 -4
- data/ext/wapiti/tools.c +7 -7
- data/ext/wapiti/tools.h +3 -4
- data/ext/wapiti/trainers.h +1 -1
- data/ext/wapiti/vmath.c +40 -38
- data/ext/wapiti/vmath.h +12 -11
- data/ext/wapiti/wapiti.c +159 -37
- data/ext/wapiti/wapiti.h +18 -4
- data/lib/wapiti.rb +15 -15
- data/lib/wapiti/errors.rb +15 -15
- data/lib/wapiti/model.rb +92 -84
- data/lib/wapiti/options.rb +123 -124
- data/lib/wapiti/utility.rb +14 -14
- data/lib/wapiti/version.rb +2 -2
- data/spec/spec_helper.rb +29 -9
- data/spec/wapiti/model_spec.rb +230 -194
- data/spec/wapiti/native_spec.rb +7 -8
- data/spec/wapiti/options_spec.rb +184 -174
- data/wapiti.gemspec +22 -8
- metadata +38 -42
- data/.gitignore +0 -5
data/ext/wapiti/model.h
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -29,14 +29,15 @@
|
|
29
29
|
#define model_h
|
30
30
|
|
31
31
|
#include <stddef.h>
|
32
|
-
#include <
|
32
|
+
#include <stdint.h>
|
33
|
+
#include <sys/time.h>
|
33
34
|
|
34
|
-
#include "wapiti.h"
|
35
35
|
#include "options.h"
|
36
36
|
#include "sequence.h"
|
37
37
|
#include "reader.h"
|
38
|
+
#include "wapiti.h"
|
38
39
|
|
39
|
-
typedef struct
|
40
|
+
typedef struct timeval tms_t;
|
40
41
|
|
41
42
|
/* mdl_t:
|
42
43
|
* Represent a linear-chain CRF model. The model contain both unigram and
|
@@ -60,34 +61,35 @@ typedef struct tms tms_t;
|
|
60
61
|
*/
|
61
62
|
typedef struct mdl_s mdl_t;
|
62
63
|
struct mdl_s {
|
63
|
-
opt_t
|
64
|
+
opt_t *opt; // options for training
|
65
|
+
int type; // model type
|
64
66
|
|
65
67
|
// Size of various model parameters
|
66
|
-
|
67
|
-
|
68
|
-
|
68
|
+
uint32_t nlbl; // Y number of labels
|
69
|
+
uint64_t nobs; // O number of observations
|
70
|
+
uint64_t nftr; // F number of features
|
69
71
|
|
70
72
|
// Informations about observations
|
71
|
-
char
|
72
|
-
|
73
|
-
|
73
|
+
char *kind; // [O] observations type
|
74
|
+
uint64_t *uoff; // [O] unigram weights offset
|
75
|
+
uint64_t *boff; // [O] bigram weights offset
|
74
76
|
|
75
77
|
// The model itself
|
76
|
-
double
|
78
|
+
double *theta; // [F] features weights
|
77
79
|
|
78
80
|
// Datasets
|
79
|
-
dat_t
|
80
|
-
dat_t
|
81
|
-
rdr_t
|
81
|
+
dat_t *train; // training dataset
|
82
|
+
dat_t *devel; // development dataset
|
83
|
+
rdr_t *reader;
|
82
84
|
|
83
85
|
// Stoping criterion
|
84
|
-
double
|
85
|
-
|
86
|
-
|
86
|
+
double *werr; // Window of error rate of last iters
|
87
|
+
uint32_t wcnt; // Number of iters in the window
|
88
|
+
uint32_t wpos; // Position for the next iter
|
87
89
|
|
88
90
|
// Timing
|
89
|
-
tms_t
|
90
|
-
double
|
91
|
+
tms_t timer; // start time of last iter
|
92
|
+
double total; // total training time
|
91
93
|
};
|
92
94
|
|
93
95
|
mdl_t *mdl_new(rdr_t *rdr);
|
data/ext/wapiti/native.c
CHANGED
@@ -1,13 +1,15 @@
|
|
1
1
|
#include <stdio.h>
|
2
2
|
#include <string.h>
|
3
3
|
|
4
|
-
#include "wapiti.h"
|
5
4
|
#include "options.h"
|
6
5
|
#include "reader.h"
|
6
|
+
#include "decoder.h"
|
7
7
|
#include "model.h"
|
8
8
|
#include "trainers.h"
|
9
|
+
#include "progress.h"
|
9
10
|
#include "quark.h"
|
10
11
|
#include "tools.h"
|
12
|
+
#include "wapiti.h"
|
11
13
|
|
12
14
|
#include "native.h"
|
13
15
|
|
@@ -22,27 +24,36 @@ VALUE cConfigurationError;
|
|
22
24
|
VALUE cLogger;
|
23
25
|
|
24
26
|
|
27
|
+
/* --- Forward declarations --- */
|
28
|
+
|
29
|
+
int wapiti_main(int argc, char *argv[argc]);
|
30
|
+
|
31
|
+
void dolabel(mdl_t *mdl);
|
32
|
+
void dotrain(mdl_t *mdl);
|
33
|
+
void doupdt(mdl_t *mdl);
|
34
|
+
|
35
|
+
|
25
36
|
/* --- Utilities --- */
|
26
37
|
|
27
38
|
static void trn_auto(mdl_t *mdl) {
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
39
|
+
const int maxiter = mdl->opt->maxiter;
|
40
|
+
mdl->opt->maxiter = 3;
|
41
|
+
trn_sgdl1(mdl);
|
42
|
+
mdl->opt->maxiter = maxiter;
|
43
|
+
trn_lbfgs(mdl);
|
33
44
|
}
|
34
45
|
|
35
46
|
static const struct {
|
36
|
-
|
37
|
-
|
47
|
+
const char *name;
|
48
|
+
void (* train)(mdl_t *mdl);
|
38
49
|
} trn_lst[] = {
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
50
|
+
{"l-bfgs", trn_lbfgs},
|
51
|
+
{"sgd-l1", trn_sgdl1},
|
52
|
+
{"bcd", trn_bcd },
|
53
|
+
{"rprop", trn_rprop},
|
54
|
+
{"rprop+", trn_rprop},
|
55
|
+
{"rprop-", trn_rprop},
|
56
|
+
{"auto", trn_auto }
|
46
57
|
};
|
47
58
|
static const int trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
|
48
59
|
|
@@ -52,78 +63,78 @@ static const int trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
|
|
52
63
|
// Auxiliary Methods
|
53
64
|
|
54
65
|
static opt_t *get_options(VALUE self) {
|
55
|
-
|
56
|
-
|
57
|
-
|
66
|
+
opt_t *options;
|
67
|
+
Data_Get_Struct(self, opt_t, options);
|
68
|
+
return options;
|
58
69
|
}
|
59
70
|
|
60
71
|
// Copies a Ruby string to the heap and stores it in a pointer.
|
61
72
|
// Frees the pointer before assigning the new value.
|
62
73
|
static void copy_string(char **dst, VALUE rb_string) {
|
63
|
-
|
74
|
+
Check_Type(rb_string, T_STRING);
|
64
75
|
|
65
|
-
|
66
|
-
|
76
|
+
if (*dst) { free(*dst); *dst = (char*)0; }
|
77
|
+
*dst = calloc(RSTRING_LEN(rb_string) + 1, sizeof(char));
|
67
78
|
|
68
|
-
|
79
|
+
memcpy(*dst, StringValuePtr(rb_string), RSTRING_LEN(rb_string) + 1);
|
69
80
|
}
|
70
81
|
|
71
82
|
|
72
83
|
// Constructor / Desctructor
|
73
84
|
|
74
85
|
static void mark_options(opt_t* options __attribute__((__unused__))) {
|
75
|
-
|
86
|
+
// nothing
|
76
87
|
}
|
77
88
|
|
78
89
|
static void deallocate_options(opt_t* options) {
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
90
|
+
|
91
|
+
// free string options
|
92
|
+
if (options->input) { free(options->input); }
|
93
|
+
if (options->output) { free(options->output); }
|
94
|
+
if (options->algo) { free((void*)options->algo); }
|
95
|
+
if (options->devel) { free(options->devel); }
|
96
|
+
if (options->pattern) { free((void*)options->pattern); }
|
97
|
+
|
98
|
+
free(options);
|
99
|
+
options = (opt_t*)0;
|
89
100
|
}
|
90
101
|
|
91
102
|
static VALUE allocate_options(VALUE self) {
|
92
|
-
|
93
|
-
|
94
|
-
}
|
95
|
-
|
96
|
-
static VALUE initialize_options(int argc, VALUE *argv, VALUE self) {
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
103
|
+
opt_t* options = malloc(sizeof(opt_t));
|
104
|
+
return Data_Wrap_Struct(self, mark_options, deallocate_options, options);
|
105
|
+
}
|
106
|
+
|
107
|
+
static VALUE initialize_options(int argc, VALUE *argv, VALUE self) {
|
108
|
+
opt_t* options = get_options(self);
|
109
|
+
*options = opt_defaults;
|
110
|
+
|
111
|
+
if (options->maxiter == 0) {
|
112
|
+
options->maxiter = INT_MAX;
|
113
|
+
}
|
114
|
+
|
115
|
+
// copy the default algorithm name to the heap so that all options strings
|
116
|
+
// are on the heap
|
117
|
+
char* tmp = calloc(strlen(options->algo), sizeof(char));
|
118
|
+
memcpy(tmp, options->algo, strlen(options->algo));
|
119
|
+
options->algo = tmp;
|
120
|
+
|
121
|
+
if (argc > 1) {
|
122
|
+
rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
|
123
|
+
"wrong number of arguments (%d for 0..1)", argc);
|
124
|
+
}
|
125
|
+
|
126
|
+
// set defaults
|
127
|
+
if (argc) {
|
128
|
+
Check_Type(argv[0], T_HASH);
|
129
|
+
(void)rb_funcall(self, rb_intern("update"), 1, argv[0]);
|
130
|
+
}
|
131
|
+
|
132
|
+
// yield self if block_given?
|
133
|
+
if (rb_block_given_p()) {
|
134
|
+
rb_yield(self);
|
135
|
+
}
|
136
|
+
|
137
|
+
return self;
|
127
138
|
}
|
128
139
|
|
129
140
|
|
@@ -133,192 +144,192 @@ static VALUE initialize_options(int argc, VALUE *argv, VALUE self) {
|
|
133
144
|
// Fixnum Accessors
|
134
145
|
|
135
146
|
static VALUE options_nbest(VALUE self) {
|
136
|
-
|
147
|
+
return INT2FIX(get_options(self)->nbest);
|
137
148
|
}
|
138
149
|
|
139
150
|
static VALUE options_set_nbest(VALUE self, VALUE rb_fixnum) {
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
151
|
+
Check_Type(rb_fixnum, T_FIXNUM);
|
152
|
+
get_options(self)->nbest = FIX2INT(rb_fixnum);
|
153
|
+
|
154
|
+
return rb_fixnum;
|
144
155
|
}
|
145
156
|
|
146
157
|
|
147
158
|
static VALUE options_stopwin(VALUE self) {
|
148
|
-
|
159
|
+
return INT2FIX(get_options(self)->stopwin);
|
149
160
|
}
|
150
161
|
|
151
162
|
static VALUE options_set_stopwin(VALUE self, VALUE rb_fixnum) {
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
163
|
+
Check_Type(rb_fixnum, T_FIXNUM);
|
164
|
+
get_options(self)->stopwin = FIX2INT(rb_fixnum);
|
165
|
+
|
166
|
+
return rb_fixnum;
|
156
167
|
}
|
157
168
|
|
158
169
|
static VALUE options_objwin(VALUE self) {
|
159
|
-
|
170
|
+
return INT2FIX(get_options(self)->objwin);
|
160
171
|
}
|
161
172
|
|
162
173
|
static VALUE options_set_objwin(VALUE self, VALUE rb_fixnum) {
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
174
|
+
Check_Type(rb_fixnum, T_FIXNUM);
|
175
|
+
get_options(self)->objwin = FIX2INT(rb_fixnum);
|
176
|
+
|
177
|
+
return rb_fixnum;
|
167
178
|
}
|
168
179
|
|
169
180
|
|
170
181
|
static VALUE options_maxiter(VALUE self) {
|
171
|
-
|
182
|
+
return INT2FIX(get_options(self)->maxiter);
|
172
183
|
}
|
173
184
|
|
174
185
|
static VALUE options_set_maxiter(VALUE self, VALUE rb_fixnum) {
|
175
|
-
|
186
|
+
opt_t *options = get_options(self);
|
176
187
|
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
188
|
+
Check_Type(rb_fixnum, T_FIXNUM);
|
189
|
+
options->maxiter = FIX2INT(rb_fixnum);
|
190
|
+
|
191
|
+
return rb_fixnum;
|
181
192
|
}
|
182
193
|
|
183
194
|
static VALUE options_jobsize(VALUE self) {
|
184
|
-
|
195
|
+
return INT2FIX(get_options(self)->jobsize);
|
185
196
|
}
|
186
197
|
|
187
198
|
static VALUE options_set_jobsize(VALUE self, VALUE rb_fixnum) {
|
188
|
-
|
199
|
+
opt_t *options = get_options(self);
|
200
|
+
|
201
|
+
Check_Type(rb_fixnum, T_FIXNUM);
|
202
|
+
options->jobsize = FIX2INT(rb_fixnum);
|
189
203
|
|
190
|
-
|
191
|
-
options->jobsize = FIX2INT(rb_fixnum);
|
192
|
-
|
193
|
-
return rb_fixnum;
|
204
|
+
return rb_fixnum;
|
194
205
|
}
|
195
206
|
|
196
207
|
static VALUE options_nthread(VALUE self) {
|
197
|
-
|
208
|
+
return INT2FIX(get_options(self)->nthread);
|
198
209
|
}
|
199
210
|
|
200
211
|
static VALUE options_set_nthread(VALUE self, VALUE rb_fixnum) {
|
201
|
-
|
212
|
+
opt_t *options = get_options(self);
|
202
213
|
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
214
|
+
Check_Type(rb_fixnum, T_FIXNUM);
|
215
|
+
options->nthread = FIX2INT(rb_fixnum);
|
216
|
+
|
217
|
+
return rb_fixnum;
|
207
218
|
}
|
208
219
|
|
209
220
|
static VALUE options_histsz(VALUE self) {
|
210
|
-
|
221
|
+
return INT2FIX(get_options(self)->lbfgs.histsz);
|
211
222
|
}
|
212
223
|
|
213
224
|
static VALUE options_set_histsz(VALUE self, VALUE rb_fixnum) {
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
225
|
+
Check_Type(rb_fixnum, T_FIXNUM);
|
226
|
+
get_options(self)->lbfgs.histsz = FIX2INT(rb_fixnum);
|
227
|
+
|
228
|
+
return rb_fixnum;
|
218
229
|
}
|
219
230
|
|
220
231
|
static VALUE options_maxls(VALUE self) {
|
221
|
-
|
232
|
+
return INT2FIX(get_options(self)->lbfgs.maxls);
|
222
233
|
}
|
223
234
|
|
224
235
|
static VALUE options_set_maxls(VALUE self, VALUE rb_fixnum) {
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
236
|
+
Check_Type(rb_fixnum, T_FIXNUM);
|
237
|
+
get_options(self)->lbfgs.maxls = FIX2INT(rb_fixnum);
|
238
|
+
|
239
|
+
return rb_fixnum;
|
229
240
|
}
|
230
241
|
|
231
242
|
|
232
243
|
// Float Accessors
|
233
244
|
|
234
245
|
static VALUE options_rho1(VALUE self) {
|
235
|
-
|
246
|
+
return rb_float_new(get_options(self)->rho1);
|
236
247
|
}
|
237
248
|
|
238
249
|
static VALUE options_set_rho1(VALUE self, VALUE rb_numeric) {
|
239
|
-
|
240
|
-
|
250
|
+
get_options(self)->rho1 = NUM2DBL(rb_numeric);
|
251
|
+
return rb_numeric;
|
241
252
|
}
|
242
253
|
|
243
254
|
static VALUE options_rho2(VALUE self) {
|
244
|
-
|
255
|
+
return rb_float_new(get_options(self)->rho2);
|
245
256
|
}
|
246
257
|
|
247
258
|
static VALUE options_set_rho2(VALUE self, VALUE rb_numeric) {
|
248
|
-
|
249
|
-
|
259
|
+
get_options(self)->rho2 = NUM2DBL(rb_numeric);
|
260
|
+
return rb_numeric;
|
250
261
|
}
|
251
262
|
|
252
263
|
static VALUE options_stopeps(VALUE self) {
|
253
|
-
|
264
|
+
return rb_float_new(get_options(self)->stopeps);
|
254
265
|
}
|
255
266
|
|
256
267
|
static VALUE options_set_stopeps(VALUE self, VALUE rb_numeric) {
|
257
|
-
|
258
|
-
|
268
|
+
get_options(self)->stopeps = NUM2DBL(rb_numeric);
|
269
|
+
return rb_numeric;
|
259
270
|
}
|
260
271
|
|
261
272
|
static VALUE options_eta0(VALUE self) {
|
262
|
-
|
273
|
+
return rb_float_new(get_options(self)->sgdl1.eta0);
|
263
274
|
}
|
264
275
|
|
265
276
|
static VALUE options_set_eta0(VALUE self, VALUE rb_numeric) {
|
266
|
-
|
267
|
-
|
277
|
+
get_options(self)->sgdl1.eta0 = NUM2DBL(rb_numeric);
|
278
|
+
return rb_numeric;
|
268
279
|
}
|
269
280
|
|
270
281
|
static VALUE options_alpha(VALUE self) {
|
271
|
-
|
282
|
+
return rb_float_new(get_options(self)->sgdl1.alpha);
|
272
283
|
}
|
273
284
|
|
274
285
|
static VALUE options_set_alpha(VALUE self, VALUE rb_numeric) {
|
275
|
-
|
276
|
-
|
286
|
+
get_options(self)->sgdl1.alpha = NUM2DBL(rb_numeric);
|
287
|
+
return rb_numeric;
|
277
288
|
}
|
278
289
|
|
279
290
|
static VALUE options_kappa(VALUE self) {
|
280
|
-
|
291
|
+
return rb_float_new(get_options(self)->bcd.kappa);
|
281
292
|
}
|
282
293
|
|
283
294
|
static VALUE options_set_kappa(VALUE self, VALUE rb_numeric) {
|
284
|
-
|
285
|
-
|
295
|
+
get_options(self)->bcd.kappa = NUM2DBL(rb_numeric);
|
296
|
+
return rb_numeric;
|
286
297
|
}
|
287
298
|
|
288
299
|
static VALUE options_stpmin(VALUE self) {
|
289
|
-
|
300
|
+
return rb_float_new(get_options(self)->rprop.stpmin);
|
290
301
|
}
|
291
302
|
|
292
303
|
static VALUE options_set_stpmin(VALUE self, VALUE rb_numeric) {
|
293
|
-
|
294
|
-
|
304
|
+
get_options(self)->rprop.stpmin = NUM2DBL(rb_numeric);
|
305
|
+
return rb_numeric;
|
295
306
|
}
|
296
307
|
|
297
308
|
static VALUE options_stpmax(VALUE self) {
|
298
|
-
|
309
|
+
return rb_float_new(get_options(self)->rprop.stpmax);
|
299
310
|
}
|
300
311
|
|
301
312
|
static VALUE options_set_stpmax(VALUE self, VALUE rb_numeric) {
|
302
|
-
|
303
|
-
|
313
|
+
get_options(self)->rprop.stpmax = NUM2DBL(rb_numeric);
|
314
|
+
return rb_numeric;
|
304
315
|
}
|
305
316
|
|
306
317
|
static VALUE options_stpinc(VALUE self) {
|
307
|
-
|
318
|
+
return rb_float_new(get_options(self)->rprop.stpinc);
|
308
319
|
}
|
309
320
|
|
310
321
|
static VALUE options_set_stpinc(VALUE self, VALUE rb_numeric) {
|
311
|
-
|
312
|
-
|
322
|
+
get_options(self)->rprop.stpinc = NUM2DBL(rb_numeric);
|
323
|
+
return rb_numeric;
|
313
324
|
}
|
314
325
|
|
315
326
|
static VALUE options_stpdec(VALUE self) {
|
316
|
-
|
327
|
+
return rb_float_new(get_options(self)->rprop.stpdec);
|
317
328
|
}
|
318
329
|
|
319
330
|
static VALUE options_set_stpdec(VALUE self, VALUE rb_numeric) {
|
320
|
-
|
321
|
-
|
331
|
+
get_options(self)->rprop.stpdec = NUM2DBL(rb_numeric);
|
332
|
+
return rb_numeric;
|
322
333
|
}
|
323
334
|
|
324
335
|
|
@@ -326,84 +337,84 @@ static VALUE options_set_stpdec(VALUE self, VALUE rb_numeric) {
|
|
326
337
|
// Boolean Accessors
|
327
338
|
|
328
339
|
static VALUE options_maxent(VALUE self) {
|
329
|
-
|
340
|
+
return get_options(self)->maxent ? Qtrue : Qfalse;
|
330
341
|
}
|
331
342
|
|
332
343
|
static VALUE options_set_maxent(VALUE self, VALUE rb_boolean) {
|
333
|
-
|
334
|
-
|
344
|
+
get_options(self)->maxent = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
|
345
|
+
return rb_boolean;
|
335
346
|
}
|
336
347
|
|
337
348
|
static VALUE options_compact(VALUE self) {
|
338
|
-
|
349
|
+
return get_options(self)->compact ? Qtrue : Qfalse;
|
339
350
|
}
|
340
351
|
|
341
352
|
static VALUE options_set_compact(VALUE self, VALUE rb_boolean) {
|
342
|
-
|
343
|
-
|
353
|
+
get_options(self)->compact = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
|
354
|
+
return rb_boolean;
|
344
355
|
}
|
345
356
|
|
346
357
|
static VALUE options_sparse(VALUE self) {
|
347
|
-
|
358
|
+
return get_options(self)->sparse ? Qtrue : Qfalse;
|
348
359
|
}
|
349
360
|
|
350
361
|
static VALUE options_set_sparse(VALUE self, VALUE rb_boolean) {
|
351
|
-
|
352
|
-
|
362
|
+
get_options(self)->sparse = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
|
363
|
+
return rb_boolean;
|
353
364
|
}
|
354
365
|
|
355
366
|
static VALUE options_check(VALUE self) {
|
356
|
-
|
367
|
+
return get_options(self)->check ? Qtrue : Qfalse;
|
357
368
|
}
|
358
369
|
|
359
370
|
static VALUE options_set_check(VALUE self, VALUE rb_boolean) {
|
360
|
-
|
361
|
-
|
371
|
+
get_options(self)->check = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
|
372
|
+
return rb_boolean;
|
362
373
|
}
|
363
374
|
|
364
375
|
static VALUE options_label(VALUE self) {
|
365
|
-
|
376
|
+
return get_options(self)->label ? Qtrue : Qfalse;
|
366
377
|
}
|
367
378
|
|
368
379
|
static VALUE options_set_label(VALUE self, VALUE rb_boolean) {
|
369
|
-
|
370
|
-
|
380
|
+
get_options(self)->label = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
|
381
|
+
return rb_boolean;
|
371
382
|
}
|
372
383
|
|
373
384
|
static VALUE options_outsc(VALUE self) {
|
374
|
-
|
385
|
+
return get_options(self)->outsc ? Qtrue : Qfalse;
|
375
386
|
}
|
376
387
|
|
377
388
|
static VALUE options_set_outsc(VALUE self, VALUE rb_boolean) {
|
378
|
-
|
379
|
-
|
389
|
+
get_options(self)->outsc = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
|
390
|
+
return rb_boolean;
|
380
391
|
}
|
381
392
|
|
382
393
|
static VALUE options_lblpost(VALUE self) {
|
383
|
-
|
394
|
+
return get_options(self)->lblpost ? Qtrue : Qfalse;
|
384
395
|
}
|
385
396
|
|
386
397
|
static VALUE options_set_lblpost(VALUE self, VALUE rb_boolean) {
|
387
|
-
|
388
|
-
|
398
|
+
get_options(self)->lblpost = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
|
399
|
+
return rb_boolean;
|
389
400
|
}
|
390
401
|
|
391
402
|
static VALUE options_clip(VALUE self) {
|
392
|
-
|
403
|
+
return get_options(self)->lbfgs.clip ? Qtrue : Qfalse;
|
393
404
|
}
|
394
405
|
|
395
406
|
static VALUE options_set_clip(VALUE self, VALUE rb_boolean) {
|
396
|
-
|
397
|
-
|
407
|
+
get_options(self)->lbfgs.clip = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
|
408
|
+
return rb_boolean;
|
398
409
|
}
|
399
410
|
|
400
411
|
static VALUE options_cutoff(VALUE self) {
|
401
|
-
|
412
|
+
return get_options(self)->rprop.cutoff ? Qtrue : Qfalse;
|
402
413
|
}
|
403
414
|
|
404
415
|
static VALUE options_set_cutoff(VALUE self, VALUE rb_boolean) {
|
405
|
-
|
406
|
-
|
416
|
+
get_options(self)->rprop.cutoff = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
|
417
|
+
return rb_boolean;
|
407
418
|
}
|
408
419
|
|
409
420
|
|
@@ -412,201 +423,201 @@ static VALUE options_set_cutoff(VALUE self, VALUE rb_boolean) {
|
|
412
423
|
// String Accessors
|
413
424
|
|
414
425
|
static VALUE options_pattern(VALUE self) {
|
415
|
-
|
416
|
-
|
426
|
+
const char *pattern = get_options(self)->pattern;
|
427
|
+
return rb_str_new2(pattern ? pattern : "");
|
417
428
|
}
|
418
429
|
|
419
430
|
static VALUE options_set_pattern(VALUE self, VALUE rb_string) {
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
431
|
+
opt_t *options = get_options(self);
|
432
|
+
copy_string((char**)&(options->pattern), rb_string);
|
433
|
+
|
434
|
+
return rb_string;
|
424
435
|
}
|
425
436
|
|
426
437
|
static VALUE options_model(VALUE self) {
|
427
|
-
|
428
|
-
|
438
|
+
const char *model = get_options(self)->model;
|
439
|
+
return rb_str_new2(model ? model : "");
|
429
440
|
}
|
430
441
|
|
431
442
|
static VALUE options_set_model(VALUE self, VALUE rb_string) {
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
443
|
+
opt_t *options = get_options(self);
|
444
|
+
copy_string(&(options->model), rb_string);
|
445
|
+
|
446
|
+
return rb_string;
|
436
447
|
}
|
437
448
|
|
438
449
|
static VALUE options_algorithm(VALUE self) {
|
439
|
-
|
440
|
-
|
450
|
+
const char *algorithm = get_options(self)->algo;
|
451
|
+
return rb_str_new2(algorithm ? algorithm : "");
|
441
452
|
}
|
442
453
|
|
443
454
|
static VALUE options_set_algorithm(VALUE self, VALUE rb_string) {
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
455
|
+
opt_t *options = get_options(self);
|
456
|
+
copy_string((char**)&(options->algo), rb_string);
|
457
|
+
|
458
|
+
return rb_string;
|
448
459
|
}
|
449
460
|
|
450
461
|
static VALUE options_development_data(VALUE self) {
|
451
|
-
|
452
|
-
|
462
|
+
char *development_data = get_options(self)->devel;
|
463
|
+
return rb_str_new2(development_data ? development_data : "");
|
453
464
|
}
|
454
465
|
|
455
466
|
static VALUE options_set_development_data(VALUE self, VALUE rb_string) {
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
467
|
+
opt_t *options = get_options(self);
|
468
|
+
copy_string(&(options->devel), rb_string);
|
469
|
+
|
470
|
+
return rb_string;
|
460
471
|
}
|
461
472
|
|
462
473
|
|
463
474
|
void Init_options() {
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
475
|
+
cOptions = rb_define_class_under(mWapiti, "Options", rb_cObject);
|
476
|
+
rb_define_alloc_func(cOptions, allocate_options);
|
477
|
+
|
478
|
+
rb_define_method(cOptions, "initialize", initialize_options, -1);
|
479
|
+
|
480
|
+
// Option Accessors
|
481
|
+
|
482
|
+
rb_define_method(cOptions, "stopwin", options_stopwin, 0);
|
483
|
+
rb_define_method(cOptions, "stopwin=", options_set_stopwin, 1);
|
484
|
+
|
485
|
+
rb_define_alias(cOptions, "stop_window", "stopwin");
|
486
|
+
rb_define_alias(cOptions, "stop_window=", "stopwin=");
|
468
487
|
|
469
|
-
|
488
|
+
rb_define_method(cOptions, "objwin", options_objwin, 0);
|
489
|
+
rb_define_method(cOptions, "objwin=", options_set_objwin, 1);
|
470
490
|
|
471
|
-
|
472
|
-
|
491
|
+
rb_define_alias(cOptions, "convergence_window", "objwin");
|
492
|
+
rb_define_alias(cOptions, "convergence_window=", "objwin=");
|
473
493
|
|
474
|
-
|
475
|
-
|
494
|
+
rb_define_method(cOptions, "maxiter", options_maxiter, 0);
|
495
|
+
rb_define_method(cOptions, "maxiter=", options_set_maxiter, 1);
|
476
496
|
|
477
|
-
|
478
|
-
|
497
|
+
rb_define_alias(cOptions, "max_iterations", "maxiter");
|
498
|
+
rb_define_alias(cOptions, "max_iterations=", "maxiter=");
|
479
499
|
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
rb_define_method(cOptions, "maxiter", options_maxiter, 0);
|
484
|
-
rb_define_method(cOptions, "maxiter=", options_set_maxiter, 1);
|
500
|
+
rb_define_method(cOptions, "jobsize", options_jobsize, 0);
|
501
|
+
rb_define_method(cOptions, "jobsize=", options_set_jobsize, 1);
|
485
502
|
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
rb_define_method(cOptions, "jobsize", options_jobsize, 0);
|
490
|
-
rb_define_method(cOptions, "jobsize=", options_set_jobsize, 1);
|
503
|
+
rb_define_method(cOptions, "nthread", options_nthread, 0);
|
504
|
+
rb_define_method(cOptions, "nthread=", options_set_nthread, 1);
|
491
505
|
|
492
|
-
|
493
|
-
|
506
|
+
rb_define_alias(cOptions, "threads", "nthread");
|
507
|
+
rb_define_alias(cOptions, "threads=", "nthread=");
|
494
508
|
|
495
|
-
|
496
|
-
|
509
|
+
rb_define_method(cOptions, "rho1", options_rho1, 0);
|
510
|
+
rb_define_method(cOptions, "rho1=", options_set_rho1, 1);
|
497
511
|
|
498
|
-
|
499
|
-
|
512
|
+
rb_define_method(cOptions, "rho2", options_rho2, 0);
|
513
|
+
rb_define_method(cOptions, "rho2=", options_set_rho2, 1);
|
500
514
|
|
501
|
-
|
502
|
-
|
515
|
+
rb_define_method(cOptions, "stopeps", options_stopeps, 0);
|
516
|
+
rb_define_method(cOptions, "stopeps=", options_set_stopeps, 1);
|
503
517
|
|
504
|
-
|
505
|
-
|
518
|
+
rb_define_alias(cOptions, "stop_epsilon", "stopeps");
|
519
|
+
rb_define_alias(cOptions, "stop_epsilon=", "stopeps=");
|
506
520
|
|
507
|
-
|
508
|
-
|
521
|
+
rb_define_method(cOptions, "maxent", options_maxent, 0);
|
522
|
+
rb_define_method(cOptions, "maxent=", options_set_maxent, 1);
|
509
523
|
|
510
|
-
|
511
|
-
rb_define_method(cOptions, "maxent=", options_set_maxent, 1);
|
524
|
+
rb_define_alias(cOptions, "maxent?", "maxent");
|
512
525
|
|
513
|
-
|
526
|
+
rb_define_method(cOptions, "compact", options_compact, 0);
|
527
|
+
rb_define_method(cOptions, "compact=", options_set_compact, 1);
|
514
528
|
|
515
|
-
|
516
|
-
rb_define_method(cOptions, "compact=", options_set_compact, 1);
|
529
|
+
rb_define_alias(cOptions, "compact?", "compact");
|
517
530
|
|
518
|
-
|
531
|
+
rb_define_method(cOptions, "sparse", options_sparse, 0);
|
532
|
+
rb_define_method(cOptions, "sparse=", options_set_sparse, 1);
|
519
533
|
|
520
|
-
|
521
|
-
rb_define_method(cOptions, "sparse=", options_set_sparse, 1);
|
534
|
+
rb_define_alias(cOptions, "sparse?", "sparse");
|
522
535
|
|
523
|
-
|
536
|
+
rb_define_method(cOptions, "skip_tokens", options_label, 0);
|
537
|
+
rb_define_method(cOptions, "skip_tokens=", options_set_label, 1);
|
524
538
|
|
525
|
-
|
526
|
-
rb_define_method(cOptions, "skip_tokens=", options_set_label, 1);
|
539
|
+
rb_define_alias(cOptions, "skip_tokens?", "skip_tokens");
|
527
540
|
|
528
|
-
|
541
|
+
rb_define_method(cOptions, "check", options_check, 0);
|
542
|
+
rb_define_method(cOptions, "check=", options_set_check, 1);
|
529
543
|
|
530
|
-
|
531
|
-
rb_define_method(cOptions, "check=", options_set_check, 1);
|
544
|
+
rb_define_alias(cOptions, "check?", "check");
|
532
545
|
|
533
|
-
|
546
|
+
rb_define_method(cOptions, "lblpost", options_lblpost, 0);
|
547
|
+
rb_define_method(cOptions, "lblpost=", options_set_lblpost, 1);
|
534
548
|
|
535
|
-
|
536
|
-
rb_define_method(cOptions, "lblpost=", options_set_lblpost, 1);
|
549
|
+
rb_define_alias(cOptions, "lblpost?", "lblpost");
|
537
550
|
|
538
|
-
|
551
|
+
rb_define_alias(cOptions, "posterior", "lblpost");
|
552
|
+
rb_define_alias(cOptions, "posterior?", "lblpost");
|
553
|
+
rb_define_alias(cOptions, "posterior=", "lblpost=");
|
539
554
|
|
540
|
-
|
541
|
-
|
542
|
-
rb_define_alias(cOptions, "posterior=", "lblpost=");
|
555
|
+
rb_define_method(cOptions, "outsc", options_outsc, 0);
|
556
|
+
rb_define_method(cOptions, "outsc=", options_set_outsc, 1);
|
543
557
|
|
544
|
-
|
545
|
-
rb_define_method(cOptions, "outsc=", options_set_outsc, 1);
|
558
|
+
rb_define_alias(cOptions, "outsc?", "outsc");
|
546
559
|
|
547
|
-
|
560
|
+
rb_define_alias(cOptions, "score", "outsc");
|
561
|
+
rb_define_alias(cOptions, "score?", "outsc");
|
562
|
+
rb_define_alias(cOptions, "score=", "outsc=");
|
548
563
|
|
549
|
-
|
550
|
-
|
551
|
-
rb_define_alias(cOptions, "score=", "outsc=");
|
564
|
+
rb_define_method(cOptions, "pattern", options_pattern, 0);
|
565
|
+
rb_define_method(cOptions, "pattern=", options_set_pattern, 1);
|
552
566
|
|
553
|
-
|
554
|
-
|
567
|
+
rb_define_alias(cOptions, "template", "pattern");
|
568
|
+
rb_define_alias(cOptions, "template=", "pattern=");
|
555
569
|
|
556
|
-
|
557
|
-
|
570
|
+
rb_define_method(cOptions, "model", options_model, 0);
|
571
|
+
rb_define_method(cOptions, "model=", options_set_model, 1);
|
558
572
|
|
559
|
-
|
560
|
-
|
573
|
+
rb_define_method(cOptions, "algorithm", options_algorithm, 0);
|
574
|
+
rb_define_method(cOptions, "algorithm=", options_set_algorithm, 1);
|
561
575
|
|
562
|
-
|
563
|
-
|
576
|
+
rb_define_alias(cOptions, "algo", "algorithm");
|
577
|
+
rb_define_alias(cOptions, "algo=", "algorithm=");
|
564
578
|
|
565
|
-
|
566
|
-
|
579
|
+
rb_define_method(cOptions, "development_data", options_development_data, 0);
|
580
|
+
rb_define_method(cOptions, "development_data=", options_set_development_data, 1);
|
567
581
|
|
568
|
-
|
569
|
-
|
582
|
+
rb_define_alias(cOptions, "devel", "development_data");
|
583
|
+
rb_define_alias(cOptions, "devel=", "development_data=");
|
570
584
|
|
571
|
-
|
572
|
-
|
585
|
+
rb_define_method(cOptions, "clip", options_clip, 0);
|
586
|
+
rb_define_method(cOptions, "clip=", options_set_clip, 1);
|
573
587
|
|
574
|
-
|
575
|
-
|
588
|
+
rb_define_method(cOptions, "histsz", options_histsz, 0);
|
589
|
+
rb_define_method(cOptions, "histsz=", options_set_histsz, 1);
|
576
590
|
|
577
|
-
|
578
|
-
|
591
|
+
rb_define_method(cOptions, "maxls", options_maxls, 0);
|
592
|
+
rb_define_method(cOptions, "maxls=", options_set_maxls, 1);
|
579
593
|
|
580
|
-
|
581
|
-
|
594
|
+
rb_define_method(cOptions, "eta0", options_eta0, 0);
|
595
|
+
rb_define_method(cOptions, "eta0=", options_set_eta0, 1);
|
582
596
|
|
583
|
-
|
584
|
-
|
597
|
+
rb_define_method(cOptions, "alpha", options_alpha, 0);
|
598
|
+
rb_define_method(cOptions, "alpha=", options_set_alpha, 1);
|
585
599
|
|
586
|
-
|
587
|
-
|
600
|
+
rb_define_method(cOptions, "kappa", options_kappa, 0);
|
601
|
+
rb_define_method(cOptions, "kappa=", options_set_kappa, 1);
|
588
602
|
|
589
|
-
|
590
|
-
|
603
|
+
rb_define_method(cOptions, "stpmin", options_stpmin, 0);
|
604
|
+
rb_define_method(cOptions, "stpmin=", options_set_stpmin, 1);
|
591
605
|
|
592
|
-
|
593
|
-
|
606
|
+
rb_define_method(cOptions, "stpmax", options_stpmax, 0);
|
607
|
+
rb_define_method(cOptions, "stpmax=", options_set_stpmax, 1);
|
594
608
|
|
595
|
-
|
596
|
-
|
609
|
+
rb_define_method(cOptions, "stpinc", options_stpinc, 0);
|
610
|
+
rb_define_method(cOptions, "stpinc=", options_set_stpinc, 1);
|
597
611
|
|
598
|
-
|
599
|
-
|
612
|
+
rb_define_method(cOptions, "stpdec", options_stpdec, 0);
|
613
|
+
rb_define_method(cOptions, "stpdec=", options_set_stpdec, 1);
|
600
614
|
|
601
|
-
|
602
|
-
|
615
|
+
rb_define_method(cOptions, "cutoff", options_cutoff, 0);
|
616
|
+
rb_define_method(cOptions, "cutoff=", options_set_cutoff, 1);
|
617
|
+
|
618
|
+
rb_define_method(cOptions, "nbest", options_nbest, 0);
|
619
|
+
rb_define_method(cOptions, "nbest=", options_set_nbest, 1);
|
603
620
|
|
604
|
-
rb_define_method(cOptions, "cutoff", options_cutoff, 0);
|
605
|
-
rb_define_method(cOptions, "cutoff=", options_set_cutoff, 1);
|
606
|
-
|
607
|
-
rb_define_method(cOptions, "nbest", options_nbest, 0);
|
608
|
-
rb_define_method(cOptions, "nbest=", options_set_nbest, 1);
|
609
|
-
|
610
621
|
}
|
611
622
|
|
612
623
|
|
@@ -615,119 +626,119 @@ void Init_options() {
|
|
615
626
|
// Auxiliary Methods
|
616
627
|
|
617
628
|
static mdl_t *get_model(VALUE self) {
|
618
|
-
|
619
|
-
|
620
|
-
|
629
|
+
mdl_t *model;
|
630
|
+
Data_Get_Struct(self, mdl_t, model);
|
631
|
+
return model;
|
621
632
|
}
|
622
633
|
|
623
634
|
// Constructor / Desctructor
|
624
635
|
|
625
636
|
static void mark_model(mdl_t *model __attribute__((__unused__))) {
|
626
|
-
|
637
|
+
// nothing
|
627
638
|
}
|
628
639
|
|
629
640
|
static void deallocate_model(mdl_t *model) {
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
641
|
+
if (model) {
|
642
|
+
mdl_free(model);
|
643
|
+
model = (mdl_t*)0;
|
644
|
+
}
|
634
645
|
}
|
635
646
|
|
636
647
|
static VALUE allocate_model(VALUE self) {
|
637
|
-
|
638
|
-
|
648
|
+
mdl_t *model = mdl_new(rdr_new(false));
|
649
|
+
return Data_Wrap_Struct(self, mark_model, deallocate_model, model);
|
639
650
|
}
|
640
651
|
|
641
|
-
static VALUE model_set_options(VALUE self, VALUE rb_options) {
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
652
|
+
static VALUE model_set_options(VALUE self, VALUE rb_options) {
|
653
|
+
if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
|
654
|
+
rb_raise(cNativeError, "argument must be a Wapiti::Options instance");
|
655
|
+
}
|
656
|
+
|
657
|
+
mdl_t *model = get_model(self);
|
658
|
+
|
659
|
+
// Store reference to options in model struct
|
660
|
+
model->opt = get_options(rb_options);
|
661
|
+
|
662
|
+
// Update reader
|
663
|
+
model->reader->autouni = model->opt->maxent;
|
664
|
+
|
665
|
+
// Save instance variable
|
666
|
+
rb_ivar_set(self, rb_intern("@options"), rb_options);
|
656
667
|
|
657
|
-
|
668
|
+
return rb_options;
|
658
669
|
}
|
659
670
|
|
660
671
|
static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
672
|
+
VALUE options;
|
673
|
+
|
674
|
+
if (argc > 1) {
|
675
|
+
rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
|
676
|
+
"wrong number of arguments (%d for 0..1)", argc);
|
677
|
+
}
|
678
|
+
|
679
|
+
if (argc) {
|
680
|
+
if (TYPE(argv[0]) == T_HASH) {
|
681
|
+
options = rb_funcall(cOptions, rb_intern("new"), 1, argv[0]);
|
682
|
+
}
|
683
|
+
else {
|
684
|
+
if (strncmp("Wapiti::Options", rb_obj_classname(argv[0]), 15) != 0) {
|
685
|
+
rb_raise(cNativeError, "argument must be a hash or an options instance");
|
686
|
+
}
|
687
|
+
options = argv[0];
|
688
|
+
}
|
689
|
+
}
|
690
|
+
else {
|
691
|
+
options = rb_funcall(cOptions, rb_intern("new"), 0);
|
692
|
+
}
|
693
|
+
|
694
|
+
// yield options if block_given?
|
695
|
+
if (rb_block_given_p()) {
|
696
|
+
rb_yield(options);
|
697
|
+
}
|
698
|
+
|
699
|
+
model_set_options(self, options);
|
700
|
+
|
701
|
+
// Load a previous model if specified by options
|
702
|
+
if (get_options(options)->model) {
|
703
|
+
rb_funcall(self, rb_intern("load"), 0);
|
704
|
+
}
|
705
|
+
|
706
|
+
// initialize counters
|
707
|
+
rb_funcall(self, rb_intern("clear_counters"), 0);
|
708
|
+
|
709
|
+
return self;
|
699
710
|
}
|
700
711
|
|
701
712
|
|
702
713
|
// Native accessors
|
703
714
|
|
704
715
|
static VALUE model_nlbl(VALUE self) {
|
705
|
-
|
716
|
+
return INT2FIX(get_model(self)->nlbl);
|
706
717
|
}
|
707
718
|
|
708
719
|
static VALUE model_nobs(VALUE self) {
|
709
|
-
|
720
|
+
return INT2FIX(get_model(self)->nobs);
|
710
721
|
}
|
711
722
|
|
712
723
|
static VALUE model_nftr(VALUE self) {
|
713
|
-
|
724
|
+
return INT2FIX(get_model(self)->nftr);
|
714
725
|
}
|
715
726
|
|
716
727
|
static VALUE model_total(VALUE self) {
|
717
|
-
|
728
|
+
return rb_float_new(get_model(self)->total);
|
718
729
|
}
|
719
730
|
|
720
731
|
|
721
732
|
// Instance methods
|
722
733
|
|
723
734
|
static VALUE model_sync(VALUE self) {
|
724
|
-
|
725
|
-
|
735
|
+
mdl_sync(get_model(self));
|
736
|
+
return self;
|
726
737
|
}
|
727
738
|
|
728
739
|
static VALUE model_compact(VALUE self) {
|
729
|
-
|
730
|
-
|
740
|
+
mdl_compact(get_model(self));
|
741
|
+
return self;
|
731
742
|
}
|
732
743
|
|
733
744
|
// call-seq:
|
@@ -737,400 +748,398 @@ static VALUE model_compact(VALUE self) {
|
|
737
748
|
// Saves the model to a file. Uses the Model's path if no argument given,
|
738
749
|
// otherwise uses the passed-in argument as the Model's path.
|
739
750
|
static VALUE model_save(int argc, VALUE *argv, VALUE self) {
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
751
|
+
if (argc > 1) {
|
752
|
+
rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
|
753
|
+
"wrong number of arguments (%d for 0..1)", argc);
|
754
|
+
}
|
755
|
+
|
756
|
+
mdl_t *model = get_model(self);
|
757
|
+
|
758
|
+
// save passed-in path in options
|
759
|
+
if (argc) {
|
760
|
+
Check_Type(argv[0], T_STRING);
|
761
|
+
rb_ivar_set(self, rb_intern("@path"), argv[0]);
|
762
|
+
}
|
763
|
+
|
764
|
+
// open the output file
|
765
|
+
FILE *file = 0;
|
766
|
+
VALUE path = rb_ivar_get(self, rb_intern("@path"));
|
767
|
+
|
768
|
+
if (NIL_P(path)) {
|
769
|
+
rb_raise(cNativeError, "failed to save model: no path given");
|
770
|
+
}
|
771
|
+
|
772
|
+
if (!(file = fopen(StringValueCStr(path), "w"))) {
|
773
|
+
rb_raise(cNativeError, "failed to save model: failed to open model file");
|
774
|
+
}
|
775
|
+
|
776
|
+
mdl_save(model, file);
|
777
|
+
fclose(file);
|
778
|
+
|
779
|
+
return self;
|
769
780
|
}
|
770
781
|
|
771
782
|
static VALUE model_load(int argc, VALUE *argv, VALUE self) {
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
783
|
+
if (argc > 1) {
|
784
|
+
rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
|
785
|
+
"wrong number of arguments (%d for 0..1)", argc);
|
786
|
+
}
|
787
|
+
|
788
|
+
mdl_t *model = get_model(self);
|
789
|
+
|
790
|
+
// save passed-in argument in options
|
791
|
+
if (argc) {
|
792
|
+
Check_Type(argv[0], T_STRING);
|
793
|
+
rb_ivar_set(self, rb_intern("@path"), argv[0]);
|
794
|
+
}
|
795
|
+
|
796
|
+
// open the model file
|
797
|
+
FILE *file = 0;
|
798
|
+
VALUE path = rb_ivar_get(self, rb_intern("@path"));
|
799
|
+
|
800
|
+
if (NIL_P(path)) {
|
801
|
+
rb_raise(cNativeError, "failed to load model: no path given");
|
802
|
+
}
|
803
|
+
|
804
|
+
if (!(file = fopen(StringValueCStr(path), "r"))) {
|
805
|
+
rb_raise(cNativeError, "failed to load model: failed to open model file");
|
806
|
+
}
|
807
|
+
|
808
|
+
mdl_load(model, file);
|
809
|
+
fclose(file);
|
810
|
+
|
811
|
+
return self;
|
801
812
|
}
|
802
813
|
|
803
814
|
static dat_t *to_dat(rdr_t *reader, VALUE data, bool labelled) {
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
815
|
+
Check_Type(data, T_ARRAY);
|
816
|
+
|
817
|
+
const unsigned int n = RARRAY_LEN(data);
|
818
|
+
unsigned int i, j, k;
|
819
|
+
|
820
|
+
dat_t *dat = wapiti_xmalloc(sizeof(dat_t));
|
821
|
+
dat->nseq = 0;
|
822
|
+
dat->mlen = 0;
|
823
|
+
dat->lbl = labelled;
|
824
|
+
dat->seq = wapiti_xmalloc(sizeof(seq_t*) * n);
|
825
|
+
|
826
|
+
for (i = 0; i < n; ++i) {
|
827
|
+
VALUE sequence = rb_ary_entry(data, i);
|
828
|
+
Check_Type(sequence, T_ARRAY);
|
829
|
+
|
830
|
+
k = RARRAY_LEN(sequence);
|
831
|
+
raw_t *raw = wapiti_xmalloc(sizeof(raw_t) + sizeof(char*) * k);
|
832
|
+
|
833
|
+
for (j = 0; j < k; ++j) {
|
834
|
+
VALUE line = rb_ary_entry(sequence, j);
|
835
|
+
Check_Type(line, T_STRING);
|
836
|
+
raw->lines[j] = StringValueCStr(line);
|
837
|
+
}
|
838
|
+
|
839
|
+
raw->len = k;
|
840
|
+
|
841
|
+
seq_t *seq = rdr_raw2seq(reader, raw, labelled);
|
842
|
+
xfree(raw);
|
843
|
+
|
844
|
+
if (seq == 0) { break; }
|
845
|
+
|
846
|
+
// and store the sequence
|
847
|
+
dat->seq[dat->nseq++] = seq;
|
848
|
+
dat->mlen = max(dat->mlen, seq->len);
|
849
|
+
|
850
|
+
}
|
851
|
+
|
852
|
+
// if no sequence was read, free memory
|
853
|
+
if (dat->nseq == 0) {
|
854
|
+
xfree(dat->seq);
|
855
|
+
xfree(dat);
|
856
|
+
|
857
|
+
return 0;
|
858
|
+
}
|
859
|
+
|
860
|
+
return dat;
|
850
861
|
}
|
851
862
|
|
852
863
|
|
853
864
|
static VALUE model_train(VALUE self, VALUE data) {
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
923
|
-
|
924
|
-
|
925
|
-
|
926
|
-
|
927
|
-
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
|
933
|
-
|
934
|
-
|
935
|
-
|
936
|
-
|
937
|
-
|
938
|
-
|
939
|
-
|
940
|
-
|
941
|
-
|
942
|
-
|
943
|
-
return self;
|
865
|
+
|
866
|
+
mdl_t* model = get_model(self);
|
867
|
+
|
868
|
+
int trn;
|
869
|
+
for (trn = 0; trn < trn_cnt; trn++) {
|
870
|
+
if (!strcmp(model->opt->algo, trn_lst[trn].name)) break;
|
871
|
+
}
|
872
|
+
|
873
|
+
if (trn == trn_cnt) {
|
874
|
+
rb_raise(cNativeError, "failed to train model: unknown algorithm '%s'", model->opt->algo);
|
875
|
+
}
|
876
|
+
|
877
|
+
FILE *file;
|
878
|
+
|
879
|
+
// Load the pattern file. This will unlock the database if previously
|
880
|
+
// locked by loading a model.
|
881
|
+
if (model->opt->pattern) {
|
882
|
+
file = fopen(model->opt->pattern, "r");
|
883
|
+
|
884
|
+
if (!file) {
|
885
|
+
rb_raise(cNativeError, "failed to train model: failed to load pattern file '%s'", model->opt->pattern);
|
886
|
+
}
|
887
|
+
|
888
|
+
rdr_loadpat(model->reader, file);
|
889
|
+
fclose(file);
|
890
|
+
}
|
891
|
+
else {
|
892
|
+
// rb_raise(cNativeError, "failed to train model: no pattern given");
|
893
|
+
}
|
894
|
+
|
895
|
+
qrk_lock(model->reader->obs, false);
|
896
|
+
|
897
|
+
|
898
|
+
// Load the training data. When this is done we lock the quarks as we
|
899
|
+
// don't want to put in the model, informations present only in the
|
900
|
+
// devlopment set.
|
901
|
+
|
902
|
+
switch (TYPE(data)) {
|
903
|
+
case T_STRING:
|
904
|
+
if (!(file = fopen(StringValuePtr(data), "r"))) {
|
905
|
+
rb_raise(cNativeError, "failed to train model: failed to open training data '%s", StringValuePtr(data));
|
906
|
+
}
|
907
|
+
|
908
|
+
model->train = rdr_readdat(model->reader, file, true);
|
909
|
+
fclose(file);
|
910
|
+
|
911
|
+
break;
|
912
|
+
case T_ARRAY:
|
913
|
+
model->train = to_dat(model->reader, data, true);
|
914
|
+
|
915
|
+
break;
|
916
|
+
default:
|
917
|
+
rb_raise(cNativeError, "failed to train model: invalid training data type (expected instance of String or Array)");
|
918
|
+
}
|
919
|
+
|
920
|
+
qrk_lock(model->reader->lbl, true);
|
921
|
+
qrk_lock(model->reader->obs, true);
|
922
|
+
|
923
|
+
if (!model->train || model->train->nseq == 0) {
|
924
|
+
rb_raise(cNativeError, "failed to train model: no training data loaded");
|
925
|
+
}
|
926
|
+
|
927
|
+
// If present, load the development set in the model. If not specified,
|
928
|
+
// the training dataset will be used instead.
|
929
|
+
if (model->opt->devel) {
|
930
|
+
if (!(file = fopen(model->opt->devel, "r"))) {
|
931
|
+
rb_raise(cNativeError, "failed to train model: cannot open development file '%s'", model->opt->devel);
|
932
|
+
}
|
933
|
+
|
934
|
+
model->devel = rdr_readdat(model->reader, file, true);
|
935
|
+
fclose(file);
|
936
|
+
}
|
937
|
+
|
938
|
+
// Initialize the model. If a previous model was loaded, this will be
|
939
|
+
// just a resync, else the model structure will be created.
|
940
|
+
rb_funcall(self, rb_intern("sync"), 0);
|
941
|
+
|
942
|
+
// Train the model.
|
943
|
+
uit_setup(model);
|
944
|
+
trn_lst[trn].train(model);
|
945
|
+
uit_cleanup(model);
|
946
|
+
|
947
|
+
// If requested compact the model.
|
948
|
+
if (model->opt->compact) {
|
949
|
+
rb_funcall(self, rb_intern("compact"), 0);
|
950
|
+
}
|
951
|
+
|
952
|
+
return self;
|
944
953
|
}
|
945
954
|
|
946
955
|
// Returns a sorted list of all labels in the Model's label database.
|
947
956
|
static VALUE model_labels(VALUE self) {
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
957
|
+
mdl_t *model = get_model(self);
|
958
|
+
const uint32_t Y = model->nlbl;
|
959
|
+
|
960
|
+
qrk_t *lp = model->reader->lbl;
|
961
|
+
|
962
|
+
VALUE labels = rb_ary_new2(Y);
|
963
|
+
|
964
|
+
for (unsigned int i = 0; i < Y; ++i) {
|
965
|
+
rb_ary_push(labels, rb_str_new2(qrk_id2str(lp, i)));
|
966
|
+
}
|
967
|
+
|
968
|
+
rb_funcall(labels, rb_intern("sort!"), 0);
|
969
|
+
|
970
|
+
return labels;
|
962
971
|
}
|
963
972
|
|
964
973
|
static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
|
965
|
-
|
966
|
-
|
967
|
-
|
968
|
-
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
|
974
|
-
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
|
982
|
-
|
983
|
-
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
|
988
|
-
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
|
994
|
-
|
995
|
-
|
996
|
-
|
997
|
-
|
998
|
-
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1002
|
-
|
1003
|
-
|
1004
|
-
|
1005
|
-
|
1006
|
-
|
1007
|
-
|
1008
|
-
|
1009
|
-
|
1010
|
-
|
1011
|
-
|
1012
|
-
|
1013
|
-
|
1014
|
-
|
1015
|
-
|
1016
|
-
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
1020
|
-
|
1021
|
-
|
1022
|
-
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1044
|
-
|
1045
|
-
|
1046
|
-
|
1047
|
-
|
1048
|
-
|
1049
|
-
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1055
|
-
|
1056
|
-
|
1057
|
-
|
1058
|
-
|
1059
|
-
|
1060
|
-
|
1061
|
-
|
1062
|
-
|
1063
|
-
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1067
|
-
|
974
|
+
qrk_t *lbls = model->reader->lbl;
|
975
|
+
|
976
|
+
const unsigned int Y = model->nlbl;
|
977
|
+
const unsigned int N = model->opt->nbest;
|
978
|
+
|
979
|
+
seq_t *seq = rdr_raw2seq(model->reader, raw, model->opt->check);
|
980
|
+
|
981
|
+
const unsigned int T = seq->len;
|
982
|
+
unsigned int n, t, tcnt = 0, terr = 0, scnt = 0, serr = 0, stat[3][Y];
|
983
|
+
|
984
|
+
uint32_t *out = wapiti_xmalloc(sizeof(uint32_t) * T * N);
|
985
|
+
double *psc = wapiti_xmalloc(sizeof(double) * T * N);
|
986
|
+
double *scs = wapiti_xmalloc(sizeof(double) * N);
|
987
|
+
|
988
|
+
VALUE sequence, tokens;
|
989
|
+
|
990
|
+
if (N == 1) {
|
991
|
+
tag_viterbi(model, seq, out, scs, psc);
|
992
|
+
}
|
993
|
+
else {
|
994
|
+
tag_nbviterbi(model, seq, N, (void*)out, scs, (void*)psc);
|
995
|
+
}
|
996
|
+
|
997
|
+
sequence = rb_ary_new();
|
998
|
+
|
999
|
+
for (t = 0; t < T; ++t) {
|
1000
|
+
tokens = rb_ary_new();
|
1001
|
+
|
1002
|
+
if (!model->opt->label) {
|
1003
|
+
VALUE token = rb_str_new2(raw->lines[t]);
|
1004
|
+
|
1005
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
1006
|
+
int enc = rb_enc_find_index("UTF-8");
|
1007
|
+
rb_enc_associate_index(token, enc);
|
1008
|
+
#endif
|
1009
|
+
|
1010
|
+
rb_ary_push(tokens, token);
|
1011
|
+
}
|
1012
|
+
|
1013
|
+
for (n = 0; n < N; ++n) {
|
1014
|
+
|
1015
|
+
uint64_t lbl = out[t * N + n];
|
1016
|
+
rb_ary_push(tokens, rb_str_new2(qrk_id2str(lbls, lbl)));
|
1017
|
+
|
1018
|
+
// output individual score
|
1019
|
+
if (model->opt->outsc) {
|
1020
|
+
rb_ary_push(tokens, rb_float_new(psc[t * N + n]));
|
1021
|
+
}
|
1022
|
+
|
1023
|
+
}
|
1024
|
+
|
1025
|
+
// yield token/label pair to block if given
|
1026
|
+
if (rb_block_given_p()) {
|
1027
|
+
tokens = rb_yield(tokens);
|
1028
|
+
}
|
1029
|
+
|
1030
|
+
rb_ary_push(sequence, tokens);
|
1031
|
+
|
1032
|
+
|
1033
|
+
// TODO output sequence score: scs[n] (float)
|
1034
|
+
|
1035
|
+
}
|
1036
|
+
|
1037
|
+
// Statistics
|
1038
|
+
if (model->opt->check) {
|
1039
|
+
int err = 0;
|
1040
|
+
|
1041
|
+
for (t = 0; t < T; ++t) {
|
1042
|
+
stat[0][seq->pos[t].lbl]++;
|
1043
|
+
stat[1][out[t * N]]++;
|
1044
|
+
|
1045
|
+
if (seq->pos[t].lbl != out[t * N]) {
|
1046
|
+
terr++;
|
1047
|
+
err = 1;
|
1048
|
+
}
|
1049
|
+
else {
|
1050
|
+
stat[2][out[t * N]]++;
|
1051
|
+
}
|
1052
|
+
}
|
1053
|
+
|
1054
|
+
tcnt = FIX2INT(rb_ivar_get(self, rb_intern("@token_count")));
|
1055
|
+
rb_ivar_set(self, rb_intern("@token_count"), INT2FIX(tcnt + (unsigned int)T));
|
1056
|
+
|
1057
|
+
terr += FIX2INT(rb_ivar_get(self, rb_intern("@token_errors")));
|
1058
|
+
rb_ivar_set(self, rb_intern("@token_errors"), INT2FIX(terr));
|
1059
|
+
|
1060
|
+
scnt = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_count")));
|
1061
|
+
rb_ivar_set(self, rb_intern("@sequence_count"), INT2FIX(++scnt));
|
1062
|
+
|
1063
|
+
serr = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_errors")));
|
1064
|
+
rb_ivar_set(self, rb_intern("@sequence_errors"), INT2FIX(serr + err));
|
1065
|
+
|
1066
|
+
}
|
1067
|
+
|
1068
|
+
|
1069
|
+
// Cleanup memory used for this sequence
|
1070
|
+
xfree(scs);
|
1071
|
+
xfree(psc);
|
1072
|
+
xfree(out);
|
1073
|
+
|
1074
|
+
rdr_freeseq(seq);
|
1075
|
+
|
1076
|
+
return sequence;
|
1068
1077
|
}
|
1069
1078
|
|
1070
1079
|
static VALUE decode_sequence_array(VALUE self, VALUE array) {
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1074
|
-
|
1075
|
-
|
1076
|
-
|
1077
|
-
|
1078
|
-
|
1079
|
-
|
1080
|
-
|
1081
|
-
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1080
|
+
Check_Type(array, T_ARRAY);
|
1081
|
+
const unsigned int n = RARRAY_LEN(array);
|
1082
|
+
|
1083
|
+
mdl_t *model = get_model(self);
|
1084
|
+
raw_t *raw;
|
1085
|
+
|
1086
|
+
const unsigned int N = model->opt->nbest;
|
1087
|
+
unsigned int i, j;
|
1088
|
+
|
1089
|
+
VALUE result = rb_ary_new2(n * N), sequence;
|
1090
|
+
|
1091
|
+
for (i = 0; i < n; ++i) {
|
1092
|
+
sequence = rb_ary_entry(array, i);
|
1093
|
+
Check_Type(sequence, T_ARRAY);
|
1094
|
+
|
1095
|
+
const unsigned int k = RARRAY_LEN(sequence);
|
1096
|
+
raw = wapiti_xmalloc(sizeof(raw_t) + sizeof(char*) * k);
|
1097
|
+
raw->len = k;
|
1098
|
+
|
1099
|
+
for (j = 0; j < k; ++j) {
|
1100
|
+
VALUE line = rb_ary_entry(sequence, j);
|
1101
|
+
Check_Type(line, T_STRING);
|
1102
|
+
|
1103
|
+
raw->lines[j] = StringValueCStr(line);
|
1104
|
+
}
|
1105
|
+
|
1106
|
+
rb_ary_push(result, decode_sequence(self, model, raw));
|
1107
|
+
|
1108
|
+
xfree(raw);
|
1109
|
+
}
|
1110
|
+
|
1111
|
+
return result;
|
1103
1112
|
}
|
1104
1113
|
|
1105
1114
|
static VALUE decode_sequence_file(VALUE self, VALUE path) {
|
1106
|
-
|
1107
|
-
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1130
|
-
|
1131
|
-
|
1132
|
-
|
1133
|
-
|
1115
|
+
Check_Type(path, T_STRING);
|
1116
|
+
FILE *file;
|
1117
|
+
|
1118
|
+
if (!(file = fopen(StringValueCStr(path), "r"))) {
|
1119
|
+
rb_raise(cNativeError, "failed to label data: could not open file '%s'", StringValueCStr(path));
|
1120
|
+
}
|
1121
|
+
|
1122
|
+
mdl_t *model = get_model(self);
|
1123
|
+
raw_t *raw;
|
1124
|
+
|
1125
|
+
VALUE result = rb_ary_new();
|
1126
|
+
|
1127
|
+
// Next read the input file sequence by sequence and label them, we have
|
1128
|
+
// to take care of not discarding the raw input as we want to send it
|
1129
|
+
// back to the output with the additional predicted labels.
|
1130
|
+
while (!feof(file)) {
|
1131
|
+
|
1132
|
+
// So, first read an input sequence keeping the raw_t object
|
1133
|
+
// available, and label it with Viterbi.
|
1134
|
+
if ((raw = rdr_readraw(model->reader, file)) == 0) {
|
1135
|
+
break;
|
1136
|
+
}
|
1137
|
+
|
1138
|
+
rb_ary_push(result, decode_sequence(self, model, raw));
|
1139
|
+
rdr_freeraw(raw);
|
1140
|
+
}
|
1141
|
+
|
1142
|
+
return result;
|
1134
1143
|
}
|
1135
1144
|
|
1136
1145
|
// cal-seq:
|
@@ -1138,144 +1147,146 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
|
|
1138
1147
|
// m.label(filename, options = {}) # => array of labelled tokens
|
1139
1148
|
//
|
1140
1149
|
static VALUE model_label(VALUE self, VALUE data) {
|
1141
|
-
|
1142
|
-
|
1143
|
-
|
1144
|
-
|
1145
|
-
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1153
|
-
|
1154
|
-
|
1150
|
+
VALUE result;
|
1151
|
+
|
1152
|
+
switch (TYPE(data)) {
|
1153
|
+
case T_STRING:
|
1154
|
+
result = decode_sequence_file(self, data);
|
1155
|
+
break;
|
1156
|
+
case T_ARRAY:
|
1157
|
+
result = decode_sequence_array(self, data);
|
1158
|
+
break;
|
1159
|
+
default:
|
1160
|
+
rb_raise(cNativeError, "failed to label data: invalid data (expected type String or Array)");
|
1161
|
+
}
|
1162
|
+
|
1163
|
+
return result;
|
1155
1164
|
}
|
1156
1165
|
|
1157
1166
|
static void Init_model() {
|
1158
|
-
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1167
|
+
cModel = rb_define_class_under(mWapiti, "Model", rb_cObject);
|
1168
|
+
rb_define_alloc_func(cModel, allocate_model);
|
1169
|
+
|
1170
|
+
rb_define_method(cModel, "initialize", initialize_model, -1);
|
1171
|
+
|
1172
|
+
rb_define_attr(cModel, "options", 1, 0);
|
1173
|
+
|
1162
1174
|
|
1163
|
-
|
1175
|
+
rb_define_method(cModel, "nlbl", model_nlbl, 0);
|
1176
|
+
rb_define_method(cModel, "labels", model_labels, 0);
|
1164
1177
|
|
1165
|
-
|
1166
|
-
|
1167
|
-
rb_define_method(cModel, "labels", model_labels, 0);
|
1168
|
-
|
1169
|
-
rb_define_method(cModel, "nobs", model_nobs, 0);
|
1170
|
-
rb_define_alias(cModel, "observations", "nobs");
|
1178
|
+
rb_define_method(cModel, "nobs", model_nobs, 0);
|
1179
|
+
rb_define_alias(cModel, "observations", "nobs");
|
1171
1180
|
|
1172
|
-
|
1173
|
-
|
1181
|
+
rb_define_method(cModel, "nftr", model_nftr, 0);
|
1182
|
+
rb_define_alias(cModel, "features", "nftr");
|
1174
1183
|
|
1175
|
-
|
1184
|
+
rb_define_method(cModel, "total", model_total, 0);
|
1176
1185
|
|
1177
|
-
|
1178
|
-
|
1179
|
-
|
1180
|
-
|
1186
|
+
rb_define_method(cModel, "sync", model_sync, 0);
|
1187
|
+
rb_define_method(cModel, "compact", model_compact, 0);
|
1188
|
+
rb_define_method(cModel, "save", model_save, -1);
|
1189
|
+
rb_define_method(cModel, "load", model_load, -1);
|
1181
1190
|
|
1182
|
-
|
1183
|
-
|
1191
|
+
rb_define_method(cModel, "train", model_train, 1);
|
1192
|
+
rb_define_method(cModel, "label", model_label, 1);
|
1184
1193
|
}
|
1185
1194
|
|
1186
1195
|
/* --- Top-Level Utility Methods --- */
|
1187
1196
|
|
1188
1197
|
|
1189
1198
|
static VALUE label(VALUE self __attribute__((__unused__)), VALUE rb_options) {
|
1190
|
-
|
1191
|
-
|
1192
|
-
|
1199
|
+
if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
|
1200
|
+
rb_raise(cNativeError, "argument must be a native options instance");
|
1201
|
+
}
|
1193
1202
|
|
1194
|
-
|
1203
|
+
opt_t *options = get_options(rb_options);
|
1195
1204
|
|
1196
|
-
|
1197
|
-
|
1198
|
-
|
1205
|
+
if (options->mode != 1) {
|
1206
|
+
rb_raise(cNativeError, "invalid options argument: mode should be set to 1 for labelling");
|
1207
|
+
}
|
1199
1208
|
|
1200
|
-
|
1201
|
-
|
1209
|
+
mdl_t *model = mdl_new(rdr_new(options->maxent));
|
1210
|
+
model->opt = options;
|
1202
1211
|
|
1203
|
-
|
1204
|
-
|
1205
|
-
mdl_free(model);
|
1212
|
+
dolabel(model);
|
1206
1213
|
|
1207
|
-
|
1214
|
+
mdl_free(model);
|
1215
|
+
|
1216
|
+
return Qnil;
|
1208
1217
|
}
|
1209
1218
|
|
1219
|
+
#if defined EXTRA
|
1210
1220
|
static VALUE dump(VALUE self __attribute__((__unused__)), VALUE rb_options) {
|
1211
|
-
|
1212
|
-
|
1213
|
-
|
1221
|
+
if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
|
1222
|
+
rb_raise(cNativeError, "argument must be a native options instance");
|
1223
|
+
}
|
1224
|
+
|
1225
|
+
opt_t *options = get_options(rb_options);
|
1214
1226
|
|
1215
|
-
|
1227
|
+
if (options->mode != 2) {
|
1228
|
+
rb_raise(cNativeError, "invalid options argument: mode should be set to 2 for training");
|
1229
|
+
}
|
1216
1230
|
|
1217
|
-
|
1218
|
-
|
1219
|
-
}
|
1231
|
+
mdl_t *model = mdl_new(rdr_new(options->maxent));
|
1232
|
+
model->opt = options;
|
1220
1233
|
|
1221
|
-
|
1222
|
-
model->opt = options;
|
1234
|
+
dodump(model);
|
1223
1235
|
|
1224
|
-
|
1225
|
-
|
1226
|
-
mdl_free(model);
|
1236
|
+
mdl_free(model);
|
1227
1237
|
|
1228
|
-
|
1238
|
+
return Qnil;
|
1229
1239
|
}
|
1230
1240
|
|
1231
1241
|
// This function is a proxy for Wapiti's main entry point.
|
1232
1242
|
static VALUE wapiti(VALUE self __attribute__((__unused__)), VALUE arguments) {
|
1233
|
-
|
1234
|
-
|
1235
|
-
|
1236
|
-
|
1237
|
-
|
1238
|
-
|
1239
|
-
|
1240
|
-
|
1241
|
-
|
1242
|
-
|
1243
|
-
|
1244
|
-
|
1245
|
-
|
1246
|
-
|
1247
|
-
|
1248
|
-
|
1249
|
-
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
|
1261
|
-
|
1243
|
+
int result = -1, argc = 0;
|
1244
|
+
char **ap, *argv[18], *input, *tmp;
|
1245
|
+
|
1246
|
+
Check_Type(arguments, T_STRING);
|
1247
|
+
tmp = StringValueCStr(arguments);
|
1248
|
+
|
1249
|
+
// allocate space for argument vector
|
1250
|
+
input = (char*)malloc(strlen(tmp) + 8);
|
1251
|
+
|
1252
|
+
// prepend command name
|
1253
|
+
strncpy(input, "wapiti ", 8);
|
1254
|
+
strncat(input, tmp, strlen(input) - 8);
|
1255
|
+
|
1256
|
+
// remember allocation pointer
|
1257
|
+
tmp = input;
|
1258
|
+
|
1259
|
+
// turn input string into argument vector (using
|
1260
|
+
// only the first seventeen tokens from input)
|
1261
|
+
for (ap = argv; (*ap = strsep(&input, " \t")) != (char*)0; ++argc) {
|
1262
|
+
if ((**ap != '\0') && (++ap >= &argv[18])) break;
|
1263
|
+
}
|
1264
|
+
|
1265
|
+
// call main entry point
|
1266
|
+
result = wapiti_main(argc, argv);
|
1267
|
+
|
1268
|
+
// free allocated memory
|
1269
|
+
free(tmp);
|
1270
|
+
|
1271
|
+
return INT2FIX(result);
|
1262
1272
|
}
|
1273
|
+
#endif
|
1263
1274
|
|
1264
1275
|
/* --- Wapiti Extension Entry Point --- */
|
1265
1276
|
|
1266
1277
|
void Init_native() {
|
1267
|
-
|
1268
|
-
|
1269
|
-
|
1270
|
-
|
1271
|
-
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1276
|
-
|
1277
|
-
|
1278
|
-
|
1279
|
-
|
1280
|
-
|
1281
|
-
}
|
1278
|
+
mWapiti = rb_const_get(rb_mKernel, rb_intern("Wapiti"));
|
1279
|
+
mNative = rb_define_module_under(mWapiti, "Native");
|
1280
|
+
|
1281
|
+
cNativeError = rb_const_get(mWapiti, rb_intern("NativeError"));
|
1282
|
+
cConfigurationError = rb_const_get(mWapiti, rb_intern("ConfigurationError"));
|
1283
|
+
cLogger = rb_funcall(mWapiti, rb_intern("log"), 0);
|
1284
|
+
|
1285
|
+
rb_define_singleton_method(mNative, "label", label, 1);
|
1286
|
+
// rb_define_singleton_method(mNative, "wapiti", wapiti, 1);
|
1287
|
+
|
1288
|
+
rb_define_const(mNative, "VERSION", rb_str_new2(VERSION));
|
1289
|
+
|
1290
|
+
Init_options();
|
1291
|
+
Init_model();
|
1292
|
+
}
|