wapiti 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.autotest +13 -0
- data/.gitignore +5 -0
- data/.rspec +3 -0
- data/Gemfile +6 -0
- data/LICENSE +30 -0
- data/README.md +153 -0
- data/Rakefile +33 -0
- data/ext/wapiti/bcd.c +392 -0
- data/ext/wapiti/decoder.c +535 -0
- data/ext/wapiti/decoder.h +46 -0
- data/ext/wapiti/extconf.rb +8 -0
- data/ext/wapiti/gradient.c +818 -0
- data/ext/wapiti/gradient.h +81 -0
- data/ext/wapiti/lbfgs.c +294 -0
- data/ext/wapiti/model.c +296 -0
- data/ext/wapiti/model.h +100 -0
- data/ext/wapiti/native.c +1238 -0
- data/ext/wapiti/native.h +15 -0
- data/ext/wapiti/options.c +278 -0
- data/ext/wapiti/options.h +91 -0
- data/ext/wapiti/pattern.c +395 -0
- data/ext/wapiti/pattern.h +56 -0
- data/ext/wapiti/progress.c +167 -0
- data/ext/wapiti/progress.h +43 -0
- data/ext/wapiti/quark.c +272 -0
- data/ext/wapiti/quark.h +46 -0
- data/ext/wapiti/reader.c +553 -0
- data/ext/wapiti/reader.h +73 -0
- data/ext/wapiti/rprop.c +191 -0
- data/ext/wapiti/sequence.h +148 -0
- data/ext/wapiti/sgdl1.c +218 -0
- data/ext/wapiti/thread.c +171 -0
- data/ext/wapiti/thread.h +42 -0
- data/ext/wapiti/tools.c +202 -0
- data/ext/wapiti/tools.h +54 -0
- data/ext/wapiti/trainers.h +39 -0
- data/ext/wapiti/vmath.c +372 -0
- data/ext/wapiti/vmath.h +51 -0
- data/ext/wapiti/wapiti.c +288 -0
- data/ext/wapiti/wapiti.h +45 -0
- data/lib/wapiti.rb +30 -0
- data/lib/wapiti/errors.rb +17 -0
- data/lib/wapiti/model.rb +49 -0
- data/lib/wapiti/options.rb +113 -0
- data/lib/wapiti/utility.rb +15 -0
- data/lib/wapiti/version.rb +3 -0
- data/spec/fixtures/ch.mod +18550 -0
- data/spec/fixtures/chpattern.txt +52 -0
- data/spec/fixtures/chtest.txt +1973 -0
- data/spec/fixtures/chtrain.txt +19995 -0
- data/spec/fixtures/nppattern.txt +52 -0
- data/spec/fixtures/nptest.txt +1973 -0
- data/spec/fixtures/nptrain.txt +19995 -0
- data/spec/fixtures/pattern.txt +14 -0
- data/spec/fixtures/test.txt +60000 -0
- data/spec/fixtures/train.txt +1200 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/wapiti/model_spec.rb +173 -0
- data/spec/wapiti/native_spec.rb +12 -0
- data/spec/wapiti/options_spec.rb +175 -0
- data/spec/wapiti/utility_spec.rb +22 -0
- data/wapiti.gemspec +35 -0
- metadata +178 -0
data/ext/wapiti/model.h
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#ifndef model_h
|
29
|
+
#define model_h
|
30
|
+
|
31
|
+
#include <stddef.h>
|
32
|
+
#include <sys/times.h>
|
33
|
+
|
34
|
+
#include "wapiti.h"
|
35
|
+
#include "options.h"
|
36
|
+
#include "sequence.h"
|
37
|
+
#include "reader.h"
|
38
|
+
|
39
|
+
typedef struct tms tms_t;
|
40
|
+
|
41
|
+
/* mdl_t:
|
42
|
+
* Represent a linear-chain CRF model. The model contain both unigram and
|
43
|
+
* bigram features. It is caracterized by <nlbl> the number of labels, <nobs>
|
44
|
+
* the number of observations, and <nftr> the number of features.
|
45
|
+
*
|
46
|
+
* Each observations have a corresponding entry in <kind> whose first bit is
|
47
|
+
* set if the observation is unigram and second one if it is bigram. Note that
|
48
|
+
* an observation can be both. An unigram observation produce Y features and a
|
49
|
+
* bigram one produce Y * Y features.
|
50
|
+
* The <theta> array keep all features weights. The <*off> array give for each
|
51
|
+
* observations the offset in the <theta> array where the features of the
|
52
|
+
* observation are stored.
|
53
|
+
*
|
54
|
+
* The <*off> and <theta> array are initialized only when the model is
|
55
|
+
* synchronized. As you can add new labels and observations after a sync, we
|
56
|
+
* keep track of the old counts in <olbl> and <oblk> to detect inconsistency
|
57
|
+
* and resynchronize the model if needed. In this case, if the number of
|
58
|
+
* labels have not changed, the previously trained weights are kept, else they
|
59
|
+
* are now meaningless so discarded.
|
60
|
+
*/
|
61
|
+
typedef struct mdl_s mdl_t;
|
62
|
+
struct mdl_s {
|
63
|
+
opt_t *opt; // options for training
|
64
|
+
|
65
|
+
// Size of various model parameters
|
66
|
+
size_t nlbl; // Y number of labels
|
67
|
+
size_t nobs; // O number of observations
|
68
|
+
size_t nftr; // F number of features
|
69
|
+
|
70
|
+
// Informations about observations
|
71
|
+
char *kind; // [O] observations type
|
72
|
+
size_t *uoff; // [O] unigram weights offset
|
73
|
+
size_t *boff; // [O] bigram weights offset
|
74
|
+
|
75
|
+
// The model itself
|
76
|
+
double *theta; // [F] features weights
|
77
|
+
|
78
|
+
// Datasets
|
79
|
+
dat_t *train; // training dataset
|
80
|
+
dat_t *devel; // development dataset
|
81
|
+
rdr_t *reader;
|
82
|
+
|
83
|
+
// Stoping criterion
|
84
|
+
double *werr; // Window of error rate of last iters
|
85
|
+
int wcnt; // Number of iters in the window
|
86
|
+
int wpos; // Position for the next iter
|
87
|
+
|
88
|
+
// Timing
|
89
|
+
tms_t timer; // start time of last iter
|
90
|
+
double total; // total training time
|
91
|
+
};
|
92
|
+
|
93
|
+
mdl_t *mdl_new(rdr_t *rdr);
|
94
|
+
void mdl_free(mdl_t *mdl);
|
95
|
+
void mdl_sync(mdl_t *mdl);
|
96
|
+
void mdl_compact(mdl_t *mdl);
|
97
|
+
void mdl_save(mdl_t *mdl, FILE *file);
|
98
|
+
void mdl_load(mdl_t *mdl, FILE *file);
|
99
|
+
|
100
|
+
#endif
|
data/ext/wapiti/native.c
ADDED
@@ -0,0 +1,1238 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <string.h>
|
3
|
+
|
4
|
+
#include "wapiti.h"
|
5
|
+
#include "options.h"
|
6
|
+
#include "reader.h"
|
7
|
+
#include "model.h"
|
8
|
+
#include "trainers.h"
|
9
|
+
#include "quark.h"
|
10
|
+
#include "tools.h"
|
11
|
+
|
12
|
+
#include "native.h"
|
13
|
+
|
14
|
+
VALUE mWapiti;
|
15
|
+
VALUE mNative;
|
16
|
+
|
17
|
+
VALUE cOptions;
|
18
|
+
VALUE cModel;
|
19
|
+
|
20
|
+
VALUE cNativeError;
|
21
|
+
VALUE cConfigurationError;
|
22
|
+
VALUE cLogger;
|
23
|
+
|
24
|
+
|
25
|
+
/* --- Utilities --- */
|
26
|
+
|
27
|
+
static void trn_auto(mdl_t *mdl) {
|
28
|
+
const int maxiter = mdl->opt->maxiter;
|
29
|
+
mdl->opt->maxiter = 3;
|
30
|
+
trn_sgdl1(mdl);
|
31
|
+
mdl->opt->maxiter = maxiter;
|
32
|
+
trn_lbfgs(mdl);
|
33
|
+
}
|
34
|
+
|
35
|
+
static const struct {
|
36
|
+
char *name;
|
37
|
+
void (* train)(mdl_t *mdl);
|
38
|
+
} trn_lst[] = {
|
39
|
+
{"l-bfgs", trn_lbfgs},
|
40
|
+
{"sgd-l1", trn_sgdl1},
|
41
|
+
{"bcd", trn_bcd },
|
42
|
+
{"rprop", trn_rprop},
|
43
|
+
{"rprop+", trn_rprop},
|
44
|
+
{"rprop-", trn_rprop},
|
45
|
+
{"auto", trn_auto }
|
46
|
+
};
|
47
|
+
static const int trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
|
48
|
+
|
49
|
+
|
50
|
+
/* --- Options Class --- */
|
51
|
+
|
52
|
+
// Auxiliary Methods
|
53
|
+
|
54
|
+
static opt_t *get_options(VALUE self) {
|
55
|
+
opt_t *options;
|
56
|
+
Data_Get_Struct(self, opt_t, options);
|
57
|
+
return options;
|
58
|
+
}
|
59
|
+
|
60
|
+
// Copies a Ruby string to the heap and stores it in a pointer.
|
61
|
+
// Frees the pointer before assigning the new value.
|
62
|
+
static void copy_string(char **dst, VALUE rb_string) {
|
63
|
+
Check_Type(rb_string, T_STRING);
|
64
|
+
|
65
|
+
if (*dst) { free(*dst); *dst = (char*)0; }
|
66
|
+
*dst = calloc(RSTRING_LEN(rb_string) + 1, sizeof(char));
|
67
|
+
|
68
|
+
memcpy(*dst, StringValuePtr(rb_string), RSTRING_LEN(rb_string) + 1);
|
69
|
+
}
|
70
|
+
|
71
|
+
|
72
|
+
// Constructor / Desctructor
|
73
|
+
|
74
|
+
static void mark_options(opt_t* options __attribute__((__unused__))) {
|
75
|
+
// nothing
|
76
|
+
}
|
77
|
+
|
78
|
+
static void deallocate_options(opt_t* options) {
|
79
|
+
|
80
|
+
// free string options
|
81
|
+
if (options->input) { free(options->input); }
|
82
|
+
if (options->output) { free(options->output); }
|
83
|
+
if (options->algo) { free(options->algo); }
|
84
|
+
if (options->devel) { free(options->devel); }
|
85
|
+
if (options->pattern) { free(options->pattern); }
|
86
|
+
|
87
|
+
free(options);
|
88
|
+
options = (opt_t*)0;
|
89
|
+
}
|
90
|
+
|
91
|
+
static VALUE allocate_options(VALUE self) {
|
92
|
+
opt_t* options = malloc(sizeof(opt_t));
|
93
|
+
return Data_Wrap_Struct(self, mark_options, deallocate_options, options);
|
94
|
+
}
|
95
|
+
|
96
|
+
static VALUE initialize_options(int argc, VALUE *argv, VALUE self) {
|
97
|
+
opt_t* options = get_options(self);
|
98
|
+
*options = opt_defaults;
|
99
|
+
|
100
|
+
if (options->maxiter == 0) {
|
101
|
+
options->maxiter = INT_MAX;
|
102
|
+
}
|
103
|
+
|
104
|
+
// copy the default algorithm name to the heap so that all options strings
|
105
|
+
// are on the heap
|
106
|
+
char* tmp = calloc(strlen(options->algo), sizeof(char));
|
107
|
+
memcpy(tmp, options->algo, strlen(options->algo));
|
108
|
+
options->algo = tmp;
|
109
|
+
|
110
|
+
if (argc > 1) {
|
111
|
+
rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
|
112
|
+
"wrong number of arguments (%d for 0..1)", argc);
|
113
|
+
}
|
114
|
+
|
115
|
+
// set defaults
|
116
|
+
if (argc) {
|
117
|
+
Check_Type(argv[0], T_HASH);
|
118
|
+
(void)rb_funcall(self, rb_intern("update"), 1, argv[0]);
|
119
|
+
}
|
120
|
+
|
121
|
+
// yield self if block_given?
|
122
|
+
if (rb_block_given_p()) {
|
123
|
+
rb_yield(self);
|
124
|
+
}
|
125
|
+
|
126
|
+
return self;
|
127
|
+
}
|
128
|
+
|
129
|
+
|
130
|
+
// Instance Methods
|
131
|
+
|
132
|
+
|
133
|
+
// Fixnum Accessors
|
134
|
+
|
135
|
+
static VALUE options_nbest(VALUE self) {
|
136
|
+
return INT2FIX(get_options(self)->nbest);
|
137
|
+
}
|
138
|
+
|
139
|
+
static VALUE options_set_nbest(VALUE self, VALUE rb_fixnum) {
|
140
|
+
Check_Type(rb_fixnum, T_FIXNUM);
|
141
|
+
get_options(self)->nbest = FIX2INT(rb_fixnum);
|
142
|
+
|
143
|
+
return rb_fixnum;
|
144
|
+
}
|
145
|
+
|
146
|
+
|
147
|
+
static VALUE options_stopwin(VALUE self) {
|
148
|
+
return INT2FIX(get_options(self)->stopwin);
|
149
|
+
}
|
150
|
+
|
151
|
+
static VALUE options_set_stopwin(VALUE self, VALUE rb_fixnum) {
|
152
|
+
Check_Type(rb_fixnum, T_FIXNUM);
|
153
|
+
get_options(self)->stopwin = FIX2INT(rb_fixnum);
|
154
|
+
|
155
|
+
return rb_fixnum;
|
156
|
+
}
|
157
|
+
|
158
|
+
static VALUE options_objwin(VALUE self) {
|
159
|
+
return INT2FIX(get_options(self)->objwin);
|
160
|
+
}
|
161
|
+
|
162
|
+
static VALUE options_set_objwin(VALUE self, VALUE rb_fixnum) {
|
163
|
+
Check_Type(rb_fixnum, T_FIXNUM);
|
164
|
+
get_options(self)->objwin = FIX2INT(rb_fixnum);
|
165
|
+
|
166
|
+
return rb_fixnum;
|
167
|
+
}
|
168
|
+
|
169
|
+
|
170
|
+
static VALUE options_maxiter(VALUE self) {
|
171
|
+
return INT2FIX(get_options(self)->maxiter);
|
172
|
+
}
|
173
|
+
|
174
|
+
static VALUE options_set_maxiter(VALUE self, VALUE rb_fixnum) {
|
175
|
+
opt_t *options = get_options(self);
|
176
|
+
|
177
|
+
Check_Type(rb_fixnum, T_FIXNUM);
|
178
|
+
options->maxiter = FIX2INT(rb_fixnum);
|
179
|
+
|
180
|
+
return rb_fixnum;
|
181
|
+
}
|
182
|
+
|
183
|
+
static VALUE options_jobsize(VALUE self) {
|
184
|
+
return INT2FIX(get_options(self)->jobsize);
|
185
|
+
}
|
186
|
+
|
187
|
+
static VALUE options_set_jobsize(VALUE self, VALUE rb_fixnum) {
|
188
|
+
opt_t *options = get_options(self);
|
189
|
+
|
190
|
+
Check_Type(rb_fixnum, T_FIXNUM);
|
191
|
+
options->jobsize = FIX2INT(rb_fixnum);
|
192
|
+
|
193
|
+
return rb_fixnum;
|
194
|
+
}
|
195
|
+
|
196
|
+
static VALUE options_nthread(VALUE self) {
|
197
|
+
return INT2FIX(get_options(self)->nthread);
|
198
|
+
}
|
199
|
+
|
200
|
+
static VALUE options_set_nthread(VALUE self, VALUE rb_fixnum) {
|
201
|
+
opt_t *options = get_options(self);
|
202
|
+
|
203
|
+
Check_Type(rb_fixnum, T_FIXNUM);
|
204
|
+
options->nthread = FIX2INT(rb_fixnum);
|
205
|
+
|
206
|
+
return rb_fixnum;
|
207
|
+
}
|
208
|
+
|
209
|
+
static VALUE options_histsz(VALUE self) {
|
210
|
+
return INT2FIX(get_options(self)->lbfgs.histsz);
|
211
|
+
}
|
212
|
+
|
213
|
+
static VALUE options_set_histsz(VALUE self, VALUE rb_fixnum) {
|
214
|
+
Check_Type(rb_fixnum, T_FIXNUM);
|
215
|
+
get_options(self)->lbfgs.histsz = FIX2INT(rb_fixnum);
|
216
|
+
|
217
|
+
return rb_fixnum;
|
218
|
+
}
|
219
|
+
|
220
|
+
static VALUE options_maxls(VALUE self) {
|
221
|
+
return INT2FIX(get_options(self)->lbfgs.maxls);
|
222
|
+
}
|
223
|
+
|
224
|
+
static VALUE options_set_maxls(VALUE self, VALUE rb_fixnum) {
|
225
|
+
Check_Type(rb_fixnum, T_FIXNUM);
|
226
|
+
get_options(self)->lbfgs.maxls = FIX2INT(rb_fixnum);
|
227
|
+
|
228
|
+
return rb_fixnum;
|
229
|
+
}
|
230
|
+
|
231
|
+
|
232
|
+
// Float Accessors
|
233
|
+
|
234
|
+
static VALUE options_rho1(VALUE self) {
|
235
|
+
return rb_float_new(get_options(self)->rho1);
|
236
|
+
}
|
237
|
+
|
238
|
+
static VALUE options_set_rho1(VALUE self, VALUE rb_numeric) {
|
239
|
+
get_options(self)->rho1 = NUM2DBL(rb_numeric);
|
240
|
+
return rb_numeric;
|
241
|
+
}
|
242
|
+
|
243
|
+
static VALUE options_rho2(VALUE self) {
|
244
|
+
return rb_float_new(get_options(self)->rho2);
|
245
|
+
}
|
246
|
+
|
247
|
+
static VALUE options_set_rho2(VALUE self, VALUE rb_numeric) {
|
248
|
+
get_options(self)->rho2 = NUM2DBL(rb_numeric);
|
249
|
+
return rb_numeric;
|
250
|
+
}
|
251
|
+
|
252
|
+
static VALUE options_stopeps(VALUE self) {
|
253
|
+
return rb_float_new(get_options(self)->stopeps);
|
254
|
+
}
|
255
|
+
|
256
|
+
static VALUE options_set_stopeps(VALUE self, VALUE rb_numeric) {
|
257
|
+
get_options(self)->stopeps = NUM2DBL(rb_numeric);
|
258
|
+
return rb_numeric;
|
259
|
+
}
|
260
|
+
|
261
|
+
static VALUE options_eta0(VALUE self) {
|
262
|
+
return rb_float_new(get_options(self)->sgdl1.eta0);
|
263
|
+
}
|
264
|
+
|
265
|
+
static VALUE options_set_eta0(VALUE self, VALUE rb_numeric) {
|
266
|
+
get_options(self)->sgdl1.eta0 = NUM2DBL(rb_numeric);
|
267
|
+
return rb_numeric;
|
268
|
+
}
|
269
|
+
|
270
|
+
static VALUE options_alpha(VALUE self) {
|
271
|
+
return rb_float_new(get_options(self)->sgdl1.alpha);
|
272
|
+
}
|
273
|
+
|
274
|
+
static VALUE options_set_alpha(VALUE self, VALUE rb_numeric) {
|
275
|
+
get_options(self)->sgdl1.alpha = NUM2DBL(rb_numeric);
|
276
|
+
return rb_numeric;
|
277
|
+
}
|
278
|
+
|
279
|
+
static VALUE options_kappa(VALUE self) {
|
280
|
+
return rb_float_new(get_options(self)->bcd.kappa);
|
281
|
+
}
|
282
|
+
|
283
|
+
static VALUE options_set_kappa(VALUE self, VALUE rb_numeric) {
|
284
|
+
get_options(self)->bcd.kappa = NUM2DBL(rb_numeric);
|
285
|
+
return rb_numeric;
|
286
|
+
}
|
287
|
+
|
288
|
+
static VALUE options_stpmin(VALUE self) {
|
289
|
+
return rb_float_new(get_options(self)->rprop.stpmin);
|
290
|
+
}
|
291
|
+
|
292
|
+
static VALUE options_set_stpmin(VALUE self, VALUE rb_numeric) {
|
293
|
+
get_options(self)->rprop.stpmin = NUM2DBL(rb_numeric);
|
294
|
+
return rb_numeric;
|
295
|
+
}
|
296
|
+
|
297
|
+
static VALUE options_stpmax(VALUE self) {
|
298
|
+
return rb_float_new(get_options(self)->rprop.stpmax);
|
299
|
+
}
|
300
|
+
|
301
|
+
static VALUE options_set_stpmax(VALUE self, VALUE rb_numeric) {
|
302
|
+
get_options(self)->rprop.stpmax = NUM2DBL(rb_numeric);
|
303
|
+
return rb_numeric;
|
304
|
+
}
|
305
|
+
|
306
|
+
static VALUE options_stpinc(VALUE self) {
|
307
|
+
return rb_float_new(get_options(self)->rprop.stpinc);
|
308
|
+
}
|
309
|
+
|
310
|
+
static VALUE options_set_stpinc(VALUE self, VALUE rb_numeric) {
|
311
|
+
get_options(self)->rprop.stpinc = NUM2DBL(rb_numeric);
|
312
|
+
return rb_numeric;
|
313
|
+
}
|
314
|
+
|
315
|
+
static VALUE options_stpdec(VALUE self) {
|
316
|
+
return rb_float_new(get_options(self)->rprop.stpdec);
|
317
|
+
}
|
318
|
+
|
319
|
+
static VALUE options_set_stpdec(VALUE self, VALUE rb_numeric) {
|
320
|
+
get_options(self)->rprop.stpdec = NUM2DBL(rb_numeric);
|
321
|
+
return rb_numeric;
|
322
|
+
}
|
323
|
+
|
324
|
+
|
325
|
+
|
326
|
+
// Boolean Accessors
|
327
|
+
|
328
|
+
static VALUE options_maxent(VALUE self) {
|
329
|
+
return get_options(self)->maxent ? Qtrue : Qfalse;
|
330
|
+
}
|
331
|
+
|
332
|
+
static VALUE options_set_maxent(VALUE self, VALUE rb_boolean) {
|
333
|
+
get_options(self)->maxent = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
|
334
|
+
return rb_boolean;
|
335
|
+
}
|
336
|
+
|
337
|
+
static VALUE options_compact(VALUE self) {
|
338
|
+
return get_options(self)->compact ? Qtrue : Qfalse;
|
339
|
+
}
|
340
|
+
|
341
|
+
static VALUE options_set_compact(VALUE self, VALUE rb_boolean) {
|
342
|
+
get_options(self)->compact = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
|
343
|
+
return rb_boolean;
|
344
|
+
}
|
345
|
+
|
346
|
+
static VALUE options_sparse(VALUE self) {
|
347
|
+
return get_options(self)->sparse ? Qtrue : Qfalse;
|
348
|
+
}
|
349
|
+
|
350
|
+
static VALUE options_set_sparse(VALUE self, VALUE rb_boolean) {
|
351
|
+
get_options(self)->sparse = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
|
352
|
+
return rb_boolean;
|
353
|
+
}
|
354
|
+
|
355
|
+
static VALUE options_check(VALUE self) {
|
356
|
+
return get_options(self)->check ? Qtrue : Qfalse;
|
357
|
+
}
|
358
|
+
|
359
|
+
static VALUE options_set_check(VALUE self, VALUE rb_boolean) {
|
360
|
+
get_options(self)->check = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
|
361
|
+
return rb_boolean;
|
362
|
+
}
|
363
|
+
|
364
|
+
static VALUE options_label(VALUE self) {
|
365
|
+
return get_options(self)->label ? Qtrue : Qfalse;
|
366
|
+
}
|
367
|
+
|
368
|
+
static VALUE options_set_label(VALUE self, VALUE rb_boolean) {
|
369
|
+
get_options(self)->label = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
|
370
|
+
return rb_boolean;
|
371
|
+
}
|
372
|
+
|
373
|
+
static VALUE options_outsc(VALUE self) {
|
374
|
+
return get_options(self)->outsc ? Qtrue : Qfalse;
|
375
|
+
}
|
376
|
+
|
377
|
+
static VALUE options_set_outsc(VALUE self, VALUE rb_boolean) {
|
378
|
+
get_options(self)->outsc = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
|
379
|
+
return rb_boolean;
|
380
|
+
}
|
381
|
+
|
382
|
+
static VALUE options_lblpost(VALUE self) {
|
383
|
+
return get_options(self)->lblpost ? Qtrue : Qfalse;
|
384
|
+
}
|
385
|
+
|
386
|
+
static VALUE options_set_lblpost(VALUE self, VALUE rb_boolean) {
|
387
|
+
get_options(self)->lblpost = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
|
388
|
+
return rb_boolean;
|
389
|
+
}
|
390
|
+
|
391
|
+
static VALUE options_clip(VALUE self) {
|
392
|
+
return get_options(self)->lbfgs.clip ? Qtrue : Qfalse;
|
393
|
+
}
|
394
|
+
|
395
|
+
static VALUE options_set_clip(VALUE self, VALUE rb_boolean) {
|
396
|
+
get_options(self)->lbfgs.clip = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
|
397
|
+
return rb_boolean;
|
398
|
+
}
|
399
|
+
|
400
|
+
static VALUE options_cutoff(VALUE self) {
|
401
|
+
return get_options(self)->rprop.cutoff ? Qtrue : Qfalse;
|
402
|
+
}
|
403
|
+
|
404
|
+
static VALUE options_set_cutoff(VALUE self, VALUE rb_boolean) {
|
405
|
+
get_options(self)->rprop.cutoff = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
|
406
|
+
return rb_boolean;
|
407
|
+
}
|
408
|
+
|
409
|
+
|
410
|
+
|
411
|
+
|
412
|
+
// String Accessors
|
413
|
+
|
414
|
+
static VALUE options_pattern(VALUE self) {
|
415
|
+
char *pattern = get_options(self)->pattern;
|
416
|
+
return rb_str_new2(pattern ? pattern : "");
|
417
|
+
}
|
418
|
+
|
419
|
+
static VALUE options_set_pattern(VALUE self, VALUE rb_string) {
|
420
|
+
opt_t *options = get_options(self);
|
421
|
+
copy_string(&(options->pattern), rb_string);
|
422
|
+
|
423
|
+
return rb_string;
|
424
|
+
}
|
425
|
+
|
426
|
+
static VALUE options_model(VALUE self) {
|
427
|
+
char *model = get_options(self)->model;
|
428
|
+
return rb_str_new2(model ? model : "");
|
429
|
+
}
|
430
|
+
|
431
|
+
static VALUE options_set_model(VALUE self, VALUE rb_string) {
|
432
|
+
opt_t *options = get_options(self);
|
433
|
+
copy_string(&(options->model), rb_string);
|
434
|
+
|
435
|
+
return rb_string;
|
436
|
+
}
|
437
|
+
|
438
|
+
static VALUE options_algorithm(VALUE self) {
|
439
|
+
char *algorithm = get_options(self)->algo;
|
440
|
+
return rb_str_new2(algorithm ? algorithm : "");
|
441
|
+
}
|
442
|
+
|
443
|
+
static VALUE options_set_algorithm(VALUE self, VALUE rb_string) {
|
444
|
+
opt_t *options = get_options(self);
|
445
|
+
copy_string(&(options->algo), rb_string);
|
446
|
+
|
447
|
+
return rb_string;
|
448
|
+
}
|
449
|
+
|
450
|
+
static VALUE options_development_data(VALUE self) {
|
451
|
+
char *development_data = get_options(self)->devel;
|
452
|
+
return rb_str_new2(development_data ? development_data : "");
|
453
|
+
}
|
454
|
+
|
455
|
+
static VALUE options_set_development_data(VALUE self, VALUE rb_string) {
|
456
|
+
opt_t *options = get_options(self);
|
457
|
+
copy_string(&(options->devel), rb_string);
|
458
|
+
|
459
|
+
return rb_string;
|
460
|
+
}
|
461
|
+
|
462
|
+
|
463
|
+
void Init_options() {
|
464
|
+
cOptions = rb_define_class_under(mWapiti, "Options", rb_cObject);
|
465
|
+
rb_define_alloc_func(cOptions, allocate_options);
|
466
|
+
|
467
|
+
rb_define_method(cOptions, "initialize", initialize_options, -1);
|
468
|
+
|
469
|
+
// Option Accessors
|
470
|
+
|
471
|
+
rb_define_method(cOptions, "stopwin", options_stopwin, 0);
|
472
|
+
rb_define_method(cOptions, "stopwin=", options_set_stopwin, 1);
|
473
|
+
|
474
|
+
rb_define_alias(cOptions, "stop_window", "stopwin");
|
475
|
+
rb_define_alias(cOptions, "stop_window=", "stopwin=");
|
476
|
+
|
477
|
+
rb_define_method(cOptions, "objwin", options_objwin, 0);
|
478
|
+
rb_define_method(cOptions, "objwin=", options_set_objwin, 1);
|
479
|
+
|
480
|
+
rb_define_alias(cOptions, "convergence_window", "objwin");
|
481
|
+
rb_define_alias(cOptions, "convergence_window=", "objwin=");
|
482
|
+
|
483
|
+
rb_define_method(cOptions, "maxiter", options_maxiter, 0);
|
484
|
+
rb_define_method(cOptions, "maxiter=", options_set_maxiter, 1);
|
485
|
+
|
486
|
+
rb_define_alias(cOptions, "max_iterations", "maxiter");
|
487
|
+
rb_define_alias(cOptions, "max_iterations=", "maxiter=");
|
488
|
+
|
489
|
+
rb_define_method(cOptions, "jobsize", options_jobsize, 0);
|
490
|
+
rb_define_method(cOptions, "jobsize=", options_set_jobsize, 1);
|
491
|
+
|
492
|
+
rb_define_method(cOptions, "nthread", options_nthread, 0);
|
493
|
+
rb_define_method(cOptions, "nthread=", options_set_nthread, 1);
|
494
|
+
|
495
|
+
rb_define_alias(cOptions, "threads", "nthread");
|
496
|
+
rb_define_alias(cOptions, "threads=", "nthread=");
|
497
|
+
|
498
|
+
rb_define_method(cOptions, "rho1", options_rho1, 0);
|
499
|
+
rb_define_method(cOptions, "rho1=", options_set_rho1, 1);
|
500
|
+
|
501
|
+
rb_define_method(cOptions, "rho2", options_rho2, 0);
|
502
|
+
rb_define_method(cOptions, "rho2=", options_set_rho2, 1);
|
503
|
+
|
504
|
+
rb_define_method(cOptions, "stopeps", options_stopeps, 0);
|
505
|
+
rb_define_method(cOptions, "stopeps=", options_set_stopeps, 1);
|
506
|
+
|
507
|
+
rb_define_alias(cOptions, "stop_epsilon", "stopeps");
|
508
|
+
rb_define_alias(cOptions, "stop_epsilon=", "stopeps=");
|
509
|
+
|
510
|
+
rb_define_method(cOptions, "maxent", options_maxent, 0);
|
511
|
+
rb_define_method(cOptions, "maxent=", options_set_maxent, 1);
|
512
|
+
|
513
|
+
rb_define_alias(cOptions, "maxent?", "maxent");
|
514
|
+
|
515
|
+
rb_define_method(cOptions, "compact", options_compact, 0);
|
516
|
+
rb_define_method(cOptions, "compact=", options_set_compact, 1);
|
517
|
+
|
518
|
+
rb_define_alias(cOptions, "compact?", "compact");
|
519
|
+
|
520
|
+
rb_define_method(cOptions, "sparse", options_sparse, 0);
|
521
|
+
rb_define_method(cOptions, "sparse=", options_set_sparse, 1);
|
522
|
+
|
523
|
+
rb_define_alias(cOptions, "sparse?", "sparse");
|
524
|
+
|
525
|
+
rb_define_method(cOptions, "label", options_label, 0);
|
526
|
+
rb_define_method(cOptions, "label=", options_set_label, 1);
|
527
|
+
|
528
|
+
rb_define_alias(cOptions, "label?", "label");
|
529
|
+
|
530
|
+
rb_define_method(cOptions, "check", options_check, 0);
|
531
|
+
rb_define_method(cOptions, "check=", options_set_check, 1);
|
532
|
+
|
533
|
+
rb_define_alias(cOptions, "check?", "check");
|
534
|
+
|
535
|
+
rb_define_method(cOptions, "lblpost", options_lblpost, 0);
|
536
|
+
rb_define_method(cOptions, "lblpost=", options_set_lblpost, 1);
|
537
|
+
|
538
|
+
rb_define_alias(cOptions, "lblpost?", "lblpost");
|
539
|
+
|
540
|
+
rb_define_alias(cOptions, "posterior", "lblpost");
|
541
|
+
rb_define_alias(cOptions, "posterior?", "lblpost");
|
542
|
+
rb_define_alias(cOptions, "posterior=", "lblpost=");
|
543
|
+
|
544
|
+
rb_define_method(cOptions, "outsc", options_outsc, 0);
|
545
|
+
rb_define_method(cOptions, "outsc=", options_set_outsc, 1);
|
546
|
+
|
547
|
+
rb_define_alias(cOptions, "outsc?", "outsc");
|
548
|
+
|
549
|
+
rb_define_alias(cOptions, "score", "outsc");
|
550
|
+
rb_define_alias(cOptions, "score?", "outsc");
|
551
|
+
rb_define_alias(cOptions, "score=", "outsc=");
|
552
|
+
|
553
|
+
rb_define_method(cOptions, "pattern", options_pattern, 0);
|
554
|
+
rb_define_method(cOptions, "pattern=", options_set_pattern, 1);
|
555
|
+
|
556
|
+
rb_define_alias(cOptions, "template", "pattern");
|
557
|
+
rb_define_alias(cOptions, "template=", "pattern=");
|
558
|
+
|
559
|
+
rb_define_method(cOptions, "model", options_model, 0);
|
560
|
+
rb_define_method(cOptions, "model=", options_set_model, 1);
|
561
|
+
|
562
|
+
rb_define_method(cOptions, "algorithm", options_algorithm, 0);
|
563
|
+
rb_define_method(cOptions, "algorithm=", options_set_algorithm, 1);
|
564
|
+
|
565
|
+
rb_define_alias(cOptions, "algo", "algorithm");
|
566
|
+
rb_define_alias(cOptions, "algo=", "algorithm=");
|
567
|
+
|
568
|
+
rb_define_method(cOptions, "development_data", options_development_data, 0);
|
569
|
+
rb_define_method(cOptions, "development_data=", options_set_development_data, 1);
|
570
|
+
|
571
|
+
rb_define_alias(cOptions, "devel", "development_data");
|
572
|
+
rb_define_alias(cOptions, "devel=", "development_data=");
|
573
|
+
|
574
|
+
rb_define_method(cOptions, "clip", options_clip, 0);
|
575
|
+
rb_define_method(cOptions, "clip=", options_set_clip, 1);
|
576
|
+
|
577
|
+
rb_define_method(cOptions, "histsz", options_histsz, 0);
|
578
|
+
rb_define_method(cOptions, "histsz=", options_set_histsz, 1);
|
579
|
+
|
580
|
+
rb_define_method(cOptions, "maxls", options_maxls, 0);
|
581
|
+
rb_define_method(cOptions, "maxls=", options_set_maxls, 1);
|
582
|
+
|
583
|
+
rb_define_method(cOptions, "eta0", options_eta0, 0);
|
584
|
+
rb_define_method(cOptions, "eta0=", options_set_eta0, 1);
|
585
|
+
|
586
|
+
rb_define_method(cOptions, "alpha", options_alpha, 0);
|
587
|
+
rb_define_method(cOptions, "alpha=", options_set_alpha, 1);
|
588
|
+
|
589
|
+
rb_define_method(cOptions, "kappa", options_kappa, 0);
|
590
|
+
rb_define_method(cOptions, "kappa=", options_set_kappa, 1);
|
591
|
+
|
592
|
+
rb_define_method(cOptions, "stpmin", options_stpmin, 0);
|
593
|
+
rb_define_method(cOptions, "stpmin=", options_set_stpmin, 1);
|
594
|
+
|
595
|
+
rb_define_method(cOptions, "stpmax", options_stpmax, 0);
|
596
|
+
rb_define_method(cOptions, "stpmax=", options_set_stpmax, 1);
|
597
|
+
|
598
|
+
rb_define_method(cOptions, "stpinc", options_stpinc, 0);
|
599
|
+
rb_define_method(cOptions, "stpinc=", options_set_stpinc, 1);
|
600
|
+
|
601
|
+
rb_define_method(cOptions, "stpdec", options_stpdec, 0);
|
602
|
+
rb_define_method(cOptions, "stpdec=", options_set_stpdec, 1);
|
603
|
+
|
604
|
+
rb_define_method(cOptions, "cutoff", options_cutoff, 0);
|
605
|
+
rb_define_method(cOptions, "cutoff=", options_set_cutoff, 1);
|
606
|
+
|
607
|
+
rb_define_method(cOptions, "nbest", options_nbest, 0);
|
608
|
+
rb_define_method(cOptions, "nbest=", options_set_nbest, 1);
|
609
|
+
|
610
|
+
}
|
611
|
+
|
612
|
+
|
613
|
+
/* --- Model Class --- */
|
614
|
+
|
615
|
+
// Auxiliary Methods
|
616
|
+
|
617
|
+
static mdl_t *get_model(VALUE self) {
|
618
|
+
mdl_t *model;
|
619
|
+
Data_Get_Struct(self, mdl_t, model);
|
620
|
+
return model;
|
621
|
+
}
|
622
|
+
|
623
|
+
// Constructor / Desctructor
|
624
|
+
|
625
|
+
static void mark_model(mdl_t *model __attribute__((__unused__))) {
|
626
|
+
// nothing
|
627
|
+
}
|
628
|
+
|
629
|
+
static void deallocate_model(mdl_t *model) {
|
630
|
+
if (model) {
|
631
|
+
mdl_free(model);
|
632
|
+
model = (mdl_t*)0;
|
633
|
+
}
|
634
|
+
}
|
635
|
+
|
636
|
+
static VALUE allocate_model(VALUE self) {
|
637
|
+
mdl_t *model = mdl_new(rdr_new(false));
|
638
|
+
return Data_Wrap_Struct(self, mark_model, deallocate_model, model);
|
639
|
+
}
|
640
|
+
|
641
|
+
static VALUE model_set_options(VALUE self, VALUE rb_options) {
|
642
|
+
if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
|
643
|
+
rb_raise(cNativeError, "argument must be a Wapiti::Options instance");
|
644
|
+
}
|
645
|
+
|
646
|
+
mdl_t *model = get_model(self);
|
647
|
+
|
648
|
+
// Store reference to options in model struct
|
649
|
+
model->opt = get_options(rb_options);
|
650
|
+
|
651
|
+
// Update reader
|
652
|
+
model->reader->maxent = model->opt->maxent;
|
653
|
+
|
654
|
+
// Save instance variable
|
655
|
+
rb_ivar_set(self, rb_intern("@options"), rb_options);
|
656
|
+
|
657
|
+
return rb_options;
|
658
|
+
}
|
659
|
+
|
660
|
+
static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
|
661
|
+
VALUE options;
|
662
|
+
|
663
|
+
if (argc > 1) {
|
664
|
+
rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
|
665
|
+
"wrong number of arguments (%d for 0..1)", argc);
|
666
|
+
}
|
667
|
+
|
668
|
+
if (argc) {
|
669
|
+
if (TYPE(argv[0]) == T_HASH) {
|
670
|
+
options = rb_funcall(cOptions, rb_intern("new"), 1, argv[0]);
|
671
|
+
}
|
672
|
+
else {
|
673
|
+
if (strncmp("Wapiti::Options", rb_obj_classname(argv[0]), 15) != 0) {
|
674
|
+
rb_raise(cNativeError, "argument must be a hash or an options instance");
|
675
|
+
}
|
676
|
+
options = argv[0];
|
677
|
+
}
|
678
|
+
}
|
679
|
+
else {
|
680
|
+
options = rb_funcall(cOptions, rb_intern("new"), 0);
|
681
|
+
}
|
682
|
+
|
683
|
+
// yield self if block_given?
|
684
|
+
if (rb_block_given_p()) {
|
685
|
+
rb_yield(options);
|
686
|
+
}
|
687
|
+
|
688
|
+
model_set_options(self, options);
|
689
|
+
|
690
|
+
// Load a previous model if specified by options
|
691
|
+
if (get_options(options)->model) {
|
692
|
+
rb_funcall(self, rb_intern("load"), 0);
|
693
|
+
}
|
694
|
+
|
695
|
+
return self;
|
696
|
+
}
|
697
|
+
|
698
|
+
|
699
|
+
// Native accessors
|
700
|
+
|
701
|
+
static VALUE model_nlbl(VALUE self) {
|
702
|
+
return INT2FIX(get_model(self)->nlbl);
|
703
|
+
}
|
704
|
+
|
705
|
+
static VALUE model_nobs(VALUE self) {
|
706
|
+
return INT2FIX(get_model(self)->nobs);
|
707
|
+
}
|
708
|
+
|
709
|
+
static VALUE model_nftr(VALUE self) {
|
710
|
+
return INT2FIX(get_model(self)->nftr);
|
711
|
+
}
|
712
|
+
|
713
|
+
static VALUE model_total(VALUE self) {
|
714
|
+
return rb_float_new(get_model(self)->total);
|
715
|
+
}
|
716
|
+
|
717
|
+
|
718
|
+
// Instance methods
|
719
|
+
|
720
|
+
static VALUE model_sync(VALUE self) {
|
721
|
+
mdl_sync(get_model(self));
|
722
|
+
return self;
|
723
|
+
}
|
724
|
+
|
725
|
+
static VALUE model_compact(VALUE self) {
|
726
|
+
mdl_compact(get_model(self));
|
727
|
+
return self;
|
728
|
+
}
|
729
|
+
|
730
|
+
// call-seq:
|
731
|
+
// m.save # => saves the model to the file defined in m.path
|
732
|
+
// m.save(path) # => sets m.path and saves the model to the file <path>
|
733
|
+
//
|
734
|
+
// Saves the model to a file. Uses the Model's path if no argument given,
|
735
|
+
// otherwise uses the passed-in argument as the Model's path.
|
736
|
+
static VALUE model_save(int argc, VALUE *argv, VALUE self) {
|
737
|
+
if (argc > 1) {
|
738
|
+
rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
|
739
|
+
"wrong number of arguments (%d for 0..1)", argc);
|
740
|
+
}
|
741
|
+
|
742
|
+
mdl_t *model = get_model(self);
|
743
|
+
|
744
|
+
// save passed-in path in options
|
745
|
+
if (argc) {
|
746
|
+
Check_Type(argv[0], T_STRING);
|
747
|
+
rb_ivar_set(self, rb_intern("@path"), argv[0]);
|
748
|
+
}
|
749
|
+
|
750
|
+
// open the output file
|
751
|
+
FILE *file = 0;
|
752
|
+
VALUE path = rb_ivar_get(self, rb_intern("@path"));
|
753
|
+
|
754
|
+
if (NIL_P(path)) {
|
755
|
+
rb_raise(cNativeError, "failed to save model: no path given");
|
756
|
+
}
|
757
|
+
|
758
|
+
if (!(file = fopen(StringValueCStr(path), "w"))) {
|
759
|
+
rb_raise(cNativeError, "failed to save model: failed to open model file");
|
760
|
+
}
|
761
|
+
|
762
|
+
mdl_save(model, file);
|
763
|
+
fclose(file);
|
764
|
+
|
765
|
+
return self;
|
766
|
+
}
|
767
|
+
|
768
|
+
static VALUE model_load(int argc, VALUE *argv, VALUE self) {
|
769
|
+
if (argc > 1) {
|
770
|
+
rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
|
771
|
+
"wrong number of arguments (%d for 0..1)", argc);
|
772
|
+
}
|
773
|
+
|
774
|
+
mdl_t *model = get_model(self);
|
775
|
+
|
776
|
+
// save passed-in argument in options
|
777
|
+
if (argc) {
|
778
|
+
Check_Type(argv[0], T_STRING);
|
779
|
+
rb_ivar_set(self, rb_intern("@path"), argv[0]);
|
780
|
+
}
|
781
|
+
|
782
|
+
// open the model file
|
783
|
+
FILE *file = 0;
|
784
|
+
VALUE path = rb_ivar_get(self, rb_intern("@path"));
|
785
|
+
|
786
|
+
if (NIL_P(path)) {
|
787
|
+
rb_raise(cNativeError, "failed to load model: no path given");
|
788
|
+
}
|
789
|
+
|
790
|
+
if (!(file = fopen(StringValueCStr(path), "r"))) {
|
791
|
+
rb_raise(cNativeError, "failed to load model: failed to open model file");
|
792
|
+
}
|
793
|
+
|
794
|
+
mdl_load(model, file);
|
795
|
+
fclose(file);
|
796
|
+
|
797
|
+
return self;
|
798
|
+
}
|
799
|
+
|
800
|
+
static dat_t *to_dat(rdr_t *reader, VALUE data, bool labelled) {
|
801
|
+
Check_Type(data, T_ARRAY);
|
802
|
+
|
803
|
+
const unsigned int n = RARRAY_LEN(data);
|
804
|
+
unsigned int i, j, k;
|
805
|
+
|
806
|
+
dat_t *dat = xmalloc(sizeof(dat_t));
|
807
|
+
dat->nseq = 0;
|
808
|
+
dat->mlen = 0;
|
809
|
+
dat->lbl = labelled;
|
810
|
+
dat->seq = xmalloc(sizeof(seq_t*) * n);
|
811
|
+
|
812
|
+
for (i = 0; i < n; ++i) {
|
813
|
+
VALUE sequence = rb_ary_entry(data, i);
|
814
|
+
Check_Type(sequence, T_ARRAY);
|
815
|
+
|
816
|
+
k = RARRAY_LEN(sequence);
|
817
|
+
raw_t *raw = xmalloc(sizeof(raw_t) + sizeof(char*) * k);
|
818
|
+
|
819
|
+
for (j = 0; j < k; ++j) {
|
820
|
+
VALUE line = rb_ary_entry(sequence, j);
|
821
|
+
Check_Type(line, T_STRING);
|
822
|
+
raw->lines[j] = StringValueCStr(line);
|
823
|
+
}
|
824
|
+
|
825
|
+
raw->len = k;
|
826
|
+
|
827
|
+
seq_t *seq = rdr_raw2seq(reader, raw, labelled);
|
828
|
+
xfree(raw);
|
829
|
+
|
830
|
+
if (seq == 0) { break; }
|
831
|
+
|
832
|
+
// and store the sequence
|
833
|
+
dat->seq[dat->nseq++] = seq;
|
834
|
+
dat->mlen = max(dat->mlen, seq->len);
|
835
|
+
|
836
|
+
}
|
837
|
+
|
838
|
+
// if no sequence was read, free memory
|
839
|
+
if (dat->nseq == 0) {
|
840
|
+
xfree(dat->seq);
|
841
|
+
xfree(dat);
|
842
|
+
|
843
|
+
return 0;
|
844
|
+
}
|
845
|
+
|
846
|
+
return dat;
|
847
|
+
}
|
848
|
+
|
849
|
+
|
850
|
+
static VALUE model_train(VALUE self, VALUE data) {
|
851
|
+
|
852
|
+
mdl_t* model = get_model(self);
|
853
|
+
|
854
|
+
int trn;
|
855
|
+
for (trn = 0; trn < trn_cnt; trn++) {
|
856
|
+
if (!strcmp(model->opt->algo, trn_lst[trn].name)) break;
|
857
|
+
}
|
858
|
+
|
859
|
+
if (trn == trn_cnt) {
|
860
|
+
rb_raise(cNativeError, "failed to train model: unknown algorithm '%s'", model->opt->algo);
|
861
|
+
}
|
862
|
+
|
863
|
+
FILE *file;
|
864
|
+
|
865
|
+
// Load the pattern file. This will unlock the database if previously
|
866
|
+
// locked by loading a model.
|
867
|
+
if (model->opt->pattern) {
|
868
|
+
file = fopen(model->opt->pattern, "r");
|
869
|
+
|
870
|
+
if (!file) {
|
871
|
+
rb_raise(cNativeError, "failed to train model: failed to load pattern file '%s'", model->opt->pattern);
|
872
|
+
}
|
873
|
+
|
874
|
+
rdr_loadpat(model->reader, file);
|
875
|
+
fclose(file);
|
876
|
+
qrk_lock(model->reader->obs, false);
|
877
|
+
}
|
878
|
+
else {
|
879
|
+
rb_raise(cNativeError, "failed to train model: no pattern given");
|
880
|
+
}
|
881
|
+
|
882
|
+
|
883
|
+
// Load the training data. When this is done we lock the quarks as we
|
884
|
+
// don't want to put in the model, informations present only in the
|
885
|
+
// devlopment set.
|
886
|
+
|
887
|
+
switch (TYPE(data)) {
|
888
|
+
case T_STRING:
|
889
|
+
if (!(file = fopen(StringValuePtr(data), "r"))) {
|
890
|
+
rb_raise(cNativeError, "failed to train model: failed to open training data '%s", StringValuePtr(data));
|
891
|
+
}
|
892
|
+
|
893
|
+
model->train = rdr_readdat(model->reader, file, true);
|
894
|
+
fclose(file);
|
895
|
+
|
896
|
+
break;
|
897
|
+
case T_ARRAY:
|
898
|
+
model->train = to_dat(model->reader, data, true);
|
899
|
+
|
900
|
+
break;
|
901
|
+
default:
|
902
|
+
rb_raise(cNativeError, "failed to train model: invalid training data type (expected instance of String or Array)");
|
903
|
+
}
|
904
|
+
|
905
|
+
qrk_lock(model->reader->lbl, true);
|
906
|
+
qrk_lock(model->reader->obs, true);
|
907
|
+
|
908
|
+
if (!model->train || model->train->nseq == 0) {
|
909
|
+
rb_raise(cNativeError, "failed to train model: no training data loaded");
|
910
|
+
}
|
911
|
+
|
912
|
+
// If present, load the development set in the model. If not specified,
|
913
|
+
// the training dataset will be used instead.
|
914
|
+
if (model->opt->devel) {
|
915
|
+
if (!(file = fopen(model->opt->devel, "r"))) {
|
916
|
+
rb_raise(cNativeError, "failed to train model: cannot open development file '%s'", model->opt->devel);
|
917
|
+
}
|
918
|
+
|
919
|
+
model->devel = rdr_readdat(model->reader, file, true);
|
920
|
+
fclose(file);
|
921
|
+
}
|
922
|
+
|
923
|
+
// Initialize the model. If a previous model was loaded, this will be
|
924
|
+
// just a resync, else the model structure will be created.
|
925
|
+
rb_funcall(self, rb_intern("sync"), 0);
|
926
|
+
|
927
|
+
// Train the model.
|
928
|
+
uit_setup(model);
|
929
|
+
trn_lst[trn].train(model);
|
930
|
+
uit_cleanup(model);
|
931
|
+
|
932
|
+
// If requested compact the model.
|
933
|
+
if (model->opt->compact) {
|
934
|
+
const size_t O = model->nobs;
|
935
|
+
const size_t F = model->nftr;
|
936
|
+
rb_funcall(self, rb_intern("compact"), 0);
|
937
|
+
}
|
938
|
+
|
939
|
+
return self;
|
940
|
+
}
|
941
|
+
|
942
|
+
// Returns a sorted list of all labels in the Model's label database.
|
943
|
+
static VALUE model_labels(VALUE self) {
|
944
|
+
mdl_t *model = get_model(self);
|
945
|
+
const size_t Y = model->nlbl;
|
946
|
+
|
947
|
+
qrk_t *lp = model->reader->lbl;
|
948
|
+
|
949
|
+
VALUE labels = rb_ary_new2(Y);
|
950
|
+
|
951
|
+
for (unsigned int i = 0; i < Y; ++i) {
|
952
|
+
rb_ary_push(labels, rb_str_new2(qrk_id2str(lp, i)));
|
953
|
+
}
|
954
|
+
|
955
|
+
rb_funcall(labels, rb_intern("sort!"), 0);
|
956
|
+
|
957
|
+
return labels;
|
958
|
+
}
|
959
|
+
|
960
|
+
static VALUE decode_sequence(mdl_t *model, raw_t *raw) {
|
961
|
+
qrk_t *lbls = model->reader->lbl;
|
962
|
+
|
963
|
+
const size_t Y = model->nlbl;
|
964
|
+
const size_t N = model->opt->nbest;
|
965
|
+
|
966
|
+
seq_t *seq = rdr_raw2seq(model->reader, raw, model->opt->check);
|
967
|
+
|
968
|
+
const int T = seq->len;
|
969
|
+
|
970
|
+
size_t *out = xmalloc(sizeof(size_t) * T * N);
|
971
|
+
double *psc = xmalloc(sizeof(double) * T * N);
|
972
|
+
double *scs = xmalloc(sizeof(double) * N);
|
973
|
+
|
974
|
+
VALUE result = rb_ary_new2(N), sequence, tokens;
|
975
|
+
|
976
|
+
if (N == 1) {
|
977
|
+
tag_viterbi(model, seq, (size_t*)out, scs, (double*)psc);
|
978
|
+
}
|
979
|
+
else {
|
980
|
+
tag_nbviterbi(model, seq, N, (void*)out, scs, (void*)psc);
|
981
|
+
}
|
982
|
+
|
983
|
+
// Next we output the raw sequence with an aditional column for
|
984
|
+
// the predicted labels
|
985
|
+
for (size_t n = 0; n < N; n++) {
|
986
|
+
|
987
|
+
sequence = rb_ary_new();
|
988
|
+
|
989
|
+
// if (model->opt->outsc)
|
990
|
+
// fprintf(fout, "# %d %f\n", (int)n, scs[n]);
|
991
|
+
|
992
|
+
for (int t = 0; t < T; t++) {
|
993
|
+
tokens = rb_ary_new();
|
994
|
+
|
995
|
+
if (!model->opt->label) {
|
996
|
+
rb_ary_push(tokens, rb_str_new2(raw->lines[t]));
|
997
|
+
}
|
998
|
+
|
999
|
+
size_t lbl = out[t * N + n];
|
1000
|
+
rb_ary_push(tokens, rb_str_new2(qrk_id2str(lbls, lbl)));
|
1001
|
+
|
1002
|
+
// if (model->opt->outsc) {
|
1003
|
+
// fprintf(fout, "\t%s", lblstr);
|
1004
|
+
// fprintf(fout, "/%f", psc[t * N + n]);
|
1005
|
+
// }
|
1006
|
+
|
1007
|
+
// yield token/label pair to block if given
|
1008
|
+
if (rb_block_given_p()) {
|
1009
|
+
tokens = rb_yield(tokens);
|
1010
|
+
}
|
1011
|
+
|
1012
|
+
rb_ary_push(sequence, tokens);
|
1013
|
+
}
|
1014
|
+
|
1015
|
+
rb_ary_push(result, sequence);
|
1016
|
+
}
|
1017
|
+
|
1018
|
+
// Cleanup memory used for this sequence
|
1019
|
+
xfree(scs);
|
1020
|
+
xfree(psc);
|
1021
|
+
xfree(out);
|
1022
|
+
|
1023
|
+
rdr_freeseq(seq);
|
1024
|
+
|
1025
|
+
return result;
|
1026
|
+
}
|
1027
|
+
|
1028
|
+
static VALUE decode_sequence_array(VALUE self, VALUE array) {
|
1029
|
+
Check_Type(array, T_ARRAY);
|
1030
|
+
const unsigned int n = RARRAY_LEN(array);
|
1031
|
+
|
1032
|
+
mdl_t *model = get_model(self);
|
1033
|
+
raw_t *raw;
|
1034
|
+
|
1035
|
+
const unsigned int N = model->opt->nbest;
|
1036
|
+
unsigned int i, j;
|
1037
|
+
|
1038
|
+
VALUE result = rb_ary_new2(n * N), sequence;
|
1039
|
+
|
1040
|
+
for (i = 0; i < n; ++i) {
|
1041
|
+
sequence = rb_ary_entry(array, i);
|
1042
|
+
Check_Type(sequence, T_ARRAY);
|
1043
|
+
|
1044
|
+
const unsigned int k = RARRAY_LEN(sequence);
|
1045
|
+
raw = xmalloc(sizeof(raw_t) + sizeof(char*) * k);
|
1046
|
+
raw->len = k;
|
1047
|
+
|
1048
|
+
for (j = 0; j < k; ++j) {
|
1049
|
+
VALUE line = rb_ary_entry(sequence, j);
|
1050
|
+
Check_Type(line, T_STRING);
|
1051
|
+
|
1052
|
+
raw->lines[j] = StringValueCStr(line);
|
1053
|
+
}
|
1054
|
+
|
1055
|
+
rb_funcall(result, rb_intern("concat"), 1, decode_sequence(model, raw));
|
1056
|
+
|
1057
|
+
xfree(raw);
|
1058
|
+
}
|
1059
|
+
|
1060
|
+
return result;
|
1061
|
+
}
|
1062
|
+
|
1063
|
+
static VALUE decode_sequence_file(VALUE self, VALUE path) {
|
1064
|
+
Check_Type(path, T_STRING);
|
1065
|
+
FILE *file;
|
1066
|
+
|
1067
|
+
if (!(file = fopen(StringValueCStr(path), "r"))) {
|
1068
|
+
rb_raise(cNativeError, "failed to label data: could not open file '%s'", StringValueCStr(path));
|
1069
|
+
}
|
1070
|
+
|
1071
|
+
mdl_t *model = get_model(self);
|
1072
|
+
raw_t *raw;
|
1073
|
+
|
1074
|
+
VALUE result = rb_ary_new();
|
1075
|
+
|
1076
|
+
// Next read the input file sequence by sequence and label them, we have
|
1077
|
+
// to take care of not discarding the raw input as we want to send it
|
1078
|
+
// back to the output with the additional predicted labels.
|
1079
|
+
while (!feof(file)) {
|
1080
|
+
|
1081
|
+
// So, first read an input sequence keeping the raw_t object
|
1082
|
+
// available, and label it with Viterbi.
|
1083
|
+
if ((raw = rdr_readraw(model->reader, file)) == 0) {
|
1084
|
+
break;
|
1085
|
+
}
|
1086
|
+
|
1087
|
+
rb_funcall(result, rb_intern("concat"), 1, decode_sequence(model, raw));
|
1088
|
+
rdr_freeraw(raw);
|
1089
|
+
}
|
1090
|
+
|
1091
|
+
return result;
|
1092
|
+
}
|
1093
|
+
|
1094
|
+
// cal-seq:
|
1095
|
+
// m.label(tokens) # => array of labelled tokens
|
1096
|
+
// m.label(filename) # => array of labelled tokens
|
1097
|
+
//
|
1098
|
+
static VALUE model_label(VALUE self, VALUE data) {
|
1099
|
+
VALUE result;
|
1100
|
+
|
1101
|
+
switch (TYPE(data)) {
|
1102
|
+
case T_STRING:
|
1103
|
+
result = decode_sequence_file(self, data);
|
1104
|
+
break;
|
1105
|
+
case T_ARRAY:
|
1106
|
+
result = decode_sequence_array(self, data);
|
1107
|
+
break;
|
1108
|
+
default:
|
1109
|
+
rb_raise(cNativeError, "failed to label data: invalid data (expected type String or Array)");
|
1110
|
+
}
|
1111
|
+
|
1112
|
+
return result;
|
1113
|
+
}
|
1114
|
+
|
1115
|
+
static void Init_model() {
|
1116
|
+
cModel = rb_define_class_under(mWapiti, "Model", rb_cObject);
|
1117
|
+
rb_define_alloc_func(cModel, allocate_model);
|
1118
|
+
|
1119
|
+
rb_define_method(cModel, "initialize", initialize_model, -1);
|
1120
|
+
|
1121
|
+
rb_define_attr(cModel, "options", 1, 0);
|
1122
|
+
|
1123
|
+
rb_define_method(cModel, "nlbl", model_nlbl, 0);
|
1124
|
+
rb_define_method(cModel, "labels", model_labels, 0);
|
1125
|
+
|
1126
|
+
rb_define_method(cModel, "nobs", model_nobs, 0);
|
1127
|
+
rb_define_alias(cModel, "observations", "nobs");
|
1128
|
+
|
1129
|
+
rb_define_method(cModel, "nftr", model_nftr, 0);
|
1130
|
+
rb_define_alias(cModel, "features", "nftr");
|
1131
|
+
|
1132
|
+
rb_define_method(cModel, "total", model_total, 0);
|
1133
|
+
|
1134
|
+
rb_define_method(cModel, "sync", model_sync, 0);
|
1135
|
+
rb_define_method(cModel, "compact", model_compact, 0);
|
1136
|
+
rb_define_method(cModel, "save", model_save, -1);
|
1137
|
+
rb_define_method(cModel, "load", model_load, -1);
|
1138
|
+
|
1139
|
+
rb_define_method(cModel, "train", model_train, 1);
|
1140
|
+
rb_define_method(cModel, "label", model_label, 1);
|
1141
|
+
}
|
1142
|
+
|
1143
|
+
/* --- Top-Level Utility Methods --- */
|
1144
|
+
|
1145
|
+
|
1146
|
+
static VALUE label(VALUE self __attribute__((__unused__)), VALUE rb_options) {
|
1147
|
+
if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
|
1148
|
+
rb_raise(cNativeError, "argument must be a native options instance");
|
1149
|
+
}
|
1150
|
+
|
1151
|
+
opt_t *options = get_options(rb_options);
|
1152
|
+
|
1153
|
+
if (options->mode != 1) {
|
1154
|
+
rb_raise(cNativeError, "invalid options argument: mode should be set to 1 for labelling");
|
1155
|
+
}
|
1156
|
+
|
1157
|
+
mdl_t *model = mdl_new(rdr_new(options->maxent));
|
1158
|
+
model->opt = options;
|
1159
|
+
|
1160
|
+
dolabel(model);
|
1161
|
+
|
1162
|
+
mdl_free(model);
|
1163
|
+
|
1164
|
+
return Qnil;
|
1165
|
+
}
|
1166
|
+
|
1167
|
+
static VALUE dump(VALUE self __attribute__((__unused__)), VALUE rb_options) {
|
1168
|
+
if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
|
1169
|
+
rb_raise(cNativeError, "argument must be a native options instance");
|
1170
|
+
}
|
1171
|
+
|
1172
|
+
opt_t *options = get_options(rb_options);
|
1173
|
+
|
1174
|
+
if (options->mode != 2) {
|
1175
|
+
rb_raise(cNativeError, "invalid options argument: mode should be set to 2 for training");
|
1176
|
+
}
|
1177
|
+
|
1178
|
+
mdl_t *model = mdl_new(rdr_new(options->maxent));
|
1179
|
+
model->opt = options;
|
1180
|
+
|
1181
|
+
dodump(model);
|
1182
|
+
|
1183
|
+
mdl_free(model);
|
1184
|
+
|
1185
|
+
return Qnil;
|
1186
|
+
}
|
1187
|
+
|
1188
|
+
// This function is a proxy for Wapiti's main entry point.
|
1189
|
+
static VALUE wapiti(VALUE self __attribute__((__unused__)), VALUE arguments) {
|
1190
|
+
int result = -1, argc = 0;
|
1191
|
+
char **ap, *argv[18], *input, *tmp;
|
1192
|
+
|
1193
|
+
Check_Type(arguments, T_STRING);
|
1194
|
+
tmp = StringValueCStr(arguments);
|
1195
|
+
|
1196
|
+
// allocate space for argument vector
|
1197
|
+
input = (char*)malloc(strlen(tmp) + 8);
|
1198
|
+
|
1199
|
+
// prepend command name
|
1200
|
+
strncpy(input, "wapiti ", 8);
|
1201
|
+
strncat(input, tmp, strlen(input) - 8);
|
1202
|
+
|
1203
|
+
// remember allocation pointer
|
1204
|
+
tmp = input;
|
1205
|
+
|
1206
|
+
// turn input string into argument vector (using
|
1207
|
+
// only the first seventeen tokens from input)
|
1208
|
+
for (ap = argv; (*ap = strsep(&input, " \t")) != (char*)0; ++argc) {
|
1209
|
+
if ((**ap != '\0') && (++ap >= &argv[18])) break;
|
1210
|
+
}
|
1211
|
+
|
1212
|
+
// call main entry point
|
1213
|
+
result = wapiti_main(argc, argv);
|
1214
|
+
|
1215
|
+
// free allocated memory
|
1216
|
+
free(tmp);
|
1217
|
+
|
1218
|
+
return INT2FIX(result);
|
1219
|
+
}
|
1220
|
+
|
1221
|
+
/* --- Wapiti Extension Entry Point --- */
|
1222
|
+
|
1223
|
+
void Init_native() {
|
1224
|
+
mWapiti = rb_const_get(rb_mKernel, rb_intern("Wapiti"));
|
1225
|
+
mNative = rb_define_module_under(mWapiti, "Native");
|
1226
|
+
|
1227
|
+
cNativeError = rb_const_get(mWapiti, rb_intern("NativeError"));
|
1228
|
+
cConfigurationError = rb_const_get(mWapiti, rb_intern("ConfigurationError"));
|
1229
|
+
cLogger = rb_funcall(mWapiti, rb_intern("log"), 0);
|
1230
|
+
|
1231
|
+
rb_define_singleton_method(mNative, "label", label, 1);
|
1232
|
+
rb_define_singleton_method(mNative, "wapiti", wapiti, 1);
|
1233
|
+
|
1234
|
+
rb_define_const(mNative, "VERSION", rb_str_new2(VERSION));
|
1235
|
+
|
1236
|
+
Init_options();
|
1237
|
+
Init_model();
|
1238
|
+
}
|