wapiti 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/.autotest +13 -0
  2. data/.gitignore +5 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +30 -0
  6. data/README.md +153 -0
  7. data/Rakefile +33 -0
  8. data/ext/wapiti/bcd.c +392 -0
  9. data/ext/wapiti/decoder.c +535 -0
  10. data/ext/wapiti/decoder.h +46 -0
  11. data/ext/wapiti/extconf.rb +8 -0
  12. data/ext/wapiti/gradient.c +818 -0
  13. data/ext/wapiti/gradient.h +81 -0
  14. data/ext/wapiti/lbfgs.c +294 -0
  15. data/ext/wapiti/model.c +296 -0
  16. data/ext/wapiti/model.h +100 -0
  17. data/ext/wapiti/native.c +1238 -0
  18. data/ext/wapiti/native.h +15 -0
  19. data/ext/wapiti/options.c +278 -0
  20. data/ext/wapiti/options.h +91 -0
  21. data/ext/wapiti/pattern.c +395 -0
  22. data/ext/wapiti/pattern.h +56 -0
  23. data/ext/wapiti/progress.c +167 -0
  24. data/ext/wapiti/progress.h +43 -0
  25. data/ext/wapiti/quark.c +272 -0
  26. data/ext/wapiti/quark.h +46 -0
  27. data/ext/wapiti/reader.c +553 -0
  28. data/ext/wapiti/reader.h +73 -0
  29. data/ext/wapiti/rprop.c +191 -0
  30. data/ext/wapiti/sequence.h +148 -0
  31. data/ext/wapiti/sgdl1.c +218 -0
  32. data/ext/wapiti/thread.c +171 -0
  33. data/ext/wapiti/thread.h +42 -0
  34. data/ext/wapiti/tools.c +202 -0
  35. data/ext/wapiti/tools.h +54 -0
  36. data/ext/wapiti/trainers.h +39 -0
  37. data/ext/wapiti/vmath.c +372 -0
  38. data/ext/wapiti/vmath.h +51 -0
  39. data/ext/wapiti/wapiti.c +288 -0
  40. data/ext/wapiti/wapiti.h +45 -0
  41. data/lib/wapiti.rb +30 -0
  42. data/lib/wapiti/errors.rb +17 -0
  43. data/lib/wapiti/model.rb +49 -0
  44. data/lib/wapiti/options.rb +113 -0
  45. data/lib/wapiti/utility.rb +15 -0
  46. data/lib/wapiti/version.rb +3 -0
  47. data/spec/fixtures/ch.mod +18550 -0
  48. data/spec/fixtures/chpattern.txt +52 -0
  49. data/spec/fixtures/chtest.txt +1973 -0
  50. data/spec/fixtures/chtrain.txt +19995 -0
  51. data/spec/fixtures/nppattern.txt +52 -0
  52. data/spec/fixtures/nptest.txt +1973 -0
  53. data/spec/fixtures/nptrain.txt +19995 -0
  54. data/spec/fixtures/pattern.txt +14 -0
  55. data/spec/fixtures/test.txt +60000 -0
  56. data/spec/fixtures/train.txt +1200 -0
  57. data/spec/spec_helper.rb +21 -0
  58. data/spec/wapiti/model_spec.rb +173 -0
  59. data/spec/wapiti/native_spec.rb +12 -0
  60. data/spec/wapiti/options_spec.rb +175 -0
  61. data/spec/wapiti/utility_spec.rb +22 -0
  62. data/wapiti.gemspec +35 -0
  63. metadata +178 -0
@@ -0,0 +1,100 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #ifndef model_h
29
+ #define model_h
30
+
31
+ #include <stddef.h>
32
+ #include <sys/times.h>
33
+
34
+ #include "wapiti.h"
35
+ #include "options.h"
36
+ #include "sequence.h"
37
+ #include "reader.h"
38
+
39
+ typedef struct tms tms_t;
40
+
41
+ /* mdl_t:
42
+ * Represent a linear-chain CRF model. The model contain both unigram and
43
+ * bigram features. It is caracterized by <nlbl> the number of labels, <nobs>
44
+ * the number of observations, and <nftr> the number of features.
45
+ *
46
+ * Each observations have a corresponding entry in <kind> whose first bit is
47
+ * set if the observation is unigram and second one if it is bigram. Note that
48
+ * an observation can be both. An unigram observation produce Y features and a
49
+ * bigram one produce Y * Y features.
50
+ * The <theta> array keep all features weights. The <*off> array give for each
51
+ * observations the offset in the <theta> array where the features of the
52
+ * observation are stored.
53
+ *
54
+ * The <*off> and <theta> array are initialized only when the model is
55
+ * synchronized. As you can add new labels and observations after a sync, we
56
+ * keep track of the old counts in <olbl> and <oblk> to detect inconsistency
57
+ * and resynchronize the model if needed. In this case, if the number of
58
+ * labels have not changed, the previously trained weights are kept, else they
59
+ * are now meaningless so discarded.
60
+ */
61
+ typedef struct mdl_s mdl_t;
62
+ struct mdl_s {
63
+ opt_t *opt; // options for training
64
+
65
+ // Size of various model parameters
66
+ size_t nlbl; // Y number of labels
67
+ size_t nobs; // O number of observations
68
+ size_t nftr; // F number of features
69
+
70
+ // Informations about observations
71
+ char *kind; // [O] observations type
72
+ size_t *uoff; // [O] unigram weights offset
73
+ size_t *boff; // [O] bigram weights offset
74
+
75
+ // The model itself
76
+ double *theta; // [F] features weights
77
+
78
+ // Datasets
79
+ dat_t *train; // training dataset
80
+ dat_t *devel; // development dataset
81
+ rdr_t *reader;
82
+
83
+ // Stoping criterion
84
+ double *werr; // Window of error rate of last iters
85
+ int wcnt; // Number of iters in the window
86
+ int wpos; // Position for the next iter
87
+
88
+ // Timing
89
+ tms_t timer; // start time of last iter
90
+ double total; // total training time
91
+ };
92
+
93
+ mdl_t *mdl_new(rdr_t *rdr);
94
+ void mdl_free(mdl_t *mdl);
95
+ void mdl_sync(mdl_t *mdl);
96
+ void mdl_compact(mdl_t *mdl);
97
+ void mdl_save(mdl_t *mdl, FILE *file);
98
+ void mdl_load(mdl_t *mdl, FILE *file);
99
+
100
+ #endif
@@ -0,0 +1,1238 @@
1
+ #include <stdio.h>
2
+ #include <string.h>
3
+
4
+ #include "wapiti.h"
5
+ #include "options.h"
6
+ #include "reader.h"
7
+ #include "model.h"
8
+ #include "trainers.h"
9
+ #include "quark.h"
10
+ #include "tools.h"
11
+
12
+ #include "native.h"
13
+
14
+ VALUE mWapiti;
15
+ VALUE mNative;
16
+
17
+ VALUE cOptions;
18
+ VALUE cModel;
19
+
20
+ VALUE cNativeError;
21
+ VALUE cConfigurationError;
22
+ VALUE cLogger;
23
+
24
+
25
+ /* --- Utilities --- */
26
+
27
+ static void trn_auto(mdl_t *mdl) {
28
+ const int maxiter = mdl->opt->maxiter;
29
+ mdl->opt->maxiter = 3;
30
+ trn_sgdl1(mdl);
31
+ mdl->opt->maxiter = maxiter;
32
+ trn_lbfgs(mdl);
33
+ }
34
+
35
+ static const struct {
36
+ char *name;
37
+ void (* train)(mdl_t *mdl);
38
+ } trn_lst[] = {
39
+ {"l-bfgs", trn_lbfgs},
40
+ {"sgd-l1", trn_sgdl1},
41
+ {"bcd", trn_bcd },
42
+ {"rprop", trn_rprop},
43
+ {"rprop+", trn_rprop},
44
+ {"rprop-", trn_rprop},
45
+ {"auto", trn_auto }
46
+ };
47
+ static const int trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
48
+
49
+
50
+ /* --- Options Class --- */
51
+
52
+ // Auxiliary Methods
53
+
54
+ static opt_t *get_options(VALUE self) {
55
+ opt_t *options;
56
+ Data_Get_Struct(self, opt_t, options);
57
+ return options;
58
+ }
59
+
60
+ // Copies a Ruby string to the heap and stores it in a pointer.
61
+ // Frees the pointer before assigning the new value.
62
+ static void copy_string(char **dst, VALUE rb_string) {
63
+ Check_Type(rb_string, T_STRING);
64
+
65
+ if (*dst) { free(*dst); *dst = (char*)0; }
66
+ *dst = calloc(RSTRING_LEN(rb_string) + 1, sizeof(char));
67
+
68
+ memcpy(*dst, StringValuePtr(rb_string), RSTRING_LEN(rb_string) + 1);
69
+ }
70
+
71
+
72
+ // Constructor / Desctructor
73
+
74
+ static void mark_options(opt_t* options __attribute__((__unused__))) {
75
+ // nothing
76
+ }
77
+
78
+ static void deallocate_options(opt_t* options) {
79
+
80
+ // free string options
81
+ if (options->input) { free(options->input); }
82
+ if (options->output) { free(options->output); }
83
+ if (options->algo) { free(options->algo); }
84
+ if (options->devel) { free(options->devel); }
85
+ if (options->pattern) { free(options->pattern); }
86
+
87
+ free(options);
88
+ options = (opt_t*)0;
89
+ }
90
+
91
+ static VALUE allocate_options(VALUE self) {
92
+ opt_t* options = malloc(sizeof(opt_t));
93
+ return Data_Wrap_Struct(self, mark_options, deallocate_options, options);
94
+ }
95
+
96
+ static VALUE initialize_options(int argc, VALUE *argv, VALUE self) {
97
+ opt_t* options = get_options(self);
98
+ *options = opt_defaults;
99
+
100
+ if (options->maxiter == 0) {
101
+ options->maxiter = INT_MAX;
102
+ }
103
+
104
+ // copy the default algorithm name to the heap so that all options strings
105
+ // are on the heap
106
+ char* tmp = calloc(strlen(options->algo), sizeof(char));
107
+ memcpy(tmp, options->algo, strlen(options->algo));
108
+ options->algo = tmp;
109
+
110
+ if (argc > 1) {
111
+ rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
112
+ "wrong number of arguments (%d for 0..1)", argc);
113
+ }
114
+
115
+ // set defaults
116
+ if (argc) {
117
+ Check_Type(argv[0], T_HASH);
118
+ (void)rb_funcall(self, rb_intern("update"), 1, argv[0]);
119
+ }
120
+
121
+ // yield self if block_given?
122
+ if (rb_block_given_p()) {
123
+ rb_yield(self);
124
+ }
125
+
126
+ return self;
127
+ }
128
+
129
+
130
+ // Instance Methods
131
+
132
+
133
+ // Fixnum Accessors
134
+
135
+ static VALUE options_nbest(VALUE self) {
136
+ return INT2FIX(get_options(self)->nbest);
137
+ }
138
+
139
+ static VALUE options_set_nbest(VALUE self, VALUE rb_fixnum) {
140
+ Check_Type(rb_fixnum, T_FIXNUM);
141
+ get_options(self)->nbest = FIX2INT(rb_fixnum);
142
+
143
+ return rb_fixnum;
144
+ }
145
+
146
+
147
+ static VALUE options_stopwin(VALUE self) {
148
+ return INT2FIX(get_options(self)->stopwin);
149
+ }
150
+
151
+ static VALUE options_set_stopwin(VALUE self, VALUE rb_fixnum) {
152
+ Check_Type(rb_fixnum, T_FIXNUM);
153
+ get_options(self)->stopwin = FIX2INT(rb_fixnum);
154
+
155
+ return rb_fixnum;
156
+ }
157
+
158
+ static VALUE options_objwin(VALUE self) {
159
+ return INT2FIX(get_options(self)->objwin);
160
+ }
161
+
162
+ static VALUE options_set_objwin(VALUE self, VALUE rb_fixnum) {
163
+ Check_Type(rb_fixnum, T_FIXNUM);
164
+ get_options(self)->objwin = FIX2INT(rb_fixnum);
165
+
166
+ return rb_fixnum;
167
+ }
168
+
169
+
170
+ static VALUE options_maxiter(VALUE self) {
171
+ return INT2FIX(get_options(self)->maxiter);
172
+ }
173
+
174
+ static VALUE options_set_maxiter(VALUE self, VALUE rb_fixnum) {
175
+ opt_t *options = get_options(self);
176
+
177
+ Check_Type(rb_fixnum, T_FIXNUM);
178
+ options->maxiter = FIX2INT(rb_fixnum);
179
+
180
+ return rb_fixnum;
181
+ }
182
+
183
+ static VALUE options_jobsize(VALUE self) {
184
+ return INT2FIX(get_options(self)->jobsize);
185
+ }
186
+
187
+ static VALUE options_set_jobsize(VALUE self, VALUE rb_fixnum) {
188
+ opt_t *options = get_options(self);
189
+
190
+ Check_Type(rb_fixnum, T_FIXNUM);
191
+ options->jobsize = FIX2INT(rb_fixnum);
192
+
193
+ return rb_fixnum;
194
+ }
195
+
196
+ static VALUE options_nthread(VALUE self) {
197
+ return INT2FIX(get_options(self)->nthread);
198
+ }
199
+
200
+ static VALUE options_set_nthread(VALUE self, VALUE rb_fixnum) {
201
+ opt_t *options = get_options(self);
202
+
203
+ Check_Type(rb_fixnum, T_FIXNUM);
204
+ options->nthread = FIX2INT(rb_fixnum);
205
+
206
+ return rb_fixnum;
207
+ }
208
+
209
+ static VALUE options_histsz(VALUE self) {
210
+ return INT2FIX(get_options(self)->lbfgs.histsz);
211
+ }
212
+
213
+ static VALUE options_set_histsz(VALUE self, VALUE rb_fixnum) {
214
+ Check_Type(rb_fixnum, T_FIXNUM);
215
+ get_options(self)->lbfgs.histsz = FIX2INT(rb_fixnum);
216
+
217
+ return rb_fixnum;
218
+ }
219
+
220
+ static VALUE options_maxls(VALUE self) {
221
+ return INT2FIX(get_options(self)->lbfgs.maxls);
222
+ }
223
+
224
+ static VALUE options_set_maxls(VALUE self, VALUE rb_fixnum) {
225
+ Check_Type(rb_fixnum, T_FIXNUM);
226
+ get_options(self)->lbfgs.maxls = FIX2INT(rb_fixnum);
227
+
228
+ return rb_fixnum;
229
+ }
230
+
231
+
232
+ // Float Accessors
233
+
234
+ static VALUE options_rho1(VALUE self) {
235
+ return rb_float_new(get_options(self)->rho1);
236
+ }
237
+
238
+ static VALUE options_set_rho1(VALUE self, VALUE rb_numeric) {
239
+ get_options(self)->rho1 = NUM2DBL(rb_numeric);
240
+ return rb_numeric;
241
+ }
242
+
243
+ static VALUE options_rho2(VALUE self) {
244
+ return rb_float_new(get_options(self)->rho2);
245
+ }
246
+
247
+ static VALUE options_set_rho2(VALUE self, VALUE rb_numeric) {
248
+ get_options(self)->rho2 = NUM2DBL(rb_numeric);
249
+ return rb_numeric;
250
+ }
251
+
252
+ static VALUE options_stopeps(VALUE self) {
253
+ return rb_float_new(get_options(self)->stopeps);
254
+ }
255
+
256
+ static VALUE options_set_stopeps(VALUE self, VALUE rb_numeric) {
257
+ get_options(self)->stopeps = NUM2DBL(rb_numeric);
258
+ return rb_numeric;
259
+ }
260
+
261
+ static VALUE options_eta0(VALUE self) {
262
+ return rb_float_new(get_options(self)->sgdl1.eta0);
263
+ }
264
+
265
+ static VALUE options_set_eta0(VALUE self, VALUE rb_numeric) {
266
+ get_options(self)->sgdl1.eta0 = NUM2DBL(rb_numeric);
267
+ return rb_numeric;
268
+ }
269
+
270
+ static VALUE options_alpha(VALUE self) {
271
+ return rb_float_new(get_options(self)->sgdl1.alpha);
272
+ }
273
+
274
+ static VALUE options_set_alpha(VALUE self, VALUE rb_numeric) {
275
+ get_options(self)->sgdl1.alpha = NUM2DBL(rb_numeric);
276
+ return rb_numeric;
277
+ }
278
+
279
+ static VALUE options_kappa(VALUE self) {
280
+ return rb_float_new(get_options(self)->bcd.kappa);
281
+ }
282
+
283
+ static VALUE options_set_kappa(VALUE self, VALUE rb_numeric) {
284
+ get_options(self)->bcd.kappa = NUM2DBL(rb_numeric);
285
+ return rb_numeric;
286
+ }
287
+
288
+ static VALUE options_stpmin(VALUE self) {
289
+ return rb_float_new(get_options(self)->rprop.stpmin);
290
+ }
291
+
292
+ static VALUE options_set_stpmin(VALUE self, VALUE rb_numeric) {
293
+ get_options(self)->rprop.stpmin = NUM2DBL(rb_numeric);
294
+ return rb_numeric;
295
+ }
296
+
297
+ static VALUE options_stpmax(VALUE self) {
298
+ return rb_float_new(get_options(self)->rprop.stpmax);
299
+ }
300
+
301
+ static VALUE options_set_stpmax(VALUE self, VALUE rb_numeric) {
302
+ get_options(self)->rprop.stpmax = NUM2DBL(rb_numeric);
303
+ return rb_numeric;
304
+ }
305
+
306
+ static VALUE options_stpinc(VALUE self) {
307
+ return rb_float_new(get_options(self)->rprop.stpinc);
308
+ }
309
+
310
+ static VALUE options_set_stpinc(VALUE self, VALUE rb_numeric) {
311
+ get_options(self)->rprop.stpinc = NUM2DBL(rb_numeric);
312
+ return rb_numeric;
313
+ }
314
+
315
+ static VALUE options_stpdec(VALUE self) {
316
+ return rb_float_new(get_options(self)->rprop.stpdec);
317
+ }
318
+
319
+ static VALUE options_set_stpdec(VALUE self, VALUE rb_numeric) {
320
+ get_options(self)->rprop.stpdec = NUM2DBL(rb_numeric);
321
+ return rb_numeric;
322
+ }
323
+
324
+
325
+
326
+ // Boolean Accessors
327
+
328
+ static VALUE options_maxent(VALUE self) {
329
+ return get_options(self)->maxent ? Qtrue : Qfalse;
330
+ }
331
+
332
+ static VALUE options_set_maxent(VALUE self, VALUE rb_boolean) {
333
+ get_options(self)->maxent = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
334
+ return rb_boolean;
335
+ }
336
+
337
+ static VALUE options_compact(VALUE self) {
338
+ return get_options(self)->compact ? Qtrue : Qfalse;
339
+ }
340
+
341
+ static VALUE options_set_compact(VALUE self, VALUE rb_boolean) {
342
+ get_options(self)->compact = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
343
+ return rb_boolean;
344
+ }
345
+
346
+ static VALUE options_sparse(VALUE self) {
347
+ return get_options(self)->sparse ? Qtrue : Qfalse;
348
+ }
349
+
350
+ static VALUE options_set_sparse(VALUE self, VALUE rb_boolean) {
351
+ get_options(self)->sparse = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
352
+ return rb_boolean;
353
+ }
354
+
355
+ static VALUE options_check(VALUE self) {
356
+ return get_options(self)->check ? Qtrue : Qfalse;
357
+ }
358
+
359
+ static VALUE options_set_check(VALUE self, VALUE rb_boolean) {
360
+ get_options(self)->check = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
361
+ return rb_boolean;
362
+ }
363
+
364
+ static VALUE options_label(VALUE self) {
365
+ return get_options(self)->label ? Qtrue : Qfalse;
366
+ }
367
+
368
+ static VALUE options_set_label(VALUE self, VALUE rb_boolean) {
369
+ get_options(self)->label = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
370
+ return rb_boolean;
371
+ }
372
+
373
+ static VALUE options_outsc(VALUE self) {
374
+ return get_options(self)->outsc ? Qtrue : Qfalse;
375
+ }
376
+
377
+ static VALUE options_set_outsc(VALUE self, VALUE rb_boolean) {
378
+ get_options(self)->outsc = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
379
+ return rb_boolean;
380
+ }
381
+
382
+ static VALUE options_lblpost(VALUE self) {
383
+ return get_options(self)->lblpost ? Qtrue : Qfalse;
384
+ }
385
+
386
+ static VALUE options_set_lblpost(VALUE self, VALUE rb_boolean) {
387
+ get_options(self)->lblpost = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
388
+ return rb_boolean;
389
+ }
390
+
391
+ static VALUE options_clip(VALUE self) {
392
+ return get_options(self)->lbfgs.clip ? Qtrue : Qfalse;
393
+ }
394
+
395
+ static VALUE options_set_clip(VALUE self, VALUE rb_boolean) {
396
+ get_options(self)->lbfgs.clip = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
397
+ return rb_boolean;
398
+ }
399
+
400
+ static VALUE options_cutoff(VALUE self) {
401
+ return get_options(self)->rprop.cutoff ? Qtrue : Qfalse;
402
+ }
403
+
404
+ static VALUE options_set_cutoff(VALUE self, VALUE rb_boolean) {
405
+ get_options(self)->rprop.cutoff = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
406
+ return rb_boolean;
407
+ }
408
+
409
+
410
+
411
+
412
+ // String Accessors
413
+
414
+ static VALUE options_pattern(VALUE self) {
415
+ char *pattern = get_options(self)->pattern;
416
+ return rb_str_new2(pattern ? pattern : "");
417
+ }
418
+
419
+ static VALUE options_set_pattern(VALUE self, VALUE rb_string) {
420
+ opt_t *options = get_options(self);
421
+ copy_string(&(options->pattern), rb_string);
422
+
423
+ return rb_string;
424
+ }
425
+
426
+ static VALUE options_model(VALUE self) {
427
+ char *model = get_options(self)->model;
428
+ return rb_str_new2(model ? model : "");
429
+ }
430
+
431
+ static VALUE options_set_model(VALUE self, VALUE rb_string) {
432
+ opt_t *options = get_options(self);
433
+ copy_string(&(options->model), rb_string);
434
+
435
+ return rb_string;
436
+ }
437
+
438
+ static VALUE options_algorithm(VALUE self) {
439
+ char *algorithm = get_options(self)->algo;
440
+ return rb_str_new2(algorithm ? algorithm : "");
441
+ }
442
+
443
+ static VALUE options_set_algorithm(VALUE self, VALUE rb_string) {
444
+ opt_t *options = get_options(self);
445
+ copy_string(&(options->algo), rb_string);
446
+
447
+ return rb_string;
448
+ }
449
+
450
+ static VALUE options_development_data(VALUE self) {
451
+ char *development_data = get_options(self)->devel;
452
+ return rb_str_new2(development_data ? development_data : "");
453
+ }
454
+
455
+ static VALUE options_set_development_data(VALUE self, VALUE rb_string) {
456
+ opt_t *options = get_options(self);
457
+ copy_string(&(options->devel), rb_string);
458
+
459
+ return rb_string;
460
+ }
461
+
462
+
463
+ void Init_options() {
464
+ cOptions = rb_define_class_under(mWapiti, "Options", rb_cObject);
465
+ rb_define_alloc_func(cOptions, allocate_options);
466
+
467
+ rb_define_method(cOptions, "initialize", initialize_options, -1);
468
+
469
+ // Option Accessors
470
+
471
+ rb_define_method(cOptions, "stopwin", options_stopwin, 0);
472
+ rb_define_method(cOptions, "stopwin=", options_set_stopwin, 1);
473
+
474
+ rb_define_alias(cOptions, "stop_window", "stopwin");
475
+ rb_define_alias(cOptions, "stop_window=", "stopwin=");
476
+
477
+ rb_define_method(cOptions, "objwin", options_objwin, 0);
478
+ rb_define_method(cOptions, "objwin=", options_set_objwin, 1);
479
+
480
+ rb_define_alias(cOptions, "convergence_window", "objwin");
481
+ rb_define_alias(cOptions, "convergence_window=", "objwin=");
482
+
483
+ rb_define_method(cOptions, "maxiter", options_maxiter, 0);
484
+ rb_define_method(cOptions, "maxiter=", options_set_maxiter, 1);
485
+
486
+ rb_define_alias(cOptions, "max_iterations", "maxiter");
487
+ rb_define_alias(cOptions, "max_iterations=", "maxiter=");
488
+
489
+ rb_define_method(cOptions, "jobsize", options_jobsize, 0);
490
+ rb_define_method(cOptions, "jobsize=", options_set_jobsize, 1);
491
+
492
+ rb_define_method(cOptions, "nthread", options_nthread, 0);
493
+ rb_define_method(cOptions, "nthread=", options_set_nthread, 1);
494
+
495
+ rb_define_alias(cOptions, "threads", "nthread");
496
+ rb_define_alias(cOptions, "threads=", "nthread=");
497
+
498
+ rb_define_method(cOptions, "rho1", options_rho1, 0);
499
+ rb_define_method(cOptions, "rho1=", options_set_rho1, 1);
500
+
501
+ rb_define_method(cOptions, "rho2", options_rho2, 0);
502
+ rb_define_method(cOptions, "rho2=", options_set_rho2, 1);
503
+
504
+ rb_define_method(cOptions, "stopeps", options_stopeps, 0);
505
+ rb_define_method(cOptions, "stopeps=", options_set_stopeps, 1);
506
+
507
+ rb_define_alias(cOptions, "stop_epsilon", "stopeps");
508
+ rb_define_alias(cOptions, "stop_epsilon=", "stopeps=");
509
+
510
+ rb_define_method(cOptions, "maxent", options_maxent, 0);
511
+ rb_define_method(cOptions, "maxent=", options_set_maxent, 1);
512
+
513
+ rb_define_alias(cOptions, "maxent?", "maxent");
514
+
515
+ rb_define_method(cOptions, "compact", options_compact, 0);
516
+ rb_define_method(cOptions, "compact=", options_set_compact, 1);
517
+
518
+ rb_define_alias(cOptions, "compact?", "compact");
519
+
520
+ rb_define_method(cOptions, "sparse", options_sparse, 0);
521
+ rb_define_method(cOptions, "sparse=", options_set_sparse, 1);
522
+
523
+ rb_define_alias(cOptions, "sparse?", "sparse");
524
+
525
+ rb_define_method(cOptions, "label", options_label, 0);
526
+ rb_define_method(cOptions, "label=", options_set_label, 1);
527
+
528
+ rb_define_alias(cOptions, "label?", "label");
529
+
530
+ rb_define_method(cOptions, "check", options_check, 0);
531
+ rb_define_method(cOptions, "check=", options_set_check, 1);
532
+
533
+ rb_define_alias(cOptions, "check?", "check");
534
+
535
+ rb_define_method(cOptions, "lblpost", options_lblpost, 0);
536
+ rb_define_method(cOptions, "lblpost=", options_set_lblpost, 1);
537
+
538
+ rb_define_alias(cOptions, "lblpost?", "lblpost");
539
+
540
+ rb_define_alias(cOptions, "posterior", "lblpost");
541
+ rb_define_alias(cOptions, "posterior?", "lblpost");
542
+ rb_define_alias(cOptions, "posterior=", "lblpost=");
543
+
544
+ rb_define_method(cOptions, "outsc", options_outsc, 0);
545
+ rb_define_method(cOptions, "outsc=", options_set_outsc, 1);
546
+
547
+ rb_define_alias(cOptions, "outsc?", "outsc");
548
+
549
+ rb_define_alias(cOptions, "score", "outsc");
550
+ rb_define_alias(cOptions, "score?", "outsc");
551
+ rb_define_alias(cOptions, "score=", "outsc=");
552
+
553
+ rb_define_method(cOptions, "pattern", options_pattern, 0);
554
+ rb_define_method(cOptions, "pattern=", options_set_pattern, 1);
555
+
556
+ rb_define_alias(cOptions, "template", "pattern");
557
+ rb_define_alias(cOptions, "template=", "pattern=");
558
+
559
+ rb_define_method(cOptions, "model", options_model, 0);
560
+ rb_define_method(cOptions, "model=", options_set_model, 1);
561
+
562
+ rb_define_method(cOptions, "algorithm", options_algorithm, 0);
563
+ rb_define_method(cOptions, "algorithm=", options_set_algorithm, 1);
564
+
565
+ rb_define_alias(cOptions, "algo", "algorithm");
566
+ rb_define_alias(cOptions, "algo=", "algorithm=");
567
+
568
+ rb_define_method(cOptions, "development_data", options_development_data, 0);
569
+ rb_define_method(cOptions, "development_data=", options_set_development_data, 1);
570
+
571
+ rb_define_alias(cOptions, "devel", "development_data");
572
+ rb_define_alias(cOptions, "devel=", "development_data=");
573
+
574
+ rb_define_method(cOptions, "clip", options_clip, 0);
575
+ rb_define_method(cOptions, "clip=", options_set_clip, 1);
576
+
577
+ rb_define_method(cOptions, "histsz", options_histsz, 0);
578
+ rb_define_method(cOptions, "histsz=", options_set_histsz, 1);
579
+
580
+ rb_define_method(cOptions, "maxls", options_maxls, 0);
581
+ rb_define_method(cOptions, "maxls=", options_set_maxls, 1);
582
+
583
+ rb_define_method(cOptions, "eta0", options_eta0, 0);
584
+ rb_define_method(cOptions, "eta0=", options_set_eta0, 1);
585
+
586
+ rb_define_method(cOptions, "alpha", options_alpha, 0);
587
+ rb_define_method(cOptions, "alpha=", options_set_alpha, 1);
588
+
589
+ rb_define_method(cOptions, "kappa", options_kappa, 0);
590
+ rb_define_method(cOptions, "kappa=", options_set_kappa, 1);
591
+
592
+ rb_define_method(cOptions, "stpmin", options_stpmin, 0);
593
+ rb_define_method(cOptions, "stpmin=", options_set_stpmin, 1);
594
+
595
+ rb_define_method(cOptions, "stpmax", options_stpmax, 0);
596
+ rb_define_method(cOptions, "stpmax=", options_set_stpmax, 1);
597
+
598
+ rb_define_method(cOptions, "stpinc", options_stpinc, 0);
599
+ rb_define_method(cOptions, "stpinc=", options_set_stpinc, 1);
600
+
601
+ rb_define_method(cOptions, "stpdec", options_stpdec, 0);
602
+ rb_define_method(cOptions, "stpdec=", options_set_stpdec, 1);
603
+
604
+ rb_define_method(cOptions, "cutoff", options_cutoff, 0);
605
+ rb_define_method(cOptions, "cutoff=", options_set_cutoff, 1);
606
+
607
+ rb_define_method(cOptions, "nbest", options_nbest, 0);
608
+ rb_define_method(cOptions, "nbest=", options_set_nbest, 1);
609
+
610
+ }
611
+
612
+
613
+ /* --- Model Class --- */
614
+
615
+ // Auxiliary Methods
616
+
617
+ static mdl_t *get_model(VALUE self) {
618
+ mdl_t *model;
619
+ Data_Get_Struct(self, mdl_t, model);
620
+ return model;
621
+ }
622
+
623
+ // Constructor / Desctructor
624
+
625
+ static void mark_model(mdl_t *model __attribute__((__unused__))) {
626
+ // nothing
627
+ }
628
+
629
+ static void deallocate_model(mdl_t *model) {
630
+ if (model) {
631
+ mdl_free(model);
632
+ model = (mdl_t*)0;
633
+ }
634
+ }
635
+
636
+ static VALUE allocate_model(VALUE self) {
637
+ mdl_t *model = mdl_new(rdr_new(false));
638
+ return Data_Wrap_Struct(self, mark_model, deallocate_model, model);
639
+ }
640
+
641
+ static VALUE model_set_options(VALUE self, VALUE rb_options) {
642
+ if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
643
+ rb_raise(cNativeError, "argument must be a Wapiti::Options instance");
644
+ }
645
+
646
+ mdl_t *model = get_model(self);
647
+
648
+ // Store reference to options in model struct
649
+ model->opt = get_options(rb_options);
650
+
651
+ // Update reader
652
+ model->reader->maxent = model->opt->maxent;
653
+
654
+ // Save instance variable
655
+ rb_ivar_set(self, rb_intern("@options"), rb_options);
656
+
657
+ return rb_options;
658
+ }
659
+
660
+ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
661
+ VALUE options;
662
+
663
+ if (argc > 1) {
664
+ rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
665
+ "wrong number of arguments (%d for 0..1)", argc);
666
+ }
667
+
668
+ if (argc) {
669
+ if (TYPE(argv[0]) == T_HASH) {
670
+ options = rb_funcall(cOptions, rb_intern("new"), 1, argv[0]);
671
+ }
672
+ else {
673
+ if (strncmp("Wapiti::Options", rb_obj_classname(argv[0]), 15) != 0) {
674
+ rb_raise(cNativeError, "argument must be a hash or an options instance");
675
+ }
676
+ options = argv[0];
677
+ }
678
+ }
679
+ else {
680
+ options = rb_funcall(cOptions, rb_intern("new"), 0);
681
+ }
682
+
683
+ // yield self if block_given?
684
+ if (rb_block_given_p()) {
685
+ rb_yield(options);
686
+ }
687
+
688
+ model_set_options(self, options);
689
+
690
+ // Load a previous model if specified by options
691
+ if (get_options(options)->model) {
692
+ rb_funcall(self, rb_intern("load"), 0);
693
+ }
694
+
695
+ return self;
696
+ }
697
+
698
+
699
+ // Native accessors
700
+
701
+ static VALUE model_nlbl(VALUE self) {
702
+ return INT2FIX(get_model(self)->nlbl);
703
+ }
704
+
705
+ static VALUE model_nobs(VALUE self) {
706
+ return INT2FIX(get_model(self)->nobs);
707
+ }
708
+
709
+ static VALUE model_nftr(VALUE self) {
710
+ return INT2FIX(get_model(self)->nftr);
711
+ }
712
+
713
+ static VALUE model_total(VALUE self) {
714
+ return rb_float_new(get_model(self)->total);
715
+ }
716
+
717
+
718
+ // Instance methods
719
+
720
+ static VALUE model_sync(VALUE self) {
721
+ mdl_sync(get_model(self));
722
+ return self;
723
+ }
724
+
725
+ static VALUE model_compact(VALUE self) {
726
+ mdl_compact(get_model(self));
727
+ return self;
728
+ }
729
+
730
+ // call-seq:
731
+ // m.save # => saves the model to the file defined in m.path
732
+ // m.save(path) # => sets m.path and saves the model to the file <path>
733
+ //
734
+ // Saves the model to a file. Uses the Model's path if no argument given,
735
+ // otherwise uses the passed-in argument as the Model's path.
736
+ static VALUE model_save(int argc, VALUE *argv, VALUE self) {
737
+ if (argc > 1) {
738
+ rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
739
+ "wrong number of arguments (%d for 0..1)", argc);
740
+ }
741
+
742
+ mdl_t *model = get_model(self);
743
+
744
+ // save passed-in path in options
745
+ if (argc) {
746
+ Check_Type(argv[0], T_STRING);
747
+ rb_ivar_set(self, rb_intern("@path"), argv[0]);
748
+ }
749
+
750
+ // open the output file
751
+ FILE *file = 0;
752
+ VALUE path = rb_ivar_get(self, rb_intern("@path"));
753
+
754
+ if (NIL_P(path)) {
755
+ rb_raise(cNativeError, "failed to save model: no path given");
756
+ }
757
+
758
+ if (!(file = fopen(StringValueCStr(path), "w"))) {
759
+ rb_raise(cNativeError, "failed to save model: failed to open model file");
760
+ }
761
+
762
+ mdl_save(model, file);
763
+ fclose(file);
764
+
765
+ return self;
766
+ }
767
+
768
+ static VALUE model_load(int argc, VALUE *argv, VALUE self) {
769
+ if (argc > 1) {
770
+ rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
771
+ "wrong number of arguments (%d for 0..1)", argc);
772
+ }
773
+
774
+ mdl_t *model = get_model(self);
775
+
776
+ // save passed-in argument in options
777
+ if (argc) {
778
+ Check_Type(argv[0], T_STRING);
779
+ rb_ivar_set(self, rb_intern("@path"), argv[0]);
780
+ }
781
+
782
+ // open the model file
783
+ FILE *file = 0;
784
+ VALUE path = rb_ivar_get(self, rb_intern("@path"));
785
+
786
+ if (NIL_P(path)) {
787
+ rb_raise(cNativeError, "failed to load model: no path given");
788
+ }
789
+
790
+ if (!(file = fopen(StringValueCStr(path), "r"))) {
791
+ rb_raise(cNativeError, "failed to load model: failed to open model file");
792
+ }
793
+
794
+ mdl_load(model, file);
795
+ fclose(file);
796
+
797
+ return self;
798
+ }
799
+
800
+ static dat_t *to_dat(rdr_t *reader, VALUE data, bool labelled) {
801
+ Check_Type(data, T_ARRAY);
802
+
803
+ const unsigned int n = RARRAY_LEN(data);
804
+ unsigned int i, j, k;
805
+
806
+ dat_t *dat = xmalloc(sizeof(dat_t));
807
+ dat->nseq = 0;
808
+ dat->mlen = 0;
809
+ dat->lbl = labelled;
810
+ dat->seq = xmalloc(sizeof(seq_t*) * n);
811
+
812
+ for (i = 0; i < n; ++i) {
813
+ VALUE sequence = rb_ary_entry(data, i);
814
+ Check_Type(sequence, T_ARRAY);
815
+
816
+ k = RARRAY_LEN(sequence);
817
+ raw_t *raw = xmalloc(sizeof(raw_t) + sizeof(char*) * k);
818
+
819
+ for (j = 0; j < k; ++j) {
820
+ VALUE line = rb_ary_entry(sequence, j);
821
+ Check_Type(line, T_STRING);
822
+ raw->lines[j] = StringValueCStr(line);
823
+ }
824
+
825
+ raw->len = k;
826
+
827
+ seq_t *seq = rdr_raw2seq(reader, raw, labelled);
828
+ xfree(raw);
829
+
830
+ if (seq == 0) { break; }
831
+
832
+ // and store the sequence
833
+ dat->seq[dat->nseq++] = seq;
834
+ dat->mlen = max(dat->mlen, seq->len);
835
+
836
+ }
837
+
838
+ // if no sequence was read, free memory
839
+ if (dat->nseq == 0) {
840
+ xfree(dat->seq);
841
+ xfree(dat);
842
+
843
+ return 0;
844
+ }
845
+
846
+ return dat;
847
+ }
848
+
849
+
850
+ static VALUE model_train(VALUE self, VALUE data) {
851
+
852
+ mdl_t* model = get_model(self);
853
+
854
+ int trn;
855
+ for (trn = 0; trn < trn_cnt; trn++) {
856
+ if (!strcmp(model->opt->algo, trn_lst[trn].name)) break;
857
+ }
858
+
859
+ if (trn == trn_cnt) {
860
+ rb_raise(cNativeError, "failed to train model: unknown algorithm '%s'", model->opt->algo);
861
+ }
862
+
863
+ FILE *file;
864
+
865
+ // Load the pattern file. This will unlock the database if previously
866
+ // locked by loading a model.
867
+ if (model->opt->pattern) {
868
+ file = fopen(model->opt->pattern, "r");
869
+
870
+ if (!file) {
871
+ rb_raise(cNativeError, "failed to train model: failed to load pattern file '%s'", model->opt->pattern);
872
+ }
873
+
874
+ rdr_loadpat(model->reader, file);
875
+ fclose(file);
876
+ qrk_lock(model->reader->obs, false);
877
+ }
878
+ else {
879
+ rb_raise(cNativeError, "failed to train model: no pattern given");
880
+ }
881
+
882
+
883
+ // Load the training data. When this is done we lock the quarks as we
884
+ // don't want to put in the model, informations present only in the
885
+ // devlopment set.
886
+
887
+ switch (TYPE(data)) {
888
+ case T_STRING:
889
+ if (!(file = fopen(StringValuePtr(data), "r"))) {
890
+ rb_raise(cNativeError, "failed to train model: failed to open training data '%s", StringValuePtr(data));
891
+ }
892
+
893
+ model->train = rdr_readdat(model->reader, file, true);
894
+ fclose(file);
895
+
896
+ break;
897
+ case T_ARRAY:
898
+ model->train = to_dat(model->reader, data, true);
899
+
900
+ break;
901
+ default:
902
+ rb_raise(cNativeError, "failed to train model: invalid training data type (expected instance of String or Array)");
903
+ }
904
+
905
+ qrk_lock(model->reader->lbl, true);
906
+ qrk_lock(model->reader->obs, true);
907
+
908
+ if (!model->train || model->train->nseq == 0) {
909
+ rb_raise(cNativeError, "failed to train model: no training data loaded");
910
+ }
911
+
912
+ // If present, load the development set in the model. If not specified,
913
+ // the training dataset will be used instead.
914
+ if (model->opt->devel) {
915
+ if (!(file = fopen(model->opt->devel, "r"))) {
916
+ rb_raise(cNativeError, "failed to train model: cannot open development file '%s'", model->opt->devel);
917
+ }
918
+
919
+ model->devel = rdr_readdat(model->reader, file, true);
920
+ fclose(file);
921
+ }
922
+
923
+ // Initialize the model. If a previous model was loaded, this will be
924
+ // just a resync, else the model structure will be created.
925
+ rb_funcall(self, rb_intern("sync"), 0);
926
+
927
+ // Train the model.
928
+ uit_setup(model);
929
+ trn_lst[trn].train(model);
930
+ uit_cleanup(model);
931
+
932
+ // If requested compact the model.
933
+ if (model->opt->compact) {
934
+ const size_t O = model->nobs;
935
+ const size_t F = model->nftr;
936
+ rb_funcall(self, rb_intern("compact"), 0);
937
+ }
938
+
939
+ return self;
940
+ }
941
+
942
+ // Returns a sorted list of all labels in the Model's label database.
943
+ static VALUE model_labels(VALUE self) {
944
+ mdl_t *model = get_model(self);
945
+ const size_t Y = model->nlbl;
946
+
947
+ qrk_t *lp = model->reader->lbl;
948
+
949
+ VALUE labels = rb_ary_new2(Y);
950
+
951
+ for (unsigned int i = 0; i < Y; ++i) {
952
+ rb_ary_push(labels, rb_str_new2(qrk_id2str(lp, i)));
953
+ }
954
+
955
+ rb_funcall(labels, rb_intern("sort!"), 0);
956
+
957
+ return labels;
958
+ }
959
+
960
+ static VALUE decode_sequence(mdl_t *model, raw_t *raw) {
961
+ qrk_t *lbls = model->reader->lbl;
962
+
963
+ const size_t Y = model->nlbl;
964
+ const size_t N = model->opt->nbest;
965
+
966
+ seq_t *seq = rdr_raw2seq(model->reader, raw, model->opt->check);
967
+
968
+ const int T = seq->len;
969
+
970
+ size_t *out = xmalloc(sizeof(size_t) * T * N);
971
+ double *psc = xmalloc(sizeof(double) * T * N);
972
+ double *scs = xmalloc(sizeof(double) * N);
973
+
974
+ VALUE result = rb_ary_new2(N), sequence, tokens;
975
+
976
+ if (N == 1) {
977
+ tag_viterbi(model, seq, (size_t*)out, scs, (double*)psc);
978
+ }
979
+ else {
980
+ tag_nbviterbi(model, seq, N, (void*)out, scs, (void*)psc);
981
+ }
982
+
983
+ // Next we output the raw sequence with an aditional column for
984
+ // the predicted labels
985
+ for (size_t n = 0; n < N; n++) {
986
+
987
+ sequence = rb_ary_new();
988
+
989
+ // if (model->opt->outsc)
990
+ // fprintf(fout, "# %d %f\n", (int)n, scs[n]);
991
+
992
+ for (int t = 0; t < T; t++) {
993
+ tokens = rb_ary_new();
994
+
995
+ if (!model->opt->label) {
996
+ rb_ary_push(tokens, rb_str_new2(raw->lines[t]));
997
+ }
998
+
999
+ size_t lbl = out[t * N + n];
1000
+ rb_ary_push(tokens, rb_str_new2(qrk_id2str(lbls, lbl)));
1001
+
1002
+ // if (model->opt->outsc) {
1003
+ // fprintf(fout, "\t%s", lblstr);
1004
+ // fprintf(fout, "/%f", psc[t * N + n]);
1005
+ // }
1006
+
1007
+ // yield token/label pair to block if given
1008
+ if (rb_block_given_p()) {
1009
+ tokens = rb_yield(tokens);
1010
+ }
1011
+
1012
+ rb_ary_push(sequence, tokens);
1013
+ }
1014
+
1015
+ rb_ary_push(result, sequence);
1016
+ }
1017
+
1018
+ // Cleanup memory used for this sequence
1019
+ xfree(scs);
1020
+ xfree(psc);
1021
+ xfree(out);
1022
+
1023
+ rdr_freeseq(seq);
1024
+
1025
+ return result;
1026
+ }
1027
+
1028
+ static VALUE decode_sequence_array(VALUE self, VALUE array) {
1029
+ Check_Type(array, T_ARRAY);
1030
+ const unsigned int n = RARRAY_LEN(array);
1031
+
1032
+ mdl_t *model = get_model(self);
1033
+ raw_t *raw;
1034
+
1035
+ const unsigned int N = model->opt->nbest;
1036
+ unsigned int i, j;
1037
+
1038
+ VALUE result = rb_ary_new2(n * N), sequence;
1039
+
1040
+ for (i = 0; i < n; ++i) {
1041
+ sequence = rb_ary_entry(array, i);
1042
+ Check_Type(sequence, T_ARRAY);
1043
+
1044
+ const unsigned int k = RARRAY_LEN(sequence);
1045
+ raw = xmalloc(sizeof(raw_t) + sizeof(char*) * k);
1046
+ raw->len = k;
1047
+
1048
+ for (j = 0; j < k; ++j) {
1049
+ VALUE line = rb_ary_entry(sequence, j);
1050
+ Check_Type(line, T_STRING);
1051
+
1052
+ raw->lines[j] = StringValueCStr(line);
1053
+ }
1054
+
1055
+ rb_funcall(result, rb_intern("concat"), 1, decode_sequence(model, raw));
1056
+
1057
+ xfree(raw);
1058
+ }
1059
+
1060
+ return result;
1061
+ }
1062
+
1063
+ static VALUE decode_sequence_file(VALUE self, VALUE path) {
1064
+ Check_Type(path, T_STRING);
1065
+ FILE *file;
1066
+
1067
+ if (!(file = fopen(StringValueCStr(path), "r"))) {
1068
+ rb_raise(cNativeError, "failed to label data: could not open file '%s'", StringValueCStr(path));
1069
+ }
1070
+
1071
+ mdl_t *model = get_model(self);
1072
+ raw_t *raw;
1073
+
1074
+ VALUE result = rb_ary_new();
1075
+
1076
+ // Next read the input file sequence by sequence and label them, we have
1077
+ // to take care of not discarding the raw input as we want to send it
1078
+ // back to the output with the additional predicted labels.
1079
+ while (!feof(file)) {
1080
+
1081
+ // So, first read an input sequence keeping the raw_t object
1082
+ // available, and label it with Viterbi.
1083
+ if ((raw = rdr_readraw(model->reader, file)) == 0) {
1084
+ break;
1085
+ }
1086
+
1087
+ rb_funcall(result, rb_intern("concat"), 1, decode_sequence(model, raw));
1088
+ rdr_freeraw(raw);
1089
+ }
1090
+
1091
+ return result;
1092
+ }
1093
+
1094
+ // cal-seq:
1095
+ // m.label(tokens) # => array of labelled tokens
1096
+ // m.label(filename) # => array of labelled tokens
1097
+ //
1098
+ static VALUE model_label(VALUE self, VALUE data) {
1099
+ VALUE result;
1100
+
1101
+ switch (TYPE(data)) {
1102
+ case T_STRING:
1103
+ result = decode_sequence_file(self, data);
1104
+ break;
1105
+ case T_ARRAY:
1106
+ result = decode_sequence_array(self, data);
1107
+ break;
1108
+ default:
1109
+ rb_raise(cNativeError, "failed to label data: invalid data (expected type String or Array)");
1110
+ }
1111
+
1112
+ return result;
1113
+ }
1114
+
1115
+ static void Init_model() {
1116
+ cModel = rb_define_class_under(mWapiti, "Model", rb_cObject);
1117
+ rb_define_alloc_func(cModel, allocate_model);
1118
+
1119
+ rb_define_method(cModel, "initialize", initialize_model, -1);
1120
+
1121
+ rb_define_attr(cModel, "options", 1, 0);
1122
+
1123
+ rb_define_method(cModel, "nlbl", model_nlbl, 0);
1124
+ rb_define_method(cModel, "labels", model_labels, 0);
1125
+
1126
+ rb_define_method(cModel, "nobs", model_nobs, 0);
1127
+ rb_define_alias(cModel, "observations", "nobs");
1128
+
1129
+ rb_define_method(cModel, "nftr", model_nftr, 0);
1130
+ rb_define_alias(cModel, "features", "nftr");
1131
+
1132
+ rb_define_method(cModel, "total", model_total, 0);
1133
+
1134
+ rb_define_method(cModel, "sync", model_sync, 0);
1135
+ rb_define_method(cModel, "compact", model_compact, 0);
1136
+ rb_define_method(cModel, "save", model_save, -1);
1137
+ rb_define_method(cModel, "load", model_load, -1);
1138
+
1139
+ rb_define_method(cModel, "train", model_train, 1);
1140
+ rb_define_method(cModel, "label", model_label, 1);
1141
+ }
1142
+
1143
+ /* --- Top-Level Utility Methods --- */
1144
+
1145
+
1146
+ static VALUE label(VALUE self __attribute__((__unused__)), VALUE rb_options) {
1147
+ if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
1148
+ rb_raise(cNativeError, "argument must be a native options instance");
1149
+ }
1150
+
1151
+ opt_t *options = get_options(rb_options);
1152
+
1153
+ if (options->mode != 1) {
1154
+ rb_raise(cNativeError, "invalid options argument: mode should be set to 1 for labelling");
1155
+ }
1156
+
1157
+ mdl_t *model = mdl_new(rdr_new(options->maxent));
1158
+ model->opt = options;
1159
+
1160
+ dolabel(model);
1161
+
1162
+ mdl_free(model);
1163
+
1164
+ return Qnil;
1165
+ }
1166
+
1167
+ static VALUE dump(VALUE self __attribute__((__unused__)), VALUE rb_options) {
1168
+ if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
1169
+ rb_raise(cNativeError, "argument must be a native options instance");
1170
+ }
1171
+
1172
+ opt_t *options = get_options(rb_options);
1173
+
1174
+ if (options->mode != 2) {
1175
+ rb_raise(cNativeError, "invalid options argument: mode should be set to 2 for training");
1176
+ }
1177
+
1178
+ mdl_t *model = mdl_new(rdr_new(options->maxent));
1179
+ model->opt = options;
1180
+
1181
+ dodump(model);
1182
+
1183
+ mdl_free(model);
1184
+
1185
+ return Qnil;
1186
+ }
1187
+
1188
+ // This function is a proxy for Wapiti's main entry point.
1189
+ static VALUE wapiti(VALUE self __attribute__((__unused__)), VALUE arguments) {
1190
+ int result = -1, argc = 0;
1191
+ char **ap, *argv[18], *input, *tmp;
1192
+
1193
+ Check_Type(arguments, T_STRING);
1194
+ tmp = StringValueCStr(arguments);
1195
+
1196
+ // allocate space for argument vector
1197
+ input = (char*)malloc(strlen(tmp) + 8);
1198
+
1199
+ // prepend command name
1200
+ strncpy(input, "wapiti ", 8);
1201
+ strncat(input, tmp, strlen(input) - 8);
1202
+
1203
+ // remember allocation pointer
1204
+ tmp = input;
1205
+
1206
+ // turn input string into argument vector (using
1207
+ // only the first seventeen tokens from input)
1208
+ for (ap = argv; (*ap = strsep(&input, " \t")) != (char*)0; ++argc) {
1209
+ if ((**ap != '\0') && (++ap >= &argv[18])) break;
1210
+ }
1211
+
1212
+ // call main entry point
1213
+ result = wapiti_main(argc, argv);
1214
+
1215
+ // free allocated memory
1216
+ free(tmp);
1217
+
1218
+ return INT2FIX(result);
1219
+ }
1220
+
1221
+ /* --- Wapiti Extension Entry Point --- */
1222
+
1223
+ void Init_native() {
1224
+ mWapiti = rb_const_get(rb_mKernel, rb_intern("Wapiti"));
1225
+ mNative = rb_define_module_under(mWapiti, "Native");
1226
+
1227
+ cNativeError = rb_const_get(mWapiti, rb_intern("NativeError"));
1228
+ cConfigurationError = rb_const_get(mWapiti, rb_intern("ConfigurationError"));
1229
+ cLogger = rb_funcall(mWapiti, rb_intern("log"), 0);
1230
+
1231
+ rb_define_singleton_method(mNative, "label", label, 1);
1232
+ rb_define_singleton_method(mNative, "wapiti", wapiti, 1);
1233
+
1234
+ rb_define_const(mNative, "VERSION", rb_str_new2(VERSION));
1235
+
1236
+ Init_options();
1237
+ Init_model();
1238
+ }