wapiti 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/.autotest +13 -0
  2. data/.gitignore +5 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +30 -0
  6. data/README.md +153 -0
  7. data/Rakefile +33 -0
  8. data/ext/wapiti/bcd.c +392 -0
  9. data/ext/wapiti/decoder.c +535 -0
  10. data/ext/wapiti/decoder.h +46 -0
  11. data/ext/wapiti/extconf.rb +8 -0
  12. data/ext/wapiti/gradient.c +818 -0
  13. data/ext/wapiti/gradient.h +81 -0
  14. data/ext/wapiti/lbfgs.c +294 -0
  15. data/ext/wapiti/model.c +296 -0
  16. data/ext/wapiti/model.h +100 -0
  17. data/ext/wapiti/native.c +1238 -0
  18. data/ext/wapiti/native.h +15 -0
  19. data/ext/wapiti/options.c +278 -0
  20. data/ext/wapiti/options.h +91 -0
  21. data/ext/wapiti/pattern.c +395 -0
  22. data/ext/wapiti/pattern.h +56 -0
  23. data/ext/wapiti/progress.c +167 -0
  24. data/ext/wapiti/progress.h +43 -0
  25. data/ext/wapiti/quark.c +272 -0
  26. data/ext/wapiti/quark.h +46 -0
  27. data/ext/wapiti/reader.c +553 -0
  28. data/ext/wapiti/reader.h +73 -0
  29. data/ext/wapiti/rprop.c +191 -0
  30. data/ext/wapiti/sequence.h +148 -0
  31. data/ext/wapiti/sgdl1.c +218 -0
  32. data/ext/wapiti/thread.c +171 -0
  33. data/ext/wapiti/thread.h +42 -0
  34. data/ext/wapiti/tools.c +202 -0
  35. data/ext/wapiti/tools.h +54 -0
  36. data/ext/wapiti/trainers.h +39 -0
  37. data/ext/wapiti/vmath.c +372 -0
  38. data/ext/wapiti/vmath.h +51 -0
  39. data/ext/wapiti/wapiti.c +288 -0
  40. data/ext/wapiti/wapiti.h +45 -0
  41. data/lib/wapiti.rb +30 -0
  42. data/lib/wapiti/errors.rb +17 -0
  43. data/lib/wapiti/model.rb +49 -0
  44. data/lib/wapiti/options.rb +113 -0
  45. data/lib/wapiti/utility.rb +15 -0
  46. data/lib/wapiti/version.rb +3 -0
  47. data/spec/fixtures/ch.mod +18550 -0
  48. data/spec/fixtures/chpattern.txt +52 -0
  49. data/spec/fixtures/chtest.txt +1973 -0
  50. data/spec/fixtures/chtrain.txt +19995 -0
  51. data/spec/fixtures/nppattern.txt +52 -0
  52. data/spec/fixtures/nptest.txt +1973 -0
  53. data/spec/fixtures/nptrain.txt +19995 -0
  54. data/spec/fixtures/pattern.txt +14 -0
  55. data/spec/fixtures/test.txt +60000 -0
  56. data/spec/fixtures/train.txt +1200 -0
  57. data/spec/spec_helper.rb +21 -0
  58. data/spec/wapiti/model_spec.rb +173 -0
  59. data/spec/wapiti/native_spec.rb +12 -0
  60. data/spec/wapiti/options_spec.rb +175 -0
  61. data/spec/wapiti/utility_spec.rb +22 -0
  62. data/wapiti.gemspec +35 -0
  63. metadata +178 -0
@@ -0,0 +1,100 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #ifndef model_h
29
+ #define model_h
30
+
31
+ #include <stddef.h>
32
+ #include <sys/times.h>
33
+
34
+ #include "wapiti.h"
35
+ #include "options.h"
36
+ #include "sequence.h"
37
+ #include "reader.h"
38
+
39
+ typedef struct tms tms_t;
40
+
41
+ /* mdl_t:
42
+ * Represent a linear-chain CRF model. The model contain both unigram and
43
+ * bigram features. It is caracterized by <nlbl> the number of labels, <nobs>
44
+ * the number of observations, and <nftr> the number of features.
45
+ *
46
+ * Each observations have a corresponding entry in <kind> whose first bit is
47
+ * set if the observation is unigram and second one if it is bigram. Note that
48
+ * an observation can be both. An unigram observation produce Y features and a
49
+ * bigram one produce Y * Y features.
50
+ * The <theta> array keep all features weights. The <*off> array give for each
51
+ * observations the offset in the <theta> array where the features of the
52
+ * observation are stored.
53
+ *
54
+ * The <*off> and <theta> array are initialized only when the model is
55
+ * synchronized. As you can add new labels and observations after a sync, we
56
+ * keep track of the old counts in <olbl> and <oblk> to detect inconsistency
57
+ * and resynchronize the model if needed. In this case, if the number of
58
+ * labels have not changed, the previously trained weights are kept, else they
59
+ * are now meaningless so discarded.
60
+ */
61
+ typedef struct mdl_s mdl_t;
62
+ struct mdl_s {
63
+ opt_t *opt; // options for training
64
+
65
+ // Size of various model parameters
66
+ size_t nlbl; // Y number of labels
67
+ size_t nobs; // O number of observations
68
+ size_t nftr; // F number of features
69
+
70
+ // Informations about observations
71
+ char *kind; // [O] observations type
72
+ size_t *uoff; // [O] unigram weights offset
73
+ size_t *boff; // [O] bigram weights offset
74
+
75
+ // The model itself
76
+ double *theta; // [F] features weights
77
+
78
+ // Datasets
79
+ dat_t *train; // training dataset
80
+ dat_t *devel; // development dataset
81
+ rdr_t *reader;
82
+
83
+ // Stoping criterion
84
+ double *werr; // Window of error rate of last iters
85
+ int wcnt; // Number of iters in the window
86
+ int wpos; // Position for the next iter
87
+
88
+ // Timing
89
+ tms_t timer; // start time of last iter
90
+ double total; // total training time
91
+ };
92
+
93
+ mdl_t *mdl_new(rdr_t *rdr);
94
+ void mdl_free(mdl_t *mdl);
95
+ void mdl_sync(mdl_t *mdl);
96
+ void mdl_compact(mdl_t *mdl);
97
+ void mdl_save(mdl_t *mdl, FILE *file);
98
+ void mdl_load(mdl_t *mdl, FILE *file);
99
+
100
+ #endif
@@ -0,0 +1,1238 @@
1
+ #include <stdio.h>
2
+ #include <string.h>
3
+
4
+ #include "wapiti.h"
5
+ #include "options.h"
6
+ #include "reader.h"
7
+ #include "model.h"
8
+ #include "trainers.h"
9
+ #include "quark.h"
10
+ #include "tools.h"
11
+
12
+ #include "native.h"
13
+
14
+ VALUE mWapiti;
15
+ VALUE mNative;
16
+
17
+ VALUE cOptions;
18
+ VALUE cModel;
19
+
20
+ VALUE cNativeError;
21
+ VALUE cConfigurationError;
22
+ VALUE cLogger;
23
+
24
+
25
+ /* --- Utilities --- */
26
+
27
+ static void trn_auto(mdl_t *mdl) {
28
+ const int maxiter = mdl->opt->maxiter;
29
+ mdl->opt->maxiter = 3;
30
+ trn_sgdl1(mdl);
31
+ mdl->opt->maxiter = maxiter;
32
+ trn_lbfgs(mdl);
33
+ }
34
+
35
+ static const struct {
36
+ char *name;
37
+ void (* train)(mdl_t *mdl);
38
+ } trn_lst[] = {
39
+ {"l-bfgs", trn_lbfgs},
40
+ {"sgd-l1", trn_sgdl1},
41
+ {"bcd", trn_bcd },
42
+ {"rprop", trn_rprop},
43
+ {"rprop+", trn_rprop},
44
+ {"rprop-", trn_rprop},
45
+ {"auto", trn_auto }
46
+ };
47
+ static const int trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
48
+
49
+
50
+ /* --- Options Class --- */
51
+
52
+ // Auxiliary Methods
53
+
54
+ static opt_t *get_options(VALUE self) {
55
+ opt_t *options;
56
+ Data_Get_Struct(self, opt_t, options);
57
+ return options;
58
+ }
59
+
60
+ // Copies a Ruby string to the heap and stores it in a pointer.
61
+ // Frees the pointer before assigning the new value.
62
+ static void copy_string(char **dst, VALUE rb_string) {
63
+ Check_Type(rb_string, T_STRING);
64
+
65
+ if (*dst) { free(*dst); *dst = (char*)0; }
66
+ *dst = calloc(RSTRING_LEN(rb_string) + 1, sizeof(char));
67
+
68
+ memcpy(*dst, StringValuePtr(rb_string), RSTRING_LEN(rb_string) + 1);
69
+ }
70
+
71
+
72
+ // Constructor / Desctructor
73
+
74
+ static void mark_options(opt_t* options __attribute__((__unused__))) {
75
+ // nothing
76
+ }
77
+
78
+ static void deallocate_options(opt_t* options) {
79
+
80
+ // free string options
81
+ if (options->input) { free(options->input); }
82
+ if (options->output) { free(options->output); }
83
+ if (options->algo) { free(options->algo); }
84
+ if (options->devel) { free(options->devel); }
85
+ if (options->pattern) { free(options->pattern); }
86
+
87
+ free(options);
88
+ options = (opt_t*)0;
89
+ }
90
+
91
+ static VALUE allocate_options(VALUE self) {
92
+ opt_t* options = malloc(sizeof(opt_t));
93
+ return Data_Wrap_Struct(self, mark_options, deallocate_options, options);
94
+ }
95
+
96
+ static VALUE initialize_options(int argc, VALUE *argv, VALUE self) {
97
+ opt_t* options = get_options(self);
98
+ *options = opt_defaults;
99
+
100
+ if (options->maxiter == 0) {
101
+ options->maxiter = INT_MAX;
102
+ }
103
+
104
+ // copy the default algorithm name to the heap so that all options strings
105
+ // are on the heap
106
+ char* tmp = calloc(strlen(options->algo), sizeof(char));
107
+ memcpy(tmp, options->algo, strlen(options->algo));
108
+ options->algo = tmp;
109
+
110
+ if (argc > 1) {
111
+ rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
112
+ "wrong number of arguments (%d for 0..1)", argc);
113
+ }
114
+
115
+ // set defaults
116
+ if (argc) {
117
+ Check_Type(argv[0], T_HASH);
118
+ (void)rb_funcall(self, rb_intern("update"), 1, argv[0]);
119
+ }
120
+
121
+ // yield self if block_given?
122
+ if (rb_block_given_p()) {
123
+ rb_yield(self);
124
+ }
125
+
126
+ return self;
127
+ }
128
+
129
+
130
+ // Instance Methods
131
+
132
+
133
+ // Fixnum Accessors
134
+
135
+ static VALUE options_nbest(VALUE self) {
136
+ return INT2FIX(get_options(self)->nbest);
137
+ }
138
+
139
+ static VALUE options_set_nbest(VALUE self, VALUE rb_fixnum) {
140
+ Check_Type(rb_fixnum, T_FIXNUM);
141
+ get_options(self)->nbest = FIX2INT(rb_fixnum);
142
+
143
+ return rb_fixnum;
144
+ }
145
+
146
+
147
+ static VALUE options_stopwin(VALUE self) {
148
+ return INT2FIX(get_options(self)->stopwin);
149
+ }
150
+
151
+ static VALUE options_set_stopwin(VALUE self, VALUE rb_fixnum) {
152
+ Check_Type(rb_fixnum, T_FIXNUM);
153
+ get_options(self)->stopwin = FIX2INT(rb_fixnum);
154
+
155
+ return rb_fixnum;
156
+ }
157
+
158
+ static VALUE options_objwin(VALUE self) {
159
+ return INT2FIX(get_options(self)->objwin);
160
+ }
161
+
162
+ static VALUE options_set_objwin(VALUE self, VALUE rb_fixnum) {
163
+ Check_Type(rb_fixnum, T_FIXNUM);
164
+ get_options(self)->objwin = FIX2INT(rb_fixnum);
165
+
166
+ return rb_fixnum;
167
+ }
168
+
169
+
170
+ static VALUE options_maxiter(VALUE self) {
171
+ return INT2FIX(get_options(self)->maxiter);
172
+ }
173
+
174
+ static VALUE options_set_maxiter(VALUE self, VALUE rb_fixnum) {
175
+ opt_t *options = get_options(self);
176
+
177
+ Check_Type(rb_fixnum, T_FIXNUM);
178
+ options->maxiter = FIX2INT(rb_fixnum);
179
+
180
+ return rb_fixnum;
181
+ }
182
+
183
+ static VALUE options_jobsize(VALUE self) {
184
+ return INT2FIX(get_options(self)->jobsize);
185
+ }
186
+
187
+ static VALUE options_set_jobsize(VALUE self, VALUE rb_fixnum) {
188
+ opt_t *options = get_options(self);
189
+
190
+ Check_Type(rb_fixnum, T_FIXNUM);
191
+ options->jobsize = FIX2INT(rb_fixnum);
192
+
193
+ return rb_fixnum;
194
+ }
195
+
196
+ static VALUE options_nthread(VALUE self) {
197
+ return INT2FIX(get_options(self)->nthread);
198
+ }
199
+
200
+ static VALUE options_set_nthread(VALUE self, VALUE rb_fixnum) {
201
+ opt_t *options = get_options(self);
202
+
203
+ Check_Type(rb_fixnum, T_FIXNUM);
204
+ options->nthread = FIX2INT(rb_fixnum);
205
+
206
+ return rb_fixnum;
207
+ }
208
+
209
+ static VALUE options_histsz(VALUE self) {
210
+ return INT2FIX(get_options(self)->lbfgs.histsz);
211
+ }
212
+
213
+ static VALUE options_set_histsz(VALUE self, VALUE rb_fixnum) {
214
+ Check_Type(rb_fixnum, T_FIXNUM);
215
+ get_options(self)->lbfgs.histsz = FIX2INT(rb_fixnum);
216
+
217
+ return rb_fixnum;
218
+ }
219
+
220
+ static VALUE options_maxls(VALUE self) {
221
+ return INT2FIX(get_options(self)->lbfgs.maxls);
222
+ }
223
+
224
+ static VALUE options_set_maxls(VALUE self, VALUE rb_fixnum) {
225
+ Check_Type(rb_fixnum, T_FIXNUM);
226
+ get_options(self)->lbfgs.maxls = FIX2INT(rb_fixnum);
227
+
228
+ return rb_fixnum;
229
+ }
230
+
231
+
232
+ // Float Accessors
233
+
234
+ static VALUE options_rho1(VALUE self) {
235
+ return rb_float_new(get_options(self)->rho1);
236
+ }
237
+
238
+ static VALUE options_set_rho1(VALUE self, VALUE rb_numeric) {
239
+ get_options(self)->rho1 = NUM2DBL(rb_numeric);
240
+ return rb_numeric;
241
+ }
242
+
243
+ static VALUE options_rho2(VALUE self) {
244
+ return rb_float_new(get_options(self)->rho2);
245
+ }
246
+
247
+ static VALUE options_set_rho2(VALUE self, VALUE rb_numeric) {
248
+ get_options(self)->rho2 = NUM2DBL(rb_numeric);
249
+ return rb_numeric;
250
+ }
251
+
252
+ static VALUE options_stopeps(VALUE self) {
253
+ return rb_float_new(get_options(self)->stopeps);
254
+ }
255
+
256
+ static VALUE options_set_stopeps(VALUE self, VALUE rb_numeric) {
257
+ get_options(self)->stopeps = NUM2DBL(rb_numeric);
258
+ return rb_numeric;
259
+ }
260
+
261
+ static VALUE options_eta0(VALUE self) {
262
+ return rb_float_new(get_options(self)->sgdl1.eta0);
263
+ }
264
+
265
+ static VALUE options_set_eta0(VALUE self, VALUE rb_numeric) {
266
+ get_options(self)->sgdl1.eta0 = NUM2DBL(rb_numeric);
267
+ return rb_numeric;
268
+ }
269
+
270
+ static VALUE options_alpha(VALUE self) {
271
+ return rb_float_new(get_options(self)->sgdl1.alpha);
272
+ }
273
+
274
+ static VALUE options_set_alpha(VALUE self, VALUE rb_numeric) {
275
+ get_options(self)->sgdl1.alpha = NUM2DBL(rb_numeric);
276
+ return rb_numeric;
277
+ }
278
+
279
+ static VALUE options_kappa(VALUE self) {
280
+ return rb_float_new(get_options(self)->bcd.kappa);
281
+ }
282
+
283
+ static VALUE options_set_kappa(VALUE self, VALUE rb_numeric) {
284
+ get_options(self)->bcd.kappa = NUM2DBL(rb_numeric);
285
+ return rb_numeric;
286
+ }
287
+
288
+ static VALUE options_stpmin(VALUE self) {
289
+ return rb_float_new(get_options(self)->rprop.stpmin);
290
+ }
291
+
292
+ static VALUE options_set_stpmin(VALUE self, VALUE rb_numeric) {
293
+ get_options(self)->rprop.stpmin = NUM2DBL(rb_numeric);
294
+ return rb_numeric;
295
+ }
296
+
297
+ static VALUE options_stpmax(VALUE self) {
298
+ return rb_float_new(get_options(self)->rprop.stpmax);
299
+ }
300
+
301
+ static VALUE options_set_stpmax(VALUE self, VALUE rb_numeric) {
302
+ get_options(self)->rprop.stpmax = NUM2DBL(rb_numeric);
303
+ return rb_numeric;
304
+ }
305
+
306
+ static VALUE options_stpinc(VALUE self) {
307
+ return rb_float_new(get_options(self)->rprop.stpinc);
308
+ }
309
+
310
+ static VALUE options_set_stpinc(VALUE self, VALUE rb_numeric) {
311
+ get_options(self)->rprop.stpinc = NUM2DBL(rb_numeric);
312
+ return rb_numeric;
313
+ }
314
+
315
+ static VALUE options_stpdec(VALUE self) {
316
+ return rb_float_new(get_options(self)->rprop.stpdec);
317
+ }
318
+
319
+ static VALUE options_set_stpdec(VALUE self, VALUE rb_numeric) {
320
+ get_options(self)->rprop.stpdec = NUM2DBL(rb_numeric);
321
+ return rb_numeric;
322
+ }
323
+
324
+
325
+
326
+ // Boolean Accessors
327
+
328
+ static VALUE options_maxent(VALUE self) {
329
+ return get_options(self)->maxent ? Qtrue : Qfalse;
330
+ }
331
+
332
+ static VALUE options_set_maxent(VALUE self, VALUE rb_boolean) {
333
+ get_options(self)->maxent = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
334
+ return rb_boolean;
335
+ }
336
+
337
+ static VALUE options_compact(VALUE self) {
338
+ return get_options(self)->compact ? Qtrue : Qfalse;
339
+ }
340
+
341
+ static VALUE options_set_compact(VALUE self, VALUE rb_boolean) {
342
+ get_options(self)->compact = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
343
+ return rb_boolean;
344
+ }
345
+
346
+ static VALUE options_sparse(VALUE self) {
347
+ return get_options(self)->sparse ? Qtrue : Qfalse;
348
+ }
349
+
350
+ static VALUE options_set_sparse(VALUE self, VALUE rb_boolean) {
351
+ get_options(self)->sparse = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
352
+ return rb_boolean;
353
+ }
354
+
355
+ static VALUE options_check(VALUE self) {
356
+ return get_options(self)->check ? Qtrue : Qfalse;
357
+ }
358
+
359
+ static VALUE options_set_check(VALUE self, VALUE rb_boolean) {
360
+ get_options(self)->check = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
361
+ return rb_boolean;
362
+ }
363
+
364
+ static VALUE options_label(VALUE self) {
365
+ return get_options(self)->label ? Qtrue : Qfalse;
366
+ }
367
+
368
+ static VALUE options_set_label(VALUE self, VALUE rb_boolean) {
369
+ get_options(self)->label = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
370
+ return rb_boolean;
371
+ }
372
+
373
+ static VALUE options_outsc(VALUE self) {
374
+ return get_options(self)->outsc ? Qtrue : Qfalse;
375
+ }
376
+
377
+ static VALUE options_set_outsc(VALUE self, VALUE rb_boolean) {
378
+ get_options(self)->outsc = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
379
+ return rb_boolean;
380
+ }
381
+
382
+ static VALUE options_lblpost(VALUE self) {
383
+ return get_options(self)->lblpost ? Qtrue : Qfalse;
384
+ }
385
+
386
+ static VALUE options_set_lblpost(VALUE self, VALUE rb_boolean) {
387
+ get_options(self)->lblpost = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
388
+ return rb_boolean;
389
+ }
390
+
391
+ static VALUE options_clip(VALUE self) {
392
+ return get_options(self)->lbfgs.clip ? Qtrue : Qfalse;
393
+ }
394
+
395
+ static VALUE options_set_clip(VALUE self, VALUE rb_boolean) {
396
+ get_options(self)->lbfgs.clip = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
397
+ return rb_boolean;
398
+ }
399
+
400
+ static VALUE options_cutoff(VALUE self) {
401
+ return get_options(self)->rprop.cutoff ? Qtrue : Qfalse;
402
+ }
403
+
404
+ static VALUE options_set_cutoff(VALUE self, VALUE rb_boolean) {
405
+ get_options(self)->rprop.cutoff = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
406
+ return rb_boolean;
407
+ }
408
+
409
+
410
+
411
+
412
+ // String Accessors
413
+
414
+ static VALUE options_pattern(VALUE self) {
415
+ char *pattern = get_options(self)->pattern;
416
+ return rb_str_new2(pattern ? pattern : "");
417
+ }
418
+
419
+ static VALUE options_set_pattern(VALUE self, VALUE rb_string) {
420
+ opt_t *options = get_options(self);
421
+ copy_string(&(options->pattern), rb_string);
422
+
423
+ return rb_string;
424
+ }
425
+
426
+ static VALUE options_model(VALUE self) {
427
+ char *model = get_options(self)->model;
428
+ return rb_str_new2(model ? model : "");
429
+ }
430
+
431
+ static VALUE options_set_model(VALUE self, VALUE rb_string) {
432
+ opt_t *options = get_options(self);
433
+ copy_string(&(options->model), rb_string);
434
+
435
+ return rb_string;
436
+ }
437
+
438
+ static VALUE options_algorithm(VALUE self) {
439
+ char *algorithm = get_options(self)->algo;
440
+ return rb_str_new2(algorithm ? algorithm : "");
441
+ }
442
+
443
+ static VALUE options_set_algorithm(VALUE self, VALUE rb_string) {
444
+ opt_t *options = get_options(self);
445
+ copy_string(&(options->algo), rb_string);
446
+
447
+ return rb_string;
448
+ }
449
+
450
+ static VALUE options_development_data(VALUE self) {
451
+ char *development_data = get_options(self)->devel;
452
+ return rb_str_new2(development_data ? development_data : "");
453
+ }
454
+
455
+ static VALUE options_set_development_data(VALUE self, VALUE rb_string) {
456
+ opt_t *options = get_options(self);
457
+ copy_string(&(options->devel), rb_string);
458
+
459
+ return rb_string;
460
+ }
461
+
462
+
463
+ void Init_options() {
464
+ cOptions = rb_define_class_under(mWapiti, "Options", rb_cObject);
465
+ rb_define_alloc_func(cOptions, allocate_options);
466
+
467
+ rb_define_method(cOptions, "initialize", initialize_options, -1);
468
+
469
+ // Option Accessors
470
+
471
+ rb_define_method(cOptions, "stopwin", options_stopwin, 0);
472
+ rb_define_method(cOptions, "stopwin=", options_set_stopwin, 1);
473
+
474
+ rb_define_alias(cOptions, "stop_window", "stopwin");
475
+ rb_define_alias(cOptions, "stop_window=", "stopwin=");
476
+
477
+ rb_define_method(cOptions, "objwin", options_objwin, 0);
478
+ rb_define_method(cOptions, "objwin=", options_set_objwin, 1);
479
+
480
+ rb_define_alias(cOptions, "convergence_window", "objwin");
481
+ rb_define_alias(cOptions, "convergence_window=", "objwin=");
482
+
483
+ rb_define_method(cOptions, "maxiter", options_maxiter, 0);
484
+ rb_define_method(cOptions, "maxiter=", options_set_maxiter, 1);
485
+
486
+ rb_define_alias(cOptions, "max_iterations", "maxiter");
487
+ rb_define_alias(cOptions, "max_iterations=", "maxiter=");
488
+
489
+ rb_define_method(cOptions, "jobsize", options_jobsize, 0);
490
+ rb_define_method(cOptions, "jobsize=", options_set_jobsize, 1);
491
+
492
+ rb_define_method(cOptions, "nthread", options_nthread, 0);
493
+ rb_define_method(cOptions, "nthread=", options_set_nthread, 1);
494
+
495
+ rb_define_alias(cOptions, "threads", "nthread");
496
+ rb_define_alias(cOptions, "threads=", "nthread=");
497
+
498
+ rb_define_method(cOptions, "rho1", options_rho1, 0);
499
+ rb_define_method(cOptions, "rho1=", options_set_rho1, 1);
500
+
501
+ rb_define_method(cOptions, "rho2", options_rho2, 0);
502
+ rb_define_method(cOptions, "rho2=", options_set_rho2, 1);
503
+
504
+ rb_define_method(cOptions, "stopeps", options_stopeps, 0);
505
+ rb_define_method(cOptions, "stopeps=", options_set_stopeps, 1);
506
+
507
+ rb_define_alias(cOptions, "stop_epsilon", "stopeps");
508
+ rb_define_alias(cOptions, "stop_epsilon=", "stopeps=");
509
+
510
+ rb_define_method(cOptions, "maxent", options_maxent, 0);
511
+ rb_define_method(cOptions, "maxent=", options_set_maxent, 1);
512
+
513
+ rb_define_alias(cOptions, "maxent?", "maxent");
514
+
515
+ rb_define_method(cOptions, "compact", options_compact, 0);
516
+ rb_define_method(cOptions, "compact=", options_set_compact, 1);
517
+
518
+ rb_define_alias(cOptions, "compact?", "compact");
519
+
520
+ rb_define_method(cOptions, "sparse", options_sparse, 0);
521
+ rb_define_method(cOptions, "sparse=", options_set_sparse, 1);
522
+
523
+ rb_define_alias(cOptions, "sparse?", "sparse");
524
+
525
+ rb_define_method(cOptions, "label", options_label, 0);
526
+ rb_define_method(cOptions, "label=", options_set_label, 1);
527
+
528
+ rb_define_alias(cOptions, "label?", "label");
529
+
530
+ rb_define_method(cOptions, "check", options_check, 0);
531
+ rb_define_method(cOptions, "check=", options_set_check, 1);
532
+
533
+ rb_define_alias(cOptions, "check?", "check");
534
+
535
+ rb_define_method(cOptions, "lblpost", options_lblpost, 0);
536
+ rb_define_method(cOptions, "lblpost=", options_set_lblpost, 1);
537
+
538
+ rb_define_alias(cOptions, "lblpost?", "lblpost");
539
+
540
+ rb_define_alias(cOptions, "posterior", "lblpost");
541
+ rb_define_alias(cOptions, "posterior?", "lblpost");
542
+ rb_define_alias(cOptions, "posterior=", "lblpost=");
543
+
544
+ rb_define_method(cOptions, "outsc", options_outsc, 0);
545
+ rb_define_method(cOptions, "outsc=", options_set_outsc, 1);
546
+
547
+ rb_define_alias(cOptions, "outsc?", "outsc");
548
+
549
+ rb_define_alias(cOptions, "score", "outsc");
550
+ rb_define_alias(cOptions, "score?", "outsc");
551
+ rb_define_alias(cOptions, "score=", "outsc=");
552
+
553
+ rb_define_method(cOptions, "pattern", options_pattern, 0);
554
+ rb_define_method(cOptions, "pattern=", options_set_pattern, 1);
555
+
556
+ rb_define_alias(cOptions, "template", "pattern");
557
+ rb_define_alias(cOptions, "template=", "pattern=");
558
+
559
+ rb_define_method(cOptions, "model", options_model, 0);
560
+ rb_define_method(cOptions, "model=", options_set_model, 1);
561
+
562
+ rb_define_method(cOptions, "algorithm", options_algorithm, 0);
563
+ rb_define_method(cOptions, "algorithm=", options_set_algorithm, 1);
564
+
565
+ rb_define_alias(cOptions, "algo", "algorithm");
566
+ rb_define_alias(cOptions, "algo=", "algorithm=");
567
+
568
+ rb_define_method(cOptions, "development_data", options_development_data, 0);
569
+ rb_define_method(cOptions, "development_data=", options_set_development_data, 1);
570
+
571
+ rb_define_alias(cOptions, "devel", "development_data");
572
+ rb_define_alias(cOptions, "devel=", "development_data=");
573
+
574
+ rb_define_method(cOptions, "clip", options_clip, 0);
575
+ rb_define_method(cOptions, "clip=", options_set_clip, 1);
576
+
577
+ rb_define_method(cOptions, "histsz", options_histsz, 0);
578
+ rb_define_method(cOptions, "histsz=", options_set_histsz, 1);
579
+
580
+ rb_define_method(cOptions, "maxls", options_maxls, 0);
581
+ rb_define_method(cOptions, "maxls=", options_set_maxls, 1);
582
+
583
+ rb_define_method(cOptions, "eta0", options_eta0, 0);
584
+ rb_define_method(cOptions, "eta0=", options_set_eta0, 1);
585
+
586
+ rb_define_method(cOptions, "alpha", options_alpha, 0);
587
+ rb_define_method(cOptions, "alpha=", options_set_alpha, 1);
588
+
589
+ rb_define_method(cOptions, "kappa", options_kappa, 0);
590
+ rb_define_method(cOptions, "kappa=", options_set_kappa, 1);
591
+
592
+ rb_define_method(cOptions, "stpmin", options_stpmin, 0);
593
+ rb_define_method(cOptions, "stpmin=", options_set_stpmin, 1);
594
+
595
+ rb_define_method(cOptions, "stpmax", options_stpmax, 0);
596
+ rb_define_method(cOptions, "stpmax=", options_set_stpmax, 1);
597
+
598
+ rb_define_method(cOptions, "stpinc", options_stpinc, 0);
599
+ rb_define_method(cOptions, "stpinc=", options_set_stpinc, 1);
600
+
601
+ rb_define_method(cOptions, "stpdec", options_stpdec, 0);
602
+ rb_define_method(cOptions, "stpdec=", options_set_stpdec, 1);
603
+
604
+ rb_define_method(cOptions, "cutoff", options_cutoff, 0);
605
+ rb_define_method(cOptions, "cutoff=", options_set_cutoff, 1);
606
+
607
+ rb_define_method(cOptions, "nbest", options_nbest, 0);
608
+ rb_define_method(cOptions, "nbest=", options_set_nbest, 1);
609
+
610
+ }
611
+
612
+
613
+ /* --- Model Class --- */
614
+
615
+ // Auxiliary Methods
616
+
617
+ static mdl_t *get_model(VALUE self) {
618
+ mdl_t *model;
619
+ Data_Get_Struct(self, mdl_t, model);
620
+ return model;
621
+ }
622
+
623
+ // Constructor / Desctructor
624
+
625
+ static void mark_model(mdl_t *model __attribute__((__unused__))) {
626
+ // nothing
627
+ }
628
+
629
+ static void deallocate_model(mdl_t *model) {
630
+ if (model) {
631
+ mdl_free(model);
632
+ model = (mdl_t*)0;
633
+ }
634
+ }
635
+
636
+ static VALUE allocate_model(VALUE self) {
637
+ mdl_t *model = mdl_new(rdr_new(false));
638
+ return Data_Wrap_Struct(self, mark_model, deallocate_model, model);
639
+ }
640
+
641
+ static VALUE model_set_options(VALUE self, VALUE rb_options) {
642
+ if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
643
+ rb_raise(cNativeError, "argument must be a Wapiti::Options instance");
644
+ }
645
+
646
+ mdl_t *model = get_model(self);
647
+
648
+ // Store reference to options in model struct
649
+ model->opt = get_options(rb_options);
650
+
651
+ // Update reader
652
+ model->reader->maxent = model->opt->maxent;
653
+
654
+ // Save instance variable
655
+ rb_ivar_set(self, rb_intern("@options"), rb_options);
656
+
657
+ return rb_options;
658
+ }
659
+
660
+ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
661
+ VALUE options;
662
+
663
+ if (argc > 1) {
664
+ rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
665
+ "wrong number of arguments (%d for 0..1)", argc);
666
+ }
667
+
668
+ if (argc) {
669
+ if (TYPE(argv[0]) == T_HASH) {
670
+ options = rb_funcall(cOptions, rb_intern("new"), 1, argv[0]);
671
+ }
672
+ else {
673
+ if (strncmp("Wapiti::Options", rb_obj_classname(argv[0]), 15) != 0) {
674
+ rb_raise(cNativeError, "argument must be a hash or an options instance");
675
+ }
676
+ options = argv[0];
677
+ }
678
+ }
679
+ else {
680
+ options = rb_funcall(cOptions, rb_intern("new"), 0);
681
+ }
682
+
683
+ // yield self if block_given?
684
+ if (rb_block_given_p()) {
685
+ rb_yield(options);
686
+ }
687
+
688
+ model_set_options(self, options);
689
+
690
+ // Load a previous model if specified by options
691
+ if (get_options(options)->model) {
692
+ rb_funcall(self, rb_intern("load"), 0);
693
+ }
694
+
695
+ return self;
696
+ }
697
+
698
+
699
+ // Native accessors
700
+
701
+ static VALUE model_nlbl(VALUE self) {
702
+ return INT2FIX(get_model(self)->nlbl);
703
+ }
704
+
705
+ static VALUE model_nobs(VALUE self) {
706
+ return INT2FIX(get_model(self)->nobs);
707
+ }
708
+
709
+ static VALUE model_nftr(VALUE self) {
710
+ return INT2FIX(get_model(self)->nftr);
711
+ }
712
+
713
+ static VALUE model_total(VALUE self) {
714
+ return rb_float_new(get_model(self)->total);
715
+ }
716
+
717
+
718
+ // Instance methods
719
+
720
+ static VALUE model_sync(VALUE self) {
721
+ mdl_sync(get_model(self));
722
+ return self;
723
+ }
724
+
725
+ static VALUE model_compact(VALUE self) {
726
+ mdl_compact(get_model(self));
727
+ return self;
728
+ }
729
+
730
+ // call-seq:
731
+ // m.save # => saves the model to the file defined in m.path
732
+ // m.save(path) # => sets m.path and saves the model to the file <path>
733
+ //
734
+ // Saves the model to a file. Uses the Model's path if no argument given,
735
+ // otherwise uses the passed-in argument as the Model's path.
736
+ static VALUE model_save(int argc, VALUE *argv, VALUE self) {
737
+ if (argc > 1) {
738
+ rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
739
+ "wrong number of arguments (%d for 0..1)", argc);
740
+ }
741
+
742
+ mdl_t *model = get_model(self);
743
+
744
+ // save passed-in path in options
745
+ if (argc) {
746
+ Check_Type(argv[0], T_STRING);
747
+ rb_ivar_set(self, rb_intern("@path"), argv[0]);
748
+ }
749
+
750
+ // open the output file
751
+ FILE *file = 0;
752
+ VALUE path = rb_ivar_get(self, rb_intern("@path"));
753
+
754
+ if (NIL_P(path)) {
755
+ rb_raise(cNativeError, "failed to save model: no path given");
756
+ }
757
+
758
+ if (!(file = fopen(StringValueCStr(path), "w"))) {
759
+ rb_raise(cNativeError, "failed to save model: failed to open model file");
760
+ }
761
+
762
+ mdl_save(model, file);
763
+ fclose(file);
764
+
765
+ return self;
766
+ }
767
+
768
+ static VALUE model_load(int argc, VALUE *argv, VALUE self) {
769
+ if (argc > 1) {
770
+ rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
771
+ "wrong number of arguments (%d for 0..1)", argc);
772
+ }
773
+
774
+ mdl_t *model = get_model(self);
775
+
776
+ // save passed-in argument in options
777
+ if (argc) {
778
+ Check_Type(argv[0], T_STRING);
779
+ rb_ivar_set(self, rb_intern("@path"), argv[0]);
780
+ }
781
+
782
+ // open the model file
783
+ FILE *file = 0;
784
+ VALUE path = rb_ivar_get(self, rb_intern("@path"));
785
+
786
+ if (NIL_P(path)) {
787
+ rb_raise(cNativeError, "failed to load model: no path given");
788
+ }
789
+
790
+ if (!(file = fopen(StringValueCStr(path), "r"))) {
791
+ rb_raise(cNativeError, "failed to load model: failed to open model file");
792
+ }
793
+
794
+ mdl_load(model, file);
795
+ fclose(file);
796
+
797
+ return self;
798
+ }
799
+
800
+ static dat_t *to_dat(rdr_t *reader, VALUE data, bool labelled) {
801
+ Check_Type(data, T_ARRAY);
802
+
803
+ const unsigned int n = RARRAY_LEN(data);
804
+ unsigned int i, j, k;
805
+
806
+ dat_t *dat = xmalloc(sizeof(dat_t));
807
+ dat->nseq = 0;
808
+ dat->mlen = 0;
809
+ dat->lbl = labelled;
810
+ dat->seq = xmalloc(sizeof(seq_t*) * n);
811
+
812
+ for (i = 0; i < n; ++i) {
813
+ VALUE sequence = rb_ary_entry(data, i);
814
+ Check_Type(sequence, T_ARRAY);
815
+
816
+ k = RARRAY_LEN(sequence);
817
+ raw_t *raw = xmalloc(sizeof(raw_t) + sizeof(char*) * k);
818
+
819
+ for (j = 0; j < k; ++j) {
820
+ VALUE line = rb_ary_entry(sequence, j);
821
+ Check_Type(line, T_STRING);
822
+ raw->lines[j] = StringValueCStr(line);
823
+ }
824
+
825
+ raw->len = k;
826
+
827
+ seq_t *seq = rdr_raw2seq(reader, raw, labelled);
828
+ xfree(raw);
829
+
830
+ if (seq == 0) { break; }
831
+
832
+ // and store the sequence
833
+ dat->seq[dat->nseq++] = seq;
834
+ dat->mlen = max(dat->mlen, seq->len);
835
+
836
+ }
837
+
838
+ // if no sequence was read, free memory
839
+ if (dat->nseq == 0) {
840
+ xfree(dat->seq);
841
+ xfree(dat);
842
+
843
+ return 0;
844
+ }
845
+
846
+ return dat;
847
+ }
848
+
849
+
850
+ static VALUE model_train(VALUE self, VALUE data) {
851
+
852
+ mdl_t* model = get_model(self);
853
+
854
+ int trn;
855
+ for (trn = 0; trn < trn_cnt; trn++) {
856
+ if (!strcmp(model->opt->algo, trn_lst[trn].name)) break;
857
+ }
858
+
859
+ if (trn == trn_cnt) {
860
+ rb_raise(cNativeError, "failed to train model: unknown algorithm '%s'", model->opt->algo);
861
+ }
862
+
863
+ FILE *file;
864
+
865
+ // Load the pattern file. This will unlock the database if previously
866
+ // locked by loading a model.
867
+ if (model->opt->pattern) {
868
+ file = fopen(model->opt->pattern, "r");
869
+
870
+ if (!file) {
871
+ rb_raise(cNativeError, "failed to train model: failed to load pattern file '%s'", model->opt->pattern);
872
+ }
873
+
874
+ rdr_loadpat(model->reader, file);
875
+ fclose(file);
876
+ qrk_lock(model->reader->obs, false);
877
+ }
878
+ else {
879
+ rb_raise(cNativeError, "failed to train model: no pattern given");
880
+ }
881
+
882
+
883
+ // Load the training data. When this is done we lock the quarks as we
884
+ // don't want to put in the model, informations present only in the
885
+ // devlopment set.
886
+
887
+ switch (TYPE(data)) {
888
+ case T_STRING:
889
+ if (!(file = fopen(StringValuePtr(data), "r"))) {
890
+ rb_raise(cNativeError, "failed to train model: failed to open training data '%s", StringValuePtr(data));
891
+ }
892
+
893
+ model->train = rdr_readdat(model->reader, file, true);
894
+ fclose(file);
895
+
896
+ break;
897
+ case T_ARRAY:
898
+ model->train = to_dat(model->reader, data, true);
899
+
900
+ break;
901
+ default:
902
+ rb_raise(cNativeError, "failed to train model: invalid training data type (expected instance of String or Array)");
903
+ }
904
+
905
+ qrk_lock(model->reader->lbl, true);
906
+ qrk_lock(model->reader->obs, true);
907
+
908
+ if (!model->train || model->train->nseq == 0) {
909
+ rb_raise(cNativeError, "failed to train model: no training data loaded");
910
+ }
911
+
912
+ // If present, load the development set in the model. If not specified,
913
+ // the training dataset will be used instead.
914
+ if (model->opt->devel) {
915
+ if (!(file = fopen(model->opt->devel, "r"))) {
916
+ rb_raise(cNativeError, "failed to train model: cannot open development file '%s'", model->opt->devel);
917
+ }
918
+
919
+ model->devel = rdr_readdat(model->reader, file, true);
920
+ fclose(file);
921
+ }
922
+
923
+ // Initialize the model. If a previous model was loaded, this will be
924
+ // just a resync, else the model structure will be created.
925
+ rb_funcall(self, rb_intern("sync"), 0);
926
+
927
+ // Train the model.
928
+ uit_setup(model);
929
+ trn_lst[trn].train(model);
930
+ uit_cleanup(model);
931
+
932
+ // If requested compact the model.
933
+ if (model->opt->compact) {
934
+ const size_t O = model->nobs;
935
+ const size_t F = model->nftr;
936
+ rb_funcall(self, rb_intern("compact"), 0);
937
+ }
938
+
939
+ return self;
940
+ }
941
+
942
+ // Returns a sorted list of all labels in the Model's label database.
943
+ static VALUE model_labels(VALUE self) {
944
+ mdl_t *model = get_model(self);
945
+ const size_t Y = model->nlbl;
946
+
947
+ qrk_t *lp = model->reader->lbl;
948
+
949
+ VALUE labels = rb_ary_new2(Y);
950
+
951
+ for (unsigned int i = 0; i < Y; ++i) {
952
+ rb_ary_push(labels, rb_str_new2(qrk_id2str(lp, i)));
953
+ }
954
+
955
+ rb_funcall(labels, rb_intern("sort!"), 0);
956
+
957
+ return labels;
958
+ }
959
+
960
+ static VALUE decode_sequence(mdl_t *model, raw_t *raw) {
961
+ qrk_t *lbls = model->reader->lbl;
962
+
963
+ const size_t Y = model->nlbl;
964
+ const size_t N = model->opt->nbest;
965
+
966
+ seq_t *seq = rdr_raw2seq(model->reader, raw, model->opt->check);
967
+
968
+ const int T = seq->len;
969
+
970
+ size_t *out = xmalloc(sizeof(size_t) * T * N);
971
+ double *psc = xmalloc(sizeof(double) * T * N);
972
+ double *scs = xmalloc(sizeof(double) * N);
973
+
974
+ VALUE result = rb_ary_new2(N), sequence, tokens;
975
+
976
+ if (N == 1) {
977
+ tag_viterbi(model, seq, (size_t*)out, scs, (double*)psc);
978
+ }
979
+ else {
980
+ tag_nbviterbi(model, seq, N, (void*)out, scs, (void*)psc);
981
+ }
982
+
983
+ // Next we output the raw sequence with an aditional column for
984
+ // the predicted labels
985
+ for (size_t n = 0; n < N; n++) {
986
+
987
+ sequence = rb_ary_new();
988
+
989
+ // if (model->opt->outsc)
990
+ // fprintf(fout, "# %d %f\n", (int)n, scs[n]);
991
+
992
+ for (int t = 0; t < T; t++) {
993
+ tokens = rb_ary_new();
994
+
995
+ if (!model->opt->label) {
996
+ rb_ary_push(tokens, rb_str_new2(raw->lines[t]));
997
+ }
998
+
999
+ size_t lbl = out[t * N + n];
1000
+ rb_ary_push(tokens, rb_str_new2(qrk_id2str(lbls, lbl)));
1001
+
1002
+ // if (model->opt->outsc) {
1003
+ // fprintf(fout, "\t%s", lblstr);
1004
+ // fprintf(fout, "/%f", psc[t * N + n]);
1005
+ // }
1006
+
1007
+ // yield token/label pair to block if given
1008
+ if (rb_block_given_p()) {
1009
+ tokens = rb_yield(tokens);
1010
+ }
1011
+
1012
+ rb_ary_push(sequence, tokens);
1013
+ }
1014
+
1015
+ rb_ary_push(result, sequence);
1016
+ }
1017
+
1018
+ // Cleanup memory used for this sequence
1019
+ xfree(scs);
1020
+ xfree(psc);
1021
+ xfree(out);
1022
+
1023
+ rdr_freeseq(seq);
1024
+
1025
+ return result;
1026
+ }
1027
+
1028
+ static VALUE decode_sequence_array(VALUE self, VALUE array) {
1029
+ Check_Type(array, T_ARRAY);
1030
+ const unsigned int n = RARRAY_LEN(array);
1031
+
1032
+ mdl_t *model = get_model(self);
1033
+ raw_t *raw;
1034
+
1035
+ const unsigned int N = model->opt->nbest;
1036
+ unsigned int i, j;
1037
+
1038
+ VALUE result = rb_ary_new2(n * N), sequence;
1039
+
1040
+ for (i = 0; i < n; ++i) {
1041
+ sequence = rb_ary_entry(array, i);
1042
+ Check_Type(sequence, T_ARRAY);
1043
+
1044
+ const unsigned int k = RARRAY_LEN(sequence);
1045
+ raw = xmalloc(sizeof(raw_t) + sizeof(char*) * k);
1046
+ raw->len = k;
1047
+
1048
+ for (j = 0; j < k; ++j) {
1049
+ VALUE line = rb_ary_entry(sequence, j);
1050
+ Check_Type(line, T_STRING);
1051
+
1052
+ raw->lines[j] = StringValueCStr(line);
1053
+ }
1054
+
1055
+ rb_funcall(result, rb_intern("concat"), 1, decode_sequence(model, raw));
1056
+
1057
+ xfree(raw);
1058
+ }
1059
+
1060
+ return result;
1061
+ }
1062
+
1063
+ static VALUE decode_sequence_file(VALUE self, VALUE path) {
1064
+ Check_Type(path, T_STRING);
1065
+ FILE *file;
1066
+
1067
+ if (!(file = fopen(StringValueCStr(path), "r"))) {
1068
+ rb_raise(cNativeError, "failed to label data: could not open file '%s'", StringValueCStr(path));
1069
+ }
1070
+
1071
+ mdl_t *model = get_model(self);
1072
+ raw_t *raw;
1073
+
1074
+ VALUE result = rb_ary_new();
1075
+
1076
+ // Next read the input file sequence by sequence and label them, we have
1077
+ // to take care of not discarding the raw input as we want to send it
1078
+ // back to the output with the additional predicted labels.
1079
+ while (!feof(file)) {
1080
+
1081
+ // So, first read an input sequence keeping the raw_t object
1082
+ // available, and label it with Viterbi.
1083
+ if ((raw = rdr_readraw(model->reader, file)) == 0) {
1084
+ break;
1085
+ }
1086
+
1087
+ rb_funcall(result, rb_intern("concat"), 1, decode_sequence(model, raw));
1088
+ rdr_freeraw(raw);
1089
+ }
1090
+
1091
+ return result;
1092
+ }
1093
+
1094
+ // cal-seq:
1095
+ // m.label(tokens) # => array of labelled tokens
1096
+ // m.label(filename) # => array of labelled tokens
1097
+ //
1098
+ static VALUE model_label(VALUE self, VALUE data) {
1099
+ VALUE result;
1100
+
1101
+ switch (TYPE(data)) {
1102
+ case T_STRING:
1103
+ result = decode_sequence_file(self, data);
1104
+ break;
1105
+ case T_ARRAY:
1106
+ result = decode_sequence_array(self, data);
1107
+ break;
1108
+ default:
1109
+ rb_raise(cNativeError, "failed to label data: invalid data (expected type String or Array)");
1110
+ }
1111
+
1112
+ return result;
1113
+ }
1114
+
1115
+ static void Init_model() {
1116
+ cModel = rb_define_class_under(mWapiti, "Model", rb_cObject);
1117
+ rb_define_alloc_func(cModel, allocate_model);
1118
+
1119
+ rb_define_method(cModel, "initialize", initialize_model, -1);
1120
+
1121
+ rb_define_attr(cModel, "options", 1, 0);
1122
+
1123
+ rb_define_method(cModel, "nlbl", model_nlbl, 0);
1124
+ rb_define_method(cModel, "labels", model_labels, 0);
1125
+
1126
+ rb_define_method(cModel, "nobs", model_nobs, 0);
1127
+ rb_define_alias(cModel, "observations", "nobs");
1128
+
1129
+ rb_define_method(cModel, "nftr", model_nftr, 0);
1130
+ rb_define_alias(cModel, "features", "nftr");
1131
+
1132
+ rb_define_method(cModel, "total", model_total, 0);
1133
+
1134
+ rb_define_method(cModel, "sync", model_sync, 0);
1135
+ rb_define_method(cModel, "compact", model_compact, 0);
1136
+ rb_define_method(cModel, "save", model_save, -1);
1137
+ rb_define_method(cModel, "load", model_load, -1);
1138
+
1139
+ rb_define_method(cModel, "train", model_train, 1);
1140
+ rb_define_method(cModel, "label", model_label, 1);
1141
+ }
1142
+
1143
+ /* --- Top-Level Utility Methods --- */
1144
+
1145
+
1146
+ static VALUE label(VALUE self __attribute__((__unused__)), VALUE rb_options) {
1147
+ if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
1148
+ rb_raise(cNativeError, "argument must be a native options instance");
1149
+ }
1150
+
1151
+ opt_t *options = get_options(rb_options);
1152
+
1153
+ if (options->mode != 1) {
1154
+ rb_raise(cNativeError, "invalid options argument: mode should be set to 1 for labelling");
1155
+ }
1156
+
1157
+ mdl_t *model = mdl_new(rdr_new(options->maxent));
1158
+ model->opt = options;
1159
+
1160
+ dolabel(model);
1161
+
1162
+ mdl_free(model);
1163
+
1164
+ return Qnil;
1165
+ }
1166
+
1167
+ static VALUE dump(VALUE self __attribute__((__unused__)), VALUE rb_options) {
1168
+ if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
1169
+ rb_raise(cNativeError, "argument must be a native options instance");
1170
+ }
1171
+
1172
+ opt_t *options = get_options(rb_options);
1173
+
1174
+ if (options->mode != 2) {
1175
+ rb_raise(cNativeError, "invalid options argument: mode should be set to 2 for training");
1176
+ }
1177
+
1178
+ mdl_t *model = mdl_new(rdr_new(options->maxent));
1179
+ model->opt = options;
1180
+
1181
+ dodump(model);
1182
+
1183
+ mdl_free(model);
1184
+
1185
+ return Qnil;
1186
+ }
1187
+
1188
+ // This function is a proxy for Wapiti's main entry point.
1189
+ static VALUE wapiti(VALUE self __attribute__((__unused__)), VALUE arguments) {
1190
+ int result = -1, argc = 0;
1191
+ char **ap, *argv[18], *input, *tmp;
1192
+
1193
+ Check_Type(arguments, T_STRING);
1194
+ tmp = StringValueCStr(arguments);
1195
+
1196
+ // allocate space for argument vector
1197
+ input = (char*)malloc(strlen(tmp) + 8);
1198
+
1199
+ // prepend command name
1200
+ strncpy(input, "wapiti ", 8);
1201
+ strncat(input, tmp, strlen(input) - 8);
1202
+
1203
+ // remember allocation pointer
1204
+ tmp = input;
1205
+
1206
+ // turn input string into argument vector (using
1207
+ // only the first seventeen tokens from input)
1208
+ for (ap = argv; (*ap = strsep(&input, " \t")) != (char*)0; ++argc) {
1209
+ if ((**ap != '\0') && (++ap >= &argv[18])) break;
1210
+ }
1211
+
1212
+ // call main entry point
1213
+ result = wapiti_main(argc, argv);
1214
+
1215
+ // free allocated memory
1216
+ free(tmp);
1217
+
1218
+ return INT2FIX(result);
1219
+ }
1220
+
1221
+ /* --- Wapiti Extension Entry Point --- */
1222
+
1223
+ void Init_native() {
1224
+ mWapiti = rb_const_get(rb_mKernel, rb_intern("Wapiti"));
1225
+ mNative = rb_define_module_under(mWapiti, "Native");
1226
+
1227
+ cNativeError = rb_const_get(mWapiti, rb_intern("NativeError"));
1228
+ cConfigurationError = rb_const_get(mWapiti, rb_intern("ConfigurationError"));
1229
+ cLogger = rb_funcall(mWapiti, rb_intern("log"), 0);
1230
+
1231
+ rb_define_singleton_method(mNative, "label", label, 1);
1232
+ rb_define_singleton_method(mNative, "wapiti", wapiti, 1);
1233
+
1234
+ rb_define_const(mNative, "VERSION", rb_str_new2(VERSION));
1235
+
1236
+ Init_options();
1237
+ Init_model();
1238
+ }