wapiti 0.0.5 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.simplecov +3 -0
  3. data/Gemfile +25 -2
  4. data/HISTORY.md +5 -1
  5. data/LICENSE +14 -13
  6. data/README.md +9 -16
  7. data/Rakefile +38 -8
  8. data/ext/wapiti/bcd.c +126 -124
  9. data/ext/wapiti/decoder.c +203 -124
  10. data/ext/wapiti/decoder.h +6 -4
  11. data/ext/wapiti/extconf.rb +2 -2
  12. data/ext/wapiti/gradient.c +491 -320
  13. data/ext/wapiti/gradient.h +52 -34
  14. data/ext/wapiti/lbfgs.c +74 -33
  15. data/ext/wapiti/model.c +47 -37
  16. data/ext/wapiti/model.h +22 -20
  17. data/ext/wapiti/native.c +850 -839
  18. data/ext/wapiti/native.h +1 -1
  19. data/ext/wapiti/options.c +52 -20
  20. data/ext/wapiti/options.h +37 -30
  21. data/ext/wapiti/pattern.c +35 -33
  22. data/ext/wapiti/pattern.h +12 -11
  23. data/ext/wapiti/progress.c +14 -13
  24. data/ext/wapiti/progress.h +3 -2
  25. data/ext/wapiti/quark.c +14 -16
  26. data/ext/wapiti/quark.h +6 -5
  27. data/ext/wapiti/reader.c +83 -69
  28. data/ext/wapiti/reader.h +11 -9
  29. data/ext/wapiti/rprop.c +84 -43
  30. data/ext/wapiti/sequence.h +18 -16
  31. data/ext/wapiti/sgdl1.c +45 -43
  32. data/ext/wapiti/thread.c +19 -17
  33. data/ext/wapiti/thread.h +5 -4
  34. data/ext/wapiti/tools.c +7 -7
  35. data/ext/wapiti/tools.h +3 -4
  36. data/ext/wapiti/trainers.h +1 -1
  37. data/ext/wapiti/vmath.c +40 -38
  38. data/ext/wapiti/vmath.h +12 -11
  39. data/ext/wapiti/wapiti.c +159 -37
  40. data/ext/wapiti/wapiti.h +18 -4
  41. data/lib/wapiti.rb +15 -15
  42. data/lib/wapiti/errors.rb +15 -15
  43. data/lib/wapiti/model.rb +92 -84
  44. data/lib/wapiti/options.rb +123 -124
  45. data/lib/wapiti/utility.rb +14 -14
  46. data/lib/wapiti/version.rb +2 -2
  47. data/spec/spec_helper.rb +29 -9
  48. data/spec/wapiti/model_spec.rb +230 -194
  49. data/spec/wapiti/native_spec.rb +7 -8
  50. data/spec/wapiti/options_spec.rb +184 -174
  51. data/wapiti.gemspec +22 -8
  52. metadata +38 -42
  53. data/.gitignore +0 -5
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -29,14 +29,15 @@
29
29
  #define model_h
30
30
 
31
31
  #include <stddef.h>
32
- #include <sys/times.h>
32
+ #include <stdint.h>
33
+ #include <sys/time.h>
33
34
 
34
- #include "wapiti.h"
35
35
  #include "options.h"
36
36
  #include "sequence.h"
37
37
  #include "reader.h"
38
+ #include "wapiti.h"
38
39
 
39
- typedef struct tms tms_t;
40
+ typedef struct timeval tms_t;
40
41
 
41
42
  /* mdl_t:
42
43
  * Represent a linear-chain CRF model. The model contain both unigram and
@@ -60,34 +61,35 @@ typedef struct tms tms_t;
60
61
  */
61
62
  typedef struct mdl_s mdl_t;
62
63
  struct mdl_s {
63
- opt_t *opt; // options for training
64
+ opt_t *opt; // options for training
65
+ int type; // model type
64
66
 
65
67
  // Size of various model parameters
66
- size_t nlbl; // Y number of labels
67
- size_t nobs; // O number of observations
68
- size_t nftr; // F number of features
68
+ uint32_t nlbl; // Y number of labels
69
+ uint64_t nobs; // O number of observations
70
+ uint64_t nftr; // F number of features
69
71
 
70
72
  // Informations about observations
71
- char *kind; // [O] observations type
72
- size_t *uoff; // [O] unigram weights offset
73
- size_t *boff; // [O] bigram weights offset
73
+ char *kind; // [O] observations type
74
+ uint64_t *uoff; // [O] unigram weights offset
75
+ uint64_t *boff; // [O] bigram weights offset
74
76
 
75
77
  // The model itself
76
- double *theta; // [F] features weights
78
+ double *theta; // [F] features weights
77
79
 
78
80
  // Datasets
79
- dat_t *train; // training dataset
80
- dat_t *devel; // development dataset
81
- rdr_t *reader;
81
+ dat_t *train; // training dataset
82
+ dat_t *devel; // development dataset
83
+ rdr_t *reader;
82
84
 
83
85
  // Stoping criterion
84
- double *werr; // Window of error rate of last iters
85
- int wcnt; // Number of iters in the window
86
- int wpos; // Position for the next iter
86
+ double *werr; // Window of error rate of last iters
87
+ uint32_t wcnt; // Number of iters in the window
88
+ uint32_t wpos; // Position for the next iter
87
89
 
88
90
  // Timing
89
- tms_t timer; // start time of last iter
90
- double total; // total training time
91
+ tms_t timer; // start time of last iter
92
+ double total; // total training time
91
93
  };
92
94
 
93
95
  mdl_t *mdl_new(rdr_t *rdr);
@@ -1,13 +1,15 @@
1
1
  #include <stdio.h>
2
2
  #include <string.h>
3
3
 
4
- #include "wapiti.h"
5
4
  #include "options.h"
6
5
  #include "reader.h"
6
+ #include "decoder.h"
7
7
  #include "model.h"
8
8
  #include "trainers.h"
9
+ #include "progress.h"
9
10
  #include "quark.h"
10
11
  #include "tools.h"
12
+ #include "wapiti.h"
11
13
 
12
14
  #include "native.h"
13
15
 
@@ -22,27 +24,36 @@ VALUE cConfigurationError;
22
24
  VALUE cLogger;
23
25
 
24
26
 
27
+ /* --- Forward declarations --- */
28
+
29
+ int wapiti_main(int argc, char *argv[argc]);
30
+
31
+ void dolabel(mdl_t *mdl);
32
+ void dotrain(mdl_t *mdl);
33
+ void doupdt(mdl_t *mdl);
34
+
35
+
25
36
  /* --- Utilities --- */
26
37
 
27
38
  static void trn_auto(mdl_t *mdl) {
28
- const int maxiter = mdl->opt->maxiter;
29
- mdl->opt->maxiter = 3;
30
- trn_sgdl1(mdl);
31
- mdl->opt->maxiter = maxiter;
32
- trn_lbfgs(mdl);
39
+ const int maxiter = mdl->opt->maxiter;
40
+ mdl->opt->maxiter = 3;
41
+ trn_sgdl1(mdl);
42
+ mdl->opt->maxiter = maxiter;
43
+ trn_lbfgs(mdl);
33
44
  }
34
45
 
35
46
  static const struct {
36
- char *name;
37
- void (* train)(mdl_t *mdl);
47
+ const char *name;
48
+ void (* train)(mdl_t *mdl);
38
49
  } trn_lst[] = {
39
- {"l-bfgs", trn_lbfgs},
40
- {"sgd-l1", trn_sgdl1},
41
- {"bcd", trn_bcd },
42
- {"rprop", trn_rprop},
43
- {"rprop+", trn_rprop},
44
- {"rprop-", trn_rprop},
45
- {"auto", trn_auto }
50
+ {"l-bfgs", trn_lbfgs},
51
+ {"sgd-l1", trn_sgdl1},
52
+ {"bcd", trn_bcd },
53
+ {"rprop", trn_rprop},
54
+ {"rprop+", trn_rprop},
55
+ {"rprop-", trn_rprop},
56
+ {"auto", trn_auto }
46
57
  };
47
58
  static const int trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
48
59
 
@@ -52,78 +63,78 @@ static const int trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
52
63
  // Auxiliary Methods
53
64
 
54
65
  static opt_t *get_options(VALUE self) {
55
- opt_t *options;
56
- Data_Get_Struct(self, opt_t, options);
57
- return options;
66
+ opt_t *options;
67
+ Data_Get_Struct(self, opt_t, options);
68
+ return options;
58
69
  }
59
70
 
60
71
  // Copies a Ruby string to the heap and stores it in a pointer.
61
72
  // Frees the pointer before assigning the new value.
62
73
  static void copy_string(char **dst, VALUE rb_string) {
63
- Check_Type(rb_string, T_STRING);
74
+ Check_Type(rb_string, T_STRING);
64
75
 
65
- if (*dst) { free(*dst); *dst = (char*)0; }
66
- *dst = calloc(RSTRING_LEN(rb_string) + 1, sizeof(char));
76
+ if (*dst) { free(*dst); *dst = (char*)0; }
77
+ *dst = calloc(RSTRING_LEN(rb_string) + 1, sizeof(char));
67
78
 
68
- memcpy(*dst, StringValuePtr(rb_string), RSTRING_LEN(rb_string) + 1);
79
+ memcpy(*dst, StringValuePtr(rb_string), RSTRING_LEN(rb_string) + 1);
69
80
  }
70
81
 
71
82
 
72
83
  // Constructor / Desctructor
73
84
 
74
85
  static void mark_options(opt_t* options __attribute__((__unused__))) {
75
- // nothing
86
+ // nothing
76
87
  }
77
88
 
78
89
  static void deallocate_options(opt_t* options) {
79
-
80
- // free string options
81
- if (options->input) { free(options->input); }
82
- if (options->output) { free(options->output); }
83
- if (options->algo) { free(options->algo); }
84
- if (options->devel) { free(options->devel); }
85
- if (options->pattern) { free(options->pattern); }
86
-
87
- free(options);
88
- options = (opt_t*)0;
90
+
91
+ // free string options
92
+ if (options->input) { free(options->input); }
93
+ if (options->output) { free(options->output); }
94
+ if (options->algo) { free((void*)options->algo); }
95
+ if (options->devel) { free(options->devel); }
96
+ if (options->pattern) { free((void*)options->pattern); }
97
+
98
+ free(options);
99
+ options = (opt_t*)0;
89
100
  }
90
101
 
91
102
  static VALUE allocate_options(VALUE self) {
92
- opt_t* options = malloc(sizeof(opt_t));
93
- return Data_Wrap_Struct(self, mark_options, deallocate_options, options);
94
- }
95
-
96
- static VALUE initialize_options(int argc, VALUE *argv, VALUE self) {
97
- opt_t* options = get_options(self);
98
- *options = opt_defaults;
99
-
100
- if (options->maxiter == 0) {
101
- options->maxiter = INT_MAX;
102
- }
103
-
104
- // copy the default algorithm name to the heap so that all options strings
105
- // are on the heap
106
- char* tmp = calloc(strlen(options->algo), sizeof(char));
107
- memcpy(tmp, options->algo, strlen(options->algo));
108
- options->algo = tmp;
109
-
110
- if (argc > 1) {
111
- rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
112
- "wrong number of arguments (%d for 0..1)", argc);
113
- }
114
-
115
- // set defaults
116
- if (argc) {
117
- Check_Type(argv[0], T_HASH);
118
- (void)rb_funcall(self, rb_intern("update"), 1, argv[0]);
119
- }
120
-
121
- // yield self if block_given?
122
- if (rb_block_given_p()) {
123
- rb_yield(self);
124
- }
125
-
126
- return self;
103
+ opt_t* options = malloc(sizeof(opt_t));
104
+ return Data_Wrap_Struct(self, mark_options, deallocate_options, options);
105
+ }
106
+
107
+ static VALUE initialize_options(int argc, VALUE *argv, VALUE self) {
108
+ opt_t* options = get_options(self);
109
+ *options = opt_defaults;
110
+
111
+ if (options->maxiter == 0) {
112
+ options->maxiter = INT_MAX;
113
+ }
114
+
115
+ // copy the default algorithm name to the heap so that all options strings
116
+ // are on the heap
117
+ char* tmp = calloc(strlen(options->algo), sizeof(char));
118
+ memcpy(tmp, options->algo, strlen(options->algo));
119
+ options->algo = tmp;
120
+
121
+ if (argc > 1) {
122
+ rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
123
+ "wrong number of arguments (%d for 0..1)", argc);
124
+ }
125
+
126
+ // set defaults
127
+ if (argc) {
128
+ Check_Type(argv[0], T_HASH);
129
+ (void)rb_funcall(self, rb_intern("update"), 1, argv[0]);
130
+ }
131
+
132
+ // yield self if block_given?
133
+ if (rb_block_given_p()) {
134
+ rb_yield(self);
135
+ }
136
+
137
+ return self;
127
138
  }
128
139
 
129
140
 
@@ -133,192 +144,192 @@ static VALUE initialize_options(int argc, VALUE *argv, VALUE self) {
133
144
  // Fixnum Accessors
134
145
 
135
146
  static VALUE options_nbest(VALUE self) {
136
- return INT2FIX(get_options(self)->nbest);
147
+ return INT2FIX(get_options(self)->nbest);
137
148
  }
138
149
 
139
150
  static VALUE options_set_nbest(VALUE self, VALUE rb_fixnum) {
140
- Check_Type(rb_fixnum, T_FIXNUM);
141
- get_options(self)->nbest = FIX2INT(rb_fixnum);
142
-
143
- return rb_fixnum;
151
+ Check_Type(rb_fixnum, T_FIXNUM);
152
+ get_options(self)->nbest = FIX2INT(rb_fixnum);
153
+
154
+ return rb_fixnum;
144
155
  }
145
156
 
146
157
 
147
158
  static VALUE options_stopwin(VALUE self) {
148
- return INT2FIX(get_options(self)->stopwin);
159
+ return INT2FIX(get_options(self)->stopwin);
149
160
  }
150
161
 
151
162
  static VALUE options_set_stopwin(VALUE self, VALUE rb_fixnum) {
152
- Check_Type(rb_fixnum, T_FIXNUM);
153
- get_options(self)->stopwin = FIX2INT(rb_fixnum);
154
-
155
- return rb_fixnum;
163
+ Check_Type(rb_fixnum, T_FIXNUM);
164
+ get_options(self)->stopwin = FIX2INT(rb_fixnum);
165
+
166
+ return rb_fixnum;
156
167
  }
157
168
 
158
169
  static VALUE options_objwin(VALUE self) {
159
- return INT2FIX(get_options(self)->objwin);
170
+ return INT2FIX(get_options(self)->objwin);
160
171
  }
161
172
 
162
173
  static VALUE options_set_objwin(VALUE self, VALUE rb_fixnum) {
163
- Check_Type(rb_fixnum, T_FIXNUM);
164
- get_options(self)->objwin = FIX2INT(rb_fixnum);
165
-
166
- return rb_fixnum;
174
+ Check_Type(rb_fixnum, T_FIXNUM);
175
+ get_options(self)->objwin = FIX2INT(rb_fixnum);
176
+
177
+ return rb_fixnum;
167
178
  }
168
179
 
169
180
 
170
181
  static VALUE options_maxiter(VALUE self) {
171
- return INT2FIX(get_options(self)->maxiter);
182
+ return INT2FIX(get_options(self)->maxiter);
172
183
  }
173
184
 
174
185
  static VALUE options_set_maxiter(VALUE self, VALUE rb_fixnum) {
175
- opt_t *options = get_options(self);
186
+ opt_t *options = get_options(self);
176
187
 
177
- Check_Type(rb_fixnum, T_FIXNUM);
178
- options->maxiter = FIX2INT(rb_fixnum);
179
-
180
- return rb_fixnum;
188
+ Check_Type(rb_fixnum, T_FIXNUM);
189
+ options->maxiter = FIX2INT(rb_fixnum);
190
+
191
+ return rb_fixnum;
181
192
  }
182
193
 
183
194
  static VALUE options_jobsize(VALUE self) {
184
- return INT2FIX(get_options(self)->jobsize);
195
+ return INT2FIX(get_options(self)->jobsize);
185
196
  }
186
197
 
187
198
  static VALUE options_set_jobsize(VALUE self, VALUE rb_fixnum) {
188
- opt_t *options = get_options(self);
199
+ opt_t *options = get_options(self);
200
+
201
+ Check_Type(rb_fixnum, T_FIXNUM);
202
+ options->jobsize = FIX2INT(rb_fixnum);
189
203
 
190
- Check_Type(rb_fixnum, T_FIXNUM);
191
- options->jobsize = FIX2INT(rb_fixnum);
192
-
193
- return rb_fixnum;
204
+ return rb_fixnum;
194
205
  }
195
206
 
196
207
  static VALUE options_nthread(VALUE self) {
197
- return INT2FIX(get_options(self)->nthread);
208
+ return INT2FIX(get_options(self)->nthread);
198
209
  }
199
210
 
200
211
  static VALUE options_set_nthread(VALUE self, VALUE rb_fixnum) {
201
- opt_t *options = get_options(self);
212
+ opt_t *options = get_options(self);
202
213
 
203
- Check_Type(rb_fixnum, T_FIXNUM);
204
- options->nthread = FIX2INT(rb_fixnum);
205
-
206
- return rb_fixnum;
214
+ Check_Type(rb_fixnum, T_FIXNUM);
215
+ options->nthread = FIX2INT(rb_fixnum);
216
+
217
+ return rb_fixnum;
207
218
  }
208
219
 
209
220
  static VALUE options_histsz(VALUE self) {
210
- return INT2FIX(get_options(self)->lbfgs.histsz);
221
+ return INT2FIX(get_options(self)->lbfgs.histsz);
211
222
  }
212
223
 
213
224
  static VALUE options_set_histsz(VALUE self, VALUE rb_fixnum) {
214
- Check_Type(rb_fixnum, T_FIXNUM);
215
- get_options(self)->lbfgs.histsz = FIX2INT(rb_fixnum);
216
-
217
- return rb_fixnum;
225
+ Check_Type(rb_fixnum, T_FIXNUM);
226
+ get_options(self)->lbfgs.histsz = FIX2INT(rb_fixnum);
227
+
228
+ return rb_fixnum;
218
229
  }
219
230
 
220
231
  static VALUE options_maxls(VALUE self) {
221
- return INT2FIX(get_options(self)->lbfgs.maxls);
232
+ return INT2FIX(get_options(self)->lbfgs.maxls);
222
233
  }
223
234
 
224
235
  static VALUE options_set_maxls(VALUE self, VALUE rb_fixnum) {
225
- Check_Type(rb_fixnum, T_FIXNUM);
226
- get_options(self)->lbfgs.maxls = FIX2INT(rb_fixnum);
227
-
228
- return rb_fixnum;
236
+ Check_Type(rb_fixnum, T_FIXNUM);
237
+ get_options(self)->lbfgs.maxls = FIX2INT(rb_fixnum);
238
+
239
+ return rb_fixnum;
229
240
  }
230
241
 
231
242
 
232
243
  // Float Accessors
233
244
 
234
245
  static VALUE options_rho1(VALUE self) {
235
- return rb_float_new(get_options(self)->rho1);
246
+ return rb_float_new(get_options(self)->rho1);
236
247
  }
237
248
 
238
249
  static VALUE options_set_rho1(VALUE self, VALUE rb_numeric) {
239
- get_options(self)->rho1 = NUM2DBL(rb_numeric);
240
- return rb_numeric;
250
+ get_options(self)->rho1 = NUM2DBL(rb_numeric);
251
+ return rb_numeric;
241
252
  }
242
253
 
243
254
  static VALUE options_rho2(VALUE self) {
244
- return rb_float_new(get_options(self)->rho2);
255
+ return rb_float_new(get_options(self)->rho2);
245
256
  }
246
257
 
247
258
  static VALUE options_set_rho2(VALUE self, VALUE rb_numeric) {
248
- get_options(self)->rho2 = NUM2DBL(rb_numeric);
249
- return rb_numeric;
259
+ get_options(self)->rho2 = NUM2DBL(rb_numeric);
260
+ return rb_numeric;
250
261
  }
251
262
 
252
263
  static VALUE options_stopeps(VALUE self) {
253
- return rb_float_new(get_options(self)->stopeps);
264
+ return rb_float_new(get_options(self)->stopeps);
254
265
  }
255
266
 
256
267
  static VALUE options_set_stopeps(VALUE self, VALUE rb_numeric) {
257
- get_options(self)->stopeps = NUM2DBL(rb_numeric);
258
- return rb_numeric;
268
+ get_options(self)->stopeps = NUM2DBL(rb_numeric);
269
+ return rb_numeric;
259
270
  }
260
271
 
261
272
  static VALUE options_eta0(VALUE self) {
262
- return rb_float_new(get_options(self)->sgdl1.eta0);
273
+ return rb_float_new(get_options(self)->sgdl1.eta0);
263
274
  }
264
275
 
265
276
  static VALUE options_set_eta0(VALUE self, VALUE rb_numeric) {
266
- get_options(self)->sgdl1.eta0 = NUM2DBL(rb_numeric);
267
- return rb_numeric;
277
+ get_options(self)->sgdl1.eta0 = NUM2DBL(rb_numeric);
278
+ return rb_numeric;
268
279
  }
269
280
 
270
281
  static VALUE options_alpha(VALUE self) {
271
- return rb_float_new(get_options(self)->sgdl1.alpha);
282
+ return rb_float_new(get_options(self)->sgdl1.alpha);
272
283
  }
273
284
 
274
285
  static VALUE options_set_alpha(VALUE self, VALUE rb_numeric) {
275
- get_options(self)->sgdl1.alpha = NUM2DBL(rb_numeric);
276
- return rb_numeric;
286
+ get_options(self)->sgdl1.alpha = NUM2DBL(rb_numeric);
287
+ return rb_numeric;
277
288
  }
278
289
 
279
290
  static VALUE options_kappa(VALUE self) {
280
- return rb_float_new(get_options(self)->bcd.kappa);
291
+ return rb_float_new(get_options(self)->bcd.kappa);
281
292
  }
282
293
 
283
294
  static VALUE options_set_kappa(VALUE self, VALUE rb_numeric) {
284
- get_options(self)->bcd.kappa = NUM2DBL(rb_numeric);
285
- return rb_numeric;
295
+ get_options(self)->bcd.kappa = NUM2DBL(rb_numeric);
296
+ return rb_numeric;
286
297
  }
287
298
 
288
299
  static VALUE options_stpmin(VALUE self) {
289
- return rb_float_new(get_options(self)->rprop.stpmin);
300
+ return rb_float_new(get_options(self)->rprop.stpmin);
290
301
  }
291
302
 
292
303
  static VALUE options_set_stpmin(VALUE self, VALUE rb_numeric) {
293
- get_options(self)->rprop.stpmin = NUM2DBL(rb_numeric);
294
- return rb_numeric;
304
+ get_options(self)->rprop.stpmin = NUM2DBL(rb_numeric);
305
+ return rb_numeric;
295
306
  }
296
307
 
297
308
  static VALUE options_stpmax(VALUE self) {
298
- return rb_float_new(get_options(self)->rprop.stpmax);
309
+ return rb_float_new(get_options(self)->rprop.stpmax);
299
310
  }
300
311
 
301
312
  static VALUE options_set_stpmax(VALUE self, VALUE rb_numeric) {
302
- get_options(self)->rprop.stpmax = NUM2DBL(rb_numeric);
303
- return rb_numeric;
313
+ get_options(self)->rprop.stpmax = NUM2DBL(rb_numeric);
314
+ return rb_numeric;
304
315
  }
305
316
 
306
317
  static VALUE options_stpinc(VALUE self) {
307
- return rb_float_new(get_options(self)->rprop.stpinc);
318
+ return rb_float_new(get_options(self)->rprop.stpinc);
308
319
  }
309
320
 
310
321
  static VALUE options_set_stpinc(VALUE self, VALUE rb_numeric) {
311
- get_options(self)->rprop.stpinc = NUM2DBL(rb_numeric);
312
- return rb_numeric;
322
+ get_options(self)->rprop.stpinc = NUM2DBL(rb_numeric);
323
+ return rb_numeric;
313
324
  }
314
325
 
315
326
  static VALUE options_stpdec(VALUE self) {
316
- return rb_float_new(get_options(self)->rprop.stpdec);
327
+ return rb_float_new(get_options(self)->rprop.stpdec);
317
328
  }
318
329
 
319
330
  static VALUE options_set_stpdec(VALUE self, VALUE rb_numeric) {
320
- get_options(self)->rprop.stpdec = NUM2DBL(rb_numeric);
321
- return rb_numeric;
331
+ get_options(self)->rprop.stpdec = NUM2DBL(rb_numeric);
332
+ return rb_numeric;
322
333
  }
323
334
 
324
335
 
@@ -326,84 +337,84 @@ static VALUE options_set_stpdec(VALUE self, VALUE rb_numeric) {
326
337
  // Boolean Accessors
327
338
 
328
339
  static VALUE options_maxent(VALUE self) {
329
- return get_options(self)->maxent ? Qtrue : Qfalse;
340
+ return get_options(self)->maxent ? Qtrue : Qfalse;
330
341
  }
331
342
 
332
343
  static VALUE options_set_maxent(VALUE self, VALUE rb_boolean) {
333
- get_options(self)->maxent = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
334
- return rb_boolean;
344
+ get_options(self)->maxent = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
345
+ return rb_boolean;
335
346
  }
336
347
 
337
348
  static VALUE options_compact(VALUE self) {
338
- return get_options(self)->compact ? Qtrue : Qfalse;
349
+ return get_options(self)->compact ? Qtrue : Qfalse;
339
350
  }
340
351
 
341
352
  static VALUE options_set_compact(VALUE self, VALUE rb_boolean) {
342
- get_options(self)->compact = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
343
- return rb_boolean;
353
+ get_options(self)->compact = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
354
+ return rb_boolean;
344
355
  }
345
356
 
346
357
  static VALUE options_sparse(VALUE self) {
347
- return get_options(self)->sparse ? Qtrue : Qfalse;
358
+ return get_options(self)->sparse ? Qtrue : Qfalse;
348
359
  }
349
360
 
350
361
  static VALUE options_set_sparse(VALUE self, VALUE rb_boolean) {
351
- get_options(self)->sparse = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
352
- return rb_boolean;
362
+ get_options(self)->sparse = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
363
+ return rb_boolean;
353
364
  }
354
365
 
355
366
  static VALUE options_check(VALUE self) {
356
- return get_options(self)->check ? Qtrue : Qfalse;
367
+ return get_options(self)->check ? Qtrue : Qfalse;
357
368
  }
358
369
 
359
370
  static VALUE options_set_check(VALUE self, VALUE rb_boolean) {
360
- get_options(self)->check = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
361
- return rb_boolean;
371
+ get_options(self)->check = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
372
+ return rb_boolean;
362
373
  }
363
374
 
364
375
  static VALUE options_label(VALUE self) {
365
- return get_options(self)->label ? Qtrue : Qfalse;
376
+ return get_options(self)->label ? Qtrue : Qfalse;
366
377
  }
367
378
 
368
379
  static VALUE options_set_label(VALUE self, VALUE rb_boolean) {
369
- get_options(self)->label = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
370
- return rb_boolean;
380
+ get_options(self)->label = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
381
+ return rb_boolean;
371
382
  }
372
383
 
373
384
  static VALUE options_outsc(VALUE self) {
374
- return get_options(self)->outsc ? Qtrue : Qfalse;
385
+ return get_options(self)->outsc ? Qtrue : Qfalse;
375
386
  }
376
387
 
377
388
  static VALUE options_set_outsc(VALUE self, VALUE rb_boolean) {
378
- get_options(self)->outsc = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
379
- return rb_boolean;
389
+ get_options(self)->outsc = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
390
+ return rb_boolean;
380
391
  }
381
392
 
382
393
  static VALUE options_lblpost(VALUE self) {
383
- return get_options(self)->lblpost ? Qtrue : Qfalse;
394
+ return get_options(self)->lblpost ? Qtrue : Qfalse;
384
395
  }
385
396
 
386
397
  static VALUE options_set_lblpost(VALUE self, VALUE rb_boolean) {
387
- get_options(self)->lblpost = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
388
- return rb_boolean;
398
+ get_options(self)->lblpost = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
399
+ return rb_boolean;
389
400
  }
390
401
 
391
402
  static VALUE options_clip(VALUE self) {
392
- return get_options(self)->lbfgs.clip ? Qtrue : Qfalse;
403
+ return get_options(self)->lbfgs.clip ? Qtrue : Qfalse;
393
404
  }
394
405
 
395
406
  static VALUE options_set_clip(VALUE self, VALUE rb_boolean) {
396
- get_options(self)->lbfgs.clip = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
397
- return rb_boolean;
407
+ get_options(self)->lbfgs.clip = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
408
+ return rb_boolean;
398
409
  }
399
410
 
400
411
  static VALUE options_cutoff(VALUE self) {
401
- return get_options(self)->rprop.cutoff ? Qtrue : Qfalse;
412
+ return get_options(self)->rprop.cutoff ? Qtrue : Qfalse;
402
413
  }
403
414
 
404
415
  static VALUE options_set_cutoff(VALUE self, VALUE rb_boolean) {
405
- get_options(self)->rprop.cutoff = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
406
- return rb_boolean;
416
+ get_options(self)->rprop.cutoff = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
417
+ return rb_boolean;
407
418
  }
408
419
 
409
420
 
@@ -412,201 +423,201 @@ static VALUE options_set_cutoff(VALUE self, VALUE rb_boolean) {
412
423
  // String Accessors
413
424
 
414
425
  static VALUE options_pattern(VALUE self) {
415
- char *pattern = get_options(self)->pattern;
416
- return rb_str_new2(pattern ? pattern : "");
426
+ const char *pattern = get_options(self)->pattern;
427
+ return rb_str_new2(pattern ? pattern : "");
417
428
  }
418
429
 
419
430
  static VALUE options_set_pattern(VALUE self, VALUE rb_string) {
420
- opt_t *options = get_options(self);
421
- copy_string(&(options->pattern), rb_string);
422
-
423
- return rb_string;
431
+ opt_t *options = get_options(self);
432
+ copy_string((char**)&(options->pattern), rb_string);
433
+
434
+ return rb_string;
424
435
  }
425
436
 
426
437
  static VALUE options_model(VALUE self) {
427
- char *model = get_options(self)->model;
428
- return rb_str_new2(model ? model : "");
438
+ const char *model = get_options(self)->model;
439
+ return rb_str_new2(model ? model : "");
429
440
  }
430
441
 
431
442
  static VALUE options_set_model(VALUE self, VALUE rb_string) {
432
- opt_t *options = get_options(self);
433
- copy_string(&(options->model), rb_string);
434
-
435
- return rb_string;
443
+ opt_t *options = get_options(self);
444
+ copy_string(&(options->model), rb_string);
445
+
446
+ return rb_string;
436
447
  }
437
448
 
438
449
  static VALUE options_algorithm(VALUE self) {
439
- char *algorithm = get_options(self)->algo;
440
- return rb_str_new2(algorithm ? algorithm : "");
450
+ const char *algorithm = get_options(self)->algo;
451
+ return rb_str_new2(algorithm ? algorithm : "");
441
452
  }
442
453
 
443
454
  static VALUE options_set_algorithm(VALUE self, VALUE rb_string) {
444
- opt_t *options = get_options(self);
445
- copy_string(&(options->algo), rb_string);
446
-
447
- return rb_string;
455
+ opt_t *options = get_options(self);
456
+ copy_string((char**)&(options->algo), rb_string);
457
+
458
+ return rb_string;
448
459
  }
449
460
 
450
461
  static VALUE options_development_data(VALUE self) {
451
- char *development_data = get_options(self)->devel;
452
- return rb_str_new2(development_data ? development_data : "");
462
+ char *development_data = get_options(self)->devel;
463
+ return rb_str_new2(development_data ? development_data : "");
453
464
  }
454
465
 
455
466
  static VALUE options_set_development_data(VALUE self, VALUE rb_string) {
456
- opt_t *options = get_options(self);
457
- copy_string(&(options->devel), rb_string);
458
-
459
- return rb_string;
467
+ opt_t *options = get_options(self);
468
+ copy_string(&(options->devel), rb_string);
469
+
470
+ return rb_string;
460
471
  }
461
472
 
462
473
 
463
474
  void Init_options() {
464
- cOptions = rb_define_class_under(mWapiti, "Options", rb_cObject);
465
- rb_define_alloc_func(cOptions, allocate_options);
466
-
467
- rb_define_method(cOptions, "initialize", initialize_options, -1);
475
+ cOptions = rb_define_class_under(mWapiti, "Options", rb_cObject);
476
+ rb_define_alloc_func(cOptions, allocate_options);
477
+
478
+ rb_define_method(cOptions, "initialize", initialize_options, -1);
479
+
480
+ // Option Accessors
481
+
482
+ rb_define_method(cOptions, "stopwin", options_stopwin, 0);
483
+ rb_define_method(cOptions, "stopwin=", options_set_stopwin, 1);
484
+
485
+ rb_define_alias(cOptions, "stop_window", "stopwin");
486
+ rb_define_alias(cOptions, "stop_window=", "stopwin=");
468
487
 
469
- // Option Accessors
488
+ rb_define_method(cOptions, "objwin", options_objwin, 0);
489
+ rb_define_method(cOptions, "objwin=", options_set_objwin, 1);
470
490
 
471
- rb_define_method(cOptions, "stopwin", options_stopwin, 0);
472
- rb_define_method(cOptions, "stopwin=", options_set_stopwin, 1);
491
+ rb_define_alias(cOptions, "convergence_window", "objwin");
492
+ rb_define_alias(cOptions, "convergence_window=", "objwin=");
473
493
 
474
- rb_define_alias(cOptions, "stop_window", "stopwin");
475
- rb_define_alias(cOptions, "stop_window=", "stopwin=");
494
+ rb_define_method(cOptions, "maxiter", options_maxiter, 0);
495
+ rb_define_method(cOptions, "maxiter=", options_set_maxiter, 1);
476
496
 
477
- rb_define_method(cOptions, "objwin", options_objwin, 0);
478
- rb_define_method(cOptions, "objwin=", options_set_objwin, 1);
497
+ rb_define_alias(cOptions, "max_iterations", "maxiter");
498
+ rb_define_alias(cOptions, "max_iterations=", "maxiter=");
479
499
 
480
- rb_define_alias(cOptions, "convergence_window", "objwin");
481
- rb_define_alias(cOptions, "convergence_window=", "objwin=");
482
-
483
- rb_define_method(cOptions, "maxiter", options_maxiter, 0);
484
- rb_define_method(cOptions, "maxiter=", options_set_maxiter, 1);
500
+ rb_define_method(cOptions, "jobsize", options_jobsize, 0);
501
+ rb_define_method(cOptions, "jobsize=", options_set_jobsize, 1);
485
502
 
486
- rb_define_alias(cOptions, "max_iterations", "maxiter");
487
- rb_define_alias(cOptions, "max_iterations=", "maxiter=");
488
-
489
- rb_define_method(cOptions, "jobsize", options_jobsize, 0);
490
- rb_define_method(cOptions, "jobsize=", options_set_jobsize, 1);
503
+ rb_define_method(cOptions, "nthread", options_nthread, 0);
504
+ rb_define_method(cOptions, "nthread=", options_set_nthread, 1);
491
505
 
492
- rb_define_method(cOptions, "nthread", options_nthread, 0);
493
- rb_define_method(cOptions, "nthread=", options_set_nthread, 1);
506
+ rb_define_alias(cOptions, "threads", "nthread");
507
+ rb_define_alias(cOptions, "threads=", "nthread=");
494
508
 
495
- rb_define_alias(cOptions, "threads", "nthread");
496
- rb_define_alias(cOptions, "threads=", "nthread=");
509
+ rb_define_method(cOptions, "rho1", options_rho1, 0);
510
+ rb_define_method(cOptions, "rho1=", options_set_rho1, 1);
497
511
 
498
- rb_define_method(cOptions, "rho1", options_rho1, 0);
499
- rb_define_method(cOptions, "rho1=", options_set_rho1, 1);
512
+ rb_define_method(cOptions, "rho2", options_rho2, 0);
513
+ rb_define_method(cOptions, "rho2=", options_set_rho2, 1);
500
514
 
501
- rb_define_method(cOptions, "rho2", options_rho2, 0);
502
- rb_define_method(cOptions, "rho2=", options_set_rho2, 1);
515
+ rb_define_method(cOptions, "stopeps", options_stopeps, 0);
516
+ rb_define_method(cOptions, "stopeps=", options_set_stopeps, 1);
503
517
 
504
- rb_define_method(cOptions, "stopeps", options_stopeps, 0);
505
- rb_define_method(cOptions, "stopeps=", options_set_stopeps, 1);
518
+ rb_define_alias(cOptions, "stop_epsilon", "stopeps");
519
+ rb_define_alias(cOptions, "stop_epsilon=", "stopeps=");
506
520
 
507
- rb_define_alias(cOptions, "stop_epsilon", "stopeps");
508
- rb_define_alias(cOptions, "stop_epsilon=", "stopeps=");
521
+ rb_define_method(cOptions, "maxent", options_maxent, 0);
522
+ rb_define_method(cOptions, "maxent=", options_set_maxent, 1);
509
523
 
510
- rb_define_method(cOptions, "maxent", options_maxent, 0);
511
- rb_define_method(cOptions, "maxent=", options_set_maxent, 1);
524
+ rb_define_alias(cOptions, "maxent?", "maxent");
512
525
 
513
- rb_define_alias(cOptions, "maxent?", "maxent");
526
+ rb_define_method(cOptions, "compact", options_compact, 0);
527
+ rb_define_method(cOptions, "compact=", options_set_compact, 1);
514
528
 
515
- rb_define_method(cOptions, "compact", options_compact, 0);
516
- rb_define_method(cOptions, "compact=", options_set_compact, 1);
529
+ rb_define_alias(cOptions, "compact?", "compact");
517
530
 
518
- rb_define_alias(cOptions, "compact?", "compact");
531
+ rb_define_method(cOptions, "sparse", options_sparse, 0);
532
+ rb_define_method(cOptions, "sparse=", options_set_sparse, 1);
519
533
 
520
- rb_define_method(cOptions, "sparse", options_sparse, 0);
521
- rb_define_method(cOptions, "sparse=", options_set_sparse, 1);
534
+ rb_define_alias(cOptions, "sparse?", "sparse");
522
535
 
523
- rb_define_alias(cOptions, "sparse?", "sparse");
536
+ rb_define_method(cOptions, "skip_tokens", options_label, 0);
537
+ rb_define_method(cOptions, "skip_tokens=", options_set_label, 1);
524
538
 
525
- rb_define_method(cOptions, "skip_tokens", options_label, 0);
526
- rb_define_method(cOptions, "skip_tokens=", options_set_label, 1);
539
+ rb_define_alias(cOptions, "skip_tokens?", "skip_tokens");
527
540
 
528
- rb_define_alias(cOptions, "skip_tokens?", "skip_tokens");
541
+ rb_define_method(cOptions, "check", options_check, 0);
542
+ rb_define_method(cOptions, "check=", options_set_check, 1);
529
543
 
530
- rb_define_method(cOptions, "check", options_check, 0);
531
- rb_define_method(cOptions, "check=", options_set_check, 1);
544
+ rb_define_alias(cOptions, "check?", "check");
532
545
 
533
- rb_define_alias(cOptions, "check?", "check");
546
+ rb_define_method(cOptions, "lblpost", options_lblpost, 0);
547
+ rb_define_method(cOptions, "lblpost=", options_set_lblpost, 1);
534
548
 
535
- rb_define_method(cOptions, "lblpost", options_lblpost, 0);
536
- rb_define_method(cOptions, "lblpost=", options_set_lblpost, 1);
549
+ rb_define_alias(cOptions, "lblpost?", "lblpost");
537
550
 
538
- rb_define_alias(cOptions, "lblpost?", "lblpost");
551
+ rb_define_alias(cOptions, "posterior", "lblpost");
552
+ rb_define_alias(cOptions, "posterior?", "lblpost");
553
+ rb_define_alias(cOptions, "posterior=", "lblpost=");
539
554
 
540
- rb_define_alias(cOptions, "posterior", "lblpost");
541
- rb_define_alias(cOptions, "posterior?", "lblpost");
542
- rb_define_alias(cOptions, "posterior=", "lblpost=");
555
+ rb_define_method(cOptions, "outsc", options_outsc, 0);
556
+ rb_define_method(cOptions, "outsc=", options_set_outsc, 1);
543
557
 
544
- rb_define_method(cOptions, "outsc", options_outsc, 0);
545
- rb_define_method(cOptions, "outsc=", options_set_outsc, 1);
558
+ rb_define_alias(cOptions, "outsc?", "outsc");
546
559
 
547
- rb_define_alias(cOptions, "outsc?", "outsc");
560
+ rb_define_alias(cOptions, "score", "outsc");
561
+ rb_define_alias(cOptions, "score?", "outsc");
562
+ rb_define_alias(cOptions, "score=", "outsc=");
548
563
 
549
- rb_define_alias(cOptions, "score", "outsc");
550
- rb_define_alias(cOptions, "score?", "outsc");
551
- rb_define_alias(cOptions, "score=", "outsc=");
564
+ rb_define_method(cOptions, "pattern", options_pattern, 0);
565
+ rb_define_method(cOptions, "pattern=", options_set_pattern, 1);
552
566
 
553
- rb_define_method(cOptions, "pattern", options_pattern, 0);
554
- rb_define_method(cOptions, "pattern=", options_set_pattern, 1);
567
+ rb_define_alias(cOptions, "template", "pattern");
568
+ rb_define_alias(cOptions, "template=", "pattern=");
555
569
 
556
- rb_define_alias(cOptions, "template", "pattern");
557
- rb_define_alias(cOptions, "template=", "pattern=");
570
+ rb_define_method(cOptions, "model", options_model, 0);
571
+ rb_define_method(cOptions, "model=", options_set_model, 1);
558
572
 
559
- rb_define_method(cOptions, "model", options_model, 0);
560
- rb_define_method(cOptions, "model=", options_set_model, 1);
573
+ rb_define_method(cOptions, "algorithm", options_algorithm, 0);
574
+ rb_define_method(cOptions, "algorithm=", options_set_algorithm, 1);
561
575
 
562
- rb_define_method(cOptions, "algorithm", options_algorithm, 0);
563
- rb_define_method(cOptions, "algorithm=", options_set_algorithm, 1);
576
+ rb_define_alias(cOptions, "algo", "algorithm");
577
+ rb_define_alias(cOptions, "algo=", "algorithm=");
564
578
 
565
- rb_define_alias(cOptions, "algo", "algorithm");
566
- rb_define_alias(cOptions, "algo=", "algorithm=");
579
+ rb_define_method(cOptions, "development_data", options_development_data, 0);
580
+ rb_define_method(cOptions, "development_data=", options_set_development_data, 1);
567
581
 
568
- rb_define_method(cOptions, "development_data", options_development_data, 0);
569
- rb_define_method(cOptions, "development_data=", options_set_development_data, 1);
582
+ rb_define_alias(cOptions, "devel", "development_data");
583
+ rb_define_alias(cOptions, "devel=", "development_data=");
570
584
 
571
- rb_define_alias(cOptions, "devel", "development_data");
572
- rb_define_alias(cOptions, "devel=", "development_data=");
585
+ rb_define_method(cOptions, "clip", options_clip, 0);
586
+ rb_define_method(cOptions, "clip=", options_set_clip, 1);
573
587
 
574
- rb_define_method(cOptions, "clip", options_clip, 0);
575
- rb_define_method(cOptions, "clip=", options_set_clip, 1);
588
+ rb_define_method(cOptions, "histsz", options_histsz, 0);
589
+ rb_define_method(cOptions, "histsz=", options_set_histsz, 1);
576
590
 
577
- rb_define_method(cOptions, "histsz", options_histsz, 0);
578
- rb_define_method(cOptions, "histsz=", options_set_histsz, 1);
591
+ rb_define_method(cOptions, "maxls", options_maxls, 0);
592
+ rb_define_method(cOptions, "maxls=", options_set_maxls, 1);
579
593
 
580
- rb_define_method(cOptions, "maxls", options_maxls, 0);
581
- rb_define_method(cOptions, "maxls=", options_set_maxls, 1);
594
+ rb_define_method(cOptions, "eta0", options_eta0, 0);
595
+ rb_define_method(cOptions, "eta0=", options_set_eta0, 1);
582
596
 
583
- rb_define_method(cOptions, "eta0", options_eta0, 0);
584
- rb_define_method(cOptions, "eta0=", options_set_eta0, 1);
597
+ rb_define_method(cOptions, "alpha", options_alpha, 0);
598
+ rb_define_method(cOptions, "alpha=", options_set_alpha, 1);
585
599
 
586
- rb_define_method(cOptions, "alpha", options_alpha, 0);
587
- rb_define_method(cOptions, "alpha=", options_set_alpha, 1);
600
+ rb_define_method(cOptions, "kappa", options_kappa, 0);
601
+ rb_define_method(cOptions, "kappa=", options_set_kappa, 1);
588
602
 
589
- rb_define_method(cOptions, "kappa", options_kappa, 0);
590
- rb_define_method(cOptions, "kappa=", options_set_kappa, 1);
603
+ rb_define_method(cOptions, "stpmin", options_stpmin, 0);
604
+ rb_define_method(cOptions, "stpmin=", options_set_stpmin, 1);
591
605
 
592
- rb_define_method(cOptions, "stpmin", options_stpmin, 0);
593
- rb_define_method(cOptions, "stpmin=", options_set_stpmin, 1);
606
+ rb_define_method(cOptions, "stpmax", options_stpmax, 0);
607
+ rb_define_method(cOptions, "stpmax=", options_set_stpmax, 1);
594
608
 
595
- rb_define_method(cOptions, "stpmax", options_stpmax, 0);
596
- rb_define_method(cOptions, "stpmax=", options_set_stpmax, 1);
609
+ rb_define_method(cOptions, "stpinc", options_stpinc, 0);
610
+ rb_define_method(cOptions, "stpinc=", options_set_stpinc, 1);
597
611
 
598
- rb_define_method(cOptions, "stpinc", options_stpinc, 0);
599
- rb_define_method(cOptions, "stpinc=", options_set_stpinc, 1);
612
+ rb_define_method(cOptions, "stpdec", options_stpdec, 0);
613
+ rb_define_method(cOptions, "stpdec=", options_set_stpdec, 1);
600
614
 
601
- rb_define_method(cOptions, "stpdec", options_stpdec, 0);
602
- rb_define_method(cOptions, "stpdec=", options_set_stpdec, 1);
615
+ rb_define_method(cOptions, "cutoff", options_cutoff, 0);
616
+ rb_define_method(cOptions, "cutoff=", options_set_cutoff, 1);
617
+
618
+ rb_define_method(cOptions, "nbest", options_nbest, 0);
619
+ rb_define_method(cOptions, "nbest=", options_set_nbest, 1);
603
620
 
604
- rb_define_method(cOptions, "cutoff", options_cutoff, 0);
605
- rb_define_method(cOptions, "cutoff=", options_set_cutoff, 1);
606
-
607
- rb_define_method(cOptions, "nbest", options_nbest, 0);
608
- rb_define_method(cOptions, "nbest=", options_set_nbest, 1);
609
-
610
621
  }
611
622
 
612
623
 
@@ -615,119 +626,119 @@ void Init_options() {
615
626
  // Auxiliary Methods
616
627
 
617
628
  static mdl_t *get_model(VALUE self) {
618
- mdl_t *model;
619
- Data_Get_Struct(self, mdl_t, model);
620
- return model;
629
+ mdl_t *model;
630
+ Data_Get_Struct(self, mdl_t, model);
631
+ return model;
621
632
  }
622
633
 
623
634
  // Constructor / Desctructor
624
635
 
625
636
  static void mark_model(mdl_t *model __attribute__((__unused__))) {
626
- // nothing
637
+ // nothing
627
638
  }
628
639
 
629
640
  static void deallocate_model(mdl_t *model) {
630
- if (model) {
631
- mdl_free(model);
632
- model = (mdl_t*)0;
633
- }
641
+ if (model) {
642
+ mdl_free(model);
643
+ model = (mdl_t*)0;
644
+ }
634
645
  }
635
646
 
636
647
  static VALUE allocate_model(VALUE self) {
637
- mdl_t *model = mdl_new(rdr_new(false));
638
- return Data_Wrap_Struct(self, mark_model, deallocate_model, model);
648
+ mdl_t *model = mdl_new(rdr_new(false));
649
+ return Data_Wrap_Struct(self, mark_model, deallocate_model, model);
639
650
  }
640
651
 
641
- static VALUE model_set_options(VALUE self, VALUE rb_options) {
642
- if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
643
- rb_raise(cNativeError, "argument must be a Wapiti::Options instance");
644
- }
645
-
646
- mdl_t *model = get_model(self);
647
-
648
- // Store reference to options in model struct
649
- model->opt = get_options(rb_options);
650
-
651
- // Update reader
652
- model->reader->maxent = model->opt->maxent;
653
-
654
- // Save instance variable
655
- rb_ivar_set(self, rb_intern("@options"), rb_options);
652
+ static VALUE model_set_options(VALUE self, VALUE rb_options) {
653
+ if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
654
+ rb_raise(cNativeError, "argument must be a Wapiti::Options instance");
655
+ }
656
+
657
+ mdl_t *model = get_model(self);
658
+
659
+ // Store reference to options in model struct
660
+ model->opt = get_options(rb_options);
661
+
662
+ // Update reader
663
+ model->reader->autouni = model->opt->maxent;
664
+
665
+ // Save instance variable
666
+ rb_ivar_set(self, rb_intern("@options"), rb_options);
656
667
 
657
- return rb_options;
668
+ return rb_options;
658
669
  }
659
670
 
660
671
  static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
661
- VALUE options;
662
-
663
- if (argc > 1) {
664
- rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
665
- "wrong number of arguments (%d for 0..1)", argc);
666
- }
667
-
668
- if (argc) {
669
- if (TYPE(argv[0]) == T_HASH) {
670
- options = rb_funcall(cOptions, rb_intern("new"), 1, argv[0]);
671
- }
672
- else {
673
- if (strncmp("Wapiti::Options", rb_obj_classname(argv[0]), 15) != 0) {
674
- rb_raise(cNativeError, "argument must be a hash or an options instance");
675
- }
676
- options = argv[0];
677
- }
678
- }
679
- else {
680
- options = rb_funcall(cOptions, rb_intern("new"), 0);
681
- }
682
-
683
- // yield options if block_given?
684
- if (rb_block_given_p()) {
685
- rb_yield(options);
686
- }
687
-
688
- model_set_options(self, options);
689
-
690
- // Load a previous model if specified by options
691
- if (get_options(options)->model) {
692
- rb_funcall(self, rb_intern("load"), 0);
693
- }
694
-
695
- // initialize counters
696
- rb_funcall(self, rb_intern("clear_counters"), 0);
697
-
698
- return self;
672
+ VALUE options;
673
+
674
+ if (argc > 1) {
675
+ rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
676
+ "wrong number of arguments (%d for 0..1)", argc);
677
+ }
678
+
679
+ if (argc) {
680
+ if (TYPE(argv[0]) == T_HASH) {
681
+ options = rb_funcall(cOptions, rb_intern("new"), 1, argv[0]);
682
+ }
683
+ else {
684
+ if (strncmp("Wapiti::Options", rb_obj_classname(argv[0]), 15) != 0) {
685
+ rb_raise(cNativeError, "argument must be a hash or an options instance");
686
+ }
687
+ options = argv[0];
688
+ }
689
+ }
690
+ else {
691
+ options = rb_funcall(cOptions, rb_intern("new"), 0);
692
+ }
693
+
694
+ // yield options if block_given?
695
+ if (rb_block_given_p()) {
696
+ rb_yield(options);
697
+ }
698
+
699
+ model_set_options(self, options);
700
+
701
+ // Load a previous model if specified by options
702
+ if (get_options(options)->model) {
703
+ rb_funcall(self, rb_intern("load"), 0);
704
+ }
705
+
706
+ // initialize counters
707
+ rb_funcall(self, rb_intern("clear_counters"), 0);
708
+
709
+ return self;
699
710
  }
700
711
 
701
712
 
702
713
  // Native accessors
703
714
 
704
715
  static VALUE model_nlbl(VALUE self) {
705
- return INT2FIX(get_model(self)->nlbl);
716
+ return INT2FIX(get_model(self)->nlbl);
706
717
  }
707
718
 
708
719
  static VALUE model_nobs(VALUE self) {
709
- return INT2FIX(get_model(self)->nobs);
720
+ return INT2FIX(get_model(self)->nobs);
710
721
  }
711
722
 
712
723
  static VALUE model_nftr(VALUE self) {
713
- return INT2FIX(get_model(self)->nftr);
724
+ return INT2FIX(get_model(self)->nftr);
714
725
  }
715
726
 
716
727
  static VALUE model_total(VALUE self) {
717
- return rb_float_new(get_model(self)->total);
728
+ return rb_float_new(get_model(self)->total);
718
729
  }
719
730
 
720
731
 
721
732
  // Instance methods
722
733
 
723
734
  static VALUE model_sync(VALUE self) {
724
- mdl_sync(get_model(self));
725
- return self;
735
+ mdl_sync(get_model(self));
736
+ return self;
726
737
  }
727
738
 
728
739
  static VALUE model_compact(VALUE self) {
729
- mdl_compact(get_model(self));
730
- return self;
740
+ mdl_compact(get_model(self));
741
+ return self;
731
742
  }
732
743
 
733
744
  // call-seq:
@@ -737,400 +748,398 @@ static VALUE model_compact(VALUE self) {
737
748
  // Saves the model to a file. Uses the Model's path if no argument given,
738
749
  // otherwise uses the passed-in argument as the Model's path.
739
750
  static VALUE model_save(int argc, VALUE *argv, VALUE self) {
740
- if (argc > 1) {
741
- rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
742
- "wrong number of arguments (%d for 0..1)", argc);
743
- }
744
-
745
- mdl_t *model = get_model(self);
746
-
747
- // save passed-in path in options
748
- if (argc) {
749
- Check_Type(argv[0], T_STRING);
750
- rb_ivar_set(self, rb_intern("@path"), argv[0]);
751
- }
752
-
753
- // open the output file
754
- FILE *file = 0;
755
- VALUE path = rb_ivar_get(self, rb_intern("@path"));
756
-
757
- if (NIL_P(path)) {
758
- rb_raise(cNativeError, "failed to save model: no path given");
759
- }
760
-
761
- if (!(file = fopen(StringValueCStr(path), "w"))) {
762
- rb_raise(cNativeError, "failed to save model: failed to open model file");
763
- }
764
-
765
- mdl_save(model, file);
766
- fclose(file);
767
-
768
- return self;
751
+ if (argc > 1) {
752
+ rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
753
+ "wrong number of arguments (%d for 0..1)", argc);
754
+ }
755
+
756
+ mdl_t *model = get_model(self);
757
+
758
+ // save passed-in path in options
759
+ if (argc) {
760
+ Check_Type(argv[0], T_STRING);
761
+ rb_ivar_set(self, rb_intern("@path"), argv[0]);
762
+ }
763
+
764
+ // open the output file
765
+ FILE *file = 0;
766
+ VALUE path = rb_ivar_get(self, rb_intern("@path"));
767
+
768
+ if (NIL_P(path)) {
769
+ rb_raise(cNativeError, "failed to save model: no path given");
770
+ }
771
+
772
+ if (!(file = fopen(StringValueCStr(path), "w"))) {
773
+ rb_raise(cNativeError, "failed to save model: failed to open model file");
774
+ }
775
+
776
+ mdl_save(model, file);
777
+ fclose(file);
778
+
779
+ return self;
769
780
  }
770
781
 
771
782
  static VALUE model_load(int argc, VALUE *argv, VALUE self) {
772
- if (argc > 1) {
773
- rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
774
- "wrong number of arguments (%d for 0..1)", argc);
775
- }
776
-
777
- mdl_t *model = get_model(self);
778
-
779
- // save passed-in argument in options
780
- if (argc) {
781
- Check_Type(argv[0], T_STRING);
782
- rb_ivar_set(self, rb_intern("@path"), argv[0]);
783
- }
784
-
785
- // open the model file
786
- FILE *file = 0;
787
- VALUE path = rb_ivar_get(self, rb_intern("@path"));
788
-
789
- if (NIL_P(path)) {
790
- rb_raise(cNativeError, "failed to load model: no path given");
791
- }
792
-
793
- if (!(file = fopen(StringValueCStr(path), "r"))) {
794
- rb_raise(cNativeError, "failed to load model: failed to open model file");
795
- }
796
-
797
- mdl_load(model, file);
798
- fclose(file);
799
-
800
- return self;
783
+ if (argc > 1) {
784
+ rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
785
+ "wrong number of arguments (%d for 0..1)", argc);
786
+ }
787
+
788
+ mdl_t *model = get_model(self);
789
+
790
+ // save passed-in argument in options
791
+ if (argc) {
792
+ Check_Type(argv[0], T_STRING);
793
+ rb_ivar_set(self, rb_intern("@path"), argv[0]);
794
+ }
795
+
796
+ // open the model file
797
+ FILE *file = 0;
798
+ VALUE path = rb_ivar_get(self, rb_intern("@path"));
799
+
800
+ if (NIL_P(path)) {
801
+ rb_raise(cNativeError, "failed to load model: no path given");
802
+ }
803
+
804
+ if (!(file = fopen(StringValueCStr(path), "r"))) {
805
+ rb_raise(cNativeError, "failed to load model: failed to open model file");
806
+ }
807
+
808
+ mdl_load(model, file);
809
+ fclose(file);
810
+
811
+ return self;
801
812
  }
802
813
 
803
814
  static dat_t *to_dat(rdr_t *reader, VALUE data, bool labelled) {
804
- Check_Type(data, T_ARRAY);
805
-
806
- const unsigned int n = RARRAY_LEN(data);
807
- unsigned int i, j, k;
808
-
809
- dat_t *dat = xmalloc(sizeof(dat_t));
810
- dat->nseq = 0;
811
- dat->mlen = 0;
812
- dat->lbl = labelled;
813
- dat->seq = xmalloc(sizeof(seq_t*) * n);
814
-
815
- for (i = 0; i < n; ++i) {
816
- VALUE sequence = rb_ary_entry(data, i);
817
- Check_Type(sequence, T_ARRAY);
818
-
819
- k = RARRAY_LEN(sequence);
820
- raw_t *raw = xmalloc(sizeof(raw_t) + sizeof(char*) * k);
821
-
822
- for (j = 0; j < k; ++j) {
823
- VALUE line = rb_ary_entry(sequence, j);
824
- Check_Type(line, T_STRING);
825
- raw->lines[j] = StringValueCStr(line);
826
- }
827
-
828
- raw->len = k;
829
-
830
- seq_t *seq = rdr_raw2seq(reader, raw, labelled);
831
- xfree(raw);
832
-
833
- if (seq == 0) { break; }
834
-
835
- // and store the sequence
836
- dat->seq[dat->nseq++] = seq;
837
- dat->mlen = max(dat->mlen, seq->len);
838
-
839
- }
840
-
841
- // if no sequence was read, free memory
842
- if (dat->nseq == 0) {
843
- xfree(dat->seq);
844
- xfree(dat);
845
-
846
- return 0;
847
- }
848
-
849
- return dat;
815
+ Check_Type(data, T_ARRAY);
816
+
817
+ const unsigned int n = RARRAY_LEN(data);
818
+ unsigned int i, j, k;
819
+
820
+ dat_t *dat = wapiti_xmalloc(sizeof(dat_t));
821
+ dat->nseq = 0;
822
+ dat->mlen = 0;
823
+ dat->lbl = labelled;
824
+ dat->seq = wapiti_xmalloc(sizeof(seq_t*) * n);
825
+
826
+ for (i = 0; i < n; ++i) {
827
+ VALUE sequence = rb_ary_entry(data, i);
828
+ Check_Type(sequence, T_ARRAY);
829
+
830
+ k = RARRAY_LEN(sequence);
831
+ raw_t *raw = wapiti_xmalloc(sizeof(raw_t) + sizeof(char*) * k);
832
+
833
+ for (j = 0; j < k; ++j) {
834
+ VALUE line = rb_ary_entry(sequence, j);
835
+ Check_Type(line, T_STRING);
836
+ raw->lines[j] = StringValueCStr(line);
837
+ }
838
+
839
+ raw->len = k;
840
+
841
+ seq_t *seq = rdr_raw2seq(reader, raw, labelled);
842
+ xfree(raw);
843
+
844
+ if (seq == 0) { break; }
845
+
846
+ // and store the sequence
847
+ dat->seq[dat->nseq++] = seq;
848
+ dat->mlen = max(dat->mlen, seq->len);
849
+
850
+ }
851
+
852
+ // if no sequence was read, free memory
853
+ if (dat->nseq == 0) {
854
+ xfree(dat->seq);
855
+ xfree(dat);
856
+
857
+ return 0;
858
+ }
859
+
860
+ return dat;
850
861
  }
851
862
 
852
863
 
853
864
  static VALUE model_train(VALUE self, VALUE data) {
854
-
855
- mdl_t* model = get_model(self);
856
-
857
- int trn;
858
- for (trn = 0; trn < trn_cnt; trn++) {
859
- if (!strcmp(model->opt->algo, trn_lst[trn].name)) break;
860
- }
861
-
862
- if (trn == trn_cnt) {
863
- rb_raise(cNativeError, "failed to train model: unknown algorithm '%s'", model->opt->algo);
864
- }
865
-
866
- FILE *file;
867
-
868
- // Load the pattern file. This will unlock the database if previously
869
- // locked by loading a model.
870
- if (model->opt->pattern) {
871
- file = fopen(model->opt->pattern, "r");
872
-
873
- if (!file) {
874
- rb_raise(cNativeError, "failed to train model: failed to load pattern file '%s'", model->opt->pattern);
875
- }
876
-
877
- rdr_loadpat(model->reader, file);
878
- fclose(file);
879
- }
880
- else {
881
- // rb_raise(cNativeError, "failed to train model: no pattern given");
882
- }
883
-
884
- qrk_lock(model->reader->obs, false);
885
-
886
-
887
- // Load the training data. When this is done we lock the quarks as we
888
- // don't want to put in the model, informations present only in the
889
- // devlopment set.
890
-
891
- switch (TYPE(data)) {
892
- case T_STRING:
893
- if (!(file = fopen(StringValuePtr(data), "r"))) {
894
- rb_raise(cNativeError, "failed to train model: failed to open training data '%s", StringValuePtr(data));
895
- }
896
-
897
- model->train = rdr_readdat(model->reader, file, true);
898
- fclose(file);
899
-
900
- break;
901
- case T_ARRAY:
902
- model->train = to_dat(model->reader, data, true);
903
-
904
- break;
905
- default:
906
- rb_raise(cNativeError, "failed to train model: invalid training data type (expected instance of String or Array)");
907
- }
908
-
909
- qrk_lock(model->reader->lbl, true);
910
- qrk_lock(model->reader->obs, true);
911
-
912
- if (!model->train || model->train->nseq == 0) {
913
- rb_raise(cNativeError, "failed to train model: no training data loaded");
914
- }
915
-
916
- // If present, load the development set in the model. If not specified,
917
- // the training dataset will be used instead.
918
- if (model->opt->devel) {
919
- if (!(file = fopen(model->opt->devel, "r"))) {
920
- rb_raise(cNativeError, "failed to train model: cannot open development file '%s'", model->opt->devel);
921
- }
922
-
923
- model->devel = rdr_readdat(model->reader, file, true);
924
- fclose(file);
925
- }
926
-
927
- // Initialize the model. If a previous model was loaded, this will be
928
- // just a resync, else the model structure will be created.
929
- rb_funcall(self, rb_intern("sync"), 0);
930
-
931
- // Train the model.
932
- uit_setup(model);
933
- trn_lst[trn].train(model);
934
- uit_cleanup(model);
935
-
936
- // If requested compact the model.
937
- if (model->opt->compact) {
938
- const size_t O = model->nobs;
939
- const size_t F = model->nftr;
940
- rb_funcall(self, rb_intern("compact"), 0);
941
- }
942
-
943
- return self;
865
+
866
+ mdl_t* model = get_model(self);
867
+
868
+ int trn;
869
+ for (trn = 0; trn < trn_cnt; trn++) {
870
+ if (!strcmp(model->opt->algo, trn_lst[trn].name)) break;
871
+ }
872
+
873
+ if (trn == trn_cnt) {
874
+ rb_raise(cNativeError, "failed to train model: unknown algorithm '%s'", model->opt->algo);
875
+ }
876
+
877
+ FILE *file;
878
+
879
+ // Load the pattern file. This will unlock the database if previously
880
+ // locked by loading a model.
881
+ if (model->opt->pattern) {
882
+ file = fopen(model->opt->pattern, "r");
883
+
884
+ if (!file) {
885
+ rb_raise(cNativeError, "failed to train model: failed to load pattern file '%s'", model->opt->pattern);
886
+ }
887
+
888
+ rdr_loadpat(model->reader, file);
889
+ fclose(file);
890
+ }
891
+ else {
892
+ // rb_raise(cNativeError, "failed to train model: no pattern given");
893
+ }
894
+
895
+ qrk_lock(model->reader->obs, false);
896
+
897
+
898
+ // Load the training data. When this is done we lock the quarks as we
899
+ // don't want to put in the model, informations present only in the
900
+ // devlopment set.
901
+
902
+ switch (TYPE(data)) {
903
+ case T_STRING:
904
+ if (!(file = fopen(StringValuePtr(data), "r"))) {
905
+ rb_raise(cNativeError, "failed to train model: failed to open training data '%s", StringValuePtr(data));
906
+ }
907
+
908
+ model->train = rdr_readdat(model->reader, file, true);
909
+ fclose(file);
910
+
911
+ break;
912
+ case T_ARRAY:
913
+ model->train = to_dat(model->reader, data, true);
914
+
915
+ break;
916
+ default:
917
+ rb_raise(cNativeError, "failed to train model: invalid training data type (expected instance of String or Array)");
918
+ }
919
+
920
+ qrk_lock(model->reader->lbl, true);
921
+ qrk_lock(model->reader->obs, true);
922
+
923
+ if (!model->train || model->train->nseq == 0) {
924
+ rb_raise(cNativeError, "failed to train model: no training data loaded");
925
+ }
926
+
927
+ // If present, load the development set in the model. If not specified,
928
+ // the training dataset will be used instead.
929
+ if (model->opt->devel) {
930
+ if (!(file = fopen(model->opt->devel, "r"))) {
931
+ rb_raise(cNativeError, "failed to train model: cannot open development file '%s'", model->opt->devel);
932
+ }
933
+
934
+ model->devel = rdr_readdat(model->reader, file, true);
935
+ fclose(file);
936
+ }
937
+
938
+ // Initialize the model. If a previous model was loaded, this will be
939
+ // just a resync, else the model structure will be created.
940
+ rb_funcall(self, rb_intern("sync"), 0);
941
+
942
+ // Train the model.
943
+ uit_setup(model);
944
+ trn_lst[trn].train(model);
945
+ uit_cleanup(model);
946
+
947
+ // If requested compact the model.
948
+ if (model->opt->compact) {
949
+ rb_funcall(self, rb_intern("compact"), 0);
950
+ }
951
+
952
+ return self;
944
953
  }
945
954
 
946
955
  // Returns a sorted list of all labels in the Model's label database.
947
956
  static VALUE model_labels(VALUE self) {
948
- mdl_t *model = get_model(self);
949
- const size_t Y = model->nlbl;
950
-
951
- qrk_t *lp = model->reader->lbl;
952
-
953
- VALUE labels = rb_ary_new2(Y);
954
-
955
- for (unsigned int i = 0; i < Y; ++i) {
956
- rb_ary_push(labels, rb_str_new2(qrk_id2str(lp, i)));
957
- }
958
-
959
- rb_funcall(labels, rb_intern("sort!"), 0);
960
-
961
- return labels;
957
+ mdl_t *model = get_model(self);
958
+ const uint32_t Y = model->nlbl;
959
+
960
+ qrk_t *lp = model->reader->lbl;
961
+
962
+ VALUE labels = rb_ary_new2(Y);
963
+
964
+ for (unsigned int i = 0; i < Y; ++i) {
965
+ rb_ary_push(labels, rb_str_new2(qrk_id2str(lp, i)));
966
+ }
967
+
968
+ rb_funcall(labels, rb_intern("sort!"), 0);
969
+
970
+ return labels;
962
971
  }
963
972
 
964
973
  static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
965
- qrk_t *lbls = model->reader->lbl;
966
-
967
- const unsigned int Y = model->nlbl;
968
- const unsigned int N = model->opt->nbest;
969
-
970
- seq_t *seq = rdr_raw2seq(model->reader, raw, model->opt->check);
971
-
972
- const unsigned int T = seq->len;
973
- unsigned int n, t, tcnt = 0, terr = 0, scnt = 0, serr = 0, stat[3][Y];
974
-
975
- size_t *out = xmalloc(sizeof(size_t) * T * N);
976
- double *psc = xmalloc(sizeof(double) * T * N);
977
- double *scs = xmalloc(sizeof(double) * N);
978
-
979
- VALUE sequence, tokens;
980
-
981
- if (N == 1) {
982
- tag_viterbi(model, seq, (size_t*)out, scs, (double*)psc);
983
- }
984
- else {
985
- tag_nbviterbi(model, seq, N, (size_t*)out, scs, (double*)psc);
986
- }
987
-
988
- sequence = rb_ary_new();
989
-
990
- for (t = 0; t < T; ++t) {
991
- tokens = rb_ary_new();
992
-
993
- if (!model->opt->label) {
994
- VALUE token = rb_str_new2(raw->lines[t]);
995
-
996
- #ifdef HAVE_RUBY_ENCODING_H
997
- int enc = rb_enc_find_index("UTF-8");
998
- rb_enc_associate_index(token, enc);
999
- #endif
1000
-
1001
- rb_ary_push(tokens, token);
1002
- }
1003
-
1004
- for (n = 0; n < N; ++n) {
1005
-
1006
- size_t lbl = out[t * N + n];
1007
- rb_ary_push(tokens, rb_str_new2(qrk_id2str(lbls, lbl)));
1008
-
1009
- // output individual score
1010
- if (model->opt->outsc) {
1011
- rb_ary_push(tokens, rb_float_new(psc[t * N + n]));
1012
- }
1013
-
1014
- }
1015
-
1016
- // yield token/label pair to block if given
1017
- if (rb_block_given_p()) {
1018
- tokens = rb_yield(tokens);
1019
- }
1020
-
1021
- rb_ary_push(sequence, tokens);
1022
-
1023
-
1024
- // TODO output sequence score: scs[n] (float)
1025
-
1026
- }
1027
-
1028
- // Statistics
1029
- if (model->opt->check) {
1030
- int err = 0;
1031
-
1032
- for (t = 0; t < T; ++t) {
1033
- stat[0][seq->pos[t].lbl]++;
1034
- stat[1][out[t * N]]++;
1035
-
1036
- if (seq->pos[t].lbl != out[t * N]) {
1037
- terr++;
1038
- err = 1;
1039
- }
1040
- else {
1041
- stat[2][out[t * N]]++;
1042
- }
1043
- }
1044
-
1045
- tcnt = FIX2INT(rb_ivar_get(self, rb_intern("@token_count")));
1046
- rb_ivar_set(self, rb_intern("@token_count"), INT2FIX(tcnt + (unsigned int)T));
1047
-
1048
- terr += FIX2INT(rb_ivar_get(self, rb_intern("@token_errors")));
1049
- rb_ivar_set(self, rb_intern("@token_errors"), INT2FIX(terr));
1050
-
1051
- scnt = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_count")));
1052
- rb_ivar_set(self, rb_intern("@sequence_count"), INT2FIX(++scnt));
1053
-
1054
- serr = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_errors")));
1055
- rb_ivar_set(self, rb_intern("@sequence_errors"), INT2FIX(serr + err));
1056
-
1057
- }
1058
-
1059
-
1060
- // Cleanup memory used for this sequence
1061
- xfree(scs);
1062
- xfree(psc);
1063
- xfree(out);
1064
-
1065
- rdr_freeseq(seq);
1066
-
1067
- return sequence;
974
+ qrk_t *lbls = model->reader->lbl;
975
+
976
+ const unsigned int Y = model->nlbl;
977
+ const unsigned int N = model->opt->nbest;
978
+
979
+ seq_t *seq = rdr_raw2seq(model->reader, raw, model->opt->check);
980
+
981
+ const unsigned int T = seq->len;
982
+ unsigned int n, t, tcnt = 0, terr = 0, scnt = 0, serr = 0, stat[3][Y];
983
+
984
+ uint32_t *out = wapiti_xmalloc(sizeof(uint32_t) * T * N);
985
+ double *psc = wapiti_xmalloc(sizeof(double) * T * N);
986
+ double *scs = wapiti_xmalloc(sizeof(double) * N);
987
+
988
+ VALUE sequence, tokens;
989
+
990
+ if (N == 1) {
991
+ tag_viterbi(model, seq, out, scs, psc);
992
+ }
993
+ else {
994
+ tag_nbviterbi(model, seq, N, (void*)out, scs, (void*)psc);
995
+ }
996
+
997
+ sequence = rb_ary_new();
998
+
999
+ for (t = 0; t < T; ++t) {
1000
+ tokens = rb_ary_new();
1001
+
1002
+ if (!model->opt->label) {
1003
+ VALUE token = rb_str_new2(raw->lines[t]);
1004
+
1005
+ #ifdef HAVE_RUBY_ENCODING_H
1006
+ int enc = rb_enc_find_index("UTF-8");
1007
+ rb_enc_associate_index(token, enc);
1008
+ #endif
1009
+
1010
+ rb_ary_push(tokens, token);
1011
+ }
1012
+
1013
+ for (n = 0; n < N; ++n) {
1014
+
1015
+ uint64_t lbl = out[t * N + n];
1016
+ rb_ary_push(tokens, rb_str_new2(qrk_id2str(lbls, lbl)));
1017
+
1018
+ // output individual score
1019
+ if (model->opt->outsc) {
1020
+ rb_ary_push(tokens, rb_float_new(psc[t * N + n]));
1021
+ }
1022
+
1023
+ }
1024
+
1025
+ // yield token/label pair to block if given
1026
+ if (rb_block_given_p()) {
1027
+ tokens = rb_yield(tokens);
1028
+ }
1029
+
1030
+ rb_ary_push(sequence, tokens);
1031
+
1032
+
1033
+ // TODO output sequence score: scs[n] (float)
1034
+
1035
+ }
1036
+
1037
+ // Statistics
1038
+ if (model->opt->check) {
1039
+ int err = 0;
1040
+
1041
+ for (t = 0; t < T; ++t) {
1042
+ stat[0][seq->pos[t].lbl]++;
1043
+ stat[1][out[t * N]]++;
1044
+
1045
+ if (seq->pos[t].lbl != out[t * N]) {
1046
+ terr++;
1047
+ err = 1;
1048
+ }
1049
+ else {
1050
+ stat[2][out[t * N]]++;
1051
+ }
1052
+ }
1053
+
1054
+ tcnt = FIX2INT(rb_ivar_get(self, rb_intern("@token_count")));
1055
+ rb_ivar_set(self, rb_intern("@token_count"), INT2FIX(tcnt + (unsigned int)T));
1056
+
1057
+ terr += FIX2INT(rb_ivar_get(self, rb_intern("@token_errors")));
1058
+ rb_ivar_set(self, rb_intern("@token_errors"), INT2FIX(terr));
1059
+
1060
+ scnt = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_count")));
1061
+ rb_ivar_set(self, rb_intern("@sequence_count"), INT2FIX(++scnt));
1062
+
1063
+ serr = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_errors")));
1064
+ rb_ivar_set(self, rb_intern("@sequence_errors"), INT2FIX(serr + err));
1065
+
1066
+ }
1067
+
1068
+
1069
+ // Cleanup memory used for this sequence
1070
+ xfree(scs);
1071
+ xfree(psc);
1072
+ xfree(out);
1073
+
1074
+ rdr_freeseq(seq);
1075
+
1076
+ return sequence;
1068
1077
  }
1069
1078
 
1070
1079
  static VALUE decode_sequence_array(VALUE self, VALUE array) {
1071
- Check_Type(array, T_ARRAY);
1072
- const unsigned int n = RARRAY_LEN(array);
1073
-
1074
- mdl_t *model = get_model(self);
1075
- raw_t *raw;
1076
-
1077
- const unsigned int N = model->opt->nbest;
1078
- unsigned int i, j;
1079
-
1080
- VALUE result = rb_ary_new2(n * N), sequence;
1081
-
1082
- for (i = 0; i < n; ++i) {
1083
- sequence = rb_ary_entry(array, i);
1084
- Check_Type(sequence, T_ARRAY);
1085
-
1086
- const unsigned int k = RARRAY_LEN(sequence);
1087
- raw = xmalloc(sizeof(raw_t) + sizeof(char*) * k);
1088
- raw->len = k;
1089
-
1090
- for (j = 0; j < k; ++j) {
1091
- VALUE line = rb_ary_entry(sequence, j);
1092
- Check_Type(line, T_STRING);
1093
-
1094
- raw->lines[j] = StringValueCStr(line);
1095
- }
1096
-
1097
- rb_ary_push(result, decode_sequence(self, model, raw));
1098
-
1099
- xfree(raw);
1100
- }
1101
-
1102
- return result;
1080
+ Check_Type(array, T_ARRAY);
1081
+ const unsigned int n = RARRAY_LEN(array);
1082
+
1083
+ mdl_t *model = get_model(self);
1084
+ raw_t *raw;
1085
+
1086
+ const unsigned int N = model->opt->nbest;
1087
+ unsigned int i, j;
1088
+
1089
+ VALUE result = rb_ary_new2(n * N), sequence;
1090
+
1091
+ for (i = 0; i < n; ++i) {
1092
+ sequence = rb_ary_entry(array, i);
1093
+ Check_Type(sequence, T_ARRAY);
1094
+
1095
+ const unsigned int k = RARRAY_LEN(sequence);
1096
+ raw = wapiti_xmalloc(sizeof(raw_t) + sizeof(char*) * k);
1097
+ raw->len = k;
1098
+
1099
+ for (j = 0; j < k; ++j) {
1100
+ VALUE line = rb_ary_entry(sequence, j);
1101
+ Check_Type(line, T_STRING);
1102
+
1103
+ raw->lines[j] = StringValueCStr(line);
1104
+ }
1105
+
1106
+ rb_ary_push(result, decode_sequence(self, model, raw));
1107
+
1108
+ xfree(raw);
1109
+ }
1110
+
1111
+ return result;
1103
1112
  }
1104
1113
 
1105
1114
  static VALUE decode_sequence_file(VALUE self, VALUE path) {
1106
- Check_Type(path, T_STRING);
1107
- FILE *file;
1108
-
1109
- if (!(file = fopen(StringValueCStr(path), "r"))) {
1110
- rb_raise(cNativeError, "failed to label data: could not open file '%s'", StringValueCStr(path));
1111
- }
1112
-
1113
- mdl_t *model = get_model(self);
1114
- raw_t *raw;
1115
-
1116
- VALUE result = rb_ary_new();
1117
-
1118
- // Next read the input file sequence by sequence and label them, we have
1119
- // to take care of not discarding the raw input as we want to send it
1120
- // back to the output with the additional predicted labels.
1121
- while (!feof(file)) {
1122
-
1123
- // So, first read an input sequence keeping the raw_t object
1124
- // available, and label it with Viterbi.
1125
- if ((raw = rdr_readraw(model->reader, file)) == 0) {
1126
- break;
1127
- }
1128
-
1129
- rb_ary_push(result, decode_sequence(self, model, raw));
1130
- rdr_freeraw(raw);
1131
- }
1132
-
1133
- return result;
1115
+ Check_Type(path, T_STRING);
1116
+ FILE *file;
1117
+
1118
+ if (!(file = fopen(StringValueCStr(path), "r"))) {
1119
+ rb_raise(cNativeError, "failed to label data: could not open file '%s'", StringValueCStr(path));
1120
+ }
1121
+
1122
+ mdl_t *model = get_model(self);
1123
+ raw_t *raw;
1124
+
1125
+ VALUE result = rb_ary_new();
1126
+
1127
+ // Next read the input file sequence by sequence and label them, we have
1128
+ // to take care of not discarding the raw input as we want to send it
1129
+ // back to the output with the additional predicted labels.
1130
+ while (!feof(file)) {
1131
+
1132
+ // So, first read an input sequence keeping the raw_t object
1133
+ // available, and label it with Viterbi.
1134
+ if ((raw = rdr_readraw(model->reader, file)) == 0) {
1135
+ break;
1136
+ }
1137
+
1138
+ rb_ary_push(result, decode_sequence(self, model, raw));
1139
+ rdr_freeraw(raw);
1140
+ }
1141
+
1142
+ return result;
1134
1143
  }
1135
1144
 
1136
1145
  // cal-seq:
@@ -1138,144 +1147,146 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
1138
1147
  // m.label(filename, options = {}) # => array of labelled tokens
1139
1148
  //
1140
1149
  static VALUE model_label(VALUE self, VALUE data) {
1141
- VALUE result;
1142
-
1143
- switch (TYPE(data)) {
1144
- case T_STRING:
1145
- result = decode_sequence_file(self, data);
1146
- break;
1147
- case T_ARRAY:
1148
- result = decode_sequence_array(self, data);
1149
- break;
1150
- default:
1151
- rb_raise(cNativeError, "failed to label data: invalid data (expected type String or Array)");
1152
- }
1153
-
1154
- return result;
1150
+ VALUE result;
1151
+
1152
+ switch (TYPE(data)) {
1153
+ case T_STRING:
1154
+ result = decode_sequence_file(self, data);
1155
+ break;
1156
+ case T_ARRAY:
1157
+ result = decode_sequence_array(self, data);
1158
+ break;
1159
+ default:
1160
+ rb_raise(cNativeError, "failed to label data: invalid data (expected type String or Array)");
1161
+ }
1162
+
1163
+ return result;
1155
1164
  }
1156
1165
 
1157
1166
  static void Init_model() {
1158
- cModel = rb_define_class_under(mWapiti, "Model", rb_cObject);
1159
- rb_define_alloc_func(cModel, allocate_model);
1160
-
1161
- rb_define_method(cModel, "initialize", initialize_model, -1);
1167
+ cModel = rb_define_class_under(mWapiti, "Model", rb_cObject);
1168
+ rb_define_alloc_func(cModel, allocate_model);
1169
+
1170
+ rb_define_method(cModel, "initialize", initialize_model, -1);
1171
+
1172
+ rb_define_attr(cModel, "options", 1, 0);
1173
+
1162
1174
 
1163
- rb_define_attr(cModel, "options", 1, 0);
1175
+ rb_define_method(cModel, "nlbl", model_nlbl, 0);
1176
+ rb_define_method(cModel, "labels", model_labels, 0);
1164
1177
 
1165
-
1166
- rb_define_method(cModel, "nlbl", model_nlbl, 0);
1167
- rb_define_method(cModel, "labels", model_labels, 0);
1168
-
1169
- rb_define_method(cModel, "nobs", model_nobs, 0);
1170
- rb_define_alias(cModel, "observations", "nobs");
1178
+ rb_define_method(cModel, "nobs", model_nobs, 0);
1179
+ rb_define_alias(cModel, "observations", "nobs");
1171
1180
 
1172
- rb_define_method(cModel, "nftr", model_nftr, 0);
1173
- rb_define_alias(cModel, "features", "nftr");
1181
+ rb_define_method(cModel, "nftr", model_nftr, 0);
1182
+ rb_define_alias(cModel, "features", "nftr");
1174
1183
 
1175
- rb_define_method(cModel, "total", model_total, 0);
1184
+ rb_define_method(cModel, "total", model_total, 0);
1176
1185
 
1177
- rb_define_method(cModel, "sync", model_sync, 0);
1178
- rb_define_method(cModel, "compact", model_compact, 0);
1179
- rb_define_method(cModel, "save", model_save, -1);
1180
- rb_define_method(cModel, "load", model_load, -1);
1186
+ rb_define_method(cModel, "sync", model_sync, 0);
1187
+ rb_define_method(cModel, "compact", model_compact, 0);
1188
+ rb_define_method(cModel, "save", model_save, -1);
1189
+ rb_define_method(cModel, "load", model_load, -1);
1181
1190
 
1182
- rb_define_method(cModel, "train", model_train, 1);
1183
- rb_define_method(cModel, "label", model_label, 1);
1191
+ rb_define_method(cModel, "train", model_train, 1);
1192
+ rb_define_method(cModel, "label", model_label, 1);
1184
1193
  }
1185
1194
 
1186
1195
  /* --- Top-Level Utility Methods --- */
1187
1196
 
1188
1197
 
1189
1198
  static VALUE label(VALUE self __attribute__((__unused__)), VALUE rb_options) {
1190
- if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
1191
- rb_raise(cNativeError, "argument must be a native options instance");
1192
- }
1199
+ if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
1200
+ rb_raise(cNativeError, "argument must be a native options instance");
1201
+ }
1193
1202
 
1194
- opt_t *options = get_options(rb_options);
1203
+ opt_t *options = get_options(rb_options);
1195
1204
 
1196
- if (options->mode != 1) {
1197
- rb_raise(cNativeError, "invalid options argument: mode should be set to 1 for labelling");
1198
- }
1205
+ if (options->mode != 1) {
1206
+ rb_raise(cNativeError, "invalid options argument: mode should be set to 1 for labelling");
1207
+ }
1199
1208
 
1200
- mdl_t *model = mdl_new(rdr_new(options->maxent));
1201
- model->opt = options;
1209
+ mdl_t *model = mdl_new(rdr_new(options->maxent));
1210
+ model->opt = options;
1202
1211
 
1203
- dolabel(model);
1204
-
1205
- mdl_free(model);
1212
+ dolabel(model);
1206
1213
 
1207
- return Qnil;
1214
+ mdl_free(model);
1215
+
1216
+ return Qnil;
1208
1217
  }
1209
1218
 
1219
+ #if defined EXTRA
1210
1220
  static VALUE dump(VALUE self __attribute__((__unused__)), VALUE rb_options) {
1211
- if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
1212
- rb_raise(cNativeError, "argument must be a native options instance");
1213
- }
1221
+ if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
1222
+ rb_raise(cNativeError, "argument must be a native options instance");
1223
+ }
1224
+
1225
+ opt_t *options = get_options(rb_options);
1214
1226
 
1215
- opt_t *options = get_options(rb_options);
1227
+ if (options->mode != 2) {
1228
+ rb_raise(cNativeError, "invalid options argument: mode should be set to 2 for training");
1229
+ }
1216
1230
 
1217
- if (options->mode != 2) {
1218
- rb_raise(cNativeError, "invalid options argument: mode should be set to 2 for training");
1219
- }
1231
+ mdl_t *model = mdl_new(rdr_new(options->maxent));
1232
+ model->opt = options;
1220
1233
 
1221
- mdl_t *model = mdl_new(rdr_new(options->maxent));
1222
- model->opt = options;
1234
+ dodump(model);
1223
1235
 
1224
- dodump(model);
1225
-
1226
- mdl_free(model);
1236
+ mdl_free(model);
1227
1237
 
1228
- return Qnil;
1238
+ return Qnil;
1229
1239
  }
1230
1240
 
1231
1241
  // This function is a proxy for Wapiti's main entry point.
1232
1242
  static VALUE wapiti(VALUE self __attribute__((__unused__)), VALUE arguments) {
1233
- int result = -1, argc = 0;
1234
- char **ap, *argv[18], *input, *tmp;
1235
-
1236
- Check_Type(arguments, T_STRING);
1237
- tmp = StringValueCStr(arguments);
1238
-
1239
- // allocate space for argument vector
1240
- input = (char*)malloc(strlen(tmp) + 8);
1241
-
1242
- // prepend command name
1243
- strncpy(input, "wapiti ", 8);
1244
- strncat(input, tmp, strlen(input) - 8);
1245
-
1246
- // remember allocation pointer
1247
- tmp = input;
1248
-
1249
- // turn input string into argument vector (using
1250
- // only the first seventeen tokens from input)
1251
- for (ap = argv; (*ap = strsep(&input, " \t")) != (char*)0; ++argc) {
1252
- if ((**ap != '\0') && (++ap >= &argv[18])) break;
1253
- }
1254
-
1255
- // call main entry point
1256
- result = wapiti_main(argc, argv);
1257
-
1258
- // free allocated memory
1259
- free(tmp);
1260
-
1261
- return INT2FIX(result);
1243
+ int result = -1, argc = 0;
1244
+ char **ap, *argv[18], *input, *tmp;
1245
+
1246
+ Check_Type(arguments, T_STRING);
1247
+ tmp = StringValueCStr(arguments);
1248
+
1249
+ // allocate space for argument vector
1250
+ input = (char*)malloc(strlen(tmp) + 8);
1251
+
1252
+ // prepend command name
1253
+ strncpy(input, "wapiti ", 8);
1254
+ strncat(input, tmp, strlen(input) - 8);
1255
+
1256
+ // remember allocation pointer
1257
+ tmp = input;
1258
+
1259
+ // turn input string into argument vector (using
1260
+ // only the first seventeen tokens from input)
1261
+ for (ap = argv; (*ap = strsep(&input, " \t")) != (char*)0; ++argc) {
1262
+ if ((**ap != '\0') && (++ap >= &argv[18])) break;
1263
+ }
1264
+
1265
+ // call main entry point
1266
+ result = wapiti_main(argc, argv);
1267
+
1268
+ // free allocated memory
1269
+ free(tmp);
1270
+
1271
+ return INT2FIX(result);
1262
1272
  }
1273
+ #endif
1263
1274
 
1264
1275
  /* --- Wapiti Extension Entry Point --- */
1265
1276
 
1266
1277
  void Init_native() {
1267
- mWapiti = rb_const_get(rb_mKernel, rb_intern("Wapiti"));
1268
- mNative = rb_define_module_under(mWapiti, "Native");
1269
-
1270
- cNativeError = rb_const_get(mWapiti, rb_intern("NativeError"));
1271
- cConfigurationError = rb_const_get(mWapiti, rb_intern("ConfigurationError"));
1272
- cLogger = rb_funcall(mWapiti, rb_intern("log"), 0);
1273
-
1274
- rb_define_singleton_method(mNative, "label", label, 1);
1275
- rb_define_singleton_method(mNative, "wapiti", wapiti, 1);
1276
-
1277
- rb_define_const(mNative, "VERSION", rb_str_new2(VERSION));
1278
-
1279
- Init_options();
1280
- Init_model();
1281
- }
1278
+ mWapiti = rb_const_get(rb_mKernel, rb_intern("Wapiti"));
1279
+ mNative = rb_define_module_under(mWapiti, "Native");
1280
+
1281
+ cNativeError = rb_const_get(mWapiti, rb_intern("NativeError"));
1282
+ cConfigurationError = rb_const_get(mWapiti, rb_intern("ConfigurationError"));
1283
+ cLogger = rb_funcall(mWapiti, rb_intern("log"), 0);
1284
+
1285
+ rb_define_singleton_method(mNative, "label", label, 1);
1286
+ // rb_define_singleton_method(mNative, "wapiti", wapiti, 1);
1287
+
1288
+ rb_define_const(mNative, "VERSION", rb_str_new2(VERSION));
1289
+
1290
+ Init_options();
1291
+ Init_model();
1292
+ }