wapiti 0.0.5 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.simplecov +3 -0
  3. data/Gemfile +25 -2
  4. data/HISTORY.md +5 -1
  5. data/LICENSE +14 -13
  6. data/README.md +9 -16
  7. data/Rakefile +38 -8
  8. data/ext/wapiti/bcd.c +126 -124
  9. data/ext/wapiti/decoder.c +203 -124
  10. data/ext/wapiti/decoder.h +6 -4
  11. data/ext/wapiti/extconf.rb +2 -2
  12. data/ext/wapiti/gradient.c +491 -320
  13. data/ext/wapiti/gradient.h +52 -34
  14. data/ext/wapiti/lbfgs.c +74 -33
  15. data/ext/wapiti/model.c +47 -37
  16. data/ext/wapiti/model.h +22 -20
  17. data/ext/wapiti/native.c +850 -839
  18. data/ext/wapiti/native.h +1 -1
  19. data/ext/wapiti/options.c +52 -20
  20. data/ext/wapiti/options.h +37 -30
  21. data/ext/wapiti/pattern.c +35 -33
  22. data/ext/wapiti/pattern.h +12 -11
  23. data/ext/wapiti/progress.c +14 -13
  24. data/ext/wapiti/progress.h +3 -2
  25. data/ext/wapiti/quark.c +14 -16
  26. data/ext/wapiti/quark.h +6 -5
  27. data/ext/wapiti/reader.c +83 -69
  28. data/ext/wapiti/reader.h +11 -9
  29. data/ext/wapiti/rprop.c +84 -43
  30. data/ext/wapiti/sequence.h +18 -16
  31. data/ext/wapiti/sgdl1.c +45 -43
  32. data/ext/wapiti/thread.c +19 -17
  33. data/ext/wapiti/thread.h +5 -4
  34. data/ext/wapiti/tools.c +7 -7
  35. data/ext/wapiti/tools.h +3 -4
  36. data/ext/wapiti/trainers.h +1 -1
  37. data/ext/wapiti/vmath.c +40 -38
  38. data/ext/wapiti/vmath.h +12 -11
  39. data/ext/wapiti/wapiti.c +159 -37
  40. data/ext/wapiti/wapiti.h +18 -4
  41. data/lib/wapiti.rb +15 -15
  42. data/lib/wapiti/errors.rb +15 -15
  43. data/lib/wapiti/model.rb +92 -84
  44. data/lib/wapiti/options.rb +123 -124
  45. data/lib/wapiti/utility.rb +14 -14
  46. data/lib/wapiti/version.rb +2 -2
  47. data/spec/spec_helper.rb +29 -9
  48. data/spec/wapiti/model_spec.rb +230 -194
  49. data/spec/wapiti/native_spec.rb +7 -8
  50. data/spec/wapiti/options_spec.rb +184 -174
  51. data/wapiti.gemspec +22 -8
  52. metadata +38 -42
  53. data/.gitignore +0 -5
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -29,14 +29,15 @@
29
29
  #define model_h
30
30
 
31
31
  #include <stddef.h>
32
- #include <sys/times.h>
32
+ #include <stdint.h>
33
+ #include <sys/time.h>
33
34
 
34
- #include "wapiti.h"
35
35
  #include "options.h"
36
36
  #include "sequence.h"
37
37
  #include "reader.h"
38
+ #include "wapiti.h"
38
39
 
39
- typedef struct tms tms_t;
40
+ typedef struct timeval tms_t;
40
41
 
41
42
  /* mdl_t:
42
43
  * Represent a linear-chain CRF model. The model contain both unigram and
@@ -60,34 +61,35 @@ typedef struct tms tms_t;
60
61
  */
61
62
  typedef struct mdl_s mdl_t;
62
63
  struct mdl_s {
63
- opt_t *opt; // options for training
64
+ opt_t *opt; // options for training
65
+ int type; // model type
64
66
 
65
67
  // Size of various model parameters
66
- size_t nlbl; // Y number of labels
67
- size_t nobs; // O number of observations
68
- size_t nftr; // F number of features
68
+ uint32_t nlbl; // Y number of labels
69
+ uint64_t nobs; // O number of observations
70
+ uint64_t nftr; // F number of features
69
71
 
70
72
  // Informations about observations
71
- char *kind; // [O] observations type
72
- size_t *uoff; // [O] unigram weights offset
73
- size_t *boff; // [O] bigram weights offset
73
+ char *kind; // [O] observations type
74
+ uint64_t *uoff; // [O] unigram weights offset
75
+ uint64_t *boff; // [O] bigram weights offset
74
76
 
75
77
  // The model itself
76
- double *theta; // [F] features weights
78
+ double *theta; // [F] features weights
77
79
 
78
80
  // Datasets
79
- dat_t *train; // training dataset
80
- dat_t *devel; // development dataset
81
- rdr_t *reader;
81
+ dat_t *train; // training dataset
82
+ dat_t *devel; // development dataset
83
+ rdr_t *reader;
82
84
 
83
85
  // Stoping criterion
84
- double *werr; // Window of error rate of last iters
85
- int wcnt; // Number of iters in the window
86
- int wpos; // Position for the next iter
86
+ double *werr; // Window of error rate of last iters
87
+ uint32_t wcnt; // Number of iters in the window
88
+ uint32_t wpos; // Position for the next iter
87
89
 
88
90
  // Timing
89
- tms_t timer; // start time of last iter
90
- double total; // total training time
91
+ tms_t timer; // start time of last iter
92
+ double total; // total training time
91
93
  };
92
94
 
93
95
  mdl_t *mdl_new(rdr_t *rdr);
@@ -1,13 +1,15 @@
1
1
  #include <stdio.h>
2
2
  #include <string.h>
3
3
 
4
- #include "wapiti.h"
5
4
  #include "options.h"
6
5
  #include "reader.h"
6
+ #include "decoder.h"
7
7
  #include "model.h"
8
8
  #include "trainers.h"
9
+ #include "progress.h"
9
10
  #include "quark.h"
10
11
  #include "tools.h"
12
+ #include "wapiti.h"
11
13
 
12
14
  #include "native.h"
13
15
 
@@ -22,27 +24,36 @@ VALUE cConfigurationError;
22
24
  VALUE cLogger;
23
25
 
24
26
 
27
+ /* --- Forward declarations --- */
28
+
29
+ int wapiti_main(int argc, char *argv[argc]);
30
+
31
+ void dolabel(mdl_t *mdl);
32
+ void dotrain(mdl_t *mdl);
33
+ void doupdt(mdl_t *mdl);
34
+
35
+
25
36
  /* --- Utilities --- */
26
37
 
27
38
  static void trn_auto(mdl_t *mdl) {
28
- const int maxiter = mdl->opt->maxiter;
29
- mdl->opt->maxiter = 3;
30
- trn_sgdl1(mdl);
31
- mdl->opt->maxiter = maxiter;
32
- trn_lbfgs(mdl);
39
+ const int maxiter = mdl->opt->maxiter;
40
+ mdl->opt->maxiter = 3;
41
+ trn_sgdl1(mdl);
42
+ mdl->opt->maxiter = maxiter;
43
+ trn_lbfgs(mdl);
33
44
  }
34
45
 
35
46
  static const struct {
36
- char *name;
37
- void (* train)(mdl_t *mdl);
47
+ const char *name;
48
+ void (* train)(mdl_t *mdl);
38
49
  } trn_lst[] = {
39
- {"l-bfgs", trn_lbfgs},
40
- {"sgd-l1", trn_sgdl1},
41
- {"bcd", trn_bcd },
42
- {"rprop", trn_rprop},
43
- {"rprop+", trn_rprop},
44
- {"rprop-", trn_rprop},
45
- {"auto", trn_auto }
50
+ {"l-bfgs", trn_lbfgs},
51
+ {"sgd-l1", trn_sgdl1},
52
+ {"bcd", trn_bcd },
53
+ {"rprop", trn_rprop},
54
+ {"rprop+", trn_rprop},
55
+ {"rprop-", trn_rprop},
56
+ {"auto", trn_auto }
46
57
  };
47
58
  static const int trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
48
59
 
@@ -52,78 +63,78 @@ static const int trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
52
63
  // Auxiliary Methods
53
64
 
54
65
  static opt_t *get_options(VALUE self) {
55
- opt_t *options;
56
- Data_Get_Struct(self, opt_t, options);
57
- return options;
66
+ opt_t *options;
67
+ Data_Get_Struct(self, opt_t, options);
68
+ return options;
58
69
  }
59
70
 
60
71
  // Copies a Ruby string to the heap and stores it in a pointer.
61
72
  // Frees the pointer before assigning the new value.
62
73
  static void copy_string(char **dst, VALUE rb_string) {
63
- Check_Type(rb_string, T_STRING);
74
+ Check_Type(rb_string, T_STRING);
64
75
 
65
- if (*dst) { free(*dst); *dst = (char*)0; }
66
- *dst = calloc(RSTRING_LEN(rb_string) + 1, sizeof(char));
76
+ if (*dst) { free(*dst); *dst = (char*)0; }
77
+ *dst = calloc(RSTRING_LEN(rb_string) + 1, sizeof(char));
67
78
 
68
- memcpy(*dst, StringValuePtr(rb_string), RSTRING_LEN(rb_string) + 1);
79
+ memcpy(*dst, StringValuePtr(rb_string), RSTRING_LEN(rb_string) + 1);
69
80
  }
70
81
 
71
82
 
72
83
  // Constructor / Desctructor
73
84
 
74
85
  static void mark_options(opt_t* options __attribute__((__unused__))) {
75
- // nothing
86
+ // nothing
76
87
  }
77
88
 
78
89
  static void deallocate_options(opt_t* options) {
79
-
80
- // free string options
81
- if (options->input) { free(options->input); }
82
- if (options->output) { free(options->output); }
83
- if (options->algo) { free(options->algo); }
84
- if (options->devel) { free(options->devel); }
85
- if (options->pattern) { free(options->pattern); }
86
-
87
- free(options);
88
- options = (opt_t*)0;
90
+
91
+ // free string options
92
+ if (options->input) { free(options->input); }
93
+ if (options->output) { free(options->output); }
94
+ if (options->algo) { free((void*)options->algo); }
95
+ if (options->devel) { free(options->devel); }
96
+ if (options->pattern) { free((void*)options->pattern); }
97
+
98
+ free(options);
99
+ options = (opt_t*)0;
89
100
  }
90
101
 
91
102
  static VALUE allocate_options(VALUE self) {
92
- opt_t* options = malloc(sizeof(opt_t));
93
- return Data_Wrap_Struct(self, mark_options, deallocate_options, options);
94
- }
95
-
96
- static VALUE initialize_options(int argc, VALUE *argv, VALUE self) {
97
- opt_t* options = get_options(self);
98
- *options = opt_defaults;
99
-
100
- if (options->maxiter == 0) {
101
- options->maxiter = INT_MAX;
102
- }
103
-
104
- // copy the default algorithm name to the heap so that all options strings
105
- // are on the heap
106
- char* tmp = calloc(strlen(options->algo), sizeof(char));
107
- memcpy(tmp, options->algo, strlen(options->algo));
108
- options->algo = tmp;
109
-
110
- if (argc > 1) {
111
- rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
112
- "wrong number of arguments (%d for 0..1)", argc);
113
- }
114
-
115
- // set defaults
116
- if (argc) {
117
- Check_Type(argv[0], T_HASH);
118
- (void)rb_funcall(self, rb_intern("update"), 1, argv[0]);
119
- }
120
-
121
- // yield self if block_given?
122
- if (rb_block_given_p()) {
123
- rb_yield(self);
124
- }
125
-
126
- return self;
103
+ opt_t* options = malloc(sizeof(opt_t));
104
+ return Data_Wrap_Struct(self, mark_options, deallocate_options, options);
105
+ }
106
+
107
+ static VALUE initialize_options(int argc, VALUE *argv, VALUE self) {
108
+ opt_t* options = get_options(self);
109
+ *options = opt_defaults;
110
+
111
+ if (options->maxiter == 0) {
112
+ options->maxiter = INT_MAX;
113
+ }
114
+
115
+ // copy the default algorithm name to the heap so that all options strings
116
+ // are on the heap
117
+ char* tmp = calloc(strlen(options->algo), sizeof(char));
118
+ memcpy(tmp, options->algo, strlen(options->algo));
119
+ options->algo = tmp;
120
+
121
+ if (argc > 1) {
122
+ rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
123
+ "wrong number of arguments (%d for 0..1)", argc);
124
+ }
125
+
126
+ // set defaults
127
+ if (argc) {
128
+ Check_Type(argv[0], T_HASH);
129
+ (void)rb_funcall(self, rb_intern("update"), 1, argv[0]);
130
+ }
131
+
132
+ // yield self if block_given?
133
+ if (rb_block_given_p()) {
134
+ rb_yield(self);
135
+ }
136
+
137
+ return self;
127
138
  }
128
139
 
129
140
 
@@ -133,192 +144,192 @@ static VALUE initialize_options(int argc, VALUE *argv, VALUE self) {
133
144
  // Fixnum Accessors
134
145
 
135
146
  static VALUE options_nbest(VALUE self) {
136
- return INT2FIX(get_options(self)->nbest);
147
+ return INT2FIX(get_options(self)->nbest);
137
148
  }
138
149
 
139
150
  static VALUE options_set_nbest(VALUE self, VALUE rb_fixnum) {
140
- Check_Type(rb_fixnum, T_FIXNUM);
141
- get_options(self)->nbest = FIX2INT(rb_fixnum);
142
-
143
- return rb_fixnum;
151
+ Check_Type(rb_fixnum, T_FIXNUM);
152
+ get_options(self)->nbest = FIX2INT(rb_fixnum);
153
+
154
+ return rb_fixnum;
144
155
  }
145
156
 
146
157
 
147
158
  static VALUE options_stopwin(VALUE self) {
148
- return INT2FIX(get_options(self)->stopwin);
159
+ return INT2FIX(get_options(self)->stopwin);
149
160
  }
150
161
 
151
162
  static VALUE options_set_stopwin(VALUE self, VALUE rb_fixnum) {
152
- Check_Type(rb_fixnum, T_FIXNUM);
153
- get_options(self)->stopwin = FIX2INT(rb_fixnum);
154
-
155
- return rb_fixnum;
163
+ Check_Type(rb_fixnum, T_FIXNUM);
164
+ get_options(self)->stopwin = FIX2INT(rb_fixnum);
165
+
166
+ return rb_fixnum;
156
167
  }
157
168
 
158
169
  static VALUE options_objwin(VALUE self) {
159
- return INT2FIX(get_options(self)->objwin);
170
+ return INT2FIX(get_options(self)->objwin);
160
171
  }
161
172
 
162
173
  static VALUE options_set_objwin(VALUE self, VALUE rb_fixnum) {
163
- Check_Type(rb_fixnum, T_FIXNUM);
164
- get_options(self)->objwin = FIX2INT(rb_fixnum);
165
-
166
- return rb_fixnum;
174
+ Check_Type(rb_fixnum, T_FIXNUM);
175
+ get_options(self)->objwin = FIX2INT(rb_fixnum);
176
+
177
+ return rb_fixnum;
167
178
  }
168
179
 
169
180
 
170
181
  static VALUE options_maxiter(VALUE self) {
171
- return INT2FIX(get_options(self)->maxiter);
182
+ return INT2FIX(get_options(self)->maxiter);
172
183
  }
173
184
 
174
185
  static VALUE options_set_maxiter(VALUE self, VALUE rb_fixnum) {
175
- opt_t *options = get_options(self);
186
+ opt_t *options = get_options(self);
176
187
 
177
- Check_Type(rb_fixnum, T_FIXNUM);
178
- options->maxiter = FIX2INT(rb_fixnum);
179
-
180
- return rb_fixnum;
188
+ Check_Type(rb_fixnum, T_FIXNUM);
189
+ options->maxiter = FIX2INT(rb_fixnum);
190
+
191
+ return rb_fixnum;
181
192
  }
182
193
 
183
194
  static VALUE options_jobsize(VALUE self) {
184
- return INT2FIX(get_options(self)->jobsize);
195
+ return INT2FIX(get_options(self)->jobsize);
185
196
  }
186
197
 
187
198
  static VALUE options_set_jobsize(VALUE self, VALUE rb_fixnum) {
188
- opt_t *options = get_options(self);
199
+ opt_t *options = get_options(self);
200
+
201
+ Check_Type(rb_fixnum, T_FIXNUM);
202
+ options->jobsize = FIX2INT(rb_fixnum);
189
203
 
190
- Check_Type(rb_fixnum, T_FIXNUM);
191
- options->jobsize = FIX2INT(rb_fixnum);
192
-
193
- return rb_fixnum;
204
+ return rb_fixnum;
194
205
  }
195
206
 
196
207
  static VALUE options_nthread(VALUE self) {
197
- return INT2FIX(get_options(self)->nthread);
208
+ return INT2FIX(get_options(self)->nthread);
198
209
  }
199
210
 
200
211
  static VALUE options_set_nthread(VALUE self, VALUE rb_fixnum) {
201
- opt_t *options = get_options(self);
212
+ opt_t *options = get_options(self);
202
213
 
203
- Check_Type(rb_fixnum, T_FIXNUM);
204
- options->nthread = FIX2INT(rb_fixnum);
205
-
206
- return rb_fixnum;
214
+ Check_Type(rb_fixnum, T_FIXNUM);
215
+ options->nthread = FIX2INT(rb_fixnum);
216
+
217
+ return rb_fixnum;
207
218
  }
208
219
 
209
220
  static VALUE options_histsz(VALUE self) {
210
- return INT2FIX(get_options(self)->lbfgs.histsz);
221
+ return INT2FIX(get_options(self)->lbfgs.histsz);
211
222
  }
212
223
 
213
224
  static VALUE options_set_histsz(VALUE self, VALUE rb_fixnum) {
214
- Check_Type(rb_fixnum, T_FIXNUM);
215
- get_options(self)->lbfgs.histsz = FIX2INT(rb_fixnum);
216
-
217
- return rb_fixnum;
225
+ Check_Type(rb_fixnum, T_FIXNUM);
226
+ get_options(self)->lbfgs.histsz = FIX2INT(rb_fixnum);
227
+
228
+ return rb_fixnum;
218
229
  }
219
230
 
220
231
  static VALUE options_maxls(VALUE self) {
221
- return INT2FIX(get_options(self)->lbfgs.maxls);
232
+ return INT2FIX(get_options(self)->lbfgs.maxls);
222
233
  }
223
234
 
224
235
  static VALUE options_set_maxls(VALUE self, VALUE rb_fixnum) {
225
- Check_Type(rb_fixnum, T_FIXNUM);
226
- get_options(self)->lbfgs.maxls = FIX2INT(rb_fixnum);
227
-
228
- return rb_fixnum;
236
+ Check_Type(rb_fixnum, T_FIXNUM);
237
+ get_options(self)->lbfgs.maxls = FIX2INT(rb_fixnum);
238
+
239
+ return rb_fixnum;
229
240
  }
230
241
 
231
242
 
232
243
  // Float Accessors
233
244
 
234
245
  static VALUE options_rho1(VALUE self) {
235
- return rb_float_new(get_options(self)->rho1);
246
+ return rb_float_new(get_options(self)->rho1);
236
247
  }
237
248
 
238
249
  static VALUE options_set_rho1(VALUE self, VALUE rb_numeric) {
239
- get_options(self)->rho1 = NUM2DBL(rb_numeric);
240
- return rb_numeric;
250
+ get_options(self)->rho1 = NUM2DBL(rb_numeric);
251
+ return rb_numeric;
241
252
  }
242
253
 
243
254
  static VALUE options_rho2(VALUE self) {
244
- return rb_float_new(get_options(self)->rho2);
255
+ return rb_float_new(get_options(self)->rho2);
245
256
  }
246
257
 
247
258
  static VALUE options_set_rho2(VALUE self, VALUE rb_numeric) {
248
- get_options(self)->rho2 = NUM2DBL(rb_numeric);
249
- return rb_numeric;
259
+ get_options(self)->rho2 = NUM2DBL(rb_numeric);
260
+ return rb_numeric;
250
261
  }
251
262
 
252
263
  static VALUE options_stopeps(VALUE self) {
253
- return rb_float_new(get_options(self)->stopeps);
264
+ return rb_float_new(get_options(self)->stopeps);
254
265
  }
255
266
 
256
267
  static VALUE options_set_stopeps(VALUE self, VALUE rb_numeric) {
257
- get_options(self)->stopeps = NUM2DBL(rb_numeric);
258
- return rb_numeric;
268
+ get_options(self)->stopeps = NUM2DBL(rb_numeric);
269
+ return rb_numeric;
259
270
  }
260
271
 
261
272
  static VALUE options_eta0(VALUE self) {
262
- return rb_float_new(get_options(self)->sgdl1.eta0);
273
+ return rb_float_new(get_options(self)->sgdl1.eta0);
263
274
  }
264
275
 
265
276
  static VALUE options_set_eta0(VALUE self, VALUE rb_numeric) {
266
- get_options(self)->sgdl1.eta0 = NUM2DBL(rb_numeric);
267
- return rb_numeric;
277
+ get_options(self)->sgdl1.eta0 = NUM2DBL(rb_numeric);
278
+ return rb_numeric;
268
279
  }
269
280
 
270
281
  static VALUE options_alpha(VALUE self) {
271
- return rb_float_new(get_options(self)->sgdl1.alpha);
282
+ return rb_float_new(get_options(self)->sgdl1.alpha);
272
283
  }
273
284
 
274
285
  static VALUE options_set_alpha(VALUE self, VALUE rb_numeric) {
275
- get_options(self)->sgdl1.alpha = NUM2DBL(rb_numeric);
276
- return rb_numeric;
286
+ get_options(self)->sgdl1.alpha = NUM2DBL(rb_numeric);
287
+ return rb_numeric;
277
288
  }
278
289
 
279
290
  static VALUE options_kappa(VALUE self) {
280
- return rb_float_new(get_options(self)->bcd.kappa);
291
+ return rb_float_new(get_options(self)->bcd.kappa);
281
292
  }
282
293
 
283
294
  static VALUE options_set_kappa(VALUE self, VALUE rb_numeric) {
284
- get_options(self)->bcd.kappa = NUM2DBL(rb_numeric);
285
- return rb_numeric;
295
+ get_options(self)->bcd.kappa = NUM2DBL(rb_numeric);
296
+ return rb_numeric;
286
297
  }
287
298
 
288
299
  static VALUE options_stpmin(VALUE self) {
289
- return rb_float_new(get_options(self)->rprop.stpmin);
300
+ return rb_float_new(get_options(self)->rprop.stpmin);
290
301
  }
291
302
 
292
303
  static VALUE options_set_stpmin(VALUE self, VALUE rb_numeric) {
293
- get_options(self)->rprop.stpmin = NUM2DBL(rb_numeric);
294
- return rb_numeric;
304
+ get_options(self)->rprop.stpmin = NUM2DBL(rb_numeric);
305
+ return rb_numeric;
295
306
  }
296
307
 
297
308
  static VALUE options_stpmax(VALUE self) {
298
- return rb_float_new(get_options(self)->rprop.stpmax);
309
+ return rb_float_new(get_options(self)->rprop.stpmax);
299
310
  }
300
311
 
301
312
  static VALUE options_set_stpmax(VALUE self, VALUE rb_numeric) {
302
- get_options(self)->rprop.stpmax = NUM2DBL(rb_numeric);
303
- return rb_numeric;
313
+ get_options(self)->rprop.stpmax = NUM2DBL(rb_numeric);
314
+ return rb_numeric;
304
315
  }
305
316
 
306
317
  static VALUE options_stpinc(VALUE self) {
307
- return rb_float_new(get_options(self)->rprop.stpinc);
318
+ return rb_float_new(get_options(self)->rprop.stpinc);
308
319
  }
309
320
 
310
321
  static VALUE options_set_stpinc(VALUE self, VALUE rb_numeric) {
311
- get_options(self)->rprop.stpinc = NUM2DBL(rb_numeric);
312
- return rb_numeric;
322
+ get_options(self)->rprop.stpinc = NUM2DBL(rb_numeric);
323
+ return rb_numeric;
313
324
  }
314
325
 
315
326
  static VALUE options_stpdec(VALUE self) {
316
- return rb_float_new(get_options(self)->rprop.stpdec);
327
+ return rb_float_new(get_options(self)->rprop.stpdec);
317
328
  }
318
329
 
319
330
  static VALUE options_set_stpdec(VALUE self, VALUE rb_numeric) {
320
- get_options(self)->rprop.stpdec = NUM2DBL(rb_numeric);
321
- return rb_numeric;
331
+ get_options(self)->rprop.stpdec = NUM2DBL(rb_numeric);
332
+ return rb_numeric;
322
333
  }
323
334
 
324
335
 
@@ -326,84 +337,84 @@ static VALUE options_set_stpdec(VALUE self, VALUE rb_numeric) {
326
337
  // Boolean Accessors
327
338
 
328
339
  static VALUE options_maxent(VALUE self) {
329
- return get_options(self)->maxent ? Qtrue : Qfalse;
340
+ return get_options(self)->maxent ? Qtrue : Qfalse;
330
341
  }
331
342
 
332
343
  static VALUE options_set_maxent(VALUE self, VALUE rb_boolean) {
333
- get_options(self)->maxent = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
334
- return rb_boolean;
344
+ get_options(self)->maxent = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
345
+ return rb_boolean;
335
346
  }
336
347
 
337
348
  static VALUE options_compact(VALUE self) {
338
- return get_options(self)->compact ? Qtrue : Qfalse;
349
+ return get_options(self)->compact ? Qtrue : Qfalse;
339
350
  }
340
351
 
341
352
  static VALUE options_set_compact(VALUE self, VALUE rb_boolean) {
342
- get_options(self)->compact = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
343
- return rb_boolean;
353
+ get_options(self)->compact = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
354
+ return rb_boolean;
344
355
  }
345
356
 
346
357
  static VALUE options_sparse(VALUE self) {
347
- return get_options(self)->sparse ? Qtrue : Qfalse;
358
+ return get_options(self)->sparse ? Qtrue : Qfalse;
348
359
  }
349
360
 
350
361
  static VALUE options_set_sparse(VALUE self, VALUE rb_boolean) {
351
- get_options(self)->sparse = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
352
- return rb_boolean;
362
+ get_options(self)->sparse = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
363
+ return rb_boolean;
353
364
  }
354
365
 
355
366
  static VALUE options_check(VALUE self) {
356
- return get_options(self)->check ? Qtrue : Qfalse;
367
+ return get_options(self)->check ? Qtrue : Qfalse;
357
368
  }
358
369
 
359
370
  static VALUE options_set_check(VALUE self, VALUE rb_boolean) {
360
- get_options(self)->check = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
361
- return rb_boolean;
371
+ get_options(self)->check = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
372
+ return rb_boolean;
362
373
  }
363
374
 
364
375
  static VALUE options_label(VALUE self) {
365
- return get_options(self)->label ? Qtrue : Qfalse;
376
+ return get_options(self)->label ? Qtrue : Qfalse;
366
377
  }
367
378
 
368
379
  static VALUE options_set_label(VALUE self, VALUE rb_boolean) {
369
- get_options(self)->label = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
370
- return rb_boolean;
380
+ get_options(self)->label = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
381
+ return rb_boolean;
371
382
  }
372
383
 
373
384
  static VALUE options_outsc(VALUE self) {
374
- return get_options(self)->outsc ? Qtrue : Qfalse;
385
+ return get_options(self)->outsc ? Qtrue : Qfalse;
375
386
  }
376
387
 
377
388
  static VALUE options_set_outsc(VALUE self, VALUE rb_boolean) {
378
- get_options(self)->outsc = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
379
- return rb_boolean;
389
+ get_options(self)->outsc = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
390
+ return rb_boolean;
380
391
  }
381
392
 
382
393
  static VALUE options_lblpost(VALUE self) {
383
- return get_options(self)->lblpost ? Qtrue : Qfalse;
394
+ return get_options(self)->lblpost ? Qtrue : Qfalse;
384
395
  }
385
396
 
386
397
  static VALUE options_set_lblpost(VALUE self, VALUE rb_boolean) {
387
- get_options(self)->lblpost = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
388
- return rb_boolean;
398
+ get_options(self)->lblpost = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
399
+ return rb_boolean;
389
400
  }
390
401
 
391
402
  static VALUE options_clip(VALUE self) {
392
- return get_options(self)->lbfgs.clip ? Qtrue : Qfalse;
403
+ return get_options(self)->lbfgs.clip ? Qtrue : Qfalse;
393
404
  }
394
405
 
395
406
  static VALUE options_set_clip(VALUE self, VALUE rb_boolean) {
396
- get_options(self)->lbfgs.clip = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
397
- return rb_boolean;
407
+ get_options(self)->lbfgs.clip = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
408
+ return rb_boolean;
398
409
  }
399
410
 
400
411
  static VALUE options_cutoff(VALUE self) {
401
- return get_options(self)->rprop.cutoff ? Qtrue : Qfalse;
412
+ return get_options(self)->rprop.cutoff ? Qtrue : Qfalse;
402
413
  }
403
414
 
404
415
  static VALUE options_set_cutoff(VALUE self, VALUE rb_boolean) {
405
- get_options(self)->rprop.cutoff = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
406
- return rb_boolean;
416
+ get_options(self)->rprop.cutoff = !(TYPE(rb_boolean) == T_NIL || !rb_boolean);
417
+ return rb_boolean;
407
418
  }
408
419
 
409
420
 
@@ -412,201 +423,201 @@ static VALUE options_set_cutoff(VALUE self, VALUE rb_boolean) {
412
423
  // String Accessors
413
424
 
414
425
  static VALUE options_pattern(VALUE self) {
415
- char *pattern = get_options(self)->pattern;
416
- return rb_str_new2(pattern ? pattern : "");
426
+ const char *pattern = get_options(self)->pattern;
427
+ return rb_str_new2(pattern ? pattern : "");
417
428
  }
418
429
 
419
430
  static VALUE options_set_pattern(VALUE self, VALUE rb_string) {
420
- opt_t *options = get_options(self);
421
- copy_string(&(options->pattern), rb_string);
422
-
423
- return rb_string;
431
+ opt_t *options = get_options(self);
432
+ copy_string((char**)&(options->pattern), rb_string);
433
+
434
+ return rb_string;
424
435
  }
425
436
 
426
437
  static VALUE options_model(VALUE self) {
427
- char *model = get_options(self)->model;
428
- return rb_str_new2(model ? model : "");
438
+ const char *model = get_options(self)->model;
439
+ return rb_str_new2(model ? model : "");
429
440
  }
430
441
 
431
442
  static VALUE options_set_model(VALUE self, VALUE rb_string) {
432
- opt_t *options = get_options(self);
433
- copy_string(&(options->model), rb_string);
434
-
435
- return rb_string;
443
+ opt_t *options = get_options(self);
444
+ copy_string(&(options->model), rb_string);
445
+
446
+ return rb_string;
436
447
  }
437
448
 
438
449
  static VALUE options_algorithm(VALUE self) {
439
- char *algorithm = get_options(self)->algo;
440
- return rb_str_new2(algorithm ? algorithm : "");
450
+ const char *algorithm = get_options(self)->algo;
451
+ return rb_str_new2(algorithm ? algorithm : "");
441
452
  }
442
453
 
443
454
  static VALUE options_set_algorithm(VALUE self, VALUE rb_string) {
444
- opt_t *options = get_options(self);
445
- copy_string(&(options->algo), rb_string);
446
-
447
- return rb_string;
455
+ opt_t *options = get_options(self);
456
+ copy_string((char**)&(options->algo), rb_string);
457
+
458
+ return rb_string;
448
459
  }
449
460
 
450
461
  static VALUE options_development_data(VALUE self) {
451
- char *development_data = get_options(self)->devel;
452
- return rb_str_new2(development_data ? development_data : "");
462
+ char *development_data = get_options(self)->devel;
463
+ return rb_str_new2(development_data ? development_data : "");
453
464
  }
454
465
 
455
466
  static VALUE options_set_development_data(VALUE self, VALUE rb_string) {
456
- opt_t *options = get_options(self);
457
- copy_string(&(options->devel), rb_string);
458
-
459
- return rb_string;
467
+ opt_t *options = get_options(self);
468
+ copy_string(&(options->devel), rb_string);
469
+
470
+ return rb_string;
460
471
  }
461
472
 
462
473
 
463
474
  void Init_options() {
464
- cOptions = rb_define_class_under(mWapiti, "Options", rb_cObject);
465
- rb_define_alloc_func(cOptions, allocate_options);
466
-
467
- rb_define_method(cOptions, "initialize", initialize_options, -1);
475
+ cOptions = rb_define_class_under(mWapiti, "Options", rb_cObject);
476
+ rb_define_alloc_func(cOptions, allocate_options);
477
+
478
+ rb_define_method(cOptions, "initialize", initialize_options, -1);
479
+
480
+ // Option Accessors
481
+
482
+ rb_define_method(cOptions, "stopwin", options_stopwin, 0);
483
+ rb_define_method(cOptions, "stopwin=", options_set_stopwin, 1);
484
+
485
+ rb_define_alias(cOptions, "stop_window", "stopwin");
486
+ rb_define_alias(cOptions, "stop_window=", "stopwin=");
468
487
 
469
- // Option Accessors
488
+ rb_define_method(cOptions, "objwin", options_objwin, 0);
489
+ rb_define_method(cOptions, "objwin=", options_set_objwin, 1);
470
490
 
471
- rb_define_method(cOptions, "stopwin", options_stopwin, 0);
472
- rb_define_method(cOptions, "stopwin=", options_set_stopwin, 1);
491
+ rb_define_alias(cOptions, "convergence_window", "objwin");
492
+ rb_define_alias(cOptions, "convergence_window=", "objwin=");
473
493
 
474
- rb_define_alias(cOptions, "stop_window", "stopwin");
475
- rb_define_alias(cOptions, "stop_window=", "stopwin=");
494
+ rb_define_method(cOptions, "maxiter", options_maxiter, 0);
495
+ rb_define_method(cOptions, "maxiter=", options_set_maxiter, 1);
476
496
 
477
- rb_define_method(cOptions, "objwin", options_objwin, 0);
478
- rb_define_method(cOptions, "objwin=", options_set_objwin, 1);
497
+ rb_define_alias(cOptions, "max_iterations", "maxiter");
498
+ rb_define_alias(cOptions, "max_iterations=", "maxiter=");
479
499
 
480
- rb_define_alias(cOptions, "convergence_window", "objwin");
481
- rb_define_alias(cOptions, "convergence_window=", "objwin=");
482
-
483
- rb_define_method(cOptions, "maxiter", options_maxiter, 0);
484
- rb_define_method(cOptions, "maxiter=", options_set_maxiter, 1);
500
+ rb_define_method(cOptions, "jobsize", options_jobsize, 0);
501
+ rb_define_method(cOptions, "jobsize=", options_set_jobsize, 1);
485
502
 
486
- rb_define_alias(cOptions, "max_iterations", "maxiter");
487
- rb_define_alias(cOptions, "max_iterations=", "maxiter=");
488
-
489
- rb_define_method(cOptions, "jobsize", options_jobsize, 0);
490
- rb_define_method(cOptions, "jobsize=", options_set_jobsize, 1);
503
+ rb_define_method(cOptions, "nthread", options_nthread, 0);
504
+ rb_define_method(cOptions, "nthread=", options_set_nthread, 1);
491
505
 
492
- rb_define_method(cOptions, "nthread", options_nthread, 0);
493
- rb_define_method(cOptions, "nthread=", options_set_nthread, 1);
506
+ rb_define_alias(cOptions, "threads", "nthread");
507
+ rb_define_alias(cOptions, "threads=", "nthread=");
494
508
 
495
- rb_define_alias(cOptions, "threads", "nthread");
496
- rb_define_alias(cOptions, "threads=", "nthread=");
509
+ rb_define_method(cOptions, "rho1", options_rho1, 0);
510
+ rb_define_method(cOptions, "rho1=", options_set_rho1, 1);
497
511
 
498
- rb_define_method(cOptions, "rho1", options_rho1, 0);
499
- rb_define_method(cOptions, "rho1=", options_set_rho1, 1);
512
+ rb_define_method(cOptions, "rho2", options_rho2, 0);
513
+ rb_define_method(cOptions, "rho2=", options_set_rho2, 1);
500
514
 
501
- rb_define_method(cOptions, "rho2", options_rho2, 0);
502
- rb_define_method(cOptions, "rho2=", options_set_rho2, 1);
515
+ rb_define_method(cOptions, "stopeps", options_stopeps, 0);
516
+ rb_define_method(cOptions, "stopeps=", options_set_stopeps, 1);
503
517
 
504
- rb_define_method(cOptions, "stopeps", options_stopeps, 0);
505
- rb_define_method(cOptions, "stopeps=", options_set_stopeps, 1);
518
+ rb_define_alias(cOptions, "stop_epsilon", "stopeps");
519
+ rb_define_alias(cOptions, "stop_epsilon=", "stopeps=");
506
520
 
507
- rb_define_alias(cOptions, "stop_epsilon", "stopeps");
508
- rb_define_alias(cOptions, "stop_epsilon=", "stopeps=");
521
+ rb_define_method(cOptions, "maxent", options_maxent, 0);
522
+ rb_define_method(cOptions, "maxent=", options_set_maxent, 1);
509
523
 
510
- rb_define_method(cOptions, "maxent", options_maxent, 0);
511
- rb_define_method(cOptions, "maxent=", options_set_maxent, 1);
524
+ rb_define_alias(cOptions, "maxent?", "maxent");
512
525
 
513
- rb_define_alias(cOptions, "maxent?", "maxent");
526
+ rb_define_method(cOptions, "compact", options_compact, 0);
527
+ rb_define_method(cOptions, "compact=", options_set_compact, 1);
514
528
 
515
- rb_define_method(cOptions, "compact", options_compact, 0);
516
- rb_define_method(cOptions, "compact=", options_set_compact, 1);
529
+ rb_define_alias(cOptions, "compact?", "compact");
517
530
 
518
- rb_define_alias(cOptions, "compact?", "compact");
531
+ rb_define_method(cOptions, "sparse", options_sparse, 0);
532
+ rb_define_method(cOptions, "sparse=", options_set_sparse, 1);
519
533
 
520
- rb_define_method(cOptions, "sparse", options_sparse, 0);
521
- rb_define_method(cOptions, "sparse=", options_set_sparse, 1);
534
+ rb_define_alias(cOptions, "sparse?", "sparse");
522
535
 
523
- rb_define_alias(cOptions, "sparse?", "sparse");
536
+ rb_define_method(cOptions, "skip_tokens", options_label, 0);
537
+ rb_define_method(cOptions, "skip_tokens=", options_set_label, 1);
524
538
 
525
- rb_define_method(cOptions, "skip_tokens", options_label, 0);
526
- rb_define_method(cOptions, "skip_tokens=", options_set_label, 1);
539
+ rb_define_alias(cOptions, "skip_tokens?", "skip_tokens");
527
540
 
528
- rb_define_alias(cOptions, "skip_tokens?", "skip_tokens");
541
+ rb_define_method(cOptions, "check", options_check, 0);
542
+ rb_define_method(cOptions, "check=", options_set_check, 1);
529
543
 
530
- rb_define_method(cOptions, "check", options_check, 0);
531
- rb_define_method(cOptions, "check=", options_set_check, 1);
544
+ rb_define_alias(cOptions, "check?", "check");
532
545
 
533
- rb_define_alias(cOptions, "check?", "check");
546
+ rb_define_method(cOptions, "lblpost", options_lblpost, 0);
547
+ rb_define_method(cOptions, "lblpost=", options_set_lblpost, 1);
534
548
 
535
- rb_define_method(cOptions, "lblpost", options_lblpost, 0);
536
- rb_define_method(cOptions, "lblpost=", options_set_lblpost, 1);
549
+ rb_define_alias(cOptions, "lblpost?", "lblpost");
537
550
 
538
- rb_define_alias(cOptions, "lblpost?", "lblpost");
551
+ rb_define_alias(cOptions, "posterior", "lblpost");
552
+ rb_define_alias(cOptions, "posterior?", "lblpost");
553
+ rb_define_alias(cOptions, "posterior=", "lblpost=");
539
554
 
540
- rb_define_alias(cOptions, "posterior", "lblpost");
541
- rb_define_alias(cOptions, "posterior?", "lblpost");
542
- rb_define_alias(cOptions, "posterior=", "lblpost=");
555
+ rb_define_method(cOptions, "outsc", options_outsc, 0);
556
+ rb_define_method(cOptions, "outsc=", options_set_outsc, 1);
543
557
 
544
- rb_define_method(cOptions, "outsc", options_outsc, 0);
545
- rb_define_method(cOptions, "outsc=", options_set_outsc, 1);
558
+ rb_define_alias(cOptions, "outsc?", "outsc");
546
559
 
547
- rb_define_alias(cOptions, "outsc?", "outsc");
560
+ rb_define_alias(cOptions, "score", "outsc");
561
+ rb_define_alias(cOptions, "score?", "outsc");
562
+ rb_define_alias(cOptions, "score=", "outsc=");
548
563
 
549
- rb_define_alias(cOptions, "score", "outsc");
550
- rb_define_alias(cOptions, "score?", "outsc");
551
- rb_define_alias(cOptions, "score=", "outsc=");
564
+ rb_define_method(cOptions, "pattern", options_pattern, 0);
565
+ rb_define_method(cOptions, "pattern=", options_set_pattern, 1);
552
566
 
553
- rb_define_method(cOptions, "pattern", options_pattern, 0);
554
- rb_define_method(cOptions, "pattern=", options_set_pattern, 1);
567
+ rb_define_alias(cOptions, "template", "pattern");
568
+ rb_define_alias(cOptions, "template=", "pattern=");
555
569
 
556
- rb_define_alias(cOptions, "template", "pattern");
557
- rb_define_alias(cOptions, "template=", "pattern=");
570
+ rb_define_method(cOptions, "model", options_model, 0);
571
+ rb_define_method(cOptions, "model=", options_set_model, 1);
558
572
 
559
- rb_define_method(cOptions, "model", options_model, 0);
560
- rb_define_method(cOptions, "model=", options_set_model, 1);
573
+ rb_define_method(cOptions, "algorithm", options_algorithm, 0);
574
+ rb_define_method(cOptions, "algorithm=", options_set_algorithm, 1);
561
575
 
562
- rb_define_method(cOptions, "algorithm", options_algorithm, 0);
563
- rb_define_method(cOptions, "algorithm=", options_set_algorithm, 1);
576
+ rb_define_alias(cOptions, "algo", "algorithm");
577
+ rb_define_alias(cOptions, "algo=", "algorithm=");
564
578
 
565
- rb_define_alias(cOptions, "algo", "algorithm");
566
- rb_define_alias(cOptions, "algo=", "algorithm=");
579
+ rb_define_method(cOptions, "development_data", options_development_data, 0);
580
+ rb_define_method(cOptions, "development_data=", options_set_development_data, 1);
567
581
 
568
- rb_define_method(cOptions, "development_data", options_development_data, 0);
569
- rb_define_method(cOptions, "development_data=", options_set_development_data, 1);
582
+ rb_define_alias(cOptions, "devel", "development_data");
583
+ rb_define_alias(cOptions, "devel=", "development_data=");
570
584
 
571
- rb_define_alias(cOptions, "devel", "development_data");
572
- rb_define_alias(cOptions, "devel=", "development_data=");
585
+ rb_define_method(cOptions, "clip", options_clip, 0);
586
+ rb_define_method(cOptions, "clip=", options_set_clip, 1);
573
587
 
574
- rb_define_method(cOptions, "clip", options_clip, 0);
575
- rb_define_method(cOptions, "clip=", options_set_clip, 1);
588
+ rb_define_method(cOptions, "histsz", options_histsz, 0);
589
+ rb_define_method(cOptions, "histsz=", options_set_histsz, 1);
576
590
 
577
- rb_define_method(cOptions, "histsz", options_histsz, 0);
578
- rb_define_method(cOptions, "histsz=", options_set_histsz, 1);
591
+ rb_define_method(cOptions, "maxls", options_maxls, 0);
592
+ rb_define_method(cOptions, "maxls=", options_set_maxls, 1);
579
593
 
580
- rb_define_method(cOptions, "maxls", options_maxls, 0);
581
- rb_define_method(cOptions, "maxls=", options_set_maxls, 1);
594
+ rb_define_method(cOptions, "eta0", options_eta0, 0);
595
+ rb_define_method(cOptions, "eta0=", options_set_eta0, 1);
582
596
 
583
- rb_define_method(cOptions, "eta0", options_eta0, 0);
584
- rb_define_method(cOptions, "eta0=", options_set_eta0, 1);
597
+ rb_define_method(cOptions, "alpha", options_alpha, 0);
598
+ rb_define_method(cOptions, "alpha=", options_set_alpha, 1);
585
599
 
586
- rb_define_method(cOptions, "alpha", options_alpha, 0);
587
- rb_define_method(cOptions, "alpha=", options_set_alpha, 1);
600
+ rb_define_method(cOptions, "kappa", options_kappa, 0);
601
+ rb_define_method(cOptions, "kappa=", options_set_kappa, 1);
588
602
 
589
- rb_define_method(cOptions, "kappa", options_kappa, 0);
590
- rb_define_method(cOptions, "kappa=", options_set_kappa, 1);
603
+ rb_define_method(cOptions, "stpmin", options_stpmin, 0);
604
+ rb_define_method(cOptions, "stpmin=", options_set_stpmin, 1);
591
605
 
592
- rb_define_method(cOptions, "stpmin", options_stpmin, 0);
593
- rb_define_method(cOptions, "stpmin=", options_set_stpmin, 1);
606
+ rb_define_method(cOptions, "stpmax", options_stpmax, 0);
607
+ rb_define_method(cOptions, "stpmax=", options_set_stpmax, 1);
594
608
 
595
- rb_define_method(cOptions, "stpmax", options_stpmax, 0);
596
- rb_define_method(cOptions, "stpmax=", options_set_stpmax, 1);
609
+ rb_define_method(cOptions, "stpinc", options_stpinc, 0);
610
+ rb_define_method(cOptions, "stpinc=", options_set_stpinc, 1);
597
611
 
598
- rb_define_method(cOptions, "stpinc", options_stpinc, 0);
599
- rb_define_method(cOptions, "stpinc=", options_set_stpinc, 1);
612
+ rb_define_method(cOptions, "stpdec", options_stpdec, 0);
613
+ rb_define_method(cOptions, "stpdec=", options_set_stpdec, 1);
600
614
 
601
- rb_define_method(cOptions, "stpdec", options_stpdec, 0);
602
- rb_define_method(cOptions, "stpdec=", options_set_stpdec, 1);
615
+ rb_define_method(cOptions, "cutoff", options_cutoff, 0);
616
+ rb_define_method(cOptions, "cutoff=", options_set_cutoff, 1);
617
+
618
+ rb_define_method(cOptions, "nbest", options_nbest, 0);
619
+ rb_define_method(cOptions, "nbest=", options_set_nbest, 1);
603
620
 
604
- rb_define_method(cOptions, "cutoff", options_cutoff, 0);
605
- rb_define_method(cOptions, "cutoff=", options_set_cutoff, 1);
606
-
607
- rb_define_method(cOptions, "nbest", options_nbest, 0);
608
- rb_define_method(cOptions, "nbest=", options_set_nbest, 1);
609
-
610
621
  }
611
622
 
612
623
 
@@ -615,119 +626,119 @@ void Init_options() {
615
626
  // Auxiliary Methods
616
627
 
617
628
  static mdl_t *get_model(VALUE self) {
618
- mdl_t *model;
619
- Data_Get_Struct(self, mdl_t, model);
620
- return model;
629
+ mdl_t *model;
630
+ Data_Get_Struct(self, mdl_t, model);
631
+ return model;
621
632
  }
622
633
 
623
634
  // Constructor / Desctructor
624
635
 
625
636
  static void mark_model(mdl_t *model __attribute__((__unused__))) {
626
- // nothing
637
+ // nothing
627
638
  }
628
639
 
629
640
  static void deallocate_model(mdl_t *model) {
630
- if (model) {
631
- mdl_free(model);
632
- model = (mdl_t*)0;
633
- }
641
+ if (model) {
642
+ mdl_free(model);
643
+ model = (mdl_t*)0;
644
+ }
634
645
  }
635
646
 
636
647
  static VALUE allocate_model(VALUE self) {
637
- mdl_t *model = mdl_new(rdr_new(false));
638
- return Data_Wrap_Struct(self, mark_model, deallocate_model, model);
648
+ mdl_t *model = mdl_new(rdr_new(false));
649
+ return Data_Wrap_Struct(self, mark_model, deallocate_model, model);
639
650
  }
640
651
 
641
- static VALUE model_set_options(VALUE self, VALUE rb_options) {
642
- if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
643
- rb_raise(cNativeError, "argument must be a Wapiti::Options instance");
644
- }
645
-
646
- mdl_t *model = get_model(self);
647
-
648
- // Store reference to options in model struct
649
- model->opt = get_options(rb_options);
650
-
651
- // Update reader
652
- model->reader->maxent = model->opt->maxent;
653
-
654
- // Save instance variable
655
- rb_ivar_set(self, rb_intern("@options"), rb_options);
652
+ static VALUE model_set_options(VALUE self, VALUE rb_options) {
653
+ if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
654
+ rb_raise(cNativeError, "argument must be a Wapiti::Options instance");
655
+ }
656
+
657
+ mdl_t *model = get_model(self);
658
+
659
+ // Store reference to options in model struct
660
+ model->opt = get_options(rb_options);
661
+
662
+ // Update reader
663
+ model->reader->autouni = model->opt->maxent;
664
+
665
+ // Save instance variable
666
+ rb_ivar_set(self, rb_intern("@options"), rb_options);
656
667
 
657
- return rb_options;
668
+ return rb_options;
658
669
  }
659
670
 
660
671
  static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
661
- VALUE options;
662
-
663
- if (argc > 1) {
664
- rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
665
- "wrong number of arguments (%d for 0..1)", argc);
666
- }
667
-
668
- if (argc) {
669
- if (TYPE(argv[0]) == T_HASH) {
670
- options = rb_funcall(cOptions, rb_intern("new"), 1, argv[0]);
671
- }
672
- else {
673
- if (strncmp("Wapiti::Options", rb_obj_classname(argv[0]), 15) != 0) {
674
- rb_raise(cNativeError, "argument must be a hash or an options instance");
675
- }
676
- options = argv[0];
677
- }
678
- }
679
- else {
680
- options = rb_funcall(cOptions, rb_intern("new"), 0);
681
- }
682
-
683
- // yield options if block_given?
684
- if (rb_block_given_p()) {
685
- rb_yield(options);
686
- }
687
-
688
- model_set_options(self, options);
689
-
690
- // Load a previous model if specified by options
691
- if (get_options(options)->model) {
692
- rb_funcall(self, rb_intern("load"), 0);
693
- }
694
-
695
- // initialize counters
696
- rb_funcall(self, rb_intern("clear_counters"), 0);
697
-
698
- return self;
672
+ VALUE options;
673
+
674
+ if (argc > 1) {
675
+ rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
676
+ "wrong number of arguments (%d for 0..1)", argc);
677
+ }
678
+
679
+ if (argc) {
680
+ if (TYPE(argv[0]) == T_HASH) {
681
+ options = rb_funcall(cOptions, rb_intern("new"), 1, argv[0]);
682
+ }
683
+ else {
684
+ if (strncmp("Wapiti::Options", rb_obj_classname(argv[0]), 15) != 0) {
685
+ rb_raise(cNativeError, "argument must be a hash or an options instance");
686
+ }
687
+ options = argv[0];
688
+ }
689
+ }
690
+ else {
691
+ options = rb_funcall(cOptions, rb_intern("new"), 0);
692
+ }
693
+
694
+ // yield options if block_given?
695
+ if (rb_block_given_p()) {
696
+ rb_yield(options);
697
+ }
698
+
699
+ model_set_options(self, options);
700
+
701
+ // Load a previous model if specified by options
702
+ if (get_options(options)->model) {
703
+ rb_funcall(self, rb_intern("load"), 0);
704
+ }
705
+
706
+ // initialize counters
707
+ rb_funcall(self, rb_intern("clear_counters"), 0);
708
+
709
+ return self;
699
710
  }
700
711
 
701
712
 
702
713
  // Native accessors
703
714
 
704
715
  static VALUE model_nlbl(VALUE self) {
705
- return INT2FIX(get_model(self)->nlbl);
716
+ return INT2FIX(get_model(self)->nlbl);
706
717
  }
707
718
 
708
719
  static VALUE model_nobs(VALUE self) {
709
- return INT2FIX(get_model(self)->nobs);
720
+ return INT2FIX(get_model(self)->nobs);
710
721
  }
711
722
 
712
723
  static VALUE model_nftr(VALUE self) {
713
- return INT2FIX(get_model(self)->nftr);
724
+ return INT2FIX(get_model(self)->nftr);
714
725
  }
715
726
 
716
727
  static VALUE model_total(VALUE self) {
717
- return rb_float_new(get_model(self)->total);
728
+ return rb_float_new(get_model(self)->total);
718
729
  }
719
730
 
720
731
 
721
732
  // Instance methods
722
733
 
723
734
  static VALUE model_sync(VALUE self) {
724
- mdl_sync(get_model(self));
725
- return self;
735
+ mdl_sync(get_model(self));
736
+ return self;
726
737
  }
727
738
 
728
739
  static VALUE model_compact(VALUE self) {
729
- mdl_compact(get_model(self));
730
- return self;
740
+ mdl_compact(get_model(self));
741
+ return self;
731
742
  }
732
743
 
733
744
  // call-seq:
@@ -737,400 +748,398 @@ static VALUE model_compact(VALUE self) {
737
748
  // Saves the model to a file. Uses the Model's path if no argument given,
738
749
  // otherwise uses the passed-in argument as the Model's path.
739
750
  static VALUE model_save(int argc, VALUE *argv, VALUE self) {
740
- if (argc > 1) {
741
- rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
742
- "wrong number of arguments (%d for 0..1)", argc);
743
- }
744
-
745
- mdl_t *model = get_model(self);
746
-
747
- // save passed-in path in options
748
- if (argc) {
749
- Check_Type(argv[0], T_STRING);
750
- rb_ivar_set(self, rb_intern("@path"), argv[0]);
751
- }
752
-
753
- // open the output file
754
- FILE *file = 0;
755
- VALUE path = rb_ivar_get(self, rb_intern("@path"));
756
-
757
- if (NIL_P(path)) {
758
- rb_raise(cNativeError, "failed to save model: no path given");
759
- }
760
-
761
- if (!(file = fopen(StringValueCStr(path), "w"))) {
762
- rb_raise(cNativeError, "failed to save model: failed to open model file");
763
- }
764
-
765
- mdl_save(model, file);
766
- fclose(file);
767
-
768
- return self;
751
+ if (argc > 1) {
752
+ rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
753
+ "wrong number of arguments (%d for 0..1)", argc);
754
+ }
755
+
756
+ mdl_t *model = get_model(self);
757
+
758
+ // save passed-in path in options
759
+ if (argc) {
760
+ Check_Type(argv[0], T_STRING);
761
+ rb_ivar_set(self, rb_intern("@path"), argv[0]);
762
+ }
763
+
764
+ // open the output file
765
+ FILE *file = 0;
766
+ VALUE path = rb_ivar_get(self, rb_intern("@path"));
767
+
768
+ if (NIL_P(path)) {
769
+ rb_raise(cNativeError, "failed to save model: no path given");
770
+ }
771
+
772
+ if (!(file = fopen(StringValueCStr(path), "w"))) {
773
+ rb_raise(cNativeError, "failed to save model: failed to open model file");
774
+ }
775
+
776
+ mdl_save(model, file);
777
+ fclose(file);
778
+
779
+ return self;
769
780
  }
770
781
 
771
782
  static VALUE model_load(int argc, VALUE *argv, VALUE self) {
772
- if (argc > 1) {
773
- rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
774
- "wrong number of arguments (%d for 0..1)", argc);
775
- }
776
-
777
- mdl_t *model = get_model(self);
778
-
779
- // save passed-in argument in options
780
- if (argc) {
781
- Check_Type(argv[0], T_STRING);
782
- rb_ivar_set(self, rb_intern("@path"), argv[0]);
783
- }
784
-
785
- // open the model file
786
- FILE *file = 0;
787
- VALUE path = rb_ivar_get(self, rb_intern("@path"));
788
-
789
- if (NIL_P(path)) {
790
- rb_raise(cNativeError, "failed to load model: no path given");
791
- }
792
-
793
- if (!(file = fopen(StringValueCStr(path), "r"))) {
794
- rb_raise(cNativeError, "failed to load model: failed to open model file");
795
- }
796
-
797
- mdl_load(model, file);
798
- fclose(file);
799
-
800
- return self;
783
+ if (argc > 1) {
784
+ rb_raise(rb_const_get(rb_mKernel, rb_intern("ArgumentError")),
785
+ "wrong number of arguments (%d for 0..1)", argc);
786
+ }
787
+
788
+ mdl_t *model = get_model(self);
789
+
790
+ // save passed-in argument in options
791
+ if (argc) {
792
+ Check_Type(argv[0], T_STRING);
793
+ rb_ivar_set(self, rb_intern("@path"), argv[0]);
794
+ }
795
+
796
+ // open the model file
797
+ FILE *file = 0;
798
+ VALUE path = rb_ivar_get(self, rb_intern("@path"));
799
+
800
+ if (NIL_P(path)) {
801
+ rb_raise(cNativeError, "failed to load model: no path given");
802
+ }
803
+
804
+ if (!(file = fopen(StringValueCStr(path), "r"))) {
805
+ rb_raise(cNativeError, "failed to load model: failed to open model file");
806
+ }
807
+
808
+ mdl_load(model, file);
809
+ fclose(file);
810
+
811
+ return self;
801
812
  }
802
813
 
803
814
  static dat_t *to_dat(rdr_t *reader, VALUE data, bool labelled) {
804
- Check_Type(data, T_ARRAY);
805
-
806
- const unsigned int n = RARRAY_LEN(data);
807
- unsigned int i, j, k;
808
-
809
- dat_t *dat = xmalloc(sizeof(dat_t));
810
- dat->nseq = 0;
811
- dat->mlen = 0;
812
- dat->lbl = labelled;
813
- dat->seq = xmalloc(sizeof(seq_t*) * n);
814
-
815
- for (i = 0; i < n; ++i) {
816
- VALUE sequence = rb_ary_entry(data, i);
817
- Check_Type(sequence, T_ARRAY);
818
-
819
- k = RARRAY_LEN(sequence);
820
- raw_t *raw = xmalloc(sizeof(raw_t) + sizeof(char*) * k);
821
-
822
- for (j = 0; j < k; ++j) {
823
- VALUE line = rb_ary_entry(sequence, j);
824
- Check_Type(line, T_STRING);
825
- raw->lines[j] = StringValueCStr(line);
826
- }
827
-
828
- raw->len = k;
829
-
830
- seq_t *seq = rdr_raw2seq(reader, raw, labelled);
831
- xfree(raw);
832
-
833
- if (seq == 0) { break; }
834
-
835
- // and store the sequence
836
- dat->seq[dat->nseq++] = seq;
837
- dat->mlen = max(dat->mlen, seq->len);
838
-
839
- }
840
-
841
- // if no sequence was read, free memory
842
- if (dat->nseq == 0) {
843
- xfree(dat->seq);
844
- xfree(dat);
845
-
846
- return 0;
847
- }
848
-
849
- return dat;
815
+ Check_Type(data, T_ARRAY);
816
+
817
+ const unsigned int n = RARRAY_LEN(data);
818
+ unsigned int i, j, k;
819
+
820
+ dat_t *dat = wapiti_xmalloc(sizeof(dat_t));
821
+ dat->nseq = 0;
822
+ dat->mlen = 0;
823
+ dat->lbl = labelled;
824
+ dat->seq = wapiti_xmalloc(sizeof(seq_t*) * n);
825
+
826
+ for (i = 0; i < n; ++i) {
827
+ VALUE sequence = rb_ary_entry(data, i);
828
+ Check_Type(sequence, T_ARRAY);
829
+
830
+ k = RARRAY_LEN(sequence);
831
+ raw_t *raw = wapiti_xmalloc(sizeof(raw_t) + sizeof(char*) * k);
832
+
833
+ for (j = 0; j < k; ++j) {
834
+ VALUE line = rb_ary_entry(sequence, j);
835
+ Check_Type(line, T_STRING);
836
+ raw->lines[j] = StringValueCStr(line);
837
+ }
838
+
839
+ raw->len = k;
840
+
841
+ seq_t *seq = rdr_raw2seq(reader, raw, labelled);
842
+ xfree(raw);
843
+
844
+ if (seq == 0) { break; }
845
+
846
+ // and store the sequence
847
+ dat->seq[dat->nseq++] = seq;
848
+ dat->mlen = max(dat->mlen, seq->len);
849
+
850
+ }
851
+
852
+ // if no sequence was read, free memory
853
+ if (dat->nseq == 0) {
854
+ xfree(dat->seq);
855
+ xfree(dat);
856
+
857
+ return 0;
858
+ }
859
+
860
+ return dat;
850
861
  }
851
862
 
852
863
 
853
864
  static VALUE model_train(VALUE self, VALUE data) {
854
-
855
- mdl_t* model = get_model(self);
856
-
857
- int trn;
858
- for (trn = 0; trn < trn_cnt; trn++) {
859
- if (!strcmp(model->opt->algo, trn_lst[trn].name)) break;
860
- }
861
-
862
- if (trn == trn_cnt) {
863
- rb_raise(cNativeError, "failed to train model: unknown algorithm '%s'", model->opt->algo);
864
- }
865
-
866
- FILE *file;
867
-
868
- // Load the pattern file. This will unlock the database if previously
869
- // locked by loading a model.
870
- if (model->opt->pattern) {
871
- file = fopen(model->opt->pattern, "r");
872
-
873
- if (!file) {
874
- rb_raise(cNativeError, "failed to train model: failed to load pattern file '%s'", model->opt->pattern);
875
- }
876
-
877
- rdr_loadpat(model->reader, file);
878
- fclose(file);
879
- }
880
- else {
881
- // rb_raise(cNativeError, "failed to train model: no pattern given");
882
- }
883
-
884
- qrk_lock(model->reader->obs, false);
885
-
886
-
887
- // Load the training data. When this is done we lock the quarks as we
888
- // don't want to put in the model, informations present only in the
889
- // devlopment set.
890
-
891
- switch (TYPE(data)) {
892
- case T_STRING:
893
- if (!(file = fopen(StringValuePtr(data), "r"))) {
894
- rb_raise(cNativeError, "failed to train model: failed to open training data '%s", StringValuePtr(data));
895
- }
896
-
897
- model->train = rdr_readdat(model->reader, file, true);
898
- fclose(file);
899
-
900
- break;
901
- case T_ARRAY:
902
- model->train = to_dat(model->reader, data, true);
903
-
904
- break;
905
- default:
906
- rb_raise(cNativeError, "failed to train model: invalid training data type (expected instance of String or Array)");
907
- }
908
-
909
- qrk_lock(model->reader->lbl, true);
910
- qrk_lock(model->reader->obs, true);
911
-
912
- if (!model->train || model->train->nseq == 0) {
913
- rb_raise(cNativeError, "failed to train model: no training data loaded");
914
- }
915
-
916
- // If present, load the development set in the model. If not specified,
917
- // the training dataset will be used instead.
918
- if (model->opt->devel) {
919
- if (!(file = fopen(model->opt->devel, "r"))) {
920
- rb_raise(cNativeError, "failed to train model: cannot open development file '%s'", model->opt->devel);
921
- }
922
-
923
- model->devel = rdr_readdat(model->reader, file, true);
924
- fclose(file);
925
- }
926
-
927
- // Initialize the model. If a previous model was loaded, this will be
928
- // just a resync, else the model structure will be created.
929
- rb_funcall(self, rb_intern("sync"), 0);
930
-
931
- // Train the model.
932
- uit_setup(model);
933
- trn_lst[trn].train(model);
934
- uit_cleanup(model);
935
-
936
- // If requested compact the model.
937
- if (model->opt->compact) {
938
- const size_t O = model->nobs;
939
- const size_t F = model->nftr;
940
- rb_funcall(self, rb_intern("compact"), 0);
941
- }
942
-
943
- return self;
865
+
866
+ mdl_t* model = get_model(self);
867
+
868
+ int trn;
869
+ for (trn = 0; trn < trn_cnt; trn++) {
870
+ if (!strcmp(model->opt->algo, trn_lst[trn].name)) break;
871
+ }
872
+
873
+ if (trn == trn_cnt) {
874
+ rb_raise(cNativeError, "failed to train model: unknown algorithm '%s'", model->opt->algo);
875
+ }
876
+
877
+ FILE *file;
878
+
879
+ // Load the pattern file. This will unlock the database if previously
880
+ // locked by loading a model.
881
+ if (model->opt->pattern) {
882
+ file = fopen(model->opt->pattern, "r");
883
+
884
+ if (!file) {
885
+ rb_raise(cNativeError, "failed to train model: failed to load pattern file '%s'", model->opt->pattern);
886
+ }
887
+
888
+ rdr_loadpat(model->reader, file);
889
+ fclose(file);
890
+ }
891
+ else {
892
+ // rb_raise(cNativeError, "failed to train model: no pattern given");
893
+ }
894
+
895
+ qrk_lock(model->reader->obs, false);
896
+
897
+
898
+ // Load the training data. When this is done we lock the quarks as we
899
+ // don't want to put in the model, informations present only in the
900
+ // devlopment set.
901
+
902
+ switch (TYPE(data)) {
903
+ case T_STRING:
904
+ if (!(file = fopen(StringValuePtr(data), "r"))) {
905
+ rb_raise(cNativeError, "failed to train model: failed to open training data '%s", StringValuePtr(data));
906
+ }
907
+
908
+ model->train = rdr_readdat(model->reader, file, true);
909
+ fclose(file);
910
+
911
+ break;
912
+ case T_ARRAY:
913
+ model->train = to_dat(model->reader, data, true);
914
+
915
+ break;
916
+ default:
917
+ rb_raise(cNativeError, "failed to train model: invalid training data type (expected instance of String or Array)");
918
+ }
919
+
920
+ qrk_lock(model->reader->lbl, true);
921
+ qrk_lock(model->reader->obs, true);
922
+
923
+ if (!model->train || model->train->nseq == 0) {
924
+ rb_raise(cNativeError, "failed to train model: no training data loaded");
925
+ }
926
+
927
+ // If present, load the development set in the model. If not specified,
928
+ // the training dataset will be used instead.
929
+ if (model->opt->devel) {
930
+ if (!(file = fopen(model->opt->devel, "r"))) {
931
+ rb_raise(cNativeError, "failed to train model: cannot open development file '%s'", model->opt->devel);
932
+ }
933
+
934
+ model->devel = rdr_readdat(model->reader, file, true);
935
+ fclose(file);
936
+ }
937
+
938
+ // Initialize the model. If a previous model was loaded, this will be
939
+ // just a resync, else the model structure will be created.
940
+ rb_funcall(self, rb_intern("sync"), 0);
941
+
942
+ // Train the model.
943
+ uit_setup(model);
944
+ trn_lst[trn].train(model);
945
+ uit_cleanup(model);
946
+
947
+ // If requested compact the model.
948
+ if (model->opt->compact) {
949
+ rb_funcall(self, rb_intern("compact"), 0);
950
+ }
951
+
952
+ return self;
944
953
  }
945
954
 
946
955
  // Returns a sorted list of all labels in the Model's label database.
947
956
  static VALUE model_labels(VALUE self) {
948
- mdl_t *model = get_model(self);
949
- const size_t Y = model->nlbl;
950
-
951
- qrk_t *lp = model->reader->lbl;
952
-
953
- VALUE labels = rb_ary_new2(Y);
954
-
955
- for (unsigned int i = 0; i < Y; ++i) {
956
- rb_ary_push(labels, rb_str_new2(qrk_id2str(lp, i)));
957
- }
958
-
959
- rb_funcall(labels, rb_intern("sort!"), 0);
960
-
961
- return labels;
957
+ mdl_t *model = get_model(self);
958
+ const uint32_t Y = model->nlbl;
959
+
960
+ qrk_t *lp = model->reader->lbl;
961
+
962
+ VALUE labels = rb_ary_new2(Y);
963
+
964
+ for (unsigned int i = 0; i < Y; ++i) {
965
+ rb_ary_push(labels, rb_str_new2(qrk_id2str(lp, i)));
966
+ }
967
+
968
+ rb_funcall(labels, rb_intern("sort!"), 0);
969
+
970
+ return labels;
962
971
  }
963
972
 
964
973
  static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
965
- qrk_t *lbls = model->reader->lbl;
966
-
967
- const unsigned int Y = model->nlbl;
968
- const unsigned int N = model->opt->nbest;
969
-
970
- seq_t *seq = rdr_raw2seq(model->reader, raw, model->opt->check);
971
-
972
- const unsigned int T = seq->len;
973
- unsigned int n, t, tcnt = 0, terr = 0, scnt = 0, serr = 0, stat[3][Y];
974
-
975
- size_t *out = xmalloc(sizeof(size_t) * T * N);
976
- double *psc = xmalloc(sizeof(double) * T * N);
977
- double *scs = xmalloc(sizeof(double) * N);
978
-
979
- VALUE sequence, tokens;
980
-
981
- if (N == 1) {
982
- tag_viterbi(model, seq, (size_t*)out, scs, (double*)psc);
983
- }
984
- else {
985
- tag_nbviterbi(model, seq, N, (size_t*)out, scs, (double*)psc);
986
- }
987
-
988
- sequence = rb_ary_new();
989
-
990
- for (t = 0; t < T; ++t) {
991
- tokens = rb_ary_new();
992
-
993
- if (!model->opt->label) {
994
- VALUE token = rb_str_new2(raw->lines[t]);
995
-
996
- #ifdef HAVE_RUBY_ENCODING_H
997
- int enc = rb_enc_find_index("UTF-8");
998
- rb_enc_associate_index(token, enc);
999
- #endif
1000
-
1001
- rb_ary_push(tokens, token);
1002
- }
1003
-
1004
- for (n = 0; n < N; ++n) {
1005
-
1006
- size_t lbl = out[t * N + n];
1007
- rb_ary_push(tokens, rb_str_new2(qrk_id2str(lbls, lbl)));
1008
-
1009
- // output individual score
1010
- if (model->opt->outsc) {
1011
- rb_ary_push(tokens, rb_float_new(psc[t * N + n]));
1012
- }
1013
-
1014
- }
1015
-
1016
- // yield token/label pair to block if given
1017
- if (rb_block_given_p()) {
1018
- tokens = rb_yield(tokens);
1019
- }
1020
-
1021
- rb_ary_push(sequence, tokens);
1022
-
1023
-
1024
- // TODO output sequence score: scs[n] (float)
1025
-
1026
- }
1027
-
1028
- // Statistics
1029
- if (model->opt->check) {
1030
- int err = 0;
1031
-
1032
- for (t = 0; t < T; ++t) {
1033
- stat[0][seq->pos[t].lbl]++;
1034
- stat[1][out[t * N]]++;
1035
-
1036
- if (seq->pos[t].lbl != out[t * N]) {
1037
- terr++;
1038
- err = 1;
1039
- }
1040
- else {
1041
- stat[2][out[t * N]]++;
1042
- }
1043
- }
1044
-
1045
- tcnt = FIX2INT(rb_ivar_get(self, rb_intern("@token_count")));
1046
- rb_ivar_set(self, rb_intern("@token_count"), INT2FIX(tcnt + (unsigned int)T));
1047
-
1048
- terr += FIX2INT(rb_ivar_get(self, rb_intern("@token_errors")));
1049
- rb_ivar_set(self, rb_intern("@token_errors"), INT2FIX(terr));
1050
-
1051
- scnt = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_count")));
1052
- rb_ivar_set(self, rb_intern("@sequence_count"), INT2FIX(++scnt));
1053
-
1054
- serr = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_errors")));
1055
- rb_ivar_set(self, rb_intern("@sequence_errors"), INT2FIX(serr + err));
1056
-
1057
- }
1058
-
1059
-
1060
- // Cleanup memory used for this sequence
1061
- xfree(scs);
1062
- xfree(psc);
1063
- xfree(out);
1064
-
1065
- rdr_freeseq(seq);
1066
-
1067
- return sequence;
974
+ qrk_t *lbls = model->reader->lbl;
975
+
976
+ const unsigned int Y = model->nlbl;
977
+ const unsigned int N = model->opt->nbest;
978
+
979
+ seq_t *seq = rdr_raw2seq(model->reader, raw, model->opt->check);
980
+
981
+ const unsigned int T = seq->len;
982
+ unsigned int n, t, tcnt = 0, terr = 0, scnt = 0, serr = 0, stat[3][Y];
983
+
984
+ uint32_t *out = wapiti_xmalloc(sizeof(uint32_t) * T * N);
985
+ double *psc = wapiti_xmalloc(sizeof(double) * T * N);
986
+ double *scs = wapiti_xmalloc(sizeof(double) * N);
987
+
988
+ VALUE sequence, tokens;
989
+
990
+ if (N == 1) {
991
+ tag_viterbi(model, seq, out, scs, psc);
992
+ }
993
+ else {
994
+ tag_nbviterbi(model, seq, N, (void*)out, scs, (void*)psc);
995
+ }
996
+
997
+ sequence = rb_ary_new();
998
+
999
+ for (t = 0; t < T; ++t) {
1000
+ tokens = rb_ary_new();
1001
+
1002
+ if (!model->opt->label) {
1003
+ VALUE token = rb_str_new2(raw->lines[t]);
1004
+
1005
+ #ifdef HAVE_RUBY_ENCODING_H
1006
+ int enc = rb_enc_find_index("UTF-8");
1007
+ rb_enc_associate_index(token, enc);
1008
+ #endif
1009
+
1010
+ rb_ary_push(tokens, token);
1011
+ }
1012
+
1013
+ for (n = 0; n < N; ++n) {
1014
+
1015
+ uint64_t lbl = out[t * N + n];
1016
+ rb_ary_push(tokens, rb_str_new2(qrk_id2str(lbls, lbl)));
1017
+
1018
+ // output individual score
1019
+ if (model->opt->outsc) {
1020
+ rb_ary_push(tokens, rb_float_new(psc[t * N + n]));
1021
+ }
1022
+
1023
+ }
1024
+
1025
+ // yield token/label pair to block if given
1026
+ if (rb_block_given_p()) {
1027
+ tokens = rb_yield(tokens);
1028
+ }
1029
+
1030
+ rb_ary_push(sequence, tokens);
1031
+
1032
+
1033
+ // TODO output sequence score: scs[n] (float)
1034
+
1035
+ }
1036
+
1037
+ // Statistics
1038
+ if (model->opt->check) {
1039
+ int err = 0;
1040
+
1041
+ for (t = 0; t < T; ++t) {
1042
+ stat[0][seq->pos[t].lbl]++;
1043
+ stat[1][out[t * N]]++;
1044
+
1045
+ if (seq->pos[t].lbl != out[t * N]) {
1046
+ terr++;
1047
+ err = 1;
1048
+ }
1049
+ else {
1050
+ stat[2][out[t * N]]++;
1051
+ }
1052
+ }
1053
+
1054
+ tcnt = FIX2INT(rb_ivar_get(self, rb_intern("@token_count")));
1055
+ rb_ivar_set(self, rb_intern("@token_count"), INT2FIX(tcnt + (unsigned int)T));
1056
+
1057
+ terr += FIX2INT(rb_ivar_get(self, rb_intern("@token_errors")));
1058
+ rb_ivar_set(self, rb_intern("@token_errors"), INT2FIX(terr));
1059
+
1060
+ scnt = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_count")));
1061
+ rb_ivar_set(self, rb_intern("@sequence_count"), INT2FIX(++scnt));
1062
+
1063
+ serr = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_errors")));
1064
+ rb_ivar_set(self, rb_intern("@sequence_errors"), INT2FIX(serr + err));
1065
+
1066
+ }
1067
+
1068
+
1069
+ // Cleanup memory used for this sequence
1070
+ xfree(scs);
1071
+ xfree(psc);
1072
+ xfree(out);
1073
+
1074
+ rdr_freeseq(seq);
1075
+
1076
+ return sequence;
1068
1077
  }
1069
1078
 
1070
1079
  static VALUE decode_sequence_array(VALUE self, VALUE array) {
1071
- Check_Type(array, T_ARRAY);
1072
- const unsigned int n = RARRAY_LEN(array);
1073
-
1074
- mdl_t *model = get_model(self);
1075
- raw_t *raw;
1076
-
1077
- const unsigned int N = model->opt->nbest;
1078
- unsigned int i, j;
1079
-
1080
- VALUE result = rb_ary_new2(n * N), sequence;
1081
-
1082
- for (i = 0; i < n; ++i) {
1083
- sequence = rb_ary_entry(array, i);
1084
- Check_Type(sequence, T_ARRAY);
1085
-
1086
- const unsigned int k = RARRAY_LEN(sequence);
1087
- raw = xmalloc(sizeof(raw_t) + sizeof(char*) * k);
1088
- raw->len = k;
1089
-
1090
- for (j = 0; j < k; ++j) {
1091
- VALUE line = rb_ary_entry(sequence, j);
1092
- Check_Type(line, T_STRING);
1093
-
1094
- raw->lines[j] = StringValueCStr(line);
1095
- }
1096
-
1097
- rb_ary_push(result, decode_sequence(self, model, raw));
1098
-
1099
- xfree(raw);
1100
- }
1101
-
1102
- return result;
1080
+ Check_Type(array, T_ARRAY);
1081
+ const unsigned int n = RARRAY_LEN(array);
1082
+
1083
+ mdl_t *model = get_model(self);
1084
+ raw_t *raw;
1085
+
1086
+ const unsigned int N = model->opt->nbest;
1087
+ unsigned int i, j;
1088
+
1089
+ VALUE result = rb_ary_new2(n * N), sequence;
1090
+
1091
+ for (i = 0; i < n; ++i) {
1092
+ sequence = rb_ary_entry(array, i);
1093
+ Check_Type(sequence, T_ARRAY);
1094
+
1095
+ const unsigned int k = RARRAY_LEN(sequence);
1096
+ raw = wapiti_xmalloc(sizeof(raw_t) + sizeof(char*) * k);
1097
+ raw->len = k;
1098
+
1099
+ for (j = 0; j < k; ++j) {
1100
+ VALUE line = rb_ary_entry(sequence, j);
1101
+ Check_Type(line, T_STRING);
1102
+
1103
+ raw->lines[j] = StringValueCStr(line);
1104
+ }
1105
+
1106
+ rb_ary_push(result, decode_sequence(self, model, raw));
1107
+
1108
+ xfree(raw);
1109
+ }
1110
+
1111
+ return result;
1103
1112
  }
1104
1113
 
1105
1114
  static VALUE decode_sequence_file(VALUE self, VALUE path) {
1106
- Check_Type(path, T_STRING);
1107
- FILE *file;
1108
-
1109
- if (!(file = fopen(StringValueCStr(path), "r"))) {
1110
- rb_raise(cNativeError, "failed to label data: could not open file '%s'", StringValueCStr(path));
1111
- }
1112
-
1113
- mdl_t *model = get_model(self);
1114
- raw_t *raw;
1115
-
1116
- VALUE result = rb_ary_new();
1117
-
1118
- // Next read the input file sequence by sequence and label them, we have
1119
- // to take care of not discarding the raw input as we want to send it
1120
- // back to the output with the additional predicted labels.
1121
- while (!feof(file)) {
1122
-
1123
- // So, first read an input sequence keeping the raw_t object
1124
- // available, and label it with Viterbi.
1125
- if ((raw = rdr_readraw(model->reader, file)) == 0) {
1126
- break;
1127
- }
1128
-
1129
- rb_ary_push(result, decode_sequence(self, model, raw));
1130
- rdr_freeraw(raw);
1131
- }
1132
-
1133
- return result;
1115
+ Check_Type(path, T_STRING);
1116
+ FILE *file;
1117
+
1118
+ if (!(file = fopen(StringValueCStr(path), "r"))) {
1119
+ rb_raise(cNativeError, "failed to label data: could not open file '%s'", StringValueCStr(path));
1120
+ }
1121
+
1122
+ mdl_t *model = get_model(self);
1123
+ raw_t *raw;
1124
+
1125
+ VALUE result = rb_ary_new();
1126
+
1127
+ // Next read the input file sequence by sequence and label them, we have
1128
+ // to take care of not discarding the raw input as we want to send it
1129
+ // back to the output with the additional predicted labels.
1130
+ while (!feof(file)) {
1131
+
1132
+ // So, first read an input sequence keeping the raw_t object
1133
+ // available, and label it with Viterbi.
1134
+ if ((raw = rdr_readraw(model->reader, file)) == 0) {
1135
+ break;
1136
+ }
1137
+
1138
+ rb_ary_push(result, decode_sequence(self, model, raw));
1139
+ rdr_freeraw(raw);
1140
+ }
1141
+
1142
+ return result;
1134
1143
  }
1135
1144
 
1136
1145
  // cal-seq:
@@ -1138,144 +1147,146 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
1138
1147
  // m.label(filename, options = {}) # => array of labelled tokens
1139
1148
  //
1140
1149
  static VALUE model_label(VALUE self, VALUE data) {
1141
- VALUE result;
1142
-
1143
- switch (TYPE(data)) {
1144
- case T_STRING:
1145
- result = decode_sequence_file(self, data);
1146
- break;
1147
- case T_ARRAY:
1148
- result = decode_sequence_array(self, data);
1149
- break;
1150
- default:
1151
- rb_raise(cNativeError, "failed to label data: invalid data (expected type String or Array)");
1152
- }
1153
-
1154
- return result;
1150
+ VALUE result;
1151
+
1152
+ switch (TYPE(data)) {
1153
+ case T_STRING:
1154
+ result = decode_sequence_file(self, data);
1155
+ break;
1156
+ case T_ARRAY:
1157
+ result = decode_sequence_array(self, data);
1158
+ break;
1159
+ default:
1160
+ rb_raise(cNativeError, "failed to label data: invalid data (expected type String or Array)");
1161
+ }
1162
+
1163
+ return result;
1155
1164
  }
1156
1165
 
1157
1166
  static void Init_model() {
1158
- cModel = rb_define_class_under(mWapiti, "Model", rb_cObject);
1159
- rb_define_alloc_func(cModel, allocate_model);
1160
-
1161
- rb_define_method(cModel, "initialize", initialize_model, -1);
1167
+ cModel = rb_define_class_under(mWapiti, "Model", rb_cObject);
1168
+ rb_define_alloc_func(cModel, allocate_model);
1169
+
1170
+ rb_define_method(cModel, "initialize", initialize_model, -1);
1171
+
1172
+ rb_define_attr(cModel, "options", 1, 0);
1173
+
1162
1174
 
1163
- rb_define_attr(cModel, "options", 1, 0);
1175
+ rb_define_method(cModel, "nlbl", model_nlbl, 0);
1176
+ rb_define_method(cModel, "labels", model_labels, 0);
1164
1177
 
1165
-
1166
- rb_define_method(cModel, "nlbl", model_nlbl, 0);
1167
- rb_define_method(cModel, "labels", model_labels, 0);
1168
-
1169
- rb_define_method(cModel, "nobs", model_nobs, 0);
1170
- rb_define_alias(cModel, "observations", "nobs");
1178
+ rb_define_method(cModel, "nobs", model_nobs, 0);
1179
+ rb_define_alias(cModel, "observations", "nobs");
1171
1180
 
1172
- rb_define_method(cModel, "nftr", model_nftr, 0);
1173
- rb_define_alias(cModel, "features", "nftr");
1181
+ rb_define_method(cModel, "nftr", model_nftr, 0);
1182
+ rb_define_alias(cModel, "features", "nftr");
1174
1183
 
1175
- rb_define_method(cModel, "total", model_total, 0);
1184
+ rb_define_method(cModel, "total", model_total, 0);
1176
1185
 
1177
- rb_define_method(cModel, "sync", model_sync, 0);
1178
- rb_define_method(cModel, "compact", model_compact, 0);
1179
- rb_define_method(cModel, "save", model_save, -1);
1180
- rb_define_method(cModel, "load", model_load, -1);
1186
+ rb_define_method(cModel, "sync", model_sync, 0);
1187
+ rb_define_method(cModel, "compact", model_compact, 0);
1188
+ rb_define_method(cModel, "save", model_save, -1);
1189
+ rb_define_method(cModel, "load", model_load, -1);
1181
1190
 
1182
- rb_define_method(cModel, "train", model_train, 1);
1183
- rb_define_method(cModel, "label", model_label, 1);
1191
+ rb_define_method(cModel, "train", model_train, 1);
1192
+ rb_define_method(cModel, "label", model_label, 1);
1184
1193
  }
1185
1194
 
1186
1195
  /* --- Top-Level Utility Methods --- */
1187
1196
 
1188
1197
 
1189
1198
  static VALUE label(VALUE self __attribute__((__unused__)), VALUE rb_options) {
1190
- if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
1191
- rb_raise(cNativeError, "argument must be a native options instance");
1192
- }
1199
+ if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
1200
+ rb_raise(cNativeError, "argument must be a native options instance");
1201
+ }
1193
1202
 
1194
- opt_t *options = get_options(rb_options);
1203
+ opt_t *options = get_options(rb_options);
1195
1204
 
1196
- if (options->mode != 1) {
1197
- rb_raise(cNativeError, "invalid options argument: mode should be set to 1 for labelling");
1198
- }
1205
+ if (options->mode != 1) {
1206
+ rb_raise(cNativeError, "invalid options argument: mode should be set to 1 for labelling");
1207
+ }
1199
1208
 
1200
- mdl_t *model = mdl_new(rdr_new(options->maxent));
1201
- model->opt = options;
1209
+ mdl_t *model = mdl_new(rdr_new(options->maxent));
1210
+ model->opt = options;
1202
1211
 
1203
- dolabel(model);
1204
-
1205
- mdl_free(model);
1212
+ dolabel(model);
1206
1213
 
1207
- return Qnil;
1214
+ mdl_free(model);
1215
+
1216
+ return Qnil;
1208
1217
  }
1209
1218
 
1219
+ #if defined EXTRA
1210
1220
  static VALUE dump(VALUE self __attribute__((__unused__)), VALUE rb_options) {
1211
- if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
1212
- rb_raise(cNativeError, "argument must be a native options instance");
1213
- }
1221
+ if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
1222
+ rb_raise(cNativeError, "argument must be a native options instance");
1223
+ }
1224
+
1225
+ opt_t *options = get_options(rb_options);
1214
1226
 
1215
- opt_t *options = get_options(rb_options);
1227
+ if (options->mode != 2) {
1228
+ rb_raise(cNativeError, "invalid options argument: mode should be set to 2 for training");
1229
+ }
1216
1230
 
1217
- if (options->mode != 2) {
1218
- rb_raise(cNativeError, "invalid options argument: mode should be set to 2 for training");
1219
- }
1231
+ mdl_t *model = mdl_new(rdr_new(options->maxent));
1232
+ model->opt = options;
1220
1233
 
1221
- mdl_t *model = mdl_new(rdr_new(options->maxent));
1222
- model->opt = options;
1234
+ dodump(model);
1223
1235
 
1224
- dodump(model);
1225
-
1226
- mdl_free(model);
1236
+ mdl_free(model);
1227
1237
 
1228
- return Qnil;
1238
+ return Qnil;
1229
1239
  }
1230
1240
 
1231
1241
  // This function is a proxy for Wapiti's main entry point.
1232
1242
  static VALUE wapiti(VALUE self __attribute__((__unused__)), VALUE arguments) {
1233
- int result = -1, argc = 0;
1234
- char **ap, *argv[18], *input, *tmp;
1235
-
1236
- Check_Type(arguments, T_STRING);
1237
- tmp = StringValueCStr(arguments);
1238
-
1239
- // allocate space for argument vector
1240
- input = (char*)malloc(strlen(tmp) + 8);
1241
-
1242
- // prepend command name
1243
- strncpy(input, "wapiti ", 8);
1244
- strncat(input, tmp, strlen(input) - 8);
1245
-
1246
- // remember allocation pointer
1247
- tmp = input;
1248
-
1249
- // turn input string into argument vector (using
1250
- // only the first seventeen tokens from input)
1251
- for (ap = argv; (*ap = strsep(&input, " \t")) != (char*)0; ++argc) {
1252
- if ((**ap != '\0') && (++ap >= &argv[18])) break;
1253
- }
1254
-
1255
- // call main entry point
1256
- result = wapiti_main(argc, argv);
1257
-
1258
- // free allocated memory
1259
- free(tmp);
1260
-
1261
- return INT2FIX(result);
1243
+ int result = -1, argc = 0;
1244
+ char **ap, *argv[18], *input, *tmp;
1245
+
1246
+ Check_Type(arguments, T_STRING);
1247
+ tmp = StringValueCStr(arguments);
1248
+
1249
+ // allocate space for argument vector
1250
+ input = (char*)malloc(strlen(tmp) + 8);
1251
+
1252
+ // prepend command name
1253
+ strncpy(input, "wapiti ", 8);
1254
+ strncat(input, tmp, strlen(input) - 8);
1255
+
1256
+ // remember allocation pointer
1257
+ tmp = input;
1258
+
1259
+ // turn input string into argument vector (using
1260
+ // only the first seventeen tokens from input)
1261
+ for (ap = argv; (*ap = strsep(&input, " \t")) != (char*)0; ++argc) {
1262
+ if ((**ap != '\0') && (++ap >= &argv[18])) break;
1263
+ }
1264
+
1265
+ // call main entry point
1266
+ result = wapiti_main(argc, argv);
1267
+
1268
+ // free allocated memory
1269
+ free(tmp);
1270
+
1271
+ return INT2FIX(result);
1262
1272
  }
1273
+ #endif
1263
1274
 
1264
1275
  /* --- Wapiti Extension Entry Point --- */
1265
1276
 
1266
1277
  void Init_native() {
1267
- mWapiti = rb_const_get(rb_mKernel, rb_intern("Wapiti"));
1268
- mNative = rb_define_module_under(mWapiti, "Native");
1269
-
1270
- cNativeError = rb_const_get(mWapiti, rb_intern("NativeError"));
1271
- cConfigurationError = rb_const_get(mWapiti, rb_intern("ConfigurationError"));
1272
- cLogger = rb_funcall(mWapiti, rb_intern("log"), 0);
1273
-
1274
- rb_define_singleton_method(mNative, "label", label, 1);
1275
- rb_define_singleton_method(mNative, "wapiti", wapiti, 1);
1276
-
1277
- rb_define_const(mNative, "VERSION", rb_str_new2(VERSION));
1278
-
1279
- Init_options();
1280
- Init_model();
1281
- }
1278
+ mWapiti = rb_const_get(rb_mKernel, rb_intern("Wapiti"));
1279
+ mNative = rb_define_module_under(mWapiti, "Native");
1280
+
1281
+ cNativeError = rb_const_get(mWapiti, rb_intern("NativeError"));
1282
+ cConfigurationError = rb_const_get(mWapiti, rb_intern("ConfigurationError"));
1283
+ cLogger = rb_funcall(mWapiti, rb_intern("log"), 0);
1284
+
1285
+ rb_define_singleton_method(mNative, "label", label, 1);
1286
+ // rb_define_singleton_method(mNative, "wapiti", wapiti, 1);
1287
+
1288
+ rb_define_const(mNative, "VERSION", rb_str_new2(VERSION));
1289
+
1290
+ Init_options();
1291
+ Init_model();
1292
+ }