wapiti 0.1.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +5 -5
  2. data/HISTORY.md +8 -0
  3. data/LICENSE +1 -1
  4. data/README.md +39 -95
  5. data/ext/wapiti/bcd.c +1 -1
  6. data/ext/wapiti/extconf.rb +15 -1
  7. data/ext/wapiti/lbfgs.c +6 -6
  8. data/ext/wapiti/model.c +2 -3
  9. data/ext/wapiti/model.h +0 -7
  10. data/ext/wapiti/native.c +89 -239
  11. data/ext/wapiti/native.h +0 -5
  12. data/ext/wapiti/pattern.c +1 -1
  13. data/ext/wapiti/progress.c +19 -44
  14. data/ext/wapiti/progress.h +1 -4
  15. data/ext/wapiti/rprop.c +3 -4
  16. data/ext/wapiti/sgdl1.c +3 -3
  17. data/ext/wapiti/tools.c +36 -30
  18. data/ext/wapiti/tools.h +9 -4
  19. data/ext/wapiti/trainers.c +55 -0
  20. data/ext/wapiti/trainers.h +4 -1
  21. data/lib/wapiti.rb +4 -24
  22. data/lib/wapiti/dataset.rb +162 -0
  23. data/lib/wapiti/errors.rb +0 -4
  24. data/lib/wapiti/log.rb +29 -0
  25. data/lib/wapiti/model.rb +63 -40
  26. data/lib/wapiti/options.rb +66 -29
  27. data/lib/wapiti/sequence.rb +105 -0
  28. data/lib/wapiti/token.rb +74 -0
  29. data/lib/wapiti/version.rb +1 -1
  30. metadata +20 -80
  31. data/.autotest +0 -13
  32. data/.rspec +0 -3
  33. data/.simplecov +0 -3
  34. data/Gemfile +0 -29
  35. data/Rakefile +0 -63
  36. data/ext/wapiti/wapiti.c +0 -410
  37. data/spec/fixtures/ch.mod +0 -18550
  38. data/spec/fixtures/chpattern.txt +0 -52
  39. data/spec/fixtures/chtest.txt +0 -1973
  40. data/spec/fixtures/chtrain.txt +0 -19995
  41. data/spec/fixtures/nppattern.txt +0 -52
  42. data/spec/fixtures/nptest.txt +0 -1973
  43. data/spec/fixtures/nptrain.txt +0 -19995
  44. data/spec/fixtures/pattern.txt +0 -14
  45. data/spec/fixtures/test.txt +0 -60000
  46. data/spec/fixtures/train.txt +0 -1200
  47. data/spec/spec_helper.rb +0 -41
  48. data/spec/wapiti/model_spec.rb +0 -233
  49. data/spec/wapiti/native_spec.rb +0 -11
  50. data/spec/wapiti/options_spec.rb +0 -185
  51. data/spec/wapiti/utility_spec.rb +0 -22
  52. data/wapiti.gemspec +0 -49
data/ext/wapiti/native.h CHANGED
@@ -2,17 +2,12 @@
2
2
  #define native_h
3
3
 
4
4
  #include <ruby.h>
5
-
6
- #ifdef HAVE_RUBY_ENCODING_H
7
5
  #include <ruby/encoding.h>
8
- #endif
9
6
 
10
7
  extern VALUE mWapiti;
11
8
  extern VALUE mNative;
12
-
13
9
  extern VALUE cOptions;
14
10
  extern VALUE cModel;
15
-
16
11
  extern VALUE cNativeError;
17
12
  extern VALUE cLogger;
18
13
 
data/ext/wapiti/pattern.c CHANGED
@@ -340,7 +340,7 @@ char *pat_exec(const pat_t *pat, const tok_t *tok, uint32_t at) {
340
340
  else if (pos >= (int32_t)T)
341
341
  value = eval[min( pos - (int32_t)T, 4)];
342
342
  else if (col >= tok->cnts[pos])
343
- fatal("missing tokens, cannot apply pattern");
343
+ fatal("missing tokens at %d: %d/%d, cannot apply pattern", pos, col, tok->cnts[pos]);
344
344
  else
345
345
  value = tok->toks[pos][col];
346
346
  }
@@ -24,23 +24,18 @@
24
24
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
25
  * POSSIBILITY OF SUCH DAMAGE.
26
26
  */
27
+
28
+
27
29
  #include <inttypes.h>
28
- #include <signal.h>
29
- #include <stdbool.h>
30
30
  #include <stddef.h>
31
- #include <stdint.h>
32
31
  #include <stdlib.h>
33
32
  #include <stdio.h>
34
-
33
+ #include <signal.h>
35
34
  #include <unistd.h>
36
- #include <sys/time.h>
37
- #include <sys/resource.h>
38
35
 
39
- #include "wapiti.h"
36
+ #include "progress.h"
40
37
  #include "decoder.h"
41
- #include "model.h"
42
38
  #include "options.h"
43
- #include "progress.h"
44
39
  #include "tools.h"
45
40
 
46
41
  /*******************************************************************************
@@ -74,7 +69,7 @@ bool uit_stop = false;
74
69
  /* uit_signal:
75
70
  * Signal handler to catch interupt signal. When a signal is received, the
76
71
  * trainer is aksed to stop as soon as possible leaving the model in a clean
77
- * state. We don't reinstall the handler so if user send a second interupt
72
+ * state. We don't reinstall the handler so if user send a second interrupt
78
73
  * signal, the program will stop imediatly. (to cope with BSD system, we even
79
74
  * reinstall explicitly the default handler)
80
75
  */
@@ -85,15 +80,17 @@ static void uit_signal(int sig) {
85
80
 
86
81
  /* uit_setup:
87
82
  * Install the signal handler for clean early stop from the user if possible
88
- * and start the timer.
89
83
  */
90
84
  void uit_setup(mdl_t *mdl) {
91
85
  uit_stop = false;
92
- if (signal(SIGINT, uit_signal) == SIG_ERR)
86
+ if (signal(SIGINT, uit_signal) == SIG_ERR) {
93
87
  warning("failed to set signal handler, no clean early stop");
94
- gettimeofday(&mdl->timer, NULL);
95
- if (mdl->opt->stopwin != 0)
88
+ }
89
+
90
+ if (mdl->opt->stopwin != 0) {
96
91
  mdl->werr = wapiti_xmalloc(sizeof(double) * mdl->opt->stopwin);
92
+ }
93
+
97
94
  mdl->wcnt = mdl->wpos = 0;
98
95
  }
99
96
 
@@ -107,41 +104,23 @@ void uit_cleanup(mdl_t *mdl) {
107
104
  free(mdl->werr);
108
105
  mdl->werr = NULL;
109
106
  }
107
+
110
108
  signal(SIGINT, SIG_DFL);
111
109
  }
112
110
 
113
111
  /* uit_progress:
114
- * Display a progress repport to the user consisting of some informations
112
+ * Display a progress report to the user consisting of information
115
113
  * provided by the trainer: iteration count and objective function value, and
116
- * some informations computed here on the current model performances.
117
- * This function return true if the trainer have to keep training the model
114
+ * some information computed here on the current model performance.
115
+ * This function returns true if the trainer have to keep training the model
118
116
  * and false if he must stop, so this is were we will implement the trainer
119
117
  * independant stoping criterion.
120
118
  */
121
- bool uit_progress(mdl_t *mdl, uint32_t it, double obj) {
119
+ bool uit_progress(mdl_t *mdl) {
122
120
  // First we just compute the error rate on devel or train data
123
121
  double te, se;
124
122
  tag_eval(mdl, &te, &se);
125
- // Next, we compute the number of active features
126
- uint64_t act = 0;
127
- for (uint64_t f = 0; f < mdl->nftr; f++)
128
- if (mdl->theta[f] != 0.0)
129
- act++;
130
- // Compute timings. As some training algorithms are multi-threaded, we
131
- // cannot use ansi/c function and must rely on posix one to sum time
132
- // spent in main thread and in child ones.
133
- tms_t now; gettimeofday(&now, NULL);
134
- double tm = (now.tv_sec + (double)now.tv_usec * 1.0e-6)
135
- - (mdl->timer.tv_sec + (double)mdl->timer.tv_usec * 1.0e-6);
136
- mdl->total += tm;
137
- mdl->timer = now;
138
- // And display progress report
139
- info(" [%4"PRIu32"]", it);
140
- info(obj >= 0.0 ? " obj=%-10.2f" : " obj=NA", obj);
141
- info(" act=%-8"PRIu64, act);
142
- info(" err=%5.2f%%/%5.2f%%", te, se);
143
- info(" time=%.2fs/%.2fs", tm, mdl->total);
144
- info("\n");
123
+
145
124
  // If requested, check the error rate stoping criterion. We check if the
146
125
  // error rate is stable enought over a few iterations.
147
126
  bool res = true;
@@ -159,10 +138,6 @@ bool uit_progress(mdl_t *mdl, uint32_t it, double obj) {
159
138
  res = false;
160
139
  }
161
140
  }
162
- // And return
163
- if (uit_stop)
164
- return false;
165
- return res;
166
- }
167
-
168
141
 
142
+ return (uit_stop) ? false : res;
143
+ }
@@ -30,15 +30,12 @@
30
30
 
31
31
  #include <stdbool.h>
32
32
  #include <stdint.h>
33
-
34
- #include "wapiti.h"
35
33
  #include "model.h"
36
34
 
37
35
  extern bool uit_stop;
38
36
 
39
37
  void uit_setup(mdl_t *mdl);
40
38
  void uit_cleanup(mdl_t *mdl);
41
- bool uit_progress(mdl_t *mdl, uint32_t it, double obj);
39
+ bool uit_progress(mdl_t *mdl);
42
40
 
43
41
  #endif
44
-
data/ext/wapiti/rprop.c CHANGED
@@ -176,7 +176,7 @@ void trn_rprop(mdl_t *mdl) {
176
176
  for (uint64_t i = 0; i < nftr; i++) {
177
177
  uint64_t f;
178
178
  double vxp, vstp, vgp;
179
- if (fscanf(file, "%"PRIu64" %la %la %la\n", &f, &vxp,
179
+ if (fscanf(file, "%"PRIu64" %le %le %le\n", &f, &vxp,
180
180
  &vstp, &vgp) != 4)
181
181
  fatal(err);
182
182
  if (wbt && !cut) xp[f] = vxp;
@@ -199,11 +199,10 @@ void trn_rprop(mdl_t *mdl) {
199
199
  // And iterate the gradient computation / weight update process until
200
200
  // convergence or stop request
201
201
  for (uint32_t k = 0; !uit_stop && k < K; k++) {
202
- double fx = grd_gradient(grd);
203
202
  if (uit_stop)
204
203
  break;
205
204
  mth_spawn((func_t *)trn_rpropsub, W, (void **)rprop, 0, 0);
206
- if (uit_progress(mdl, k + 1, fx) == false)
205
+ if (uit_progress(mdl) == false)
207
206
  break;
208
207
  }
209
208
  // Save state if user requested it
@@ -216,7 +215,7 @@ void trn_rprop(mdl_t *mdl) {
216
215
  double vxp = xp != NULL ? xp[f] : 0.0;
217
216
  double vstp = stp[f], vgp = gp[f];
218
217
  fprintf(file, "%"PRIu64" ", f);
219
- fprintf(file, "%la %la %la\n", vxp, vstp, vgp);
218
+ fprintf(file, "%le %le %le\n", vxp, vstp, vgp);
220
219
  }
221
220
  fclose(file);
222
221
  }
data/ext/wapiti/sgdl1.c CHANGED
@@ -104,7 +104,7 @@ void trn_sgdl1(mdl_t *mdl) {
104
104
  // The index is a simple table indexed by sequences number. Each entry
105
105
  // point to two lists of observations terminated by <none>, one for
106
106
  // unigrams obss and one for bigrams obss.
107
- info(" - Build the index\n");
107
+ info("build the index");
108
108
  sgd_idx_t *idx = wapiti_xmalloc(sizeof(sgd_idx_t) * S);
109
109
  for (uint32_t s = 0; s < S; s++) {
110
110
  const seq_t *seq = mdl->train->seq[s];
@@ -126,7 +126,7 @@ void trn_sgdl1(mdl_t *mdl) {
126
126
  memcpy(idx[s].uobs, uobs, ucnt * sizeof(uint64_t));
127
127
  memcpy(idx[s].bobs, bobs, bcnt * sizeof(uint64_t));
128
128
  }
129
- info(" Done\n");
129
+ info("indexing done");
130
130
  // We will process sequences in random order in each iteration, so we
131
131
  // will have to permute them. The current permutation is stored in a
132
132
  // vector called <perm> shuffled at the start of each iteration. We
@@ -202,7 +202,7 @@ void trn_sgdl1(mdl_t *mdl) {
202
202
  if (uit_stop)
203
203
  break;
204
204
  // Repport progress back to the user
205
- if (!uit_progress(mdl, k + 1, -1.0))
205
+ if (!uit_progress(mdl))
206
206
  break;
207
207
  }
208
208
  grd_stfree(grd_st);
data/ext/wapiti/tools.c CHANGED
@@ -26,13 +26,11 @@
26
26
  */
27
27
 
28
28
  #include <errno.h>
29
- #include <stdarg.h>
30
- #include <stddef.h>
31
29
  #include <stdlib.h>
32
- #include <stdio.h>
33
30
  #include <string.h>
34
31
 
35
32
  #include "tools.h"
33
+ #include "native.h"
36
34
 
37
35
  /*
38
36
  * Wapiti Ruby Logging
@@ -42,8 +40,20 @@
42
40
  *
43
41
  */
44
42
 
45
- #include "native.h"
43
+ FILE *ufopen(VALUE path, const char *mode) {
44
+ FILE *file = (FILE*)0;
45
+ Check_Type(path, T_STRING);
46
46
 
47
+ if (rb_obj_tainted(path)) {
48
+ fatal("failed to open file from tainted string '%s'", StringValueCStr(path));
49
+ }
50
+
51
+ if (!(file = fopen(StringValueCStr(path), mode))) {
52
+ pfatal("failed to open file '%s'", StringValueCStr(path));
53
+ }
54
+
55
+ return file;
56
+ }
47
57
 
48
58
  /*******************************************************************************
49
59
  * Error handling and memory managment
@@ -63,13 +73,13 @@
63
73
  * formating than the printf family and exit program with an error. We let the
64
74
  * OS care about freeing ressources.
65
75
  */
66
- void fatal(const char *msg, ...) {
76
+ __attribute__((noreturn)) void fatal(const char *fmt, ...) {
77
+ VALUE msg;
67
78
  va_list args;
68
- va_start(args, msg);
69
-
70
- rb_raise(cNativeError, msg, args);
71
-
79
+ va_start(args, fmt);
80
+ msg = rb_vsprintf(fmt, args);
72
81
  va_end(args);
82
+ rb_raise(cNativeError, StringValueCStr(msg));
73
83
  }
74
84
 
75
85
  /* pfatal:
@@ -79,17 +89,15 @@ void fatal(const char *msg, ...) {
79
89
  * must be carefull to not call other functino that might reset it before
80
90
  * calling pfatal.
81
91
  */
82
- void pfatal(const char *msg, ...) {
83
- // const char *err = strerror(errno);
92
+ __attribute__((noreturn)) void pfatal(const char *fmt, ...) {
93
+ const char *err = strerror(errno);
94
+ VALUE msg;
84
95
  va_list args;
85
- va_start(args, msg);
86
-
87
- // VALUE message = rb_vsprintf(msg, args);
88
- // rb_str_catf(message, ": <%s>", err);
89
- rb_raise(cNativeError, msg, args);
90
-
96
+ va_start(args, fmt);
97
+ msg = rb_vsprintf(fmt, args);
91
98
  va_end(args);
92
-
99
+ rb_str_catf(msg, ": %s", err);
100
+ rb_raise(cNativeError, StringValueCStr(msg));
93
101
  }
94
102
 
95
103
  /* warning:
@@ -97,14 +105,13 @@ void pfatal(const char *msg, ...) {
97
105
  * exit the program. It is intended to inform the user that something strange
98
106
  * have happen and the result might be not what it have expected.
99
107
  */
100
- void warning(const char *msg, ...) {
108
+ void warning(const char *fmt, ...) {
109
+ VALUE msg;
101
110
  va_list args;
102
- va_start(args, msg);
103
-
104
- // (void)rb_funcall(cLogger, rb_intern("warn"), 1, rb_vsprintf(msg, args));
105
- (void)rb_funcall(cLogger, rb_intern("warn"), 1, rb_str_new2(msg));
106
-
111
+ va_start(args, fmt);
112
+ msg = rb_vsprintf(fmt, args);
107
113
  va_end(args);
114
+ (void)rb_funcall(cLogger, rb_intern("warn"), 1, msg);
108
115
  }
109
116
 
110
117
  /* info:
@@ -113,14 +120,13 @@ void warning(const char *msg, ...) {
113
120
  * just a wrapper for printf to stderr. Note that unlike the previous one,
114
121
  * this function doesn't automatically append a new line character.
115
122
  */
116
- void info(const char *msg, ...) {
123
+ void info(const char *fmt, ...) {
124
+ VALUE msg;
117
125
  va_list args;
118
- va_start(args, msg);
119
-
120
- // (void)rb_funcall(cLogger, rb_intern("info"), 1, rb_vsprintf(msg, args));
121
- (void)rb_funcall(cLogger, rb_intern("info"), 1, rb_str_new2(msg));
122
-
126
+ va_start(args, fmt);
127
+ msg = rb_vsprintf(fmt, args);
123
128
  va_end(args);
129
+ (void)rb_funcall(cLogger, rb_intern("info"), 1, msg);
124
130
  }
125
131
 
126
132
  /* wapiti_xmalloc:
data/ext/wapiti/tools.h CHANGED
@@ -31,17 +31,22 @@
31
31
  #include <stddef.h>
32
32
  #include <stdint.h>
33
33
  #include <stdio.h>
34
+ #include <ruby.h>
34
35
 
35
36
  #define unused(v) ((void)(v))
36
37
  #define none ((uint64_t)-1)
37
38
 
39
+ #undef min
38
40
  #define min(a, b) ((a) < (b) ? (a) : (b))
41
+ #undef max
39
42
  #define max(a, b) ((a) < (b) ? (b) : (a))
40
43
 
41
- void fatal(const char *msg, ...);
42
- void pfatal(const char *msg, ...);
43
- void warning(const char *msg, ...);
44
- void info(const char *msg, ...);
44
+ FILE *ufopen(VALUE path, const char *mode);
45
+
46
+ void fatal(const char *fmt, ...);
47
+ void pfatal(const char *fmt, ...);
48
+ void warning(const char *fmt, ...);
49
+ void info(const char *fmt, ...);
45
50
 
46
51
  void *wapiti_xmalloc(size_t size);
47
52
  void *wapiti_xrealloc(void *ptr, size_t size);
@@ -0,0 +1,55 @@
1
+ #include <string.h>
2
+ #include "tools.h"
3
+ #include "trainers.h"
4
+
5
+ static const struct {
6
+ const char *name;
7
+ trn_t train;
8
+ } trn_lst[] = {
9
+ {"l-bfgs", trn_lbfgs},
10
+ {"sgd-l1", trn_sgdl1},
11
+ {"bcd", trn_bcd },
12
+ {"rprop", trn_rprop},
13
+ {"rprop+", trn_rprop},
14
+ {"rprop-", trn_rprop}
15
+ };
16
+
17
+ static const uint32_t trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
18
+
19
+ trn_t trn_get(const char *algo) {
20
+ uint32_t trn;
21
+
22
+ for (trn = 0; trn < trn_cnt; trn++) {
23
+ if (!strcmp(algo, trn_lst[trn].name)) break;
24
+ }
25
+
26
+ if (trn == trn_cnt) {
27
+ fatal("unknown algorithm '%s'", algo);
28
+ trn = 0;
29
+ }
30
+
31
+ return trn_lst[trn].train;
32
+ }
33
+
34
+ static const char *typ_lst[] = {
35
+ "maxent",
36
+ "memm",
37
+ "crf"
38
+ };
39
+
40
+ static const uint32_t typ_cnt = sizeof(typ_lst) / sizeof(typ_lst[0]);
41
+
42
+ uint32_t typ_get(const char* type) {
43
+ uint32_t typ;
44
+
45
+ for (typ = 0; typ < typ_cnt; typ++) {
46
+ if (!strcmp(type, typ_lst[typ])) break;
47
+ }
48
+
49
+ if (typ == typ_cnt) {
50
+ fatal("unknown model type '%s'", type);
51
+ }
52
+
53
+ return typ;
54
+ }
55
+
@@ -35,5 +35,8 @@ void trn_sgdl1(mdl_t *mdl);
35
35
  void trn_bcd(mdl_t *mdl);
36
36
  void trn_rprop(mdl_t *mdl);
37
37
 
38
- #endif
38
+ typedef void (*trn_t)(mdl_t*);
39
+ trn_t trn_get(const char *algo);
40
+ uint32_t typ_get(const char *type);
39
41
 
42
+ #endif
data/lib/wapiti.rb CHANGED
@@ -1,30 +1,10 @@
1
-
2
- require 'logger'
3
- require 'tempfile'
4
-
5
1
  require 'wapiti/version'
6
-
7
- module Wapiti
8
-
9
- Logger = ::Logger.new(STDOUT)
10
- Logger.level = ::Logger::WARN
11
-
12
- class << self
13
- def log
14
- Logger
15
- end
16
-
17
- def debug!
18
- log.level == ::Logger::DEBUG
19
- end
20
- end
21
-
22
- end
23
-
24
2
  require 'wapiti/errors'
3
+ require 'wapiti/log'
4
+ require 'wapiti/token'
5
+ require 'wapiti/sequence'
6
+ require 'wapiti/dataset'
25
7
  require 'wapiti/native'
26
-
27
8
  require 'wapiti/options'
28
9
  require 'wapiti/model'
29
-
30
10
  require 'wapiti/utility'