wapiti 0.0.5 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.simplecov +3 -0
  3. data/Gemfile +25 -2
  4. data/HISTORY.md +5 -1
  5. data/LICENSE +14 -13
  6. data/README.md +9 -16
  7. data/Rakefile +38 -8
  8. data/ext/wapiti/bcd.c +126 -124
  9. data/ext/wapiti/decoder.c +203 -124
  10. data/ext/wapiti/decoder.h +6 -4
  11. data/ext/wapiti/extconf.rb +2 -2
  12. data/ext/wapiti/gradient.c +491 -320
  13. data/ext/wapiti/gradient.h +52 -34
  14. data/ext/wapiti/lbfgs.c +74 -33
  15. data/ext/wapiti/model.c +47 -37
  16. data/ext/wapiti/model.h +22 -20
  17. data/ext/wapiti/native.c +850 -839
  18. data/ext/wapiti/native.h +1 -1
  19. data/ext/wapiti/options.c +52 -20
  20. data/ext/wapiti/options.h +37 -30
  21. data/ext/wapiti/pattern.c +35 -33
  22. data/ext/wapiti/pattern.h +12 -11
  23. data/ext/wapiti/progress.c +14 -13
  24. data/ext/wapiti/progress.h +3 -2
  25. data/ext/wapiti/quark.c +14 -16
  26. data/ext/wapiti/quark.h +6 -5
  27. data/ext/wapiti/reader.c +83 -69
  28. data/ext/wapiti/reader.h +11 -9
  29. data/ext/wapiti/rprop.c +84 -43
  30. data/ext/wapiti/sequence.h +18 -16
  31. data/ext/wapiti/sgdl1.c +45 -43
  32. data/ext/wapiti/thread.c +19 -17
  33. data/ext/wapiti/thread.h +5 -4
  34. data/ext/wapiti/tools.c +7 -7
  35. data/ext/wapiti/tools.h +3 -4
  36. data/ext/wapiti/trainers.h +1 -1
  37. data/ext/wapiti/vmath.c +40 -38
  38. data/ext/wapiti/vmath.h +12 -11
  39. data/ext/wapiti/wapiti.c +159 -37
  40. data/ext/wapiti/wapiti.h +18 -4
  41. data/lib/wapiti.rb +15 -15
  42. data/lib/wapiti/errors.rb +15 -15
  43. data/lib/wapiti/model.rb +92 -84
  44. data/lib/wapiti/options.rb +123 -124
  45. data/lib/wapiti/utility.rb +14 -14
  46. data/lib/wapiti/version.rb +2 -2
  47. data/spec/spec_helper.rb +29 -9
  48. data/spec/wapiti/model_spec.rb +230 -194
  49. data/spec/wapiti/native_spec.rb +7 -8
  50. data/spec/wapiti/options_spec.rb +184 -174
  51. data/wapiti.gemspec +22 -8
  52. metadata +38 -42
  53. data/.gitignore +0 -5
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -29,27 +29,28 @@
29
29
  #define pattern_h
30
30
 
31
31
  #include <stdbool.h>
32
+ #include <stdint.h>
32
33
 
33
34
  #include "sequence.h"
34
35
 
35
36
  typedef struct pat_s pat_t;
36
37
  typedef struct pat_item_s pat_item_t;
37
38
  struct pat_s {
38
- char *src;
39
- int ntoks;
40
- int nitems;
39
+ char *src;
40
+ uint32_t ntoks;
41
+ uint32_t nitems;
41
42
  struct pat_item_s {
42
- char type;
43
- bool caps;
44
- char *value;
45
- bool absolute;
46
- int offset;
47
- int column;
43
+ char type;
44
+ bool caps;
45
+ char *value;
46
+ bool absolute;
47
+ int32_t offset;
48
+ uint32_t column;
48
49
  } items[];
49
50
  };
50
51
 
51
52
  pat_t *pat_comp(char *p);
52
- char *pat_exec(const pat_t *pat, const tok_t *tok, int at);
53
+ char *pat_exec(const pat_t *pat, const tok_t *tok, uint32_t at);
53
54
  void pat_free(pat_t *pat);
54
55
 
55
56
  #endif
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -24,14 +24,16 @@
24
24
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
25
  * POSSIBILITY OF SUCH DAMAGE.
26
26
  */
27
+ #include <inttypes.h>
27
28
  #include <signal.h>
28
29
  #include <stdbool.h>
29
30
  #include <stddef.h>
31
+ #include <stdint.h>
30
32
  #include <stdlib.h>
31
33
  #include <stdio.h>
32
34
 
33
35
  #include <unistd.h>
34
- #include <sys/times.h>
36
+ #include <sys/time.h>
35
37
  #include <sys/resource.h>
36
38
 
37
39
  #include "wapiti.h"
@@ -89,7 +91,7 @@ void uit_setup(mdl_t *mdl) {
89
91
  uit_stop = false;
90
92
  if (signal(SIGINT, uit_signal) == SIG_ERR)
91
93
  warning("failed to set signal handler, no clean early stop");
92
- times(&mdl->timer);
94
+ gettimeofday(&mdl->timer, NULL);
93
95
  if (mdl->opt->stopwin != 0)
94
96
  mdl->werr = wapiti_xmalloc(sizeof(double) * mdl->opt->stopwin);
95
97
  mdl->wcnt = mdl->wpos = 0;
@@ -116,28 +118,27 @@ void uit_cleanup(mdl_t *mdl) {
116
118
  * and false if he must stop, so this is were we will implement the trainer
117
119
  * independant stoping criterion.
118
120
  */
119
- bool uit_progress(mdl_t *mdl, int it, double obj) {
121
+ bool uit_progress(mdl_t *mdl, uint32_t it, double obj) {
120
122
  // First we just compute the error rate on devel or train data
121
123
  double te, se;
122
124
  tag_eval(mdl, &te, &se);
123
125
  // Next, we compute the number of active features
124
- size_t act = 0;
125
- for (size_t f = 0; f < mdl->nftr; f++)
126
+ uint64_t act = 0;
127
+ for (uint64_t f = 0; f < mdl->nftr; f++)
126
128
  if (mdl->theta[f] != 0.0)
127
129
  act++;
128
130
  // Compute timings. As some training algorithms are multi-threaded, we
129
131
  // cannot use ansi/c function and must rely on posix one to sum time
130
132
  // spent in main thread and in child ones.
131
- tms_t now; times(&now);
132
- double tm = (now.tms_utime - mdl->timer.tms_utime )
133
- + (now.tms_cutime - mdl->timer.tms_cutime);
134
- tm /= sysconf(_SC_CLK_TCK);
133
+ tms_t now; gettimeofday(&now, NULL);
134
+ double tm = (now.tv_sec + (double)now.tv_usec * 1.0e-6)
135
+ - (mdl->timer.tv_sec + (double)mdl->timer.tv_usec * 1.0e-6);
135
136
  mdl->total += tm;
136
137
  mdl->timer = now;
137
138
  // And display progress report
138
- info(" [%4d]", it);
139
+ info(" [%4"PRIu32"]", it);
139
140
  info(obj >= 0.0 ? " obj=%-10.2f" : " obj=NA", obj);
140
- info(" act=%-8zu", act);
141
+ info(" act=%-8"PRIu64, act);
141
142
  info(" err=%5.2f%%/%5.2f%%", te, se);
142
143
  info(" time=%.2fs/%.2fs", tm, mdl->total);
143
144
  info("\n");
@@ -150,7 +151,7 @@ bool uit_progress(mdl_t *mdl, int it, double obj) {
150
151
  mdl->wcnt++;
151
152
  if (mdl->wcnt >= mdl->opt->stopwin) {
152
153
  double emin = 200.0, emax = -100.0;
153
- for (int i = 0; i < mdl->opt->stopwin; i++) {
154
+ for (uint32_t i = 0; i < mdl->opt->stopwin; i++) {
154
155
  emin = min(emin, mdl->werr[i]);
155
156
  emax = max(emax, mdl->werr[i]);
156
157
  }
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -29,6 +29,7 @@
29
29
  #define progress_h
30
30
 
31
31
  #include <stdbool.h>
32
+ #include <stdint.h>
32
33
 
33
34
  #include "wapiti.h"
34
35
  #include "model.h"
@@ -37,7 +38,7 @@ extern bool uit_stop;
37
38
 
38
39
  void uit_setup(mdl_t *mdl);
39
40
  void uit_cleanup(mdl_t *mdl);
40
- bool uit_progress(mdl_t *mdl, int it, double obj);
41
+ bool uit_progress(mdl_t *mdl, uint32_t it, double obj);
41
42
 
42
43
  #endif
43
44
 
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -24,11 +24,12 @@
24
24
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
25
  * POSSIBILITY OF SUCH DAMAGE.
26
26
  */
27
+ #include <inttypes.h>
27
28
  #include <stdbool.h>
28
29
  #include <stddef.h>
29
30
  #include <stdlib.h>
30
- #include <stdio.h>
31
31
  #include <stdint.h>
32
+ #include <stdio.h>
32
33
  #include <string.h>
33
34
 
34
35
  #include "quark.h"
@@ -47,7 +48,7 @@
47
48
  * Information Coded in Alphanumeric, Journal of the ACM 15 (4): pp. 514--534,
48
49
  * 1968. DOI:10.1145/321479.321481
49
50
  *
50
- * This code is copyright 2002-2011 Thomas Lavergne and licenced under the BSD
51
+ * This code is copyright 2002-2013 Thomas Lavergne and licenced under the BSD
51
52
  * Licence like the remaining of Wapiti.
52
53
  ******************************************************************************/
53
54
 
@@ -68,8 +69,6 @@ struct qrk_s {
68
69
  uint64_t size;
69
70
  };
70
71
 
71
- #define qrk_none ((uint64_t)-1)
72
-
73
72
  #define qrk_lf2nd(lf) ((node_t *)((intptr_t)(lf) | 1))
74
73
  #define qrk_nd2lf(nd) ((leaf_t *)((intptr_t)(nd) & ~1))
75
74
  #define qrk_isleaf(nd) ((intptr_t)(nd) & 1)
@@ -86,7 +85,7 @@ qrk_t *qrk_new(void) {
86
85
  qrk->count = 0;
87
86
  qrk->lock = false;
88
87
  qrk->size = size;
89
- qrk->leafs = wapiti_xmalloc(sizeof(leaf_t) * size);
88
+ qrk->leafs = wapiti_xmalloc(sizeof(leaf_t *) * size);
90
89
  return qrk;
91
90
  }
92
91
 
@@ -96,10 +95,10 @@ qrk_t *qrk_new(void) {
96
95
  * qrk_unmap become invalid and must not be used anymore.
97
96
  */
98
97
  void qrk_free(qrk_t *qrk) {
99
- const size_t stkmax = 1024;
98
+ const uint32_t stkmax = 1024;
100
99
  if (qrk->count != 0) {
101
100
  node_t *stk[stkmax];
102
- int cnt = 0;
101
+ uint32_t cnt = 0;
103
102
  stk[cnt++] = qrk->root;
104
103
  while (cnt != 0) {
105
104
  node_t *nd = stk[--cnt];
@@ -122,7 +121,7 @@ void qrk_free(qrk_t *qrk) {
122
121
  * pair inside the quark. This function is not thread safe and should not be
123
122
  * called on the same map from different thread without locking.
124
123
  */
125
- size_t qrk_str2id(qrk_t *qrk, const char *key) {
124
+ uint64_t qrk_str2id(qrk_t *qrk, const char *key) {
126
125
  const uint8_t *raw = (void *)key;
127
126
  const size_t len = strlen(key);
128
127
  // We first take care of the empty trie case so later we can safely
@@ -213,7 +212,7 @@ size_t qrk_str2id(qrk_t *qrk, const char *key) {
213
212
  * remain valid only for the life time of the quark, a call to qrk_free will
214
213
  * make this pointer invalid.
215
214
  */
216
- const char *qrk_id2str(const qrk_t *qrk, size_t id) {
215
+ const char *qrk_id2str(const qrk_t *qrk, uint64_t id) {
217
216
  if (id >= qrk->count)
218
217
  fatal("invalid identifier");
219
218
  return qrk->leafs[id]->key;
@@ -225,7 +224,7 @@ const char *qrk_id2str(const qrk_t *qrk, size_t id) {
225
224
  * number correspond to the id.
226
225
  */
227
226
  void qrk_save(const qrk_t *qrk, FILE *file) {
228
- if (fprintf(file, "#qrk#%zu\n", (size_t)qrk->count) < 0)
227
+ if (fprintf(file, "#qrk#%"PRIu64"\n", qrk->count) < 0)
229
228
  pfatal("cannot write to file");
230
229
  if (qrk->count == 0)
231
230
  return;
@@ -240,13 +239,13 @@ void qrk_save(const qrk_t *qrk, FILE *file) {
240
239
  * initilay empty, this will load a map exactly as saved by qrk_save.
241
240
  */
242
241
  void qrk_load(qrk_t *qrk, FILE *file) {
243
- size_t cnt = 0;
244
- if (fscanf(file, "#qrk#%zu\n", &cnt) != 1) {
242
+ uint64_t cnt = 0;
243
+ if (fscanf(file, "#qrk#%"SCNu64"\n", &cnt) != 1) {
245
244
  if (ferror(file) != 0)
246
245
  pfatal("cannot read from file");
247
246
  pfatal("invalid format");
248
247
  }
249
- for (size_t n = 0; n < cnt; ++n) {
248
+ for (uint64_t n = 0; n < cnt; ++n) {
250
249
  char *str = ns_readstr(file);
251
250
  qrk_str2id(qrk, str);
252
251
  free(str);
@@ -256,7 +255,7 @@ void qrk_load(qrk_t *qrk, FILE *file) {
256
255
  /* qrk_count:
257
256
  * Return the number of mappings stored in the quark.
258
257
  */
259
- size_t qrk_count(const qrk_t *qrk) {
258
+ uint64_t qrk_count(const qrk_t *qrk) {
260
259
  return qrk->count;
261
260
  }
262
261
 
@@ -269,4 +268,3 @@ bool qrk_lock(qrk_t *qrk, bool lock) {
269
268
  return old;
270
269
  }
271
270
 
272
-
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -28,17 +28,18 @@
28
28
  #ifndef quark_h
29
29
  #define quark_h
30
30
 
31
- #include <stddef.h>
31
+ #include <stdbool.h>
32
+ #include <stdint.h>
32
33
  #include <stdio.h>
33
34
 
34
35
  typedef struct qrk_s qrk_t;
35
36
 
36
37
  qrk_t *qrk_new(void);
37
38
  void qrk_free(qrk_t *qrk);
38
- size_t qrk_count(const qrk_t *qrk);
39
+ uint64_t qrk_count(const qrk_t *qrk);
39
40
  bool qrk_lock(qrk_t *qrk, bool lock);
40
- const char *qrk_id2str(const qrk_t *qrk, size_t id);
41
- size_t qrk_str2id(qrk_t *qrk, const char *key);
41
+ const char *qrk_id2str(const qrk_t *qrk, uint64_t id);
42
+ uint64_t qrk_str2id(qrk_t *qrk, const char *key);
42
43
  void qrk_load(qrk_t *qrk, FILE *file);
43
44
  void qrk_save(const qrk_t *qrk, FILE *file);
44
45
 
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -25,6 +25,7 @@
25
25
  * POSSIBILITY OF SUCH DAMAGE.
26
26
  */
27
27
  #include <ctype.h>
28
+ #include <inttypes.h>
28
29
  #include <stdbool.h>
29
30
  #include <stddef.h>
30
31
  #include <stdlib.h>
@@ -59,12 +60,14 @@
59
60
  ******************************************************************************/
60
61
 
61
62
  /* rdr_new:
62
- * Create a new empty reader object. You mut load patterns in it or a
63
- * previously saved reader if you want to use it for reading sequences.
63
+ * Create a new empty reader object. If no patterns are loaded before you
64
+ * start using the reader the input data are assumed to be already prepared
65
+ * list of features. They must either start with a prefix 'u', 'b', or '*', or
66
+ * you must set autouni to true in order to automatically add a 'u' prefix.
64
67
  */
65
- rdr_t *rdr_new(bool maxent) {
68
+ rdr_t *rdr_new(bool autouni) {
66
69
  rdr_t *rdr = wapiti_xmalloc(sizeof(rdr_t));
67
- rdr->maxent = maxent;
70
+ rdr->autouni = autouni;
68
71
  rdr->npats = rdr->nuni = rdr->nbi = 0;
69
72
  rdr->ntoks = 0;
70
73
  rdr->pats = NULL;
@@ -78,7 +81,7 @@ rdr_t *rdr_new(bool maxent) {
78
81
  * any string returned by them must not be used after this call.
79
82
  */
80
83
  void rdr_free(rdr_t *rdr) {
81
- for (int i = 0; i < rdr->npats; i++)
84
+ for (uint32_t i = 0; i < rdr->npats; i++)
82
85
  pat_free(rdr->pats[i]);
83
86
  free(rdr->pats);
84
87
  qrk_free(rdr->lbl);
@@ -90,7 +93,7 @@ void rdr_free(rdr_t *rdr) {
90
93
  * Free all memory used by a raw_t object.
91
94
  */
92
95
  void rdr_freeraw(raw_t *raw) {
93
- for (int t = 0; t < raw->len; t++)
96
+ for (uint32_t t = 0; t < raw->len; t++)
94
97
  free(raw->lines[t]);
95
98
  free(raw);
96
99
  }
@@ -107,7 +110,7 @@ void rdr_freeseq(seq_t *seq) {
107
110
  * Free all memory used by a dat_t object.
108
111
  */
109
112
  void rdr_freedat(dat_t *dat) {
110
- for (size_t i = 0; i < dat->nseq; i++)
113
+ for (uint32_t i = 0; i < dat->nseq; i++)
111
114
  rdr_freeseq(dat->seq[i]);
112
115
  free(dat->seq);
113
116
  free(dat);
@@ -118,11 +121,11 @@ void rdr_freedat(dat_t *dat) {
118
121
  * available memory, a buffer large enough is allocated and returned. The
119
122
  * caller is responsible to free it. On end-of-file, NULL is returned.
120
123
  */
121
- static char *rdr_readline(FILE *file) {
124
+ char *rdr_readline(FILE *file) {
122
125
  if (feof(file))
123
126
  return NULL;
124
127
  // Initialize the buffer
125
- int len = 0, size = 16;
128
+ uint32_t len = 0, size = 16;
126
129
  char *buffer = wapiti_xmalloc(size);
127
130
  // We read the line chunk by chunk until end of line, file or error
128
131
  while (!feof(file)) {
@@ -203,7 +206,7 @@ raw_t *rdr_readraw(rdr_t *rdr, FILE *file) {
203
206
  if (feof(file))
204
207
  return NULL;
205
208
  // Prepare the raw sequence object
206
- int size = 32, cnt = 0;
209
+ uint32_t size = 32, cnt = 0;
207
210
  raw_t *raw = wapiti_xmalloc(sizeof(raw_t) + sizeof(char *) * size);
208
211
  // And read the next sequence in the file, this will skip any blank line
209
212
  // before reading the sequence stoping at end of file or on a new blank
@@ -232,9 +235,9 @@ raw_t *rdr_readraw(rdr_t *rdr, FILE *file) {
232
235
  + sizeof(char *) * size);
233
236
  }
234
237
  raw->lines[cnt++] = line;
235
- // In maxent mode, we only have to load one line for each sample
236
- // so we can stop here.
237
- if (rdr->maxent)
238
+ // In autouni mode, there will be only unigram features so we
239
+ // can use small sequences to improve multi-theading.
240
+ if (rdr->autouni)
238
241
  break;
239
242
  }
240
243
  // If no lines was read, we just free allocated memory and return NULL
@@ -251,13 +254,12 @@ raw_t *rdr_readraw(rdr_t *rdr, FILE *file) {
251
254
 
252
255
  /* rdr_mapobs:
253
256
  * Map an observation to its identifier, automatically adding a 'u' prefix in
254
- * pure maxent mode.
257
+ * 'autouni' mode.
255
258
  */
256
- static size_t rdr_mapobs(rdr_t *rdr, const char *str) {
257
- if (!rdr->maxent)
259
+ static uint64_t rdr_mapobs(rdr_t *rdr, const char *str) {
260
+ if (!rdr->autouni)
258
261
  return qrk_str2id(rdr->obs, str);
259
- size_t len = strlen(str) + 2;
260
- char tmp[len];
262
+ char tmp[strlen(str) + 2];
261
263
  tmp[0] = 'u';
262
264
  strcpy(tmp + 1, str);
263
265
  return qrk_str2id(rdr->obs, tmp);
@@ -268,13 +270,13 @@ static size_t rdr_mapobs(rdr_t *rdr, const char *str) {
268
270
  * applying patterns.
269
271
  */
270
272
  static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
271
- const int T = tok->len;
272
- int size = 0;
273
- if (rdr->maxent) {
273
+ const uint32_t T = tok->len;
274
+ uint32_t size = 0;
275
+ if (rdr->autouni) {
274
276
  size = tok->cnts[0];
275
277
  } else {
276
- for (int t = 0; t < T; t++) {
277
- for (int n = 0; n < tok->cnts[t]; n++) {
278
+ for (uint32_t t = 0; t < T; t++) {
279
+ for (uint32_t n = 0; n < tok->cnts[t]; n++) {
278
280
  const char *o = tok->toks[t][n];
279
281
  switch (o[0]) {
280
282
  case 'u': size += 1; break;
@@ -287,30 +289,30 @@ static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
287
289
  }
288
290
  }
289
291
  seq_t *seq = wapiti_xmalloc(sizeof(seq_t) + sizeof(pos_t) * T);
290
- seq->raw = wapiti_xmalloc(sizeof(size_t) * size);
292
+ seq->raw = wapiti_xmalloc(sizeof(uint64_t) * size);
291
293
  seq->len = T;
292
- size_t *raw = seq->raw;
293
- for (int t = 0; t < T; t++) {
294
- seq->pos[t].lbl = none;
294
+ uint64_t *raw = seq->raw;
295
+ for (uint32_t t = 0; t < T; t++) {
296
+ seq->pos[t].lbl = (uint32_t)-1;
295
297
  seq->pos[t].ucnt = 0;
296
298
  seq->pos[t].uobs = raw;
297
- for (int n = 0; n < tok->cnts[t]; n++) {
298
- if (tok->toks[t][n][0] == 'b')
299
+ for (uint32_t n = 0; n < tok->cnts[t]; n++) {
300
+ if (!rdr->autouni && tok->toks[t][n][0] == 'b')
299
301
  continue;
300
- size_t id = rdr_mapobs(rdr, tok->toks[t][n]);
302
+ uint64_t id = rdr_mapobs(rdr, tok->toks[t][n]);
301
303
  if (id != none) {
302
304
  (*raw++) = id;
303
305
  seq->pos[t].ucnt++;
304
306
  }
305
307
  }
306
308
  seq->pos[t].bcnt = 0;
307
- if (rdr->maxent)
309
+ if (rdr->autouni)
308
310
  continue;
309
311
  seq->pos[t].bobs = raw;
310
- for (int n = 0; n < tok->cnts[t]; n++) {
312
+ for (uint32_t n = 0; n < tok->cnts[t]; n++) {
311
313
  if (tok->toks[t][n][0] == 'u')
312
314
  continue;
313
- size_t id = rdr_mapobs(rdr, tok->toks[t][n]);
315
+ uint64_t id = rdr_mapobs(rdr, tok->toks[t][n]);
314
316
  if (id != none) {
315
317
  (*raw++) = id;
316
318
  seq->pos[t].bcnt++;
@@ -319,9 +321,9 @@ static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
319
321
  }
320
322
  // And finally, if the user specified it, populate the labels
321
323
  if (tok->lbl != NULL) {
322
- for (int t = 0; t < T; t++) {
324
+ for (uint32_t t = 0; t < T; t++) {
323
325
  const char *lbl = tok->lbl[t];
324
- size_t id = qrk_str2id(rdr->lbl, lbl);
326
+ uint64_t id = qrk_str2id(rdr->lbl, lbl);
325
327
  seq->pos[t].lbl = id;
326
328
  }
327
329
  }
@@ -332,35 +334,35 @@ static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
332
334
  * Convert a tok_t to a seq_t object by applying the patterns of the reader.
333
335
  */
334
336
  static seq_t *rdr_pattok2seq(rdr_t *rdr, const tok_t *tok) {
335
- const int T = tok->len;
337
+ const uint32_t T = tok->len;
336
338
  // So now the tok object is ready, we can start building the seq_t
337
339
  // object by appling patterns. First we allocate the seq_t object. The
338
340
  // sequence itself as well as the sub array are allocated in one time.
339
341
  seq_t *seq = wapiti_xmalloc(sizeof(seq_t) + sizeof(pos_t) * T);
340
- seq->raw = wapiti_xmalloc(sizeof(size_t) * (rdr->nuni + rdr->nbi) * T);
342
+ seq->raw = wapiti_xmalloc(sizeof(uint64_t) * (rdr->nuni + rdr->nbi) * T);
341
343
  seq->len = T;
342
- size_t *tmp = seq->raw;
343
- for (int t = 0; t < T; t++) {
344
- seq->pos[t].lbl = none;
344
+ uint64_t *tmp = seq->raw;
345
+ for (uint32_t t = 0; t < T; t++) {
346
+ seq->pos[t].lbl = (uint32_t)-1;
345
347
  seq->pos[t].uobs = tmp; tmp += rdr->nuni;
346
348
  seq->pos[t].bobs = tmp; tmp += rdr->nbi;
347
349
  }
348
350
  // Next, we can build the observations list by applying the patterns on
349
351
  // the tok_t sequence.
350
- for (int t = 0; t < T; t++) {
352
+ for (uint32_t t = 0; t < T; t++) {
351
353
  pos_t *pos = &seq->pos[t];
352
354
  pos->ucnt = 0;
353
355
  pos->bcnt = 0;
354
- for (int x = 0; x < rdr->npats; x++) {
356
+ for (uint32_t x = 0; x < rdr->npats; x++) {
355
357
  // Get the observation and map it to an identifier
356
358
  char *obs = pat_exec(rdr->pats[x], tok, t);
357
- size_t id = rdr_mapobs(rdr, obs);
359
+ uint64_t id = rdr_mapobs(rdr, obs);
358
360
  if (id == none) {
359
361
  free(obs);
360
362
  continue;
361
363
  }
362
364
  // If the observation is ok, add it to the lists
363
- int kind = 0;
365
+ char kind = 0;
364
366
  switch (obs[0]) {
365
367
  case 'u': kind = 1; break;
366
368
  case 'b': kind = 2; break;
@@ -375,9 +377,9 @@ static seq_t *rdr_pattok2seq(rdr_t *rdr, const tok_t *tok) {
375
377
  }
376
378
  // And finally, if the user specified it, populate the labels
377
379
  if (tok->lbl != NULL) {
378
- for (int t = 0; t < T; t++) {
380
+ for (uint32_t t = 0; t < T; t++) {
379
381
  const char *lbl = tok->lbl[t];
380
- size_t id = qrk_str2id(rdr->lbl, lbl);
382
+ uint64_t id = qrk_str2id(rdr->lbl, lbl);
381
383
  seq->pos[t].lbl = id;
382
384
  }
383
385
  }
@@ -390,11 +392,11 @@ static seq_t *rdr_pattok2seq(rdr_t *rdr, const tok_t *tok) {
390
392
  * interned also.
391
393
  */
392
394
  seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl) {
393
- const int T = raw->len;
395
+ const uint32_t T = raw->len;
394
396
  // Allocate the tok_t object, the label array is allocated only if they
395
397
  // are requested by the user.
396
398
  tok_t *tok = wapiti_xmalloc(sizeof(tok_t) + T * sizeof(char **));
397
- tok->cnts = wapiti_xmalloc(sizeof(size_t) * T);
399
+ tok->cnts = wapiti_xmalloc(sizeof(uint32_t) * T);
398
400
  tok->lbl = NULL;
399
401
  if (lbl == true)
400
402
  tok->lbl = wapiti_xmalloc(sizeof(char *) * T);
@@ -402,16 +404,15 @@ seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl) {
402
404
  // tokens. To reduce memory fragmentation, the raw line is copied and
403
405
  // his reference is kept by the first tokens, next tokens are pointer to
404
406
  // this copy.
405
- for (int t = 0; t < T; t++) {
407
+ for (uint32_t t = 0; t < T; t++) {
406
408
  // Get a copy of the raw line skiping leading space characters
407
409
  const char *src = raw->lines[t];
408
410
  while (isspace(*src))
409
411
  src++;
410
412
  char *line = xstrdup(src);
411
413
  // Split it in tokens
412
- const int len = strlen(line);
413
- char *toks[len / 2];
414
- int cnt = 0;
414
+ char *toks[strlen(line) / 2 + 1];
415
+ uint32_t cnt = 0;
415
416
  while (*line != '\0') {
416
417
  toks[cnt++] = line;
417
418
  while (*line != '\0' && !isspace(*line))
@@ -441,7 +442,7 @@ seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl) {
441
442
  else
442
443
  seq = rdr_pattok2seq(rdr, tok);
443
444
  // Before returning the sequence, we have to free the tok_t
444
- for (int t = 0; t < T; t++) {
445
+ for (uint32_t t = 0; t < T; t++) {
445
446
  if (tok->cnts[t] == 0)
446
447
  continue;
447
448
  free(tok->toks[t][0]);
@@ -477,7 +478,7 @@ seq_t *rdr_readseq(rdr_t *rdr, FILE *file, bool lbl) {
477
478
  */
478
479
  dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl) {
479
480
  // Prepare dataset
480
- size_t size = 1000;
481
+ uint32_t size = 1000;
481
482
  dat_t *dat = wapiti_xmalloc(sizeof(dat_t));
482
483
  dat->nseq = 0;
483
484
  dat->mlen = 0;
@@ -498,7 +499,7 @@ dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl) {
498
499
  dat->seq[dat->nseq++] = seq;
499
500
  dat->mlen = max(dat->mlen, seq->len);
500
501
  if (dat->nseq % 1000 == 0)
501
- info("%7d sequences loaded\n", dat->nseq);
502
+ info("%7"PRIu32" sequences loaded\n", dat->nseq);
502
503
  }
503
504
  // If no sequence readed, cleanup and repport
504
505
  if (dat->nseq == 0) {
@@ -520,18 +521,30 @@ dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl) {
520
521
  */
521
522
  void rdr_load(rdr_t *rdr, FILE *file) {
522
523
  const char *err = "broken file, invalid reader format";
523
- if (fscanf(file, "#rdr#%d/%d\n", &rdr->npats, &rdr->ntoks) != 2)
524
- fatal(err);
524
+ int autouni = rdr->autouni;
525
+ fpos_t pos;
526
+ fgetpos(file, &pos);
527
+ if (fscanf(file, "#rdr#%"PRIu32"/%"PRIu32"/%d\n",
528
+ &rdr->npats, &rdr->ntoks, &autouni) != 3) {
529
+ // This for compatibility with previous file format
530
+ fsetpos(file, &pos);
531
+ if (fscanf(file, "#rdr#%"PRIu32"/%"PRIu32"\n",
532
+ &rdr->npats, &rdr->ntoks) != 2)
533
+ fatal(err);
534
+ }
535
+ rdr->autouni = autouni;
525
536
  rdr->nuni = rdr->nbi = 0;
526
- rdr->pats = wapiti_xmalloc(sizeof(pat_t *) * rdr->npats);
527
- for (int p = 0; p < rdr->npats; p++) {
528
- char *pat = ns_readstr(file);
529
- rdr->pats[p] = pat_comp(pat);
530
- switch (tolower(pat[0])) {
531
- case 'u': rdr->nuni++; break;
532
- case 'b': rdr->nbi++; break;
533
- case '*': rdr->nuni++;
534
- rdr->nbi++; break;
537
+ if (rdr->npats != 0) {
538
+ rdr->pats = wapiti_xmalloc(sizeof(pat_t *) * rdr->npats);
539
+ for (uint32_t p = 0; p < rdr->npats; p++) {
540
+ char *pat = ns_readstr(file);
541
+ rdr->pats[p] = pat_comp(pat);
542
+ switch (tolower(pat[0])) {
543
+ case 'u': rdr->nuni++; break;
544
+ case 'b': rdr->nbi++; break;
545
+ case '*': rdr->nuni++;
546
+ rdr->nbi++; break;
547
+ }
535
548
  }
536
549
  }
537
550
  qrk_load(rdr->lbl, file);
@@ -543,9 +556,10 @@ void rdr_load(rdr_t *rdr, FILE *file) {
543
556
  * is plain text and portable accros computers.
544
557
  */
545
558
  void rdr_save(const rdr_t *rdr, FILE *file) {
546
- if(fprintf(file, "#rdr#%d/%d\n", rdr->npats, rdr->ntoks) < 0)
559
+ if (fprintf(file, "#rdr#%"PRIu32"/%"PRIu32"/%d\n",
560
+ rdr->npats, rdr->ntoks, rdr->autouni) < 0)
547
561
  pfatal("cannot write to file");
548
- for (int p = 0; p < rdr->npats; p++)
562
+ for (uint32_t p = 0; p < rdr->npats; p++)
549
563
  ns_writestr(file, rdr->pats[p]->src);
550
564
  qrk_save(rdr->lbl, file);
551
565
  qrk_save(rdr->obs, file);