wapiti 0.0.5 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.simplecov +3 -0
  3. data/Gemfile +25 -2
  4. data/HISTORY.md +5 -1
  5. data/LICENSE +14 -13
  6. data/README.md +9 -16
  7. data/Rakefile +38 -8
  8. data/ext/wapiti/bcd.c +126 -124
  9. data/ext/wapiti/decoder.c +203 -124
  10. data/ext/wapiti/decoder.h +6 -4
  11. data/ext/wapiti/extconf.rb +2 -2
  12. data/ext/wapiti/gradient.c +491 -320
  13. data/ext/wapiti/gradient.h +52 -34
  14. data/ext/wapiti/lbfgs.c +74 -33
  15. data/ext/wapiti/model.c +47 -37
  16. data/ext/wapiti/model.h +22 -20
  17. data/ext/wapiti/native.c +850 -839
  18. data/ext/wapiti/native.h +1 -1
  19. data/ext/wapiti/options.c +52 -20
  20. data/ext/wapiti/options.h +37 -30
  21. data/ext/wapiti/pattern.c +35 -33
  22. data/ext/wapiti/pattern.h +12 -11
  23. data/ext/wapiti/progress.c +14 -13
  24. data/ext/wapiti/progress.h +3 -2
  25. data/ext/wapiti/quark.c +14 -16
  26. data/ext/wapiti/quark.h +6 -5
  27. data/ext/wapiti/reader.c +83 -69
  28. data/ext/wapiti/reader.h +11 -9
  29. data/ext/wapiti/rprop.c +84 -43
  30. data/ext/wapiti/sequence.h +18 -16
  31. data/ext/wapiti/sgdl1.c +45 -43
  32. data/ext/wapiti/thread.c +19 -17
  33. data/ext/wapiti/thread.h +5 -4
  34. data/ext/wapiti/tools.c +7 -7
  35. data/ext/wapiti/tools.h +3 -4
  36. data/ext/wapiti/trainers.h +1 -1
  37. data/ext/wapiti/vmath.c +40 -38
  38. data/ext/wapiti/vmath.h +12 -11
  39. data/ext/wapiti/wapiti.c +159 -37
  40. data/ext/wapiti/wapiti.h +18 -4
  41. data/lib/wapiti.rb +15 -15
  42. data/lib/wapiti/errors.rb +15 -15
  43. data/lib/wapiti/model.rb +92 -84
  44. data/lib/wapiti/options.rb +123 -124
  45. data/lib/wapiti/utility.rb +14 -14
  46. data/lib/wapiti/version.rb +2 -2
  47. data/spec/spec_helper.rb +29 -9
  48. data/spec/wapiti/model_spec.rb +230 -194
  49. data/spec/wapiti/native_spec.rb +7 -8
  50. data/spec/wapiti/options_spec.rb +184 -174
  51. data/wapiti.gemspec +22 -8
  52. metadata +38 -42
  53. data/.gitignore +0 -5
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -29,27 +29,28 @@
29
29
  #define pattern_h
30
30
 
31
31
  #include <stdbool.h>
32
+ #include <stdint.h>
32
33
 
33
34
  #include "sequence.h"
34
35
 
35
36
  typedef struct pat_s pat_t;
36
37
  typedef struct pat_item_s pat_item_t;
37
38
  struct pat_s {
38
- char *src;
39
- int ntoks;
40
- int nitems;
39
+ char *src;
40
+ uint32_t ntoks;
41
+ uint32_t nitems;
41
42
  struct pat_item_s {
42
- char type;
43
- bool caps;
44
- char *value;
45
- bool absolute;
46
- int offset;
47
- int column;
43
+ char type;
44
+ bool caps;
45
+ char *value;
46
+ bool absolute;
47
+ int32_t offset;
48
+ uint32_t column;
48
49
  } items[];
49
50
  };
50
51
 
51
52
  pat_t *pat_comp(char *p);
52
- char *pat_exec(const pat_t *pat, const tok_t *tok, int at);
53
+ char *pat_exec(const pat_t *pat, const tok_t *tok, uint32_t at);
53
54
  void pat_free(pat_t *pat);
54
55
 
55
56
  #endif
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -24,14 +24,16 @@
24
24
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
25
  * POSSIBILITY OF SUCH DAMAGE.
26
26
  */
27
+ #include <inttypes.h>
27
28
  #include <signal.h>
28
29
  #include <stdbool.h>
29
30
  #include <stddef.h>
31
+ #include <stdint.h>
30
32
  #include <stdlib.h>
31
33
  #include <stdio.h>
32
34
 
33
35
  #include <unistd.h>
34
- #include <sys/times.h>
36
+ #include <sys/time.h>
35
37
  #include <sys/resource.h>
36
38
 
37
39
  #include "wapiti.h"
@@ -89,7 +91,7 @@ void uit_setup(mdl_t *mdl) {
89
91
  uit_stop = false;
90
92
  if (signal(SIGINT, uit_signal) == SIG_ERR)
91
93
  warning("failed to set signal handler, no clean early stop");
92
- times(&mdl->timer);
94
+ gettimeofday(&mdl->timer, NULL);
93
95
  if (mdl->opt->stopwin != 0)
94
96
  mdl->werr = wapiti_xmalloc(sizeof(double) * mdl->opt->stopwin);
95
97
  mdl->wcnt = mdl->wpos = 0;
@@ -116,28 +118,27 @@ void uit_cleanup(mdl_t *mdl) {
116
118
  * and false if he must stop, so this is were we will implement the trainer
117
119
  * independant stoping criterion.
118
120
  */
119
- bool uit_progress(mdl_t *mdl, int it, double obj) {
121
+ bool uit_progress(mdl_t *mdl, uint32_t it, double obj) {
120
122
  // First we just compute the error rate on devel or train data
121
123
  double te, se;
122
124
  tag_eval(mdl, &te, &se);
123
125
  // Next, we compute the number of active features
124
- size_t act = 0;
125
- for (size_t f = 0; f < mdl->nftr; f++)
126
+ uint64_t act = 0;
127
+ for (uint64_t f = 0; f < mdl->nftr; f++)
126
128
  if (mdl->theta[f] != 0.0)
127
129
  act++;
128
130
  // Compute timings. As some training algorithms are multi-threaded, we
129
131
  // cannot use ansi/c function and must rely on posix one to sum time
130
132
  // spent in main thread and in child ones.
131
- tms_t now; times(&now);
132
- double tm = (now.tms_utime - mdl->timer.tms_utime )
133
- + (now.tms_cutime - mdl->timer.tms_cutime);
134
- tm /= sysconf(_SC_CLK_TCK);
133
+ tms_t now; gettimeofday(&now, NULL);
134
+ double tm = (now.tv_sec + (double)now.tv_usec * 1.0e-6)
135
+ - (mdl->timer.tv_sec + (double)mdl->timer.tv_usec * 1.0e-6);
135
136
  mdl->total += tm;
136
137
  mdl->timer = now;
137
138
  // And display progress report
138
- info(" [%4d]", it);
139
+ info(" [%4"PRIu32"]", it);
139
140
  info(obj >= 0.0 ? " obj=%-10.2f" : " obj=NA", obj);
140
- info(" act=%-8zu", act);
141
+ info(" act=%-8"PRIu64, act);
141
142
  info(" err=%5.2f%%/%5.2f%%", te, se);
142
143
  info(" time=%.2fs/%.2fs", tm, mdl->total);
143
144
  info("\n");
@@ -150,7 +151,7 @@ bool uit_progress(mdl_t *mdl, int it, double obj) {
150
151
  mdl->wcnt++;
151
152
  if (mdl->wcnt >= mdl->opt->stopwin) {
152
153
  double emin = 200.0, emax = -100.0;
153
- for (int i = 0; i < mdl->opt->stopwin; i++) {
154
+ for (uint32_t i = 0; i < mdl->opt->stopwin; i++) {
154
155
  emin = min(emin, mdl->werr[i]);
155
156
  emax = max(emax, mdl->werr[i]);
156
157
  }
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -29,6 +29,7 @@
29
29
  #define progress_h
30
30
 
31
31
  #include <stdbool.h>
32
+ #include <stdint.h>
32
33
 
33
34
  #include "wapiti.h"
34
35
  #include "model.h"
@@ -37,7 +38,7 @@ extern bool uit_stop;
37
38
 
38
39
  void uit_setup(mdl_t *mdl);
39
40
  void uit_cleanup(mdl_t *mdl);
40
- bool uit_progress(mdl_t *mdl, int it, double obj);
41
+ bool uit_progress(mdl_t *mdl, uint32_t it, double obj);
41
42
 
42
43
  #endif
43
44
 
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -24,11 +24,12 @@
24
24
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
25
  * POSSIBILITY OF SUCH DAMAGE.
26
26
  */
27
+ #include <inttypes.h>
27
28
  #include <stdbool.h>
28
29
  #include <stddef.h>
29
30
  #include <stdlib.h>
30
- #include <stdio.h>
31
31
  #include <stdint.h>
32
+ #include <stdio.h>
32
33
  #include <string.h>
33
34
 
34
35
  #include "quark.h"
@@ -47,7 +48,7 @@
47
48
  * Information Coded in Alphanumeric, Journal of the ACM 15 (4): pp. 514--534,
48
49
  * 1968. DOI:10.1145/321479.321481
49
50
  *
50
- * This code is copyright 2002-2011 Thomas Lavergne and licenced under the BSD
51
+ * This code is copyright 2002-2013 Thomas Lavergne and licenced under the BSD
51
52
  * Licence like the remaining of Wapiti.
52
53
  ******************************************************************************/
53
54
 
@@ -68,8 +69,6 @@ struct qrk_s {
68
69
  uint64_t size;
69
70
  };
70
71
 
71
- #define qrk_none ((uint64_t)-1)
72
-
73
72
  #define qrk_lf2nd(lf) ((node_t *)((intptr_t)(lf) | 1))
74
73
  #define qrk_nd2lf(nd) ((leaf_t *)((intptr_t)(nd) & ~1))
75
74
  #define qrk_isleaf(nd) ((intptr_t)(nd) & 1)
@@ -86,7 +85,7 @@ qrk_t *qrk_new(void) {
86
85
  qrk->count = 0;
87
86
  qrk->lock = false;
88
87
  qrk->size = size;
89
- qrk->leafs = wapiti_xmalloc(sizeof(leaf_t) * size);
88
+ qrk->leafs = wapiti_xmalloc(sizeof(leaf_t *) * size);
90
89
  return qrk;
91
90
  }
92
91
 
@@ -96,10 +95,10 @@ qrk_t *qrk_new(void) {
96
95
  * qrk_unmap become invalid and must not be used anymore.
97
96
  */
98
97
  void qrk_free(qrk_t *qrk) {
99
- const size_t stkmax = 1024;
98
+ const uint32_t stkmax = 1024;
100
99
  if (qrk->count != 0) {
101
100
  node_t *stk[stkmax];
102
- int cnt = 0;
101
+ uint32_t cnt = 0;
103
102
  stk[cnt++] = qrk->root;
104
103
  while (cnt != 0) {
105
104
  node_t *nd = stk[--cnt];
@@ -122,7 +121,7 @@ void qrk_free(qrk_t *qrk) {
122
121
  * pair inside the quark. This function is not thread safe and should not be
123
122
  * called on the same map from different thread without locking.
124
123
  */
125
- size_t qrk_str2id(qrk_t *qrk, const char *key) {
124
+ uint64_t qrk_str2id(qrk_t *qrk, const char *key) {
126
125
  const uint8_t *raw = (void *)key;
127
126
  const size_t len = strlen(key);
128
127
  // We first take care of the empty trie case so later we can safely
@@ -213,7 +212,7 @@ size_t qrk_str2id(qrk_t *qrk, const char *key) {
213
212
  * remain valid only for the life time of the quark, a call to qrk_free will
214
213
  * make this pointer invalid.
215
214
  */
216
- const char *qrk_id2str(const qrk_t *qrk, size_t id) {
215
+ const char *qrk_id2str(const qrk_t *qrk, uint64_t id) {
217
216
  if (id >= qrk->count)
218
217
  fatal("invalid identifier");
219
218
  return qrk->leafs[id]->key;
@@ -225,7 +224,7 @@ const char *qrk_id2str(const qrk_t *qrk, size_t id) {
225
224
  * number correspond to the id.
226
225
  */
227
226
  void qrk_save(const qrk_t *qrk, FILE *file) {
228
- if (fprintf(file, "#qrk#%zu\n", (size_t)qrk->count) < 0)
227
+ if (fprintf(file, "#qrk#%"PRIu64"\n", qrk->count) < 0)
229
228
  pfatal("cannot write to file");
230
229
  if (qrk->count == 0)
231
230
  return;
@@ -240,13 +239,13 @@ void qrk_save(const qrk_t *qrk, FILE *file) {
240
239
  * initilay empty, this will load a map exactly as saved by qrk_save.
241
240
  */
242
241
  void qrk_load(qrk_t *qrk, FILE *file) {
243
- size_t cnt = 0;
244
- if (fscanf(file, "#qrk#%zu\n", &cnt) != 1) {
242
+ uint64_t cnt = 0;
243
+ if (fscanf(file, "#qrk#%"SCNu64"\n", &cnt) != 1) {
245
244
  if (ferror(file) != 0)
246
245
  pfatal("cannot read from file");
247
246
  pfatal("invalid format");
248
247
  }
249
- for (size_t n = 0; n < cnt; ++n) {
248
+ for (uint64_t n = 0; n < cnt; ++n) {
250
249
  char *str = ns_readstr(file);
251
250
  qrk_str2id(qrk, str);
252
251
  free(str);
@@ -256,7 +255,7 @@ void qrk_load(qrk_t *qrk, FILE *file) {
256
255
  /* qrk_count:
257
256
  * Return the number of mappings stored in the quark.
258
257
  */
259
- size_t qrk_count(const qrk_t *qrk) {
258
+ uint64_t qrk_count(const qrk_t *qrk) {
260
259
  return qrk->count;
261
260
  }
262
261
 
@@ -269,4 +268,3 @@ bool qrk_lock(qrk_t *qrk, bool lock) {
269
268
  return old;
270
269
  }
271
270
 
272
-
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -28,17 +28,18 @@
28
28
  #ifndef quark_h
29
29
  #define quark_h
30
30
 
31
- #include <stddef.h>
31
+ #include <stdbool.h>
32
+ #include <stdint.h>
32
33
  #include <stdio.h>
33
34
 
34
35
  typedef struct qrk_s qrk_t;
35
36
 
36
37
  qrk_t *qrk_new(void);
37
38
  void qrk_free(qrk_t *qrk);
38
- size_t qrk_count(const qrk_t *qrk);
39
+ uint64_t qrk_count(const qrk_t *qrk);
39
40
  bool qrk_lock(qrk_t *qrk, bool lock);
40
- const char *qrk_id2str(const qrk_t *qrk, size_t id);
41
- size_t qrk_str2id(qrk_t *qrk, const char *key);
41
+ const char *qrk_id2str(const qrk_t *qrk, uint64_t id);
42
+ uint64_t qrk_str2id(qrk_t *qrk, const char *key);
42
43
  void qrk_load(qrk_t *qrk, FILE *file);
43
44
  void qrk_save(const qrk_t *qrk, FILE *file);
44
45
 
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * Wapiti - A linear-chain CRF tool
3
3
  *
4
- * Copyright (c) 2009-2011 CNRS
4
+ * Copyright (c) 2009-2013 CNRS
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without
@@ -25,6 +25,7 @@
25
25
  * POSSIBILITY OF SUCH DAMAGE.
26
26
  */
27
27
  #include <ctype.h>
28
+ #include <inttypes.h>
28
29
  #include <stdbool.h>
29
30
  #include <stddef.h>
30
31
  #include <stdlib.h>
@@ -59,12 +60,14 @@
59
60
  ******************************************************************************/
60
61
 
61
62
  /* rdr_new:
62
- * Create a new empty reader object. You mut load patterns in it or a
63
- * previously saved reader if you want to use it for reading sequences.
63
+ * Create a new empty reader object. If no patterns are loaded before you
64
+ * start using the reader the input data are assumed to be already prepared
65
+ * list of features. They must either start with a prefix 'u', 'b', or '*', or
66
+ * you must set autouni to true in order to automatically add a 'u' prefix.
64
67
  */
65
- rdr_t *rdr_new(bool maxent) {
68
+ rdr_t *rdr_new(bool autouni) {
66
69
  rdr_t *rdr = wapiti_xmalloc(sizeof(rdr_t));
67
- rdr->maxent = maxent;
70
+ rdr->autouni = autouni;
68
71
  rdr->npats = rdr->nuni = rdr->nbi = 0;
69
72
  rdr->ntoks = 0;
70
73
  rdr->pats = NULL;
@@ -78,7 +81,7 @@ rdr_t *rdr_new(bool maxent) {
78
81
  * any string returned by them must not be used after this call.
79
82
  */
80
83
  void rdr_free(rdr_t *rdr) {
81
- for (int i = 0; i < rdr->npats; i++)
84
+ for (uint32_t i = 0; i < rdr->npats; i++)
82
85
  pat_free(rdr->pats[i]);
83
86
  free(rdr->pats);
84
87
  qrk_free(rdr->lbl);
@@ -90,7 +93,7 @@ void rdr_free(rdr_t *rdr) {
90
93
  * Free all memory used by a raw_t object.
91
94
  */
92
95
  void rdr_freeraw(raw_t *raw) {
93
- for (int t = 0; t < raw->len; t++)
96
+ for (uint32_t t = 0; t < raw->len; t++)
94
97
  free(raw->lines[t]);
95
98
  free(raw);
96
99
  }
@@ -107,7 +110,7 @@ void rdr_freeseq(seq_t *seq) {
107
110
  * Free all memory used by a dat_t object.
108
111
  */
109
112
  void rdr_freedat(dat_t *dat) {
110
- for (size_t i = 0; i < dat->nseq; i++)
113
+ for (uint32_t i = 0; i < dat->nseq; i++)
111
114
  rdr_freeseq(dat->seq[i]);
112
115
  free(dat->seq);
113
116
  free(dat);
@@ -118,11 +121,11 @@ void rdr_freedat(dat_t *dat) {
118
121
  * available memory, a buffer large enough is allocated and returned. The
119
122
  * caller is responsible to free it. On end-of-file, NULL is returned.
120
123
  */
121
- static char *rdr_readline(FILE *file) {
124
+ char *rdr_readline(FILE *file) {
122
125
  if (feof(file))
123
126
  return NULL;
124
127
  // Initialize the buffer
125
- int len = 0, size = 16;
128
+ uint32_t len = 0, size = 16;
126
129
  char *buffer = wapiti_xmalloc(size);
127
130
  // We read the line chunk by chunk until end of line, file or error
128
131
  while (!feof(file)) {
@@ -203,7 +206,7 @@ raw_t *rdr_readraw(rdr_t *rdr, FILE *file) {
203
206
  if (feof(file))
204
207
  return NULL;
205
208
  // Prepare the raw sequence object
206
- int size = 32, cnt = 0;
209
+ uint32_t size = 32, cnt = 0;
207
210
  raw_t *raw = wapiti_xmalloc(sizeof(raw_t) + sizeof(char *) * size);
208
211
  // And read the next sequence in the file, this will skip any blank line
209
212
  // before reading the sequence stoping at end of file or on a new blank
@@ -232,9 +235,9 @@ raw_t *rdr_readraw(rdr_t *rdr, FILE *file) {
232
235
  + sizeof(char *) * size);
233
236
  }
234
237
  raw->lines[cnt++] = line;
235
- // In maxent mode, we only have to load one line for each sample
236
- // so we can stop here.
237
- if (rdr->maxent)
238
+ // In autouni mode, there will be only unigram features so we
239
+ // can use small sequences to improve multi-theading.
240
+ if (rdr->autouni)
238
241
  break;
239
242
  }
240
243
  // If no lines was read, we just free allocated memory and return NULL
@@ -251,13 +254,12 @@ raw_t *rdr_readraw(rdr_t *rdr, FILE *file) {
251
254
 
252
255
  /* rdr_mapobs:
253
256
  * Map an observation to its identifier, automatically adding a 'u' prefix in
254
- * pure maxent mode.
257
+ * 'autouni' mode.
255
258
  */
256
- static size_t rdr_mapobs(rdr_t *rdr, const char *str) {
257
- if (!rdr->maxent)
259
+ static uint64_t rdr_mapobs(rdr_t *rdr, const char *str) {
260
+ if (!rdr->autouni)
258
261
  return qrk_str2id(rdr->obs, str);
259
- size_t len = strlen(str) + 2;
260
- char tmp[len];
262
+ char tmp[strlen(str) + 2];
261
263
  tmp[0] = 'u';
262
264
  strcpy(tmp + 1, str);
263
265
  return qrk_str2id(rdr->obs, tmp);
@@ -268,13 +270,13 @@ static size_t rdr_mapobs(rdr_t *rdr, const char *str) {
268
270
  * applying patterns.
269
271
  */
270
272
  static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
271
- const int T = tok->len;
272
- int size = 0;
273
- if (rdr->maxent) {
273
+ const uint32_t T = tok->len;
274
+ uint32_t size = 0;
275
+ if (rdr->autouni) {
274
276
  size = tok->cnts[0];
275
277
  } else {
276
- for (int t = 0; t < T; t++) {
277
- for (int n = 0; n < tok->cnts[t]; n++) {
278
+ for (uint32_t t = 0; t < T; t++) {
279
+ for (uint32_t n = 0; n < tok->cnts[t]; n++) {
278
280
  const char *o = tok->toks[t][n];
279
281
  switch (o[0]) {
280
282
  case 'u': size += 1; break;
@@ -287,30 +289,30 @@ static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
287
289
  }
288
290
  }
289
291
  seq_t *seq = wapiti_xmalloc(sizeof(seq_t) + sizeof(pos_t) * T);
290
- seq->raw = wapiti_xmalloc(sizeof(size_t) * size);
292
+ seq->raw = wapiti_xmalloc(sizeof(uint64_t) * size);
291
293
  seq->len = T;
292
- size_t *raw = seq->raw;
293
- for (int t = 0; t < T; t++) {
294
- seq->pos[t].lbl = none;
294
+ uint64_t *raw = seq->raw;
295
+ for (uint32_t t = 0; t < T; t++) {
296
+ seq->pos[t].lbl = (uint32_t)-1;
295
297
  seq->pos[t].ucnt = 0;
296
298
  seq->pos[t].uobs = raw;
297
- for (int n = 0; n < tok->cnts[t]; n++) {
298
- if (tok->toks[t][n][0] == 'b')
299
+ for (uint32_t n = 0; n < tok->cnts[t]; n++) {
300
+ if (!rdr->autouni && tok->toks[t][n][0] == 'b')
299
301
  continue;
300
- size_t id = rdr_mapobs(rdr, tok->toks[t][n]);
302
+ uint64_t id = rdr_mapobs(rdr, tok->toks[t][n]);
301
303
  if (id != none) {
302
304
  (*raw++) = id;
303
305
  seq->pos[t].ucnt++;
304
306
  }
305
307
  }
306
308
  seq->pos[t].bcnt = 0;
307
- if (rdr->maxent)
309
+ if (rdr->autouni)
308
310
  continue;
309
311
  seq->pos[t].bobs = raw;
310
- for (int n = 0; n < tok->cnts[t]; n++) {
312
+ for (uint32_t n = 0; n < tok->cnts[t]; n++) {
311
313
  if (tok->toks[t][n][0] == 'u')
312
314
  continue;
313
- size_t id = rdr_mapobs(rdr, tok->toks[t][n]);
315
+ uint64_t id = rdr_mapobs(rdr, tok->toks[t][n]);
314
316
  if (id != none) {
315
317
  (*raw++) = id;
316
318
  seq->pos[t].bcnt++;
@@ -319,9 +321,9 @@ static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
319
321
  }
320
322
  // And finally, if the user specified it, populate the labels
321
323
  if (tok->lbl != NULL) {
322
- for (int t = 0; t < T; t++) {
324
+ for (uint32_t t = 0; t < T; t++) {
323
325
  const char *lbl = tok->lbl[t];
324
- size_t id = qrk_str2id(rdr->lbl, lbl);
326
+ uint64_t id = qrk_str2id(rdr->lbl, lbl);
325
327
  seq->pos[t].lbl = id;
326
328
  }
327
329
  }
@@ -332,35 +334,35 @@ static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
332
334
  * Convert a tok_t to a seq_t object by applying the patterns of the reader.
333
335
  */
334
336
  static seq_t *rdr_pattok2seq(rdr_t *rdr, const tok_t *tok) {
335
- const int T = tok->len;
337
+ const uint32_t T = tok->len;
336
338
  // So now the tok object is ready, we can start building the seq_t
337
339
  // object by appling patterns. First we allocate the seq_t object. The
338
340
  // sequence itself as well as the sub array are allocated in one time.
339
341
  seq_t *seq = wapiti_xmalloc(sizeof(seq_t) + sizeof(pos_t) * T);
340
- seq->raw = wapiti_xmalloc(sizeof(size_t) * (rdr->nuni + rdr->nbi) * T);
342
+ seq->raw = wapiti_xmalloc(sizeof(uint64_t) * (rdr->nuni + rdr->nbi) * T);
341
343
  seq->len = T;
342
- size_t *tmp = seq->raw;
343
- for (int t = 0; t < T; t++) {
344
- seq->pos[t].lbl = none;
344
+ uint64_t *tmp = seq->raw;
345
+ for (uint32_t t = 0; t < T; t++) {
346
+ seq->pos[t].lbl = (uint32_t)-1;
345
347
  seq->pos[t].uobs = tmp; tmp += rdr->nuni;
346
348
  seq->pos[t].bobs = tmp; tmp += rdr->nbi;
347
349
  }
348
350
  // Next, we can build the observations list by applying the patterns on
349
351
  // the tok_t sequence.
350
- for (int t = 0; t < T; t++) {
352
+ for (uint32_t t = 0; t < T; t++) {
351
353
  pos_t *pos = &seq->pos[t];
352
354
  pos->ucnt = 0;
353
355
  pos->bcnt = 0;
354
- for (int x = 0; x < rdr->npats; x++) {
356
+ for (uint32_t x = 0; x < rdr->npats; x++) {
355
357
  // Get the observation and map it to an identifier
356
358
  char *obs = pat_exec(rdr->pats[x], tok, t);
357
- size_t id = rdr_mapobs(rdr, obs);
359
+ uint64_t id = rdr_mapobs(rdr, obs);
358
360
  if (id == none) {
359
361
  free(obs);
360
362
  continue;
361
363
  }
362
364
  // If the observation is ok, add it to the lists
363
- int kind = 0;
365
+ char kind = 0;
364
366
  switch (obs[0]) {
365
367
  case 'u': kind = 1; break;
366
368
  case 'b': kind = 2; break;
@@ -375,9 +377,9 @@ static seq_t *rdr_pattok2seq(rdr_t *rdr, const tok_t *tok) {
375
377
  }
376
378
  // And finally, if the user specified it, populate the labels
377
379
  if (tok->lbl != NULL) {
378
- for (int t = 0; t < T; t++) {
380
+ for (uint32_t t = 0; t < T; t++) {
379
381
  const char *lbl = tok->lbl[t];
380
- size_t id = qrk_str2id(rdr->lbl, lbl);
382
+ uint64_t id = qrk_str2id(rdr->lbl, lbl);
381
383
  seq->pos[t].lbl = id;
382
384
  }
383
385
  }
@@ -390,11 +392,11 @@ static seq_t *rdr_pattok2seq(rdr_t *rdr, const tok_t *tok) {
390
392
  * interned also.
391
393
  */
392
394
  seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl) {
393
- const int T = raw->len;
395
+ const uint32_t T = raw->len;
394
396
  // Allocate the tok_t object, the label array is allocated only if they
395
397
  // are requested by the user.
396
398
  tok_t *tok = wapiti_xmalloc(sizeof(tok_t) + T * sizeof(char **));
397
- tok->cnts = wapiti_xmalloc(sizeof(size_t) * T);
399
+ tok->cnts = wapiti_xmalloc(sizeof(uint32_t) * T);
398
400
  tok->lbl = NULL;
399
401
  if (lbl == true)
400
402
  tok->lbl = wapiti_xmalloc(sizeof(char *) * T);
@@ -402,16 +404,15 @@ seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl) {
402
404
  // tokens. To reduce memory fragmentation, the raw line is copied and
403
405
  // his reference is kept by the first tokens, next tokens are pointer to
404
406
  // this copy.
405
- for (int t = 0; t < T; t++) {
407
+ for (uint32_t t = 0; t < T; t++) {
406
408
  // Get a copy of the raw line skiping leading space characters
407
409
  const char *src = raw->lines[t];
408
410
  while (isspace(*src))
409
411
  src++;
410
412
  char *line = xstrdup(src);
411
413
  // Split it in tokens
412
- const int len = strlen(line);
413
- char *toks[len / 2];
414
- int cnt = 0;
414
+ char *toks[strlen(line) / 2 + 1];
415
+ uint32_t cnt = 0;
415
416
  while (*line != '\0') {
416
417
  toks[cnt++] = line;
417
418
  while (*line != '\0' && !isspace(*line))
@@ -441,7 +442,7 @@ seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl) {
441
442
  else
442
443
  seq = rdr_pattok2seq(rdr, tok);
443
444
  // Before returning the sequence, we have to free the tok_t
444
- for (int t = 0; t < T; t++) {
445
+ for (uint32_t t = 0; t < T; t++) {
445
446
  if (tok->cnts[t] == 0)
446
447
  continue;
447
448
  free(tok->toks[t][0]);
@@ -477,7 +478,7 @@ seq_t *rdr_readseq(rdr_t *rdr, FILE *file, bool lbl) {
477
478
  */
478
479
  dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl) {
479
480
  // Prepare dataset
480
- size_t size = 1000;
481
+ uint32_t size = 1000;
481
482
  dat_t *dat = wapiti_xmalloc(sizeof(dat_t));
482
483
  dat->nseq = 0;
483
484
  dat->mlen = 0;
@@ -498,7 +499,7 @@ dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl) {
498
499
  dat->seq[dat->nseq++] = seq;
499
500
  dat->mlen = max(dat->mlen, seq->len);
500
501
  if (dat->nseq % 1000 == 0)
501
- info("%7d sequences loaded\n", dat->nseq);
502
+ info("%7"PRIu32" sequences loaded\n", dat->nseq);
502
503
  }
503
504
  // If no sequence readed, cleanup and repport
504
505
  if (dat->nseq == 0) {
@@ -520,18 +521,30 @@ dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl) {
520
521
  */
521
522
  void rdr_load(rdr_t *rdr, FILE *file) {
522
523
  const char *err = "broken file, invalid reader format";
523
- if (fscanf(file, "#rdr#%d/%d\n", &rdr->npats, &rdr->ntoks) != 2)
524
- fatal(err);
524
+ int autouni = rdr->autouni;
525
+ fpos_t pos;
526
+ fgetpos(file, &pos);
527
+ if (fscanf(file, "#rdr#%"PRIu32"/%"PRIu32"/%d\n",
528
+ &rdr->npats, &rdr->ntoks, &autouni) != 3) {
529
+ // This for compatibility with previous file format
530
+ fsetpos(file, &pos);
531
+ if (fscanf(file, "#rdr#%"PRIu32"/%"PRIu32"\n",
532
+ &rdr->npats, &rdr->ntoks) != 2)
533
+ fatal(err);
534
+ }
535
+ rdr->autouni = autouni;
525
536
  rdr->nuni = rdr->nbi = 0;
526
- rdr->pats = wapiti_xmalloc(sizeof(pat_t *) * rdr->npats);
527
- for (int p = 0; p < rdr->npats; p++) {
528
- char *pat = ns_readstr(file);
529
- rdr->pats[p] = pat_comp(pat);
530
- switch (tolower(pat[0])) {
531
- case 'u': rdr->nuni++; break;
532
- case 'b': rdr->nbi++; break;
533
- case '*': rdr->nuni++;
534
- rdr->nbi++; break;
537
+ if (rdr->npats != 0) {
538
+ rdr->pats = wapiti_xmalloc(sizeof(pat_t *) * rdr->npats);
539
+ for (uint32_t p = 0; p < rdr->npats; p++) {
540
+ char *pat = ns_readstr(file);
541
+ rdr->pats[p] = pat_comp(pat);
542
+ switch (tolower(pat[0])) {
543
+ case 'u': rdr->nuni++; break;
544
+ case 'b': rdr->nbi++; break;
545
+ case '*': rdr->nuni++;
546
+ rdr->nbi++; break;
547
+ }
535
548
  }
536
549
  }
537
550
  qrk_load(rdr->lbl, file);
@@ -543,9 +556,10 @@ void rdr_load(rdr_t *rdr, FILE *file) {
543
556
  * is plain text and portable accros computers.
544
557
  */
545
558
  void rdr_save(const rdr_t *rdr, FILE *file) {
546
- if(fprintf(file, "#rdr#%d/%d\n", rdr->npats, rdr->ntoks) < 0)
559
+ if (fprintf(file, "#rdr#%"PRIu32"/%"PRIu32"/%d\n",
560
+ rdr->npats, rdr->ntoks, rdr->autouni) < 0)
547
561
  pfatal("cannot write to file");
548
- for (int p = 0; p < rdr->npats; p++)
562
+ for (uint32_t p = 0; p < rdr->npats; p++)
549
563
  ns_writestr(file, rdr->pats[p]->src);
550
564
  qrk_save(rdr->lbl, file);
551
565
  qrk_save(rdr->obs, file);