wapiti 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/.autotest +13 -0
  2. data/.gitignore +5 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +30 -0
  6. data/README.md +153 -0
  7. data/Rakefile +33 -0
  8. data/ext/wapiti/bcd.c +392 -0
  9. data/ext/wapiti/decoder.c +535 -0
  10. data/ext/wapiti/decoder.h +46 -0
  11. data/ext/wapiti/extconf.rb +8 -0
  12. data/ext/wapiti/gradient.c +818 -0
  13. data/ext/wapiti/gradient.h +81 -0
  14. data/ext/wapiti/lbfgs.c +294 -0
  15. data/ext/wapiti/model.c +296 -0
  16. data/ext/wapiti/model.h +100 -0
  17. data/ext/wapiti/native.c +1238 -0
  18. data/ext/wapiti/native.h +15 -0
  19. data/ext/wapiti/options.c +278 -0
  20. data/ext/wapiti/options.h +91 -0
  21. data/ext/wapiti/pattern.c +395 -0
  22. data/ext/wapiti/pattern.h +56 -0
  23. data/ext/wapiti/progress.c +167 -0
  24. data/ext/wapiti/progress.h +43 -0
  25. data/ext/wapiti/quark.c +272 -0
  26. data/ext/wapiti/quark.h +46 -0
  27. data/ext/wapiti/reader.c +553 -0
  28. data/ext/wapiti/reader.h +73 -0
  29. data/ext/wapiti/rprop.c +191 -0
  30. data/ext/wapiti/sequence.h +148 -0
  31. data/ext/wapiti/sgdl1.c +218 -0
  32. data/ext/wapiti/thread.c +171 -0
  33. data/ext/wapiti/thread.h +42 -0
  34. data/ext/wapiti/tools.c +202 -0
  35. data/ext/wapiti/tools.h +54 -0
  36. data/ext/wapiti/trainers.h +39 -0
  37. data/ext/wapiti/vmath.c +372 -0
  38. data/ext/wapiti/vmath.h +51 -0
  39. data/ext/wapiti/wapiti.c +288 -0
  40. data/ext/wapiti/wapiti.h +45 -0
  41. data/lib/wapiti.rb +30 -0
  42. data/lib/wapiti/errors.rb +17 -0
  43. data/lib/wapiti/model.rb +49 -0
  44. data/lib/wapiti/options.rb +113 -0
  45. data/lib/wapiti/utility.rb +15 -0
  46. data/lib/wapiti/version.rb +3 -0
  47. data/spec/fixtures/ch.mod +18550 -0
  48. data/spec/fixtures/chpattern.txt +52 -0
  49. data/spec/fixtures/chtest.txt +1973 -0
  50. data/spec/fixtures/chtrain.txt +19995 -0
  51. data/spec/fixtures/nppattern.txt +52 -0
  52. data/spec/fixtures/nptest.txt +1973 -0
  53. data/spec/fixtures/nptrain.txt +19995 -0
  54. data/spec/fixtures/pattern.txt +14 -0
  55. data/spec/fixtures/test.txt +60000 -0
  56. data/spec/fixtures/train.txt +1200 -0
  57. data/spec/spec_helper.rb +21 -0
  58. data/spec/wapiti/model_spec.rb +173 -0
  59. data/spec/wapiti/native_spec.rb +12 -0
  60. data/spec/wapiti/options_spec.rb +175 -0
  61. data/spec/wapiti/utility_spec.rb +22 -0
  62. data/wapiti.gemspec +35 -0
  63. metadata +178 -0
@@ -0,0 +1,46 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #ifndef quark_h
29
+ #define quark_h
30
+
31
+ #include <stddef.h>
32
+ #include <stdio.h>
33
+
34
+ typedef struct qrk_s qrk_t;
35
+
36
+ qrk_t *qrk_new(void);
37
+ void qrk_free(qrk_t *qrk);
38
+ size_t qrk_count(const qrk_t *qrk);
39
+ bool qrk_lock(qrk_t *qrk, bool lock);
40
+ const char *qrk_id2str(const qrk_t *qrk, size_t id);
41
+ size_t qrk_str2id(qrk_t *qrk, const char *key);
42
+ void qrk_load(qrk_t *qrk, FILE *file);
43
+ void qrk_save(const qrk_t *qrk, FILE *file);
44
+
45
+ #endif
46
+
@@ -0,0 +1,553 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+ #include <ctype.h>
28
+ #include <stdbool.h>
29
+ #include <stddef.h>
30
+ #include <stdlib.h>
31
+ #include <stdio.h>
32
+ #include <string.h>
33
+
34
+ #include "wapiti.h"
35
+ #include "pattern.h"
36
+ #include "quark.h"
37
+ #include "reader.h"
38
+ #include "sequence.h"
39
+ #include "tools.h"
40
+
41
+ /*******************************************************************************
42
+ * Datafile reader
43
+ *
44
+ * And now come the data file reader which use the previous module to parse
45
+ * the input data in order to produce seq_t objects representing interned
46
+ * sequences.
47
+ *
48
+ * This is where the sequence will go through the tree steps to build seq_t
49
+ * objects used internally. There is two way do do this. First the simpler is
50
+ * to use the rdr_readseq function which directly read a sequence from a file
51
+ * and convert it to a seq_t object transparently. This is how the training
52
+ * and development data are loaded.
53
+ * The second way consist of read a raw sequence with rdr_readraw and next
54
+ * converting it to a seq_t object with rdr_raw2seq. This allow the caller to
55
+ * keep the raw sequence and is used by the tagger to produce a clean output.
56
+ *
57
+ * There is no public interface to the tok_t object as it is intended only for
58
+ * internal use in the reader as an intermediate step to apply patterns.
59
+ ******************************************************************************/
60
+
61
+ /* rdr_new:
62
+ * Create a new empty reader object. You mut load patterns in it or a
63
+ * previously saved reader if you want to use it for reading sequences.
64
+ */
65
+ rdr_t *rdr_new(bool maxent) {
66
+ rdr_t *rdr = wapiti_xmalloc(sizeof(rdr_t));
67
+ rdr->maxent = maxent;
68
+ rdr->npats = rdr->nuni = rdr->nbi = 0;
69
+ rdr->ntoks = 0;
70
+ rdr->pats = NULL;
71
+ rdr->lbl = qrk_new();
72
+ rdr->obs = qrk_new();
73
+ return rdr;
74
+ }
75
+
76
+ /* rdr_free:
77
+ * Free all memory used by a reader object including the quark database, so
78
+ * any string returned by them must not be used after this call.
79
+ */
80
+ void rdr_free(rdr_t *rdr) {
81
+ for (int i = 0; i < rdr->npats; i++)
82
+ pat_free(rdr->pats[i]);
83
+ free(rdr->pats);
84
+ qrk_free(rdr->lbl);
85
+ qrk_free(rdr->obs);
86
+ free(rdr);
87
+ }
88
+
89
+ /* rdr_freeraw:
90
+ * Free all memory used by a raw_t object.
91
+ */
92
+ void rdr_freeraw(raw_t *raw) {
93
+ for (int t = 0; t < raw->len; t++)
94
+ free(raw->lines[t]);
95
+ free(raw);
96
+ }
97
+
98
+ /* rdr_freeseq:
99
+ * Free all memory used by a seq_t object.
100
+ */
101
+ void rdr_freeseq(seq_t *seq) {
102
+ free(seq->raw);
103
+ free(seq);
104
+ }
105
+
106
+ /* rdr_freedat:
107
+ * Free all memory used by a dat_t object.
108
+ */
109
+ void rdr_freedat(dat_t *dat) {
110
+ for (size_t i = 0; i < dat->nseq; i++)
111
+ rdr_freeseq(dat->seq[i]);
112
+ free(dat->seq);
113
+ free(dat);
114
+ }
115
+
116
+ /* rdr_readline:
117
+ * Read an input line from <file>. The line can be of any size limited only by
118
+ * available memory, a buffer large enough is allocated and returned. The
119
+ * caller is responsible to free it. On end-of-file, NULL is returned.
120
+ */
121
+ static char *rdr_readline(FILE *file) {
122
+ if (feof(file))
123
+ return NULL;
124
+ // Initialize the buffer
125
+ int len = 0, size = 16;
126
+ char *buffer = wapiti_xmalloc(size);
127
+ // We read the line chunk by chunk until end of line, file or error
128
+ while (!feof(file)) {
129
+ if (fgets(buffer + len, size - len, file) == NULL) {
130
+ // On NULL return there is two possible cases, either an
131
+ // error or the end of file
132
+ if (ferror(file))
133
+ pfatal("cannot read from file");
134
+ // On end of file, we must check if we have already read
135
+ // some data or not
136
+ if (len == 0) {
137
+ free(buffer);
138
+ return NULL;
139
+ }
140
+ break;
141
+ }
142
+ // Check for end of line, if this is not the case enlarge the
143
+ // buffer and go read more data
144
+ len += strlen(buffer + len);
145
+ if (len == size - 1 && buffer[len - 1] != '\n') {
146
+ size = size * 1.4;
147
+ buffer = wapiti_xrealloc(buffer, size);
148
+ continue;
149
+ }
150
+ break;
151
+ }
152
+ // At this point empty line should have already catched so we just
153
+ // remove the end of line if present and resize the buffer to fit the
154
+ // data
155
+ if (buffer[len - 1] == '\n')
156
+ buffer[--len] = '\0';
157
+ return wapiti_xrealloc(buffer, len + 1);
158
+ }
159
+
160
+ /* rdr_loadpat:
161
+ * Load and compile patterns from given file and store them in the reader. As
162
+ * we compile patterns, syntax errors in them will be raised at this time.
163
+ */
164
+ void rdr_loadpat(rdr_t *rdr, FILE *file) {
165
+ while (!feof(file)) {
166
+ // Read raw input line
167
+ char *line = rdr_readline(file);
168
+ if (line == NULL)
169
+ break;
170
+ // Remove comments and trailing spaces
171
+ int end = strcspn(line, "#");
172
+ while (end != 0 && isspace(line[end - 1]))
173
+ end--;
174
+ if (end == 0) {
175
+ free(line);
176
+ continue;
177
+ }
178
+ line[end] = '\0';
179
+ line[0] = tolower(line[0]);
180
+ // Compile pattern and add it to the list
181
+ pat_t *pat = pat_comp(line);
182
+ rdr->npats++;
183
+ switch (line[0]) {
184
+ case 'u': rdr->nuni++; break;
185
+ case 'b': rdr->nbi++; break;
186
+ case '*': rdr->nuni++;
187
+ rdr->nbi++; break;
188
+ default:
189
+ fatal("unknown pattern type '%c'", line[0]);
190
+ }
191
+ rdr->pats = wapiti_xrealloc(rdr->pats, sizeof(char *) * rdr->npats);
192
+ rdr->pats[rdr->npats - 1] = pat;
193
+ rdr->ntoks = max(rdr->ntoks, pat->ntoks);
194
+ }
195
+ }
196
+
197
+ /* rdr_readraw:
198
+ * Read a raw sequence from given file: a set of lines terminated by end of
199
+ * file or by an empty line. Return NULL if file end was reached before any
200
+ * sequence was read.
201
+ */
202
+ raw_t *rdr_readraw(rdr_t *rdr, FILE *file) {
203
+ if (feof(file))
204
+ return NULL;
205
+ // Prepare the raw sequence object
206
+ int size = 32, cnt = 0;
207
+ raw_t *raw = wapiti_xmalloc(sizeof(raw_t) + sizeof(char *) * size);
208
+ // And read the next sequence in the file, this will skip any blank line
209
+ // before reading the sequence stoping at end of file or on a new blank
210
+ // line.
211
+ while (!feof(file)) {
212
+ char *line = rdr_readline(file);
213
+ if (line == NULL)
214
+ break;
215
+ // Check for empty line marking the end of the current sequence
216
+ int len = strlen(line);
217
+ while (len != 0 && isspace(line[len - 1]))
218
+ len--;
219
+ if (len == 0) {
220
+ free(line);
221
+ // Special case when no line was already read, we try
222
+ // again. This allow multiple blank lines beetwen
223
+ // sequences.
224
+ if (cnt == 0)
225
+ continue;
226
+ break;
227
+ }
228
+ // Next, grow the buffer if needed and add the new line in it
229
+ if (size == cnt) {
230
+ size *= 1.4;
231
+ raw = wapiti_xrealloc(raw, sizeof(raw_t)
232
+ + sizeof(char *) * size);
233
+ }
234
+ raw->lines[cnt++] = line;
235
+ // In maxent mode, we only have to load one line for each sample
236
+ // so we can stop here.
237
+ if (rdr->maxent)
238
+ break;
239
+ }
240
+ // If no lines was read, we just free allocated memory and return NULL
241
+ // to signal the end of file to the caller. Else, we adjust the object
242
+ // size and return it.
243
+ if (cnt == 0) {
244
+ free(raw);
245
+ return NULL;
246
+ }
247
+ raw = wapiti_xrealloc(raw, sizeof(raw_t) + sizeof(char *) * cnt);
248
+ raw->len = cnt;
249
+ return raw;
250
+ }
251
+
252
+ /* rdr_mapobs:
253
+ * Map an observation to its identifier, automatically adding a 'u' prefix in
254
+ * pure maxent mode.
255
+ */
256
+ static size_t rdr_mapobs(rdr_t *rdr, const char *str) {
257
+ if (!rdr->maxent)
258
+ return qrk_str2id(rdr->obs, str);
259
+ size_t len = strlen(str) + 2;
260
+ char tmp[len];
261
+ tmp[0] = 'u';
262
+ strcpy(tmp + 1, str);
263
+ return qrk_str2id(rdr->obs, tmp);
264
+ }
265
+
266
+ /* rdr_rawtok2seq:
267
+ * Convert a tok_t to a seq_t object taking each tokens as a feature without
268
+ * applying patterns.
269
+ */
270
+ static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
271
+ const int T = tok->len;
272
+ int size = 0;
273
+ if (rdr->maxent) {
274
+ size = tok->cnts[0];
275
+ } else {
276
+ for (int t = 0; t < T; t++) {
277
+ for (int n = 0; n < tok->cnts[t]; n++) {
278
+ const char *o = tok->toks[t][n];
279
+ switch (o[0]) {
280
+ case 'u': size += 1; break;
281
+ case 'b': size += 1; break;
282
+ case '*': size += 2; break;
283
+ default:
284
+ fatal("invalid feature: %s", o);
285
+ }
286
+ }
287
+ }
288
+ }
289
+ seq_t *seq = wapiti_xmalloc(sizeof(seq_t) + sizeof(pos_t) * T);
290
+ seq->raw = wapiti_xmalloc(sizeof(size_t) * size);
291
+ seq->len = T;
292
+ size_t *raw = seq->raw;
293
+ for (int t = 0; t < T; t++) {
294
+ seq->pos[t].lbl = none;
295
+ seq->pos[t].ucnt = 0;
296
+ seq->pos[t].uobs = raw;
297
+ for (int n = 0; n < tok->cnts[t]; n++) {
298
+ if (tok->toks[t][n][0] == 'b')
299
+ continue;
300
+ size_t id = rdr_mapobs(rdr, tok->toks[t][n]);
301
+ if (id != none) {
302
+ (*raw++) = id;
303
+ seq->pos[t].ucnt++;
304
+ }
305
+ }
306
+ seq->pos[t].bcnt = 0;
307
+ if (rdr->maxent)
308
+ continue;
309
+ seq->pos[t].bobs = raw;
310
+ for (int n = 0; n < tok->cnts[t]; n++) {
311
+ if (tok->toks[t][n][0] == 'u')
312
+ continue;
313
+ size_t id = rdr_mapobs(rdr, tok->toks[t][n]);
314
+ if (id != none) {
315
+ (*raw++) = id;
316
+ seq->pos[t].bcnt++;
317
+ }
318
+ }
319
+ }
320
+ // And finally, if the user specified it, populate the labels
321
+ if (tok->lbl != NULL) {
322
+ for (int t = 0; t < T; t++) {
323
+ const char *lbl = tok->lbl[t];
324
+ size_t id = qrk_str2id(rdr->lbl, lbl);
325
+ seq->pos[t].lbl = id;
326
+ }
327
+ }
328
+ return seq;
329
+ }
330
+
331
+ /* rdr_pattok2seq:
332
+ * Convert a tok_t to a seq_t object by applying the patterns of the reader.
333
+ */
334
+ static seq_t *rdr_pattok2seq(rdr_t *rdr, const tok_t *tok) {
335
+ const int T = tok->len;
336
+ // So now the tok object is ready, we can start building the seq_t
337
+ // object by appling patterns. First we allocate the seq_t object. The
338
+ // sequence itself as well as the sub array are allocated in one time.
339
+ seq_t *seq = wapiti_xmalloc(sizeof(seq_t) + sizeof(pos_t) * T);
340
+ seq->raw = wapiti_xmalloc(sizeof(size_t) * (rdr->nuni + rdr->nbi) * T);
341
+ seq->len = T;
342
+ size_t *tmp = seq->raw;
343
+ for (int t = 0; t < T; t++) {
344
+ seq->pos[t].lbl = none;
345
+ seq->pos[t].uobs = tmp; tmp += rdr->nuni;
346
+ seq->pos[t].bobs = tmp; tmp += rdr->nbi;
347
+ }
348
+ // Next, we can build the observations list by applying the patterns on
349
+ // the tok_t sequence.
350
+ for (int t = 0; t < T; t++) {
351
+ pos_t *pos = &seq->pos[t];
352
+ pos->ucnt = 0;
353
+ pos->bcnt = 0;
354
+ for (int x = 0; x < rdr->npats; x++) {
355
+ // Get the observation and map it to an identifier
356
+ char *obs = pat_exec(rdr->pats[x], tok, t);
357
+ size_t id = rdr_mapobs(rdr, obs);
358
+ if (id == none) {
359
+ free(obs);
360
+ continue;
361
+ }
362
+ // If the observation is ok, add it to the lists
363
+ int kind = 0;
364
+ switch (obs[0]) {
365
+ case 'u': kind = 1; break;
366
+ case 'b': kind = 2; break;
367
+ case '*': kind = 3; break;
368
+ }
369
+ if (kind & 1)
370
+ pos->uobs[pos->ucnt++] = id;
371
+ if (kind & 2)
372
+ pos->bobs[pos->bcnt++] = id;
373
+ free(obs);
374
+ }
375
+ }
376
+ // And finally, if the user specified it, populate the labels
377
+ if (tok->lbl != NULL) {
378
+ for (int t = 0; t < T; t++) {
379
+ const char *lbl = tok->lbl[t];
380
+ size_t id = qrk_str2id(rdr->lbl, lbl);
381
+ seq->pos[t].lbl = id;
382
+ }
383
+ }
384
+ return seq;
385
+ }
386
+
387
+ /* rdr_raw2seq:
388
+ * Convert a raw sequence to a seq_t object suitable for training or
389
+ * labelling. If lbl is true, the last column is assumed to be a label and
390
+ * interned also.
391
+ */
392
+ seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl) {
393
+ const int T = raw->len;
394
+ // Allocate the tok_t object, the label array is allocated only if they
395
+ // are requested by the user.
396
+ tok_t *tok = wapiti_xmalloc(sizeof(tok_t) + T * sizeof(char **));
397
+ tok->cnts = wapiti_xmalloc(sizeof(size_t) * T);
398
+ tok->lbl = NULL;
399
+ if (lbl == true)
400
+ tok->lbl = wapiti_xmalloc(sizeof(char *) * T);
401
+ // We now take the raw sequence line by line and split them in list of
402
+ // tokens. To reduce memory fragmentation, the raw line is copied and
403
+ // his reference is kept by the first tokens, next tokens are pointer to
404
+ // this copy.
405
+ for (int t = 0; t < T; t++) {
406
+ // Get a copy of the raw line skiping leading space characters
407
+ const char *src = raw->lines[t];
408
+ while (isspace(*src))
409
+ src++;
410
+ char *line = xstrdup(src);
411
+ // Split it in tokens
412
+ const int len = strlen(line);
413
+ char *toks[len / 2];
414
+ int cnt = 0;
415
+ while (*line != '\0') {
416
+ toks[cnt++] = line;
417
+ while (*line != '\0' && !isspace(*line))
418
+ line++;
419
+ if (*line == '\0')
420
+ break;
421
+ *line++ = '\0';
422
+ while (*line != '\0' && isspace(*line))
423
+ line++;
424
+ }
425
+ // If user specified that data are labelled, move the last token
426
+ // to the label array.
427
+ if (lbl == true) {
428
+ tok->lbl[t] = toks[cnt - 1];
429
+ cnt--;
430
+ }
431
+ // And put the remaining tokens in the tok_t object
432
+ tok->cnts[t] = cnt;
433
+ tok->toks[t] = wapiti_xmalloc(sizeof(char *) * cnt);
434
+ memcpy(tok->toks[t], toks, sizeof(char *) * cnt);
435
+ }
436
+ tok->len = T;
437
+ // Convert the tok_t to a seq_t
438
+ seq_t *seq = NULL;
439
+ if (rdr->npats == 0)
440
+ seq = rdr_rawtok2seq(rdr, tok);
441
+ else
442
+ seq = rdr_pattok2seq(rdr, tok);
443
+ // Before returning the sequence, we have to free the tok_t
444
+ for (int t = 0; t < T; t++) {
445
+ if (tok->cnts[t] == 0)
446
+ continue;
447
+ free(tok->toks[t][0]);
448
+ free(tok->toks[t]);
449
+ }
450
+ free(tok->cnts);
451
+ if (lbl == true)
452
+ free(tok->lbl);
453
+ free(tok);
454
+ return seq;
455
+ }
456
+
457
+ /* rdr_readseq:
458
+ * Simple wrapper around rdr_readraw and rdr_raw2seq to directly read a
459
+ * sequence as a seq_t object from file. This take care of all the process
460
+ * and correctly free temporary data. If lbl is true the sequence is assumed
461
+ * to be labeled.
462
+ * Return NULL if end of file occure before anything as been read.
463
+ */
464
+ seq_t *rdr_readseq(rdr_t *rdr, FILE *file, bool lbl) {
465
+ raw_t *raw = rdr_readraw(rdr, file);
466
+ if (raw == NULL)
467
+ return NULL;
468
+ seq_t *seq = rdr_raw2seq(rdr, raw, lbl);
469
+ rdr_freeraw(raw);
470
+ return seq;
471
+ }
472
+
473
+ /* rdr_readdat:
474
+ * Read a full dataset at once and return it as a dat_t object. This function
475
+ * take and interpret his parameters like the single sequence reading
476
+ * function.
477
+ */
478
+ dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl) {
479
+ // Prepare dataset
480
+ size_t size = 1000;
481
+ dat_t *dat = wapiti_xmalloc(sizeof(dat_t));
482
+ dat->nseq = 0;
483
+ dat->mlen = 0;
484
+ dat->lbl = lbl;
485
+ dat->seq = wapiti_xmalloc(sizeof(seq_t *) * size);
486
+ // Load sequences
487
+ while (!feof(file)) {
488
+ // Read the next sequence
489
+ seq_t *seq = rdr_readseq(rdr, file, lbl);
490
+ if (seq == NULL)
491
+ break;
492
+ // Grow the buffer if needed
493
+ if (dat->nseq == size) {
494
+ size *= 1.4;
495
+ dat->seq = wapiti_xrealloc(dat->seq, sizeof(seq_t *) * size);
496
+ }
497
+ // And store the sequence
498
+ dat->seq[dat->nseq++] = seq;
499
+ dat->mlen = max(dat->mlen, seq->len);
500
+ if (dat->nseq % 1000 == 0)
501
+ info("%7d sequences loaded\n", dat->nseq);
502
+ }
503
+ // If no sequence readed, cleanup and repport
504
+ if (dat->nseq == 0) {
505
+ free(dat->seq);
506
+ free(dat);
507
+ return NULL;
508
+ }
509
+ // Adjust the dataset size and return
510
+ if (size > dat->nseq)
511
+ dat->seq = wapiti_xrealloc(dat->seq, sizeof(seq_t *) * dat->nseq);
512
+ return dat;
513
+ }
514
+
515
+ /* rdr_load:
516
+ * Read from the given file a reader saved previously with rdr_save. The given
517
+ * reader must be empty, comming fresh from rdr_new. Be carefull that this
518
+ * function performs almost no checks on the input data, so if you modify the
519
+ * reader and make a mistake, it will probably result in a crash.
520
+ */
521
+ void rdr_load(rdr_t *rdr, FILE *file) {
522
+ const char *err = "broken file, invalid reader format";
523
+ if (fscanf(file, "#rdr#%d/%d\n", &rdr->npats, &rdr->ntoks) != 2)
524
+ fatal(err);
525
+ rdr->nuni = rdr->nbi = 0;
526
+ rdr->pats = wapiti_xmalloc(sizeof(pat_t *) * rdr->npats);
527
+ for (int p = 0; p < rdr->npats; p++) {
528
+ char *pat = ns_readstr(file);
529
+ rdr->pats[p] = pat_comp(pat);
530
+ switch (tolower(pat[0])) {
531
+ case 'u': rdr->nuni++; break;
532
+ case 'b': rdr->nbi++; break;
533
+ case '*': rdr->nuni++;
534
+ rdr->nbi++; break;
535
+ }
536
+ }
537
+ qrk_load(rdr->lbl, file);
538
+ qrk_load(rdr->obs, file);
539
+ }
540
+
541
+ /* rdr_save:
542
+ * Save the reader to the given file so it can be loaded back. The save format
543
+ * is plain text and portable accros computers.
544
+ */
545
+ void rdr_save(const rdr_t *rdr, FILE *file) {
546
+ if(fprintf(file, "#rdr#%d/%d\n", rdr->npats, rdr->ntoks) < 0)
547
+ pfatal("cannot write to file");
548
+ for (int p = 0; p < rdr->npats; p++)
549
+ ns_writestr(file, rdr->pats[p]->src);
550
+ qrk_save(rdr->lbl, file);
551
+ qrk_save(rdr->obs, file);
552
+ }
553
+