wapiti 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/.autotest +13 -0
  2. data/.gitignore +5 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +30 -0
  6. data/README.md +153 -0
  7. data/Rakefile +33 -0
  8. data/ext/wapiti/bcd.c +392 -0
  9. data/ext/wapiti/decoder.c +535 -0
  10. data/ext/wapiti/decoder.h +46 -0
  11. data/ext/wapiti/extconf.rb +8 -0
  12. data/ext/wapiti/gradient.c +818 -0
  13. data/ext/wapiti/gradient.h +81 -0
  14. data/ext/wapiti/lbfgs.c +294 -0
  15. data/ext/wapiti/model.c +296 -0
  16. data/ext/wapiti/model.h +100 -0
  17. data/ext/wapiti/native.c +1238 -0
  18. data/ext/wapiti/native.h +15 -0
  19. data/ext/wapiti/options.c +278 -0
  20. data/ext/wapiti/options.h +91 -0
  21. data/ext/wapiti/pattern.c +395 -0
  22. data/ext/wapiti/pattern.h +56 -0
  23. data/ext/wapiti/progress.c +167 -0
  24. data/ext/wapiti/progress.h +43 -0
  25. data/ext/wapiti/quark.c +272 -0
  26. data/ext/wapiti/quark.h +46 -0
  27. data/ext/wapiti/reader.c +553 -0
  28. data/ext/wapiti/reader.h +73 -0
  29. data/ext/wapiti/rprop.c +191 -0
  30. data/ext/wapiti/sequence.h +148 -0
  31. data/ext/wapiti/sgdl1.c +218 -0
  32. data/ext/wapiti/thread.c +171 -0
  33. data/ext/wapiti/thread.h +42 -0
  34. data/ext/wapiti/tools.c +202 -0
  35. data/ext/wapiti/tools.h +54 -0
  36. data/ext/wapiti/trainers.h +39 -0
  37. data/ext/wapiti/vmath.c +372 -0
  38. data/ext/wapiti/vmath.h +51 -0
  39. data/ext/wapiti/wapiti.c +288 -0
  40. data/ext/wapiti/wapiti.h +45 -0
  41. data/lib/wapiti.rb +30 -0
  42. data/lib/wapiti/errors.rb +17 -0
  43. data/lib/wapiti/model.rb +49 -0
  44. data/lib/wapiti/options.rb +113 -0
  45. data/lib/wapiti/utility.rb +15 -0
  46. data/lib/wapiti/version.rb +3 -0
  47. data/spec/fixtures/ch.mod +18550 -0
  48. data/spec/fixtures/chpattern.txt +52 -0
  49. data/spec/fixtures/chtest.txt +1973 -0
  50. data/spec/fixtures/chtrain.txt +19995 -0
  51. data/spec/fixtures/nppattern.txt +52 -0
  52. data/spec/fixtures/nptest.txt +1973 -0
  53. data/spec/fixtures/nptrain.txt +19995 -0
  54. data/spec/fixtures/pattern.txt +14 -0
  55. data/spec/fixtures/test.txt +60000 -0
  56. data/spec/fixtures/train.txt +1200 -0
  57. data/spec/spec_helper.rb +21 -0
  58. data/spec/wapiti/model_spec.rb +173 -0
  59. data/spec/wapiti/native_spec.rb +12 -0
  60. data/spec/wapiti/options_spec.rb +175 -0
  61. data/spec/wapiti/utility_spec.rb +22 -0
  62. data/wapiti.gemspec +35 -0
  63. metadata +178 -0
@@ -0,0 +1,46 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #ifndef quark_h
29
+ #define quark_h
30
+
31
+ #include <stddef.h>
32
+ #include <stdio.h>
33
+
34
+ typedef struct qrk_s qrk_t;
35
+
36
+ qrk_t *qrk_new(void);
37
+ void qrk_free(qrk_t *qrk);
38
+ size_t qrk_count(const qrk_t *qrk);
39
+ bool qrk_lock(qrk_t *qrk, bool lock);
40
+ const char *qrk_id2str(const qrk_t *qrk, size_t id);
41
+ size_t qrk_str2id(qrk_t *qrk, const char *key);
42
+ void qrk_load(qrk_t *qrk, FILE *file);
43
+ void qrk_save(const qrk_t *qrk, FILE *file);
44
+
45
+ #endif
46
+
@@ -0,0 +1,553 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+ #include <ctype.h>
28
+ #include <stdbool.h>
29
+ #include <stddef.h>
30
+ #include <stdlib.h>
31
+ #include <stdio.h>
32
+ #include <string.h>
33
+
34
+ #include "wapiti.h"
35
+ #include "pattern.h"
36
+ #include "quark.h"
37
+ #include "reader.h"
38
+ #include "sequence.h"
39
+ #include "tools.h"
40
+
41
+ /*******************************************************************************
42
+ * Datafile reader
43
+ *
44
+ * And now come the data file reader which use the previous module to parse
45
+ * the input data in order to produce seq_t objects representing interned
46
+ * sequences.
47
+ *
48
+ * This is where the sequence will go through the tree steps to build seq_t
49
+ * objects used internally. There is two way do do this. First the simpler is
50
+ * to use the rdr_readseq function which directly read a sequence from a file
51
+ * and convert it to a seq_t object transparently. This is how the training
52
+ * and development data are loaded.
53
+ * The second way consist of read a raw sequence with rdr_readraw and next
54
+ * converting it to a seq_t object with rdr_raw2seq. This allow the caller to
55
+ * keep the raw sequence and is used by the tagger to produce a clean output.
56
+ *
57
+ * There is no public interface to the tok_t object as it is intended only for
58
+ * internal use in the reader as an intermediate step to apply patterns.
59
+ ******************************************************************************/
60
+
61
+ /* rdr_new:
62
+ * Create a new empty reader object. You mut load patterns in it or a
63
+ * previously saved reader if you want to use it for reading sequences.
64
+ */
65
+ rdr_t *rdr_new(bool maxent) {
66
+ rdr_t *rdr = wapiti_xmalloc(sizeof(rdr_t));
67
+ rdr->maxent = maxent;
68
+ rdr->npats = rdr->nuni = rdr->nbi = 0;
69
+ rdr->ntoks = 0;
70
+ rdr->pats = NULL;
71
+ rdr->lbl = qrk_new();
72
+ rdr->obs = qrk_new();
73
+ return rdr;
74
+ }
75
+
76
+ /* rdr_free:
77
+ * Free all memory used by a reader object including the quark database, so
78
+ * any string returned by them must not be used after this call.
79
+ */
80
+ void rdr_free(rdr_t *rdr) {
81
+ for (int i = 0; i < rdr->npats; i++)
82
+ pat_free(rdr->pats[i]);
83
+ free(rdr->pats);
84
+ qrk_free(rdr->lbl);
85
+ qrk_free(rdr->obs);
86
+ free(rdr);
87
+ }
88
+
89
+ /* rdr_freeraw:
90
+ * Free all memory used by a raw_t object.
91
+ */
92
+ void rdr_freeraw(raw_t *raw) {
93
+ for (int t = 0; t < raw->len; t++)
94
+ free(raw->lines[t]);
95
+ free(raw);
96
+ }
97
+
98
+ /* rdr_freeseq:
99
+ * Free all memory used by a seq_t object.
100
+ */
101
+ void rdr_freeseq(seq_t *seq) {
102
+ free(seq->raw);
103
+ free(seq);
104
+ }
105
+
106
+ /* rdr_freedat:
107
+ * Free all memory used by a dat_t object.
108
+ */
109
+ void rdr_freedat(dat_t *dat) {
110
+ for (size_t i = 0; i < dat->nseq; i++)
111
+ rdr_freeseq(dat->seq[i]);
112
+ free(dat->seq);
113
+ free(dat);
114
+ }
115
+
116
+ /* rdr_readline:
117
+ * Read an input line from <file>. The line can be of any size limited only by
118
+ * available memory, a buffer large enough is allocated and returned. The
119
+ * caller is responsible to free it. On end-of-file, NULL is returned.
120
+ */
121
+ static char *rdr_readline(FILE *file) {
122
+ if (feof(file))
123
+ return NULL;
124
+ // Initialize the buffer
125
+ int len = 0, size = 16;
126
+ char *buffer = wapiti_xmalloc(size);
127
+ // We read the line chunk by chunk until end of line, file or error
128
+ while (!feof(file)) {
129
+ if (fgets(buffer + len, size - len, file) == NULL) {
130
+ // On NULL return there is two possible cases, either an
131
+ // error or the end of file
132
+ if (ferror(file))
133
+ pfatal("cannot read from file");
134
+ // On end of file, we must check if we have already read
135
+ // some data or not
136
+ if (len == 0) {
137
+ free(buffer);
138
+ return NULL;
139
+ }
140
+ break;
141
+ }
142
+ // Check for end of line, if this is not the case enlarge the
143
+ // buffer and go read more data
144
+ len += strlen(buffer + len);
145
+ if (len == size - 1 && buffer[len - 1] != '\n') {
146
+ size = size * 1.4;
147
+ buffer = wapiti_xrealloc(buffer, size);
148
+ continue;
149
+ }
150
+ break;
151
+ }
152
+ // At this point empty line should have already catched so we just
153
+ // remove the end of line if present and resize the buffer to fit the
154
+ // data
155
+ if (buffer[len - 1] == '\n')
156
+ buffer[--len] = '\0';
157
+ return wapiti_xrealloc(buffer, len + 1);
158
+ }
159
+
160
+ /* rdr_loadpat:
161
+ * Load and compile patterns from given file and store them in the reader. As
162
+ * we compile patterns, syntax errors in them will be raised at this time.
163
+ */
164
+ void rdr_loadpat(rdr_t *rdr, FILE *file) {
165
+ while (!feof(file)) {
166
+ // Read raw input line
167
+ char *line = rdr_readline(file);
168
+ if (line == NULL)
169
+ break;
170
+ // Remove comments and trailing spaces
171
+ int end = strcspn(line, "#");
172
+ while (end != 0 && isspace(line[end - 1]))
173
+ end--;
174
+ if (end == 0) {
175
+ free(line);
176
+ continue;
177
+ }
178
+ line[end] = '\0';
179
+ line[0] = tolower(line[0]);
180
+ // Compile pattern and add it to the list
181
+ pat_t *pat = pat_comp(line);
182
+ rdr->npats++;
183
+ switch (line[0]) {
184
+ case 'u': rdr->nuni++; break;
185
+ case 'b': rdr->nbi++; break;
186
+ case '*': rdr->nuni++;
187
+ rdr->nbi++; break;
188
+ default:
189
+ fatal("unknown pattern type '%c'", line[0]);
190
+ }
191
+ rdr->pats = wapiti_xrealloc(rdr->pats, sizeof(char *) * rdr->npats);
192
+ rdr->pats[rdr->npats - 1] = pat;
193
+ rdr->ntoks = max(rdr->ntoks, pat->ntoks);
194
+ }
195
+ }
196
+
197
+ /* rdr_readraw:
198
+ * Read a raw sequence from given file: a set of lines terminated by end of
199
+ * file or by an empty line. Return NULL if file end was reached before any
200
+ * sequence was read.
201
+ */
202
+ raw_t *rdr_readraw(rdr_t *rdr, FILE *file) {
203
+ if (feof(file))
204
+ return NULL;
205
+ // Prepare the raw sequence object
206
+ int size = 32, cnt = 0;
207
+ raw_t *raw = wapiti_xmalloc(sizeof(raw_t) + sizeof(char *) * size);
208
+ // And read the next sequence in the file, this will skip any blank line
209
+ // before reading the sequence stoping at end of file or on a new blank
210
+ // line.
211
+ while (!feof(file)) {
212
+ char *line = rdr_readline(file);
213
+ if (line == NULL)
214
+ break;
215
+ // Check for empty line marking the end of the current sequence
216
+ int len = strlen(line);
217
+ while (len != 0 && isspace(line[len - 1]))
218
+ len--;
219
+ if (len == 0) {
220
+ free(line);
221
+ // Special case when no line was already read, we try
222
+ // again. This allow multiple blank lines beetwen
223
+ // sequences.
224
+ if (cnt == 0)
225
+ continue;
226
+ break;
227
+ }
228
+ // Next, grow the buffer if needed and add the new line in it
229
+ if (size == cnt) {
230
+ size *= 1.4;
231
+ raw = wapiti_xrealloc(raw, sizeof(raw_t)
232
+ + sizeof(char *) * size);
233
+ }
234
+ raw->lines[cnt++] = line;
235
+ // In maxent mode, we only have to load one line for each sample
236
+ // so we can stop here.
237
+ if (rdr->maxent)
238
+ break;
239
+ }
240
+ // If no lines was read, we just free allocated memory and return NULL
241
+ // to signal the end of file to the caller. Else, we adjust the object
242
+ // size and return it.
243
+ if (cnt == 0) {
244
+ free(raw);
245
+ return NULL;
246
+ }
247
+ raw = wapiti_xrealloc(raw, sizeof(raw_t) + sizeof(char *) * cnt);
248
+ raw->len = cnt;
249
+ return raw;
250
+ }
251
+
252
+ /* rdr_mapobs:
253
+ * Map an observation to its identifier, automatically adding a 'u' prefix in
254
+ * pure maxent mode.
255
+ */
256
+ static size_t rdr_mapobs(rdr_t *rdr, const char *str) {
257
+ if (!rdr->maxent)
258
+ return qrk_str2id(rdr->obs, str);
259
+ size_t len = strlen(str) + 2;
260
+ char tmp[len];
261
+ tmp[0] = 'u';
262
+ strcpy(tmp + 1, str);
263
+ return qrk_str2id(rdr->obs, tmp);
264
+ }
265
+
266
+ /* rdr_rawtok2seq:
267
+ * Convert a tok_t to a seq_t object taking each tokens as a feature without
268
+ * applying patterns.
269
+ */
270
+ static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
271
+ const int T = tok->len;
272
+ int size = 0;
273
+ if (rdr->maxent) {
274
+ size = tok->cnts[0];
275
+ } else {
276
+ for (int t = 0; t < T; t++) {
277
+ for (int n = 0; n < tok->cnts[t]; n++) {
278
+ const char *o = tok->toks[t][n];
279
+ switch (o[0]) {
280
+ case 'u': size += 1; break;
281
+ case 'b': size += 1; break;
282
+ case '*': size += 2; break;
283
+ default:
284
+ fatal("invalid feature: %s", o);
285
+ }
286
+ }
287
+ }
288
+ }
289
+ seq_t *seq = wapiti_xmalloc(sizeof(seq_t) + sizeof(pos_t) * T);
290
+ seq->raw = wapiti_xmalloc(sizeof(size_t) * size);
291
+ seq->len = T;
292
+ size_t *raw = seq->raw;
293
+ for (int t = 0; t < T; t++) {
294
+ seq->pos[t].lbl = none;
295
+ seq->pos[t].ucnt = 0;
296
+ seq->pos[t].uobs = raw;
297
+ for (int n = 0; n < tok->cnts[t]; n++) {
298
+ if (tok->toks[t][n][0] == 'b')
299
+ continue;
300
+ size_t id = rdr_mapobs(rdr, tok->toks[t][n]);
301
+ if (id != none) {
302
+ (*raw++) = id;
303
+ seq->pos[t].ucnt++;
304
+ }
305
+ }
306
+ seq->pos[t].bcnt = 0;
307
+ if (rdr->maxent)
308
+ continue;
309
+ seq->pos[t].bobs = raw;
310
+ for (int n = 0; n < tok->cnts[t]; n++) {
311
+ if (tok->toks[t][n][0] == 'u')
312
+ continue;
313
+ size_t id = rdr_mapobs(rdr, tok->toks[t][n]);
314
+ if (id != none) {
315
+ (*raw++) = id;
316
+ seq->pos[t].bcnt++;
317
+ }
318
+ }
319
+ }
320
+ // And finally, if the user specified it, populate the labels
321
+ if (tok->lbl != NULL) {
322
+ for (int t = 0; t < T; t++) {
323
+ const char *lbl = tok->lbl[t];
324
+ size_t id = qrk_str2id(rdr->lbl, lbl);
325
+ seq->pos[t].lbl = id;
326
+ }
327
+ }
328
+ return seq;
329
+ }
330
+
331
+ /* rdr_pattok2seq:
332
+ * Convert a tok_t to a seq_t object by applying the patterns of the reader.
333
+ */
334
+ static seq_t *rdr_pattok2seq(rdr_t *rdr, const tok_t *tok) {
335
+ const int T = tok->len;
336
+ // So now the tok object is ready, we can start building the seq_t
337
+ // object by appling patterns. First we allocate the seq_t object. The
338
+ // sequence itself as well as the sub array are allocated in one time.
339
+ seq_t *seq = wapiti_xmalloc(sizeof(seq_t) + sizeof(pos_t) * T);
340
+ seq->raw = wapiti_xmalloc(sizeof(size_t) * (rdr->nuni + rdr->nbi) * T);
341
+ seq->len = T;
342
+ size_t *tmp = seq->raw;
343
+ for (int t = 0; t < T; t++) {
344
+ seq->pos[t].lbl = none;
345
+ seq->pos[t].uobs = tmp; tmp += rdr->nuni;
346
+ seq->pos[t].bobs = tmp; tmp += rdr->nbi;
347
+ }
348
+ // Next, we can build the observations list by applying the patterns on
349
+ // the tok_t sequence.
350
+ for (int t = 0; t < T; t++) {
351
+ pos_t *pos = &seq->pos[t];
352
+ pos->ucnt = 0;
353
+ pos->bcnt = 0;
354
+ for (int x = 0; x < rdr->npats; x++) {
355
+ // Get the observation and map it to an identifier
356
+ char *obs = pat_exec(rdr->pats[x], tok, t);
357
+ size_t id = rdr_mapobs(rdr, obs);
358
+ if (id == none) {
359
+ free(obs);
360
+ continue;
361
+ }
362
+ // If the observation is ok, add it to the lists
363
+ int kind = 0;
364
+ switch (obs[0]) {
365
+ case 'u': kind = 1; break;
366
+ case 'b': kind = 2; break;
367
+ case '*': kind = 3; break;
368
+ }
369
+ if (kind & 1)
370
+ pos->uobs[pos->ucnt++] = id;
371
+ if (kind & 2)
372
+ pos->bobs[pos->bcnt++] = id;
373
+ free(obs);
374
+ }
375
+ }
376
+ // And finally, if the user specified it, populate the labels
377
+ if (tok->lbl != NULL) {
378
+ for (int t = 0; t < T; t++) {
379
+ const char *lbl = tok->lbl[t];
380
+ size_t id = qrk_str2id(rdr->lbl, lbl);
381
+ seq->pos[t].lbl = id;
382
+ }
383
+ }
384
+ return seq;
385
+ }
386
+
387
+ /* rdr_raw2seq:
388
+ * Convert a raw sequence to a seq_t object suitable for training or
389
+ * labelling. If lbl is true, the last column is assumed to be a label and
390
+ * interned also.
391
+ */
392
+ seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl) {
393
+ const int T = raw->len;
394
+ // Allocate the tok_t object, the label array is allocated only if they
395
+ // are requested by the user.
396
+ tok_t *tok = wapiti_xmalloc(sizeof(tok_t) + T * sizeof(char **));
397
+ tok->cnts = wapiti_xmalloc(sizeof(size_t) * T);
398
+ tok->lbl = NULL;
399
+ if (lbl == true)
400
+ tok->lbl = wapiti_xmalloc(sizeof(char *) * T);
401
+ // We now take the raw sequence line by line and split them in list of
402
+ // tokens. To reduce memory fragmentation, the raw line is copied and
403
+ // his reference is kept by the first tokens, next tokens are pointer to
404
+ // this copy.
405
+ for (int t = 0; t < T; t++) {
406
+ // Get a copy of the raw line skiping leading space characters
407
+ const char *src = raw->lines[t];
408
+ while (isspace(*src))
409
+ src++;
410
+ char *line = xstrdup(src);
411
+ // Split it in tokens
412
+ const int len = strlen(line);
413
+ char *toks[len / 2];
414
+ int cnt = 0;
415
+ while (*line != '\0') {
416
+ toks[cnt++] = line;
417
+ while (*line != '\0' && !isspace(*line))
418
+ line++;
419
+ if (*line == '\0')
420
+ break;
421
+ *line++ = '\0';
422
+ while (*line != '\0' && isspace(*line))
423
+ line++;
424
+ }
425
+ // If user specified that data are labelled, move the last token
426
+ // to the label array.
427
+ if (lbl == true) {
428
+ tok->lbl[t] = toks[cnt - 1];
429
+ cnt--;
430
+ }
431
+ // And put the remaining tokens in the tok_t object
432
+ tok->cnts[t] = cnt;
433
+ tok->toks[t] = wapiti_xmalloc(sizeof(char *) * cnt);
434
+ memcpy(tok->toks[t], toks, sizeof(char *) * cnt);
435
+ }
436
+ tok->len = T;
437
+ // Convert the tok_t to a seq_t
438
+ seq_t *seq = NULL;
439
+ if (rdr->npats == 0)
440
+ seq = rdr_rawtok2seq(rdr, tok);
441
+ else
442
+ seq = rdr_pattok2seq(rdr, tok);
443
+ // Before returning the sequence, we have to free the tok_t
444
+ for (int t = 0; t < T; t++) {
445
+ if (tok->cnts[t] == 0)
446
+ continue;
447
+ free(tok->toks[t][0]);
448
+ free(tok->toks[t]);
449
+ }
450
+ free(tok->cnts);
451
+ if (lbl == true)
452
+ free(tok->lbl);
453
+ free(tok);
454
+ return seq;
455
+ }
456
+
457
+ /* rdr_readseq:
458
+ * Simple wrapper around rdr_readraw and rdr_raw2seq to directly read a
459
+ * sequence as a seq_t object from file. This take care of all the process
460
+ * and correctly free temporary data. If lbl is true the sequence is assumed
461
+ * to be labeled.
462
+ * Return NULL if end of file occure before anything as been read.
463
+ */
464
+ seq_t *rdr_readseq(rdr_t *rdr, FILE *file, bool lbl) {
465
+ raw_t *raw = rdr_readraw(rdr, file);
466
+ if (raw == NULL)
467
+ return NULL;
468
+ seq_t *seq = rdr_raw2seq(rdr, raw, lbl);
469
+ rdr_freeraw(raw);
470
+ return seq;
471
+ }
472
+
473
+ /* rdr_readdat:
474
+ * Read a full dataset at once and return it as a dat_t object. This function
475
+ * take and interpret his parameters like the single sequence reading
476
+ * function.
477
+ */
478
+ dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl) {
479
+ // Prepare dataset
480
+ size_t size = 1000;
481
+ dat_t *dat = wapiti_xmalloc(sizeof(dat_t));
482
+ dat->nseq = 0;
483
+ dat->mlen = 0;
484
+ dat->lbl = lbl;
485
+ dat->seq = wapiti_xmalloc(sizeof(seq_t *) * size);
486
+ // Load sequences
487
+ while (!feof(file)) {
488
+ // Read the next sequence
489
+ seq_t *seq = rdr_readseq(rdr, file, lbl);
490
+ if (seq == NULL)
491
+ break;
492
+ // Grow the buffer if needed
493
+ if (dat->nseq == size) {
494
+ size *= 1.4;
495
+ dat->seq = wapiti_xrealloc(dat->seq, sizeof(seq_t *) * size);
496
+ }
497
+ // And store the sequence
498
+ dat->seq[dat->nseq++] = seq;
499
+ dat->mlen = max(dat->mlen, seq->len);
500
+ if (dat->nseq % 1000 == 0)
501
+ info("%7d sequences loaded\n", dat->nseq);
502
+ }
503
+ // If no sequence readed, cleanup and repport
504
+ if (dat->nseq == 0) {
505
+ free(dat->seq);
506
+ free(dat);
507
+ return NULL;
508
+ }
509
+ // Adjust the dataset size and return
510
+ if (size > dat->nseq)
511
+ dat->seq = wapiti_xrealloc(dat->seq, sizeof(seq_t *) * dat->nseq);
512
+ return dat;
513
+ }
514
+
515
+ /* rdr_load:
516
+ * Read from the given file a reader saved previously with rdr_save. The given
517
+ * reader must be empty, comming fresh from rdr_new. Be carefull that this
518
+ * function performs almost no checks on the input data, so if you modify the
519
+ * reader and make a mistake, it will probably result in a crash.
520
+ */
521
+ void rdr_load(rdr_t *rdr, FILE *file) {
522
+ const char *err = "broken file, invalid reader format";
523
+ if (fscanf(file, "#rdr#%d/%d\n", &rdr->npats, &rdr->ntoks) != 2)
524
+ fatal(err);
525
+ rdr->nuni = rdr->nbi = 0;
526
+ rdr->pats = wapiti_xmalloc(sizeof(pat_t *) * rdr->npats);
527
+ for (int p = 0; p < rdr->npats; p++) {
528
+ char *pat = ns_readstr(file);
529
+ rdr->pats[p] = pat_comp(pat);
530
+ switch (tolower(pat[0])) {
531
+ case 'u': rdr->nuni++; break;
532
+ case 'b': rdr->nbi++; break;
533
+ case '*': rdr->nuni++;
534
+ rdr->nbi++; break;
535
+ }
536
+ }
537
+ qrk_load(rdr->lbl, file);
538
+ qrk_load(rdr->obs, file);
539
+ }
540
+
541
+ /* rdr_save:
542
+ * Save the reader to the given file so it can be loaded back. The save format
543
+ * is plain text and portable accros computers.
544
+ */
545
+ void rdr_save(const rdr_t *rdr, FILE *file) {
546
+ if(fprintf(file, "#rdr#%d/%d\n", rdr->npats, rdr->ntoks) < 0)
547
+ pfatal("cannot write to file");
548
+ for (int p = 0; p < rdr->npats; p++)
549
+ ns_writestr(file, rdr->pats[p]->src);
550
+ qrk_save(rdr->lbl, file);
551
+ qrk_save(rdr->obs, file);
552
+ }
553
+