wapiti 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +13 -0
- data/.gitignore +5 -0
- data/.rspec +3 -0
- data/Gemfile +6 -0
- data/LICENSE +30 -0
- data/README.md +153 -0
- data/Rakefile +33 -0
- data/ext/wapiti/bcd.c +392 -0
- data/ext/wapiti/decoder.c +535 -0
- data/ext/wapiti/decoder.h +46 -0
- data/ext/wapiti/extconf.rb +8 -0
- data/ext/wapiti/gradient.c +818 -0
- data/ext/wapiti/gradient.h +81 -0
- data/ext/wapiti/lbfgs.c +294 -0
- data/ext/wapiti/model.c +296 -0
- data/ext/wapiti/model.h +100 -0
- data/ext/wapiti/native.c +1238 -0
- data/ext/wapiti/native.h +15 -0
- data/ext/wapiti/options.c +278 -0
- data/ext/wapiti/options.h +91 -0
- data/ext/wapiti/pattern.c +395 -0
- data/ext/wapiti/pattern.h +56 -0
- data/ext/wapiti/progress.c +167 -0
- data/ext/wapiti/progress.h +43 -0
- data/ext/wapiti/quark.c +272 -0
- data/ext/wapiti/quark.h +46 -0
- data/ext/wapiti/reader.c +553 -0
- data/ext/wapiti/reader.h +73 -0
- data/ext/wapiti/rprop.c +191 -0
- data/ext/wapiti/sequence.h +148 -0
- data/ext/wapiti/sgdl1.c +218 -0
- data/ext/wapiti/thread.c +171 -0
- data/ext/wapiti/thread.h +42 -0
- data/ext/wapiti/tools.c +202 -0
- data/ext/wapiti/tools.h +54 -0
- data/ext/wapiti/trainers.h +39 -0
- data/ext/wapiti/vmath.c +372 -0
- data/ext/wapiti/vmath.h +51 -0
- data/ext/wapiti/wapiti.c +288 -0
- data/ext/wapiti/wapiti.h +45 -0
- data/lib/wapiti.rb +30 -0
- data/lib/wapiti/errors.rb +17 -0
- data/lib/wapiti/model.rb +49 -0
- data/lib/wapiti/options.rb +113 -0
- data/lib/wapiti/utility.rb +15 -0
- data/lib/wapiti/version.rb +3 -0
- data/spec/fixtures/ch.mod +18550 -0
- data/spec/fixtures/chpattern.txt +52 -0
- data/spec/fixtures/chtest.txt +1973 -0
- data/spec/fixtures/chtrain.txt +19995 -0
- data/spec/fixtures/nppattern.txt +52 -0
- data/spec/fixtures/nptest.txt +1973 -0
- data/spec/fixtures/nptrain.txt +19995 -0
- data/spec/fixtures/pattern.txt +14 -0
- data/spec/fixtures/test.txt +60000 -0
- data/spec/fixtures/train.txt +1200 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/wapiti/model_spec.rb +173 -0
- data/spec/wapiti/native_spec.rb +12 -0
- data/spec/wapiti/options_spec.rb +175 -0
- data/spec/wapiti/utility_spec.rb +22 -0
- data/wapiti.gemspec +35 -0
- metadata +178 -0
data/ext/wapiti/quark.h
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#ifndef quark_h
|
29
|
+
#define quark_h
|
30
|
+
|
31
|
+
#include <stddef.h>
|
32
|
+
#include <stdio.h>
|
33
|
+
|
34
|
+
typedef struct qrk_s qrk_t;
|
35
|
+
|
36
|
+
qrk_t *qrk_new(void);
|
37
|
+
void qrk_free(qrk_t *qrk);
|
38
|
+
size_t qrk_count(const qrk_t *qrk);
|
39
|
+
bool qrk_lock(qrk_t *qrk, bool lock);
|
40
|
+
const char *qrk_id2str(const qrk_t *qrk, size_t id);
|
41
|
+
size_t qrk_str2id(qrk_t *qrk, const char *key);
|
42
|
+
void qrk_load(qrk_t *qrk, FILE *file);
|
43
|
+
void qrk_save(const qrk_t *qrk, FILE *file);
|
44
|
+
|
45
|
+
#endif
|
46
|
+
|
data/ext/wapiti/reader.c
ADDED
@@ -0,0 +1,553 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
#include <ctype.h>
|
28
|
+
#include <stdbool.h>
|
29
|
+
#include <stddef.h>
|
30
|
+
#include <stdlib.h>
|
31
|
+
#include <stdio.h>
|
32
|
+
#include <string.h>
|
33
|
+
|
34
|
+
#include "wapiti.h"
|
35
|
+
#include "pattern.h"
|
36
|
+
#include "quark.h"
|
37
|
+
#include "reader.h"
|
38
|
+
#include "sequence.h"
|
39
|
+
#include "tools.h"
|
40
|
+
|
41
|
+
/*******************************************************************************
|
42
|
+
* Datafile reader
|
43
|
+
*
|
44
|
+
* And now come the data file reader which use the previous module to parse
|
45
|
+
* the input data in order to produce seq_t objects representing interned
|
46
|
+
* sequences.
|
47
|
+
*
|
48
|
+
* This is where the sequence will go through the tree steps to build seq_t
|
49
|
+
* objects used internally. There is two way do do this. First the simpler is
|
50
|
+
* to use the rdr_readseq function which directly read a sequence from a file
|
51
|
+
* and convert it to a seq_t object transparently. This is how the training
|
52
|
+
* and development data are loaded.
|
53
|
+
* The second way consist of read a raw sequence with rdr_readraw and next
|
54
|
+
* converting it to a seq_t object with rdr_raw2seq. This allow the caller to
|
55
|
+
* keep the raw sequence and is used by the tagger to produce a clean output.
|
56
|
+
*
|
57
|
+
* There is no public interface to the tok_t object as it is intended only for
|
58
|
+
* internal use in the reader as an intermediate step to apply patterns.
|
59
|
+
******************************************************************************/
|
60
|
+
|
61
|
+
/* rdr_new:
|
62
|
+
* Create a new empty reader object. You mut load patterns in it or a
|
63
|
+
* previously saved reader if you want to use it for reading sequences.
|
64
|
+
*/
|
65
|
+
rdr_t *rdr_new(bool maxent) {
|
66
|
+
rdr_t *rdr = wapiti_xmalloc(sizeof(rdr_t));
|
67
|
+
rdr->maxent = maxent;
|
68
|
+
rdr->npats = rdr->nuni = rdr->nbi = 0;
|
69
|
+
rdr->ntoks = 0;
|
70
|
+
rdr->pats = NULL;
|
71
|
+
rdr->lbl = qrk_new();
|
72
|
+
rdr->obs = qrk_new();
|
73
|
+
return rdr;
|
74
|
+
}
|
75
|
+
|
76
|
+
/* rdr_free:
|
77
|
+
* Free all memory used by a reader object including the quark database, so
|
78
|
+
* any string returned by them must not be used after this call.
|
79
|
+
*/
|
80
|
+
void rdr_free(rdr_t *rdr) {
|
81
|
+
for (int i = 0; i < rdr->npats; i++)
|
82
|
+
pat_free(rdr->pats[i]);
|
83
|
+
free(rdr->pats);
|
84
|
+
qrk_free(rdr->lbl);
|
85
|
+
qrk_free(rdr->obs);
|
86
|
+
free(rdr);
|
87
|
+
}
|
88
|
+
|
89
|
+
/* rdr_freeraw:
|
90
|
+
* Free all memory used by a raw_t object.
|
91
|
+
*/
|
92
|
+
void rdr_freeraw(raw_t *raw) {
|
93
|
+
for (int t = 0; t < raw->len; t++)
|
94
|
+
free(raw->lines[t]);
|
95
|
+
free(raw);
|
96
|
+
}
|
97
|
+
|
98
|
+
/* rdr_freeseq:
|
99
|
+
* Free all memory used by a seq_t object.
|
100
|
+
*/
|
101
|
+
void rdr_freeseq(seq_t *seq) {
|
102
|
+
free(seq->raw);
|
103
|
+
free(seq);
|
104
|
+
}
|
105
|
+
|
106
|
+
/* rdr_freedat:
|
107
|
+
* Free all memory used by a dat_t object.
|
108
|
+
*/
|
109
|
+
void rdr_freedat(dat_t *dat) {
|
110
|
+
for (size_t i = 0; i < dat->nseq; i++)
|
111
|
+
rdr_freeseq(dat->seq[i]);
|
112
|
+
free(dat->seq);
|
113
|
+
free(dat);
|
114
|
+
}
|
115
|
+
|
116
|
+
/* rdr_readline:
|
117
|
+
* Read an input line from <file>. The line can be of any size limited only by
|
118
|
+
* available memory, a buffer large enough is allocated and returned. The
|
119
|
+
* caller is responsible to free it. On end-of-file, NULL is returned.
|
120
|
+
*/
|
121
|
+
static char *rdr_readline(FILE *file) {
|
122
|
+
if (feof(file))
|
123
|
+
return NULL;
|
124
|
+
// Initialize the buffer
|
125
|
+
int len = 0, size = 16;
|
126
|
+
char *buffer = wapiti_xmalloc(size);
|
127
|
+
// We read the line chunk by chunk until end of line, file or error
|
128
|
+
while (!feof(file)) {
|
129
|
+
if (fgets(buffer + len, size - len, file) == NULL) {
|
130
|
+
// On NULL return there is two possible cases, either an
|
131
|
+
// error or the end of file
|
132
|
+
if (ferror(file))
|
133
|
+
pfatal("cannot read from file");
|
134
|
+
// On end of file, we must check if we have already read
|
135
|
+
// some data or not
|
136
|
+
if (len == 0) {
|
137
|
+
free(buffer);
|
138
|
+
return NULL;
|
139
|
+
}
|
140
|
+
break;
|
141
|
+
}
|
142
|
+
// Check for end of line, if this is not the case enlarge the
|
143
|
+
// buffer and go read more data
|
144
|
+
len += strlen(buffer + len);
|
145
|
+
if (len == size - 1 && buffer[len - 1] != '\n') {
|
146
|
+
size = size * 1.4;
|
147
|
+
buffer = wapiti_xrealloc(buffer, size);
|
148
|
+
continue;
|
149
|
+
}
|
150
|
+
break;
|
151
|
+
}
|
152
|
+
// At this point empty line should have already catched so we just
|
153
|
+
// remove the end of line if present and resize the buffer to fit the
|
154
|
+
// data
|
155
|
+
if (buffer[len - 1] == '\n')
|
156
|
+
buffer[--len] = '\0';
|
157
|
+
return wapiti_xrealloc(buffer, len + 1);
|
158
|
+
}
|
159
|
+
|
160
|
+
/* rdr_loadpat:
|
161
|
+
* Load and compile patterns from given file and store them in the reader. As
|
162
|
+
* we compile patterns, syntax errors in them will be raised at this time.
|
163
|
+
*/
|
164
|
+
void rdr_loadpat(rdr_t *rdr, FILE *file) {
|
165
|
+
while (!feof(file)) {
|
166
|
+
// Read raw input line
|
167
|
+
char *line = rdr_readline(file);
|
168
|
+
if (line == NULL)
|
169
|
+
break;
|
170
|
+
// Remove comments and trailing spaces
|
171
|
+
int end = strcspn(line, "#");
|
172
|
+
while (end != 0 && isspace(line[end - 1]))
|
173
|
+
end--;
|
174
|
+
if (end == 0) {
|
175
|
+
free(line);
|
176
|
+
continue;
|
177
|
+
}
|
178
|
+
line[end] = '\0';
|
179
|
+
line[0] = tolower(line[0]);
|
180
|
+
// Compile pattern and add it to the list
|
181
|
+
pat_t *pat = pat_comp(line);
|
182
|
+
rdr->npats++;
|
183
|
+
switch (line[0]) {
|
184
|
+
case 'u': rdr->nuni++; break;
|
185
|
+
case 'b': rdr->nbi++; break;
|
186
|
+
case '*': rdr->nuni++;
|
187
|
+
rdr->nbi++; break;
|
188
|
+
default:
|
189
|
+
fatal("unknown pattern type '%c'", line[0]);
|
190
|
+
}
|
191
|
+
rdr->pats = wapiti_xrealloc(rdr->pats, sizeof(char *) * rdr->npats);
|
192
|
+
rdr->pats[rdr->npats - 1] = pat;
|
193
|
+
rdr->ntoks = max(rdr->ntoks, pat->ntoks);
|
194
|
+
}
|
195
|
+
}
|
196
|
+
|
197
|
+
/* rdr_readraw:
|
198
|
+
* Read a raw sequence from given file: a set of lines terminated by end of
|
199
|
+
* file or by an empty line. Return NULL if file end was reached before any
|
200
|
+
* sequence was read.
|
201
|
+
*/
|
202
|
+
raw_t *rdr_readraw(rdr_t *rdr, FILE *file) {
|
203
|
+
if (feof(file))
|
204
|
+
return NULL;
|
205
|
+
// Prepare the raw sequence object
|
206
|
+
int size = 32, cnt = 0;
|
207
|
+
raw_t *raw = wapiti_xmalloc(sizeof(raw_t) + sizeof(char *) * size);
|
208
|
+
// And read the next sequence in the file, this will skip any blank line
|
209
|
+
// before reading the sequence stoping at end of file or on a new blank
|
210
|
+
// line.
|
211
|
+
while (!feof(file)) {
|
212
|
+
char *line = rdr_readline(file);
|
213
|
+
if (line == NULL)
|
214
|
+
break;
|
215
|
+
// Check for empty line marking the end of the current sequence
|
216
|
+
int len = strlen(line);
|
217
|
+
while (len != 0 && isspace(line[len - 1]))
|
218
|
+
len--;
|
219
|
+
if (len == 0) {
|
220
|
+
free(line);
|
221
|
+
// Special case when no line was already read, we try
|
222
|
+
// again. This allow multiple blank lines beetwen
|
223
|
+
// sequences.
|
224
|
+
if (cnt == 0)
|
225
|
+
continue;
|
226
|
+
break;
|
227
|
+
}
|
228
|
+
// Next, grow the buffer if needed and add the new line in it
|
229
|
+
if (size == cnt) {
|
230
|
+
size *= 1.4;
|
231
|
+
raw = wapiti_xrealloc(raw, sizeof(raw_t)
|
232
|
+
+ sizeof(char *) * size);
|
233
|
+
}
|
234
|
+
raw->lines[cnt++] = line;
|
235
|
+
// In maxent mode, we only have to load one line for each sample
|
236
|
+
// so we can stop here.
|
237
|
+
if (rdr->maxent)
|
238
|
+
break;
|
239
|
+
}
|
240
|
+
// If no lines was read, we just free allocated memory and return NULL
|
241
|
+
// to signal the end of file to the caller. Else, we adjust the object
|
242
|
+
// size and return it.
|
243
|
+
if (cnt == 0) {
|
244
|
+
free(raw);
|
245
|
+
return NULL;
|
246
|
+
}
|
247
|
+
raw = wapiti_xrealloc(raw, sizeof(raw_t) + sizeof(char *) * cnt);
|
248
|
+
raw->len = cnt;
|
249
|
+
return raw;
|
250
|
+
}
|
251
|
+
|
252
|
+
/* rdr_mapobs:
|
253
|
+
* Map an observation to its identifier, automatically adding a 'u' prefix in
|
254
|
+
* pure maxent mode.
|
255
|
+
*/
|
256
|
+
static size_t rdr_mapobs(rdr_t *rdr, const char *str) {
|
257
|
+
if (!rdr->maxent)
|
258
|
+
return qrk_str2id(rdr->obs, str);
|
259
|
+
size_t len = strlen(str) + 2;
|
260
|
+
char tmp[len];
|
261
|
+
tmp[0] = 'u';
|
262
|
+
strcpy(tmp + 1, str);
|
263
|
+
return qrk_str2id(rdr->obs, tmp);
|
264
|
+
}
|
265
|
+
|
266
|
+
/* rdr_rawtok2seq:
|
267
|
+
* Convert a tok_t to a seq_t object taking each tokens as a feature without
|
268
|
+
* applying patterns.
|
269
|
+
*/
|
270
|
+
static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
|
271
|
+
const int T = tok->len;
|
272
|
+
int size = 0;
|
273
|
+
if (rdr->maxent) {
|
274
|
+
size = tok->cnts[0];
|
275
|
+
} else {
|
276
|
+
for (int t = 0; t < T; t++) {
|
277
|
+
for (int n = 0; n < tok->cnts[t]; n++) {
|
278
|
+
const char *o = tok->toks[t][n];
|
279
|
+
switch (o[0]) {
|
280
|
+
case 'u': size += 1; break;
|
281
|
+
case 'b': size += 1; break;
|
282
|
+
case '*': size += 2; break;
|
283
|
+
default:
|
284
|
+
fatal("invalid feature: %s", o);
|
285
|
+
}
|
286
|
+
}
|
287
|
+
}
|
288
|
+
}
|
289
|
+
seq_t *seq = wapiti_xmalloc(sizeof(seq_t) + sizeof(pos_t) * T);
|
290
|
+
seq->raw = wapiti_xmalloc(sizeof(size_t) * size);
|
291
|
+
seq->len = T;
|
292
|
+
size_t *raw = seq->raw;
|
293
|
+
for (int t = 0; t < T; t++) {
|
294
|
+
seq->pos[t].lbl = none;
|
295
|
+
seq->pos[t].ucnt = 0;
|
296
|
+
seq->pos[t].uobs = raw;
|
297
|
+
for (int n = 0; n < tok->cnts[t]; n++) {
|
298
|
+
if (tok->toks[t][n][0] == 'b')
|
299
|
+
continue;
|
300
|
+
size_t id = rdr_mapobs(rdr, tok->toks[t][n]);
|
301
|
+
if (id != none) {
|
302
|
+
(*raw++) = id;
|
303
|
+
seq->pos[t].ucnt++;
|
304
|
+
}
|
305
|
+
}
|
306
|
+
seq->pos[t].bcnt = 0;
|
307
|
+
if (rdr->maxent)
|
308
|
+
continue;
|
309
|
+
seq->pos[t].bobs = raw;
|
310
|
+
for (int n = 0; n < tok->cnts[t]; n++) {
|
311
|
+
if (tok->toks[t][n][0] == 'u')
|
312
|
+
continue;
|
313
|
+
size_t id = rdr_mapobs(rdr, tok->toks[t][n]);
|
314
|
+
if (id != none) {
|
315
|
+
(*raw++) = id;
|
316
|
+
seq->pos[t].bcnt++;
|
317
|
+
}
|
318
|
+
}
|
319
|
+
}
|
320
|
+
// And finally, if the user specified it, populate the labels
|
321
|
+
if (tok->lbl != NULL) {
|
322
|
+
for (int t = 0; t < T; t++) {
|
323
|
+
const char *lbl = tok->lbl[t];
|
324
|
+
size_t id = qrk_str2id(rdr->lbl, lbl);
|
325
|
+
seq->pos[t].lbl = id;
|
326
|
+
}
|
327
|
+
}
|
328
|
+
return seq;
|
329
|
+
}
|
330
|
+
|
331
|
+
/* rdr_pattok2seq:
|
332
|
+
* Convert a tok_t to a seq_t object by applying the patterns of the reader.
|
333
|
+
*/
|
334
|
+
static seq_t *rdr_pattok2seq(rdr_t *rdr, const tok_t *tok) {
|
335
|
+
const int T = tok->len;
|
336
|
+
// So now the tok object is ready, we can start building the seq_t
|
337
|
+
// object by appling patterns. First we allocate the seq_t object. The
|
338
|
+
// sequence itself as well as the sub array are allocated in one time.
|
339
|
+
seq_t *seq = wapiti_xmalloc(sizeof(seq_t) + sizeof(pos_t) * T);
|
340
|
+
seq->raw = wapiti_xmalloc(sizeof(size_t) * (rdr->nuni + rdr->nbi) * T);
|
341
|
+
seq->len = T;
|
342
|
+
size_t *tmp = seq->raw;
|
343
|
+
for (int t = 0; t < T; t++) {
|
344
|
+
seq->pos[t].lbl = none;
|
345
|
+
seq->pos[t].uobs = tmp; tmp += rdr->nuni;
|
346
|
+
seq->pos[t].bobs = tmp; tmp += rdr->nbi;
|
347
|
+
}
|
348
|
+
// Next, we can build the observations list by applying the patterns on
|
349
|
+
// the tok_t sequence.
|
350
|
+
for (int t = 0; t < T; t++) {
|
351
|
+
pos_t *pos = &seq->pos[t];
|
352
|
+
pos->ucnt = 0;
|
353
|
+
pos->bcnt = 0;
|
354
|
+
for (int x = 0; x < rdr->npats; x++) {
|
355
|
+
// Get the observation and map it to an identifier
|
356
|
+
char *obs = pat_exec(rdr->pats[x], tok, t);
|
357
|
+
size_t id = rdr_mapobs(rdr, obs);
|
358
|
+
if (id == none) {
|
359
|
+
free(obs);
|
360
|
+
continue;
|
361
|
+
}
|
362
|
+
// If the observation is ok, add it to the lists
|
363
|
+
int kind = 0;
|
364
|
+
switch (obs[0]) {
|
365
|
+
case 'u': kind = 1; break;
|
366
|
+
case 'b': kind = 2; break;
|
367
|
+
case '*': kind = 3; break;
|
368
|
+
}
|
369
|
+
if (kind & 1)
|
370
|
+
pos->uobs[pos->ucnt++] = id;
|
371
|
+
if (kind & 2)
|
372
|
+
pos->bobs[pos->bcnt++] = id;
|
373
|
+
free(obs);
|
374
|
+
}
|
375
|
+
}
|
376
|
+
// And finally, if the user specified it, populate the labels
|
377
|
+
if (tok->lbl != NULL) {
|
378
|
+
for (int t = 0; t < T; t++) {
|
379
|
+
const char *lbl = tok->lbl[t];
|
380
|
+
size_t id = qrk_str2id(rdr->lbl, lbl);
|
381
|
+
seq->pos[t].lbl = id;
|
382
|
+
}
|
383
|
+
}
|
384
|
+
return seq;
|
385
|
+
}
|
386
|
+
|
387
|
+
/* rdr_raw2seq:
|
388
|
+
* Convert a raw sequence to a seq_t object suitable for training or
|
389
|
+
* labelling. If lbl is true, the last column is assumed to be a label and
|
390
|
+
* interned also.
|
391
|
+
*/
|
392
|
+
seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl) {
|
393
|
+
const int T = raw->len;
|
394
|
+
// Allocate the tok_t object, the label array is allocated only if they
|
395
|
+
// are requested by the user.
|
396
|
+
tok_t *tok = wapiti_xmalloc(sizeof(tok_t) + T * sizeof(char **));
|
397
|
+
tok->cnts = wapiti_xmalloc(sizeof(size_t) * T);
|
398
|
+
tok->lbl = NULL;
|
399
|
+
if (lbl == true)
|
400
|
+
tok->lbl = wapiti_xmalloc(sizeof(char *) * T);
|
401
|
+
// We now take the raw sequence line by line and split them in list of
|
402
|
+
// tokens. To reduce memory fragmentation, the raw line is copied and
|
403
|
+
// his reference is kept by the first tokens, next tokens are pointer to
|
404
|
+
// this copy.
|
405
|
+
for (int t = 0; t < T; t++) {
|
406
|
+
// Get a copy of the raw line skiping leading space characters
|
407
|
+
const char *src = raw->lines[t];
|
408
|
+
while (isspace(*src))
|
409
|
+
src++;
|
410
|
+
char *line = xstrdup(src);
|
411
|
+
// Split it in tokens
|
412
|
+
const int len = strlen(line);
|
413
|
+
char *toks[len / 2];
|
414
|
+
int cnt = 0;
|
415
|
+
while (*line != '\0') {
|
416
|
+
toks[cnt++] = line;
|
417
|
+
while (*line != '\0' && !isspace(*line))
|
418
|
+
line++;
|
419
|
+
if (*line == '\0')
|
420
|
+
break;
|
421
|
+
*line++ = '\0';
|
422
|
+
while (*line != '\0' && isspace(*line))
|
423
|
+
line++;
|
424
|
+
}
|
425
|
+
// If user specified that data are labelled, move the last token
|
426
|
+
// to the label array.
|
427
|
+
if (lbl == true) {
|
428
|
+
tok->lbl[t] = toks[cnt - 1];
|
429
|
+
cnt--;
|
430
|
+
}
|
431
|
+
// And put the remaining tokens in the tok_t object
|
432
|
+
tok->cnts[t] = cnt;
|
433
|
+
tok->toks[t] = wapiti_xmalloc(sizeof(char *) * cnt);
|
434
|
+
memcpy(tok->toks[t], toks, sizeof(char *) * cnt);
|
435
|
+
}
|
436
|
+
tok->len = T;
|
437
|
+
// Convert the tok_t to a seq_t
|
438
|
+
seq_t *seq = NULL;
|
439
|
+
if (rdr->npats == 0)
|
440
|
+
seq = rdr_rawtok2seq(rdr, tok);
|
441
|
+
else
|
442
|
+
seq = rdr_pattok2seq(rdr, tok);
|
443
|
+
// Before returning the sequence, we have to free the tok_t
|
444
|
+
for (int t = 0; t < T; t++) {
|
445
|
+
if (tok->cnts[t] == 0)
|
446
|
+
continue;
|
447
|
+
free(tok->toks[t][0]);
|
448
|
+
free(tok->toks[t]);
|
449
|
+
}
|
450
|
+
free(tok->cnts);
|
451
|
+
if (lbl == true)
|
452
|
+
free(tok->lbl);
|
453
|
+
free(tok);
|
454
|
+
return seq;
|
455
|
+
}
|
456
|
+
|
457
|
+
/* rdr_readseq:
|
458
|
+
* Simple wrapper around rdr_readraw and rdr_raw2seq to directly read a
|
459
|
+
* sequence as a seq_t object from file. This take care of all the process
|
460
|
+
* and correctly free temporary data. If lbl is true the sequence is assumed
|
461
|
+
* to be labeled.
|
462
|
+
* Return NULL if end of file occure before anything as been read.
|
463
|
+
*/
|
464
|
+
seq_t *rdr_readseq(rdr_t *rdr, FILE *file, bool lbl) {
|
465
|
+
raw_t *raw = rdr_readraw(rdr, file);
|
466
|
+
if (raw == NULL)
|
467
|
+
return NULL;
|
468
|
+
seq_t *seq = rdr_raw2seq(rdr, raw, lbl);
|
469
|
+
rdr_freeraw(raw);
|
470
|
+
return seq;
|
471
|
+
}
|
472
|
+
|
473
|
+
/* rdr_readdat:
|
474
|
+
* Read a full dataset at once and return it as a dat_t object. This function
|
475
|
+
* take and interpret his parameters like the single sequence reading
|
476
|
+
* function.
|
477
|
+
*/
|
478
|
+
dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl) {
|
479
|
+
// Prepare dataset
|
480
|
+
size_t size = 1000;
|
481
|
+
dat_t *dat = wapiti_xmalloc(sizeof(dat_t));
|
482
|
+
dat->nseq = 0;
|
483
|
+
dat->mlen = 0;
|
484
|
+
dat->lbl = lbl;
|
485
|
+
dat->seq = wapiti_xmalloc(sizeof(seq_t *) * size);
|
486
|
+
// Load sequences
|
487
|
+
while (!feof(file)) {
|
488
|
+
// Read the next sequence
|
489
|
+
seq_t *seq = rdr_readseq(rdr, file, lbl);
|
490
|
+
if (seq == NULL)
|
491
|
+
break;
|
492
|
+
// Grow the buffer if needed
|
493
|
+
if (dat->nseq == size) {
|
494
|
+
size *= 1.4;
|
495
|
+
dat->seq = wapiti_xrealloc(dat->seq, sizeof(seq_t *) * size);
|
496
|
+
}
|
497
|
+
// And store the sequence
|
498
|
+
dat->seq[dat->nseq++] = seq;
|
499
|
+
dat->mlen = max(dat->mlen, seq->len);
|
500
|
+
if (dat->nseq % 1000 == 0)
|
501
|
+
info("%7d sequences loaded\n", dat->nseq);
|
502
|
+
}
|
503
|
+
// If no sequence readed, cleanup and repport
|
504
|
+
if (dat->nseq == 0) {
|
505
|
+
free(dat->seq);
|
506
|
+
free(dat);
|
507
|
+
return NULL;
|
508
|
+
}
|
509
|
+
// Adjust the dataset size and return
|
510
|
+
if (size > dat->nseq)
|
511
|
+
dat->seq = wapiti_xrealloc(dat->seq, sizeof(seq_t *) * dat->nseq);
|
512
|
+
return dat;
|
513
|
+
}
|
514
|
+
|
515
|
+
/* rdr_load:
|
516
|
+
* Read from the given file a reader saved previously with rdr_save. The given
|
517
|
+
* reader must be empty, comming fresh from rdr_new. Be carefull that this
|
518
|
+
* function performs almost no checks on the input data, so if you modify the
|
519
|
+
* reader and make a mistake, it will probably result in a crash.
|
520
|
+
*/
|
521
|
+
void rdr_load(rdr_t *rdr, FILE *file) {
|
522
|
+
const char *err = "broken file, invalid reader format";
|
523
|
+
if (fscanf(file, "#rdr#%d/%d\n", &rdr->npats, &rdr->ntoks) != 2)
|
524
|
+
fatal(err);
|
525
|
+
rdr->nuni = rdr->nbi = 0;
|
526
|
+
rdr->pats = wapiti_xmalloc(sizeof(pat_t *) * rdr->npats);
|
527
|
+
for (int p = 0; p < rdr->npats; p++) {
|
528
|
+
char *pat = ns_readstr(file);
|
529
|
+
rdr->pats[p] = pat_comp(pat);
|
530
|
+
switch (tolower(pat[0])) {
|
531
|
+
case 'u': rdr->nuni++; break;
|
532
|
+
case 'b': rdr->nbi++; break;
|
533
|
+
case '*': rdr->nuni++;
|
534
|
+
rdr->nbi++; break;
|
535
|
+
}
|
536
|
+
}
|
537
|
+
qrk_load(rdr->lbl, file);
|
538
|
+
qrk_load(rdr->obs, file);
|
539
|
+
}
|
540
|
+
|
541
|
+
/* rdr_save:
|
542
|
+
* Save the reader to the given file so it can be loaded back. The save format
|
543
|
+
* is plain text and portable accros computers.
|
544
|
+
*/
|
545
|
+
void rdr_save(const rdr_t *rdr, FILE *file) {
|
546
|
+
if(fprintf(file, "#rdr#%d/%d\n", rdr->npats, rdr->ntoks) < 0)
|
547
|
+
pfatal("cannot write to file");
|
548
|
+
for (int p = 0; p < rdr->npats; p++)
|
549
|
+
ns_writestr(file, rdr->pats[p]->src);
|
550
|
+
qrk_save(rdr->lbl, file);
|
551
|
+
qrk_save(rdr->obs, file);
|
552
|
+
}
|
553
|
+
|