wapiti 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.autotest +13 -0
- data/.gitignore +5 -0
- data/.rspec +3 -0
- data/Gemfile +6 -0
- data/LICENSE +30 -0
- data/README.md +153 -0
- data/Rakefile +33 -0
- data/ext/wapiti/bcd.c +392 -0
- data/ext/wapiti/decoder.c +535 -0
- data/ext/wapiti/decoder.h +46 -0
- data/ext/wapiti/extconf.rb +8 -0
- data/ext/wapiti/gradient.c +818 -0
- data/ext/wapiti/gradient.h +81 -0
- data/ext/wapiti/lbfgs.c +294 -0
- data/ext/wapiti/model.c +296 -0
- data/ext/wapiti/model.h +100 -0
- data/ext/wapiti/native.c +1238 -0
- data/ext/wapiti/native.h +15 -0
- data/ext/wapiti/options.c +278 -0
- data/ext/wapiti/options.h +91 -0
- data/ext/wapiti/pattern.c +395 -0
- data/ext/wapiti/pattern.h +56 -0
- data/ext/wapiti/progress.c +167 -0
- data/ext/wapiti/progress.h +43 -0
- data/ext/wapiti/quark.c +272 -0
- data/ext/wapiti/quark.h +46 -0
- data/ext/wapiti/reader.c +553 -0
- data/ext/wapiti/reader.h +73 -0
- data/ext/wapiti/rprop.c +191 -0
- data/ext/wapiti/sequence.h +148 -0
- data/ext/wapiti/sgdl1.c +218 -0
- data/ext/wapiti/thread.c +171 -0
- data/ext/wapiti/thread.h +42 -0
- data/ext/wapiti/tools.c +202 -0
- data/ext/wapiti/tools.h +54 -0
- data/ext/wapiti/trainers.h +39 -0
- data/ext/wapiti/vmath.c +372 -0
- data/ext/wapiti/vmath.h +51 -0
- data/ext/wapiti/wapiti.c +288 -0
- data/ext/wapiti/wapiti.h +45 -0
- data/lib/wapiti.rb +30 -0
- data/lib/wapiti/errors.rb +17 -0
- data/lib/wapiti/model.rb +49 -0
- data/lib/wapiti/options.rb +113 -0
- data/lib/wapiti/utility.rb +15 -0
- data/lib/wapiti/version.rb +3 -0
- data/spec/fixtures/ch.mod +18550 -0
- data/spec/fixtures/chpattern.txt +52 -0
- data/spec/fixtures/chtest.txt +1973 -0
- data/spec/fixtures/chtrain.txt +19995 -0
- data/spec/fixtures/nppattern.txt +52 -0
- data/spec/fixtures/nptest.txt +1973 -0
- data/spec/fixtures/nptrain.txt +19995 -0
- data/spec/fixtures/pattern.txt +14 -0
- data/spec/fixtures/test.txt +60000 -0
- data/spec/fixtures/train.txt +1200 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/wapiti/model_spec.rb +173 -0
- data/spec/wapiti/native_spec.rb +12 -0
- data/spec/wapiti/options_spec.rb +175 -0
- data/spec/wapiti/utility_spec.rb +22 -0
- data/wapiti.gemspec +35 -0
- metadata +178 -0
data/ext/wapiti/quark.h
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#ifndef quark_h
|
29
|
+
#define quark_h
|
30
|
+
|
31
|
+
#include <stddef.h>
|
32
|
+
#include <stdio.h>
|
33
|
+
|
34
|
+
typedef struct qrk_s qrk_t;
|
35
|
+
|
36
|
+
qrk_t *qrk_new(void);
|
37
|
+
void qrk_free(qrk_t *qrk);
|
38
|
+
size_t qrk_count(const qrk_t *qrk);
|
39
|
+
bool qrk_lock(qrk_t *qrk, bool lock);
|
40
|
+
const char *qrk_id2str(const qrk_t *qrk, size_t id);
|
41
|
+
size_t qrk_str2id(qrk_t *qrk, const char *key);
|
42
|
+
void qrk_load(qrk_t *qrk, FILE *file);
|
43
|
+
void qrk_save(const qrk_t *qrk, FILE *file);
|
44
|
+
|
45
|
+
#endif
|
46
|
+
|
data/ext/wapiti/reader.c
ADDED
@@ -0,0 +1,553 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
#include <ctype.h>
|
28
|
+
#include <stdbool.h>
|
29
|
+
#include <stddef.h>
|
30
|
+
#include <stdlib.h>
|
31
|
+
#include <stdio.h>
|
32
|
+
#include <string.h>
|
33
|
+
|
34
|
+
#include "wapiti.h"
|
35
|
+
#include "pattern.h"
|
36
|
+
#include "quark.h"
|
37
|
+
#include "reader.h"
|
38
|
+
#include "sequence.h"
|
39
|
+
#include "tools.h"
|
40
|
+
|
41
|
+
/*******************************************************************************
|
42
|
+
* Datafile reader
|
43
|
+
*
|
44
|
+
* And now come the data file reader which use the previous module to parse
|
45
|
+
* the input data in order to produce seq_t objects representing interned
|
46
|
+
* sequences.
|
47
|
+
*
|
48
|
+
* This is where the sequence will go through the tree steps to build seq_t
|
49
|
+
* objects used internally. There is two way do do this. First the simpler is
|
50
|
+
* to use the rdr_readseq function which directly read a sequence from a file
|
51
|
+
* and convert it to a seq_t object transparently. This is how the training
|
52
|
+
* and development data are loaded.
|
53
|
+
* The second way consist of read a raw sequence with rdr_readraw and next
|
54
|
+
* converting it to a seq_t object with rdr_raw2seq. This allow the caller to
|
55
|
+
* keep the raw sequence and is used by the tagger to produce a clean output.
|
56
|
+
*
|
57
|
+
* There is no public interface to the tok_t object as it is intended only for
|
58
|
+
* internal use in the reader as an intermediate step to apply patterns.
|
59
|
+
******************************************************************************/
|
60
|
+
|
61
|
+
/* rdr_new:
|
62
|
+
* Create a new empty reader object. You mut load patterns in it or a
|
63
|
+
* previously saved reader if you want to use it for reading sequences.
|
64
|
+
*/
|
65
|
+
rdr_t *rdr_new(bool maxent) {
|
66
|
+
rdr_t *rdr = wapiti_xmalloc(sizeof(rdr_t));
|
67
|
+
rdr->maxent = maxent;
|
68
|
+
rdr->npats = rdr->nuni = rdr->nbi = 0;
|
69
|
+
rdr->ntoks = 0;
|
70
|
+
rdr->pats = NULL;
|
71
|
+
rdr->lbl = qrk_new();
|
72
|
+
rdr->obs = qrk_new();
|
73
|
+
return rdr;
|
74
|
+
}
|
75
|
+
|
76
|
+
/* rdr_free:
|
77
|
+
* Free all memory used by a reader object including the quark database, so
|
78
|
+
* any string returned by them must not be used after this call.
|
79
|
+
*/
|
80
|
+
void rdr_free(rdr_t *rdr) {
|
81
|
+
for (int i = 0; i < rdr->npats; i++)
|
82
|
+
pat_free(rdr->pats[i]);
|
83
|
+
free(rdr->pats);
|
84
|
+
qrk_free(rdr->lbl);
|
85
|
+
qrk_free(rdr->obs);
|
86
|
+
free(rdr);
|
87
|
+
}
|
88
|
+
|
89
|
+
/* rdr_freeraw:
|
90
|
+
* Free all memory used by a raw_t object.
|
91
|
+
*/
|
92
|
+
void rdr_freeraw(raw_t *raw) {
|
93
|
+
for (int t = 0; t < raw->len; t++)
|
94
|
+
free(raw->lines[t]);
|
95
|
+
free(raw);
|
96
|
+
}
|
97
|
+
|
98
|
+
/* rdr_freeseq:
|
99
|
+
* Free all memory used by a seq_t object.
|
100
|
+
*/
|
101
|
+
void rdr_freeseq(seq_t *seq) {
|
102
|
+
free(seq->raw);
|
103
|
+
free(seq);
|
104
|
+
}
|
105
|
+
|
106
|
+
/* rdr_freedat:
|
107
|
+
* Free all memory used by a dat_t object.
|
108
|
+
*/
|
109
|
+
void rdr_freedat(dat_t *dat) {
|
110
|
+
for (size_t i = 0; i < dat->nseq; i++)
|
111
|
+
rdr_freeseq(dat->seq[i]);
|
112
|
+
free(dat->seq);
|
113
|
+
free(dat);
|
114
|
+
}
|
115
|
+
|
116
|
+
/* rdr_readline:
|
117
|
+
* Read an input line from <file>. The line can be of any size limited only by
|
118
|
+
* available memory, a buffer large enough is allocated and returned. The
|
119
|
+
* caller is responsible to free it. On end-of-file, NULL is returned.
|
120
|
+
*/
|
121
|
+
static char *rdr_readline(FILE *file) {
|
122
|
+
if (feof(file))
|
123
|
+
return NULL;
|
124
|
+
// Initialize the buffer
|
125
|
+
int len = 0, size = 16;
|
126
|
+
char *buffer = wapiti_xmalloc(size);
|
127
|
+
// We read the line chunk by chunk until end of line, file or error
|
128
|
+
while (!feof(file)) {
|
129
|
+
if (fgets(buffer + len, size - len, file) == NULL) {
|
130
|
+
// On NULL return there is two possible cases, either an
|
131
|
+
// error or the end of file
|
132
|
+
if (ferror(file))
|
133
|
+
pfatal("cannot read from file");
|
134
|
+
// On end of file, we must check if we have already read
|
135
|
+
// some data or not
|
136
|
+
if (len == 0) {
|
137
|
+
free(buffer);
|
138
|
+
return NULL;
|
139
|
+
}
|
140
|
+
break;
|
141
|
+
}
|
142
|
+
// Check for end of line, if this is not the case enlarge the
|
143
|
+
// buffer and go read more data
|
144
|
+
len += strlen(buffer + len);
|
145
|
+
if (len == size - 1 && buffer[len - 1] != '\n') {
|
146
|
+
size = size * 1.4;
|
147
|
+
buffer = wapiti_xrealloc(buffer, size);
|
148
|
+
continue;
|
149
|
+
}
|
150
|
+
break;
|
151
|
+
}
|
152
|
+
// At this point empty line should have already catched so we just
|
153
|
+
// remove the end of line if present and resize the buffer to fit the
|
154
|
+
// data
|
155
|
+
if (buffer[len - 1] == '\n')
|
156
|
+
buffer[--len] = '\0';
|
157
|
+
return wapiti_xrealloc(buffer, len + 1);
|
158
|
+
}
|
159
|
+
|
160
|
+
/* rdr_loadpat:
|
161
|
+
* Load and compile patterns from given file and store them in the reader. As
|
162
|
+
* we compile patterns, syntax errors in them will be raised at this time.
|
163
|
+
*/
|
164
|
+
void rdr_loadpat(rdr_t *rdr, FILE *file) {
|
165
|
+
while (!feof(file)) {
|
166
|
+
// Read raw input line
|
167
|
+
char *line = rdr_readline(file);
|
168
|
+
if (line == NULL)
|
169
|
+
break;
|
170
|
+
// Remove comments and trailing spaces
|
171
|
+
int end = strcspn(line, "#");
|
172
|
+
while (end != 0 && isspace(line[end - 1]))
|
173
|
+
end--;
|
174
|
+
if (end == 0) {
|
175
|
+
free(line);
|
176
|
+
continue;
|
177
|
+
}
|
178
|
+
line[end] = '\0';
|
179
|
+
line[0] = tolower(line[0]);
|
180
|
+
// Compile pattern and add it to the list
|
181
|
+
pat_t *pat = pat_comp(line);
|
182
|
+
rdr->npats++;
|
183
|
+
switch (line[0]) {
|
184
|
+
case 'u': rdr->nuni++; break;
|
185
|
+
case 'b': rdr->nbi++; break;
|
186
|
+
case '*': rdr->nuni++;
|
187
|
+
rdr->nbi++; break;
|
188
|
+
default:
|
189
|
+
fatal("unknown pattern type '%c'", line[0]);
|
190
|
+
}
|
191
|
+
rdr->pats = wapiti_xrealloc(rdr->pats, sizeof(char *) * rdr->npats);
|
192
|
+
rdr->pats[rdr->npats - 1] = pat;
|
193
|
+
rdr->ntoks = max(rdr->ntoks, pat->ntoks);
|
194
|
+
}
|
195
|
+
}
|
196
|
+
|
197
|
+
/* rdr_readraw:
|
198
|
+
* Read a raw sequence from given file: a set of lines terminated by end of
|
199
|
+
* file or by an empty line. Return NULL if file end was reached before any
|
200
|
+
* sequence was read.
|
201
|
+
*/
|
202
|
+
raw_t *rdr_readraw(rdr_t *rdr, FILE *file) {
|
203
|
+
if (feof(file))
|
204
|
+
return NULL;
|
205
|
+
// Prepare the raw sequence object
|
206
|
+
int size = 32, cnt = 0;
|
207
|
+
raw_t *raw = wapiti_xmalloc(sizeof(raw_t) + sizeof(char *) * size);
|
208
|
+
// And read the next sequence in the file, this will skip any blank line
|
209
|
+
// before reading the sequence stoping at end of file or on a new blank
|
210
|
+
// line.
|
211
|
+
while (!feof(file)) {
|
212
|
+
char *line = rdr_readline(file);
|
213
|
+
if (line == NULL)
|
214
|
+
break;
|
215
|
+
// Check for empty line marking the end of the current sequence
|
216
|
+
int len = strlen(line);
|
217
|
+
while (len != 0 && isspace(line[len - 1]))
|
218
|
+
len--;
|
219
|
+
if (len == 0) {
|
220
|
+
free(line);
|
221
|
+
// Special case when no line was already read, we try
|
222
|
+
// again. This allow multiple blank lines beetwen
|
223
|
+
// sequences.
|
224
|
+
if (cnt == 0)
|
225
|
+
continue;
|
226
|
+
break;
|
227
|
+
}
|
228
|
+
// Next, grow the buffer if needed and add the new line in it
|
229
|
+
if (size == cnt) {
|
230
|
+
size *= 1.4;
|
231
|
+
raw = wapiti_xrealloc(raw, sizeof(raw_t)
|
232
|
+
+ sizeof(char *) * size);
|
233
|
+
}
|
234
|
+
raw->lines[cnt++] = line;
|
235
|
+
// In maxent mode, we only have to load one line for each sample
|
236
|
+
// so we can stop here.
|
237
|
+
if (rdr->maxent)
|
238
|
+
break;
|
239
|
+
}
|
240
|
+
// If no lines was read, we just free allocated memory and return NULL
|
241
|
+
// to signal the end of file to the caller. Else, we adjust the object
|
242
|
+
// size and return it.
|
243
|
+
if (cnt == 0) {
|
244
|
+
free(raw);
|
245
|
+
return NULL;
|
246
|
+
}
|
247
|
+
raw = wapiti_xrealloc(raw, sizeof(raw_t) + sizeof(char *) * cnt);
|
248
|
+
raw->len = cnt;
|
249
|
+
return raw;
|
250
|
+
}
|
251
|
+
|
252
|
+
/* rdr_mapobs:
|
253
|
+
* Map an observation to its identifier, automatically adding a 'u' prefix in
|
254
|
+
* pure maxent mode.
|
255
|
+
*/
|
256
|
+
static size_t rdr_mapobs(rdr_t *rdr, const char *str) {
|
257
|
+
if (!rdr->maxent)
|
258
|
+
return qrk_str2id(rdr->obs, str);
|
259
|
+
size_t len = strlen(str) + 2;
|
260
|
+
char tmp[len];
|
261
|
+
tmp[0] = 'u';
|
262
|
+
strcpy(tmp + 1, str);
|
263
|
+
return qrk_str2id(rdr->obs, tmp);
|
264
|
+
}
|
265
|
+
|
266
|
+
/* rdr_rawtok2seq:
|
267
|
+
* Convert a tok_t to a seq_t object taking each tokens as a feature without
|
268
|
+
* applying patterns.
|
269
|
+
*/
|
270
|
+
static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
|
271
|
+
const int T = tok->len;
|
272
|
+
int size = 0;
|
273
|
+
if (rdr->maxent) {
|
274
|
+
size = tok->cnts[0];
|
275
|
+
} else {
|
276
|
+
for (int t = 0; t < T; t++) {
|
277
|
+
for (int n = 0; n < tok->cnts[t]; n++) {
|
278
|
+
const char *o = tok->toks[t][n];
|
279
|
+
switch (o[0]) {
|
280
|
+
case 'u': size += 1; break;
|
281
|
+
case 'b': size += 1; break;
|
282
|
+
case '*': size += 2; break;
|
283
|
+
default:
|
284
|
+
fatal("invalid feature: %s", o);
|
285
|
+
}
|
286
|
+
}
|
287
|
+
}
|
288
|
+
}
|
289
|
+
seq_t *seq = wapiti_xmalloc(sizeof(seq_t) + sizeof(pos_t) * T);
|
290
|
+
seq->raw = wapiti_xmalloc(sizeof(size_t) * size);
|
291
|
+
seq->len = T;
|
292
|
+
size_t *raw = seq->raw;
|
293
|
+
for (int t = 0; t < T; t++) {
|
294
|
+
seq->pos[t].lbl = none;
|
295
|
+
seq->pos[t].ucnt = 0;
|
296
|
+
seq->pos[t].uobs = raw;
|
297
|
+
for (int n = 0; n < tok->cnts[t]; n++) {
|
298
|
+
if (tok->toks[t][n][0] == 'b')
|
299
|
+
continue;
|
300
|
+
size_t id = rdr_mapobs(rdr, tok->toks[t][n]);
|
301
|
+
if (id != none) {
|
302
|
+
(*raw++) = id;
|
303
|
+
seq->pos[t].ucnt++;
|
304
|
+
}
|
305
|
+
}
|
306
|
+
seq->pos[t].bcnt = 0;
|
307
|
+
if (rdr->maxent)
|
308
|
+
continue;
|
309
|
+
seq->pos[t].bobs = raw;
|
310
|
+
for (int n = 0; n < tok->cnts[t]; n++) {
|
311
|
+
if (tok->toks[t][n][0] == 'u')
|
312
|
+
continue;
|
313
|
+
size_t id = rdr_mapobs(rdr, tok->toks[t][n]);
|
314
|
+
if (id != none) {
|
315
|
+
(*raw++) = id;
|
316
|
+
seq->pos[t].bcnt++;
|
317
|
+
}
|
318
|
+
}
|
319
|
+
}
|
320
|
+
// And finally, if the user specified it, populate the labels
|
321
|
+
if (tok->lbl != NULL) {
|
322
|
+
for (int t = 0; t < T; t++) {
|
323
|
+
const char *lbl = tok->lbl[t];
|
324
|
+
size_t id = qrk_str2id(rdr->lbl, lbl);
|
325
|
+
seq->pos[t].lbl = id;
|
326
|
+
}
|
327
|
+
}
|
328
|
+
return seq;
|
329
|
+
}
|
330
|
+
|
331
|
+
/* rdr_pattok2seq:
|
332
|
+
* Convert a tok_t to a seq_t object by applying the patterns of the reader.
|
333
|
+
*/
|
334
|
+
static seq_t *rdr_pattok2seq(rdr_t *rdr, const tok_t *tok) {
|
335
|
+
const int T = tok->len;
|
336
|
+
// So now the tok object is ready, we can start building the seq_t
|
337
|
+
// object by appling patterns. First we allocate the seq_t object. The
|
338
|
+
// sequence itself as well as the sub array are allocated in one time.
|
339
|
+
seq_t *seq = wapiti_xmalloc(sizeof(seq_t) + sizeof(pos_t) * T);
|
340
|
+
seq->raw = wapiti_xmalloc(sizeof(size_t) * (rdr->nuni + rdr->nbi) * T);
|
341
|
+
seq->len = T;
|
342
|
+
size_t *tmp = seq->raw;
|
343
|
+
for (int t = 0; t < T; t++) {
|
344
|
+
seq->pos[t].lbl = none;
|
345
|
+
seq->pos[t].uobs = tmp; tmp += rdr->nuni;
|
346
|
+
seq->pos[t].bobs = tmp; tmp += rdr->nbi;
|
347
|
+
}
|
348
|
+
// Next, we can build the observations list by applying the patterns on
|
349
|
+
// the tok_t sequence.
|
350
|
+
for (int t = 0; t < T; t++) {
|
351
|
+
pos_t *pos = &seq->pos[t];
|
352
|
+
pos->ucnt = 0;
|
353
|
+
pos->bcnt = 0;
|
354
|
+
for (int x = 0; x < rdr->npats; x++) {
|
355
|
+
// Get the observation and map it to an identifier
|
356
|
+
char *obs = pat_exec(rdr->pats[x], tok, t);
|
357
|
+
size_t id = rdr_mapobs(rdr, obs);
|
358
|
+
if (id == none) {
|
359
|
+
free(obs);
|
360
|
+
continue;
|
361
|
+
}
|
362
|
+
// If the observation is ok, add it to the lists
|
363
|
+
int kind = 0;
|
364
|
+
switch (obs[0]) {
|
365
|
+
case 'u': kind = 1; break;
|
366
|
+
case 'b': kind = 2; break;
|
367
|
+
case '*': kind = 3; break;
|
368
|
+
}
|
369
|
+
if (kind & 1)
|
370
|
+
pos->uobs[pos->ucnt++] = id;
|
371
|
+
if (kind & 2)
|
372
|
+
pos->bobs[pos->bcnt++] = id;
|
373
|
+
free(obs);
|
374
|
+
}
|
375
|
+
}
|
376
|
+
// And finally, if the user specified it, populate the labels
|
377
|
+
if (tok->lbl != NULL) {
|
378
|
+
for (int t = 0; t < T; t++) {
|
379
|
+
const char *lbl = tok->lbl[t];
|
380
|
+
size_t id = qrk_str2id(rdr->lbl, lbl);
|
381
|
+
seq->pos[t].lbl = id;
|
382
|
+
}
|
383
|
+
}
|
384
|
+
return seq;
|
385
|
+
}
|
386
|
+
|
387
|
+
/* rdr_raw2seq:
|
388
|
+
* Convert a raw sequence to a seq_t object suitable for training or
|
389
|
+
* labelling. If lbl is true, the last column is assumed to be a label and
|
390
|
+
* interned also.
|
391
|
+
*/
|
392
|
+
seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl) {
|
393
|
+
const int T = raw->len;
|
394
|
+
// Allocate the tok_t object, the label array is allocated only if they
|
395
|
+
// are requested by the user.
|
396
|
+
tok_t *tok = wapiti_xmalloc(sizeof(tok_t) + T * sizeof(char **));
|
397
|
+
tok->cnts = wapiti_xmalloc(sizeof(size_t) * T);
|
398
|
+
tok->lbl = NULL;
|
399
|
+
if (lbl == true)
|
400
|
+
tok->lbl = wapiti_xmalloc(sizeof(char *) * T);
|
401
|
+
// We now take the raw sequence line by line and split them in list of
|
402
|
+
// tokens. To reduce memory fragmentation, the raw line is copied and
|
403
|
+
// his reference is kept by the first tokens, next tokens are pointer to
|
404
|
+
// this copy.
|
405
|
+
for (int t = 0; t < T; t++) {
|
406
|
+
// Get a copy of the raw line skiping leading space characters
|
407
|
+
const char *src = raw->lines[t];
|
408
|
+
while (isspace(*src))
|
409
|
+
src++;
|
410
|
+
char *line = xstrdup(src);
|
411
|
+
// Split it in tokens
|
412
|
+
const int len = strlen(line);
|
413
|
+
char *toks[len / 2];
|
414
|
+
int cnt = 0;
|
415
|
+
while (*line != '\0') {
|
416
|
+
toks[cnt++] = line;
|
417
|
+
while (*line != '\0' && !isspace(*line))
|
418
|
+
line++;
|
419
|
+
if (*line == '\0')
|
420
|
+
break;
|
421
|
+
*line++ = '\0';
|
422
|
+
while (*line != '\0' && isspace(*line))
|
423
|
+
line++;
|
424
|
+
}
|
425
|
+
// If user specified that data are labelled, move the last token
|
426
|
+
// to the label array.
|
427
|
+
if (lbl == true) {
|
428
|
+
tok->lbl[t] = toks[cnt - 1];
|
429
|
+
cnt--;
|
430
|
+
}
|
431
|
+
// And put the remaining tokens in the tok_t object
|
432
|
+
tok->cnts[t] = cnt;
|
433
|
+
tok->toks[t] = wapiti_xmalloc(sizeof(char *) * cnt);
|
434
|
+
memcpy(tok->toks[t], toks, sizeof(char *) * cnt);
|
435
|
+
}
|
436
|
+
tok->len = T;
|
437
|
+
// Convert the tok_t to a seq_t
|
438
|
+
seq_t *seq = NULL;
|
439
|
+
if (rdr->npats == 0)
|
440
|
+
seq = rdr_rawtok2seq(rdr, tok);
|
441
|
+
else
|
442
|
+
seq = rdr_pattok2seq(rdr, tok);
|
443
|
+
// Before returning the sequence, we have to free the tok_t
|
444
|
+
for (int t = 0; t < T; t++) {
|
445
|
+
if (tok->cnts[t] == 0)
|
446
|
+
continue;
|
447
|
+
free(tok->toks[t][0]);
|
448
|
+
free(tok->toks[t]);
|
449
|
+
}
|
450
|
+
free(tok->cnts);
|
451
|
+
if (lbl == true)
|
452
|
+
free(tok->lbl);
|
453
|
+
free(tok);
|
454
|
+
return seq;
|
455
|
+
}
|
456
|
+
|
457
|
+
/* rdr_readseq:
|
458
|
+
* Simple wrapper around rdr_readraw and rdr_raw2seq to directly read a
|
459
|
+
* sequence as a seq_t object from file. This take care of all the process
|
460
|
+
* and correctly free temporary data. If lbl is true the sequence is assumed
|
461
|
+
* to be labeled.
|
462
|
+
* Return NULL if end of file occure before anything as been read.
|
463
|
+
*/
|
464
|
+
seq_t *rdr_readseq(rdr_t *rdr, FILE *file, bool lbl) {
|
465
|
+
raw_t *raw = rdr_readraw(rdr, file);
|
466
|
+
if (raw == NULL)
|
467
|
+
return NULL;
|
468
|
+
seq_t *seq = rdr_raw2seq(rdr, raw, lbl);
|
469
|
+
rdr_freeraw(raw);
|
470
|
+
return seq;
|
471
|
+
}
|
472
|
+
|
473
|
+
/* rdr_readdat:
|
474
|
+
* Read a full dataset at once and return it as a dat_t object. This function
|
475
|
+
* take and interpret his parameters like the single sequence reading
|
476
|
+
* function.
|
477
|
+
*/
|
478
|
+
dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl) {
|
479
|
+
// Prepare dataset
|
480
|
+
size_t size = 1000;
|
481
|
+
dat_t *dat = wapiti_xmalloc(sizeof(dat_t));
|
482
|
+
dat->nseq = 0;
|
483
|
+
dat->mlen = 0;
|
484
|
+
dat->lbl = lbl;
|
485
|
+
dat->seq = wapiti_xmalloc(sizeof(seq_t *) * size);
|
486
|
+
// Load sequences
|
487
|
+
while (!feof(file)) {
|
488
|
+
// Read the next sequence
|
489
|
+
seq_t *seq = rdr_readseq(rdr, file, lbl);
|
490
|
+
if (seq == NULL)
|
491
|
+
break;
|
492
|
+
// Grow the buffer if needed
|
493
|
+
if (dat->nseq == size) {
|
494
|
+
size *= 1.4;
|
495
|
+
dat->seq = wapiti_xrealloc(dat->seq, sizeof(seq_t *) * size);
|
496
|
+
}
|
497
|
+
// And store the sequence
|
498
|
+
dat->seq[dat->nseq++] = seq;
|
499
|
+
dat->mlen = max(dat->mlen, seq->len);
|
500
|
+
if (dat->nseq % 1000 == 0)
|
501
|
+
info("%7d sequences loaded\n", dat->nseq);
|
502
|
+
}
|
503
|
+
// If no sequence readed, cleanup and repport
|
504
|
+
if (dat->nseq == 0) {
|
505
|
+
free(dat->seq);
|
506
|
+
free(dat);
|
507
|
+
return NULL;
|
508
|
+
}
|
509
|
+
// Adjust the dataset size and return
|
510
|
+
if (size > dat->nseq)
|
511
|
+
dat->seq = wapiti_xrealloc(dat->seq, sizeof(seq_t *) * dat->nseq);
|
512
|
+
return dat;
|
513
|
+
}
|
514
|
+
|
515
|
+
/* rdr_load:
|
516
|
+
* Read from the given file a reader saved previously with rdr_save. The given
|
517
|
+
* reader must be empty, comming fresh from rdr_new. Be carefull that this
|
518
|
+
* function performs almost no checks on the input data, so if you modify the
|
519
|
+
* reader and make a mistake, it will probably result in a crash.
|
520
|
+
*/
|
521
|
+
void rdr_load(rdr_t *rdr, FILE *file) {
|
522
|
+
const char *err = "broken file, invalid reader format";
|
523
|
+
if (fscanf(file, "#rdr#%d/%d\n", &rdr->npats, &rdr->ntoks) != 2)
|
524
|
+
fatal(err);
|
525
|
+
rdr->nuni = rdr->nbi = 0;
|
526
|
+
rdr->pats = wapiti_xmalloc(sizeof(pat_t *) * rdr->npats);
|
527
|
+
for (int p = 0; p < rdr->npats; p++) {
|
528
|
+
char *pat = ns_readstr(file);
|
529
|
+
rdr->pats[p] = pat_comp(pat);
|
530
|
+
switch (tolower(pat[0])) {
|
531
|
+
case 'u': rdr->nuni++; break;
|
532
|
+
case 'b': rdr->nbi++; break;
|
533
|
+
case '*': rdr->nuni++;
|
534
|
+
rdr->nbi++; break;
|
535
|
+
}
|
536
|
+
}
|
537
|
+
qrk_load(rdr->lbl, file);
|
538
|
+
qrk_load(rdr->obs, file);
|
539
|
+
}
|
540
|
+
|
541
|
+
/* rdr_save:
|
542
|
+
* Save the reader to the given file so it can be loaded back. The save format
|
543
|
+
* is plain text and portable accros computers.
|
544
|
+
*/
|
545
|
+
void rdr_save(const rdr_t *rdr, FILE *file) {
|
546
|
+
if(fprintf(file, "#rdr#%d/%d\n", rdr->npats, rdr->ntoks) < 0)
|
547
|
+
pfatal("cannot write to file");
|
548
|
+
for (int p = 0; p < rdr->npats; p++)
|
549
|
+
ns_writestr(file, rdr->pats[p]->src);
|
550
|
+
qrk_save(rdr->lbl, file);
|
551
|
+
qrk_save(rdr->obs, file);
|
552
|
+
}
|
553
|
+
|