wapiti 0.0.5 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.simplecov +3 -0
- data/Gemfile +25 -2
- data/HISTORY.md +5 -1
- data/LICENSE +14 -13
- data/README.md +9 -16
- data/Rakefile +38 -8
- data/ext/wapiti/bcd.c +126 -124
- data/ext/wapiti/decoder.c +203 -124
- data/ext/wapiti/decoder.h +6 -4
- data/ext/wapiti/extconf.rb +2 -2
- data/ext/wapiti/gradient.c +491 -320
- data/ext/wapiti/gradient.h +52 -34
- data/ext/wapiti/lbfgs.c +74 -33
- data/ext/wapiti/model.c +47 -37
- data/ext/wapiti/model.h +22 -20
- data/ext/wapiti/native.c +850 -839
- data/ext/wapiti/native.h +1 -1
- data/ext/wapiti/options.c +52 -20
- data/ext/wapiti/options.h +37 -30
- data/ext/wapiti/pattern.c +35 -33
- data/ext/wapiti/pattern.h +12 -11
- data/ext/wapiti/progress.c +14 -13
- data/ext/wapiti/progress.h +3 -2
- data/ext/wapiti/quark.c +14 -16
- data/ext/wapiti/quark.h +6 -5
- data/ext/wapiti/reader.c +83 -69
- data/ext/wapiti/reader.h +11 -9
- data/ext/wapiti/rprop.c +84 -43
- data/ext/wapiti/sequence.h +18 -16
- data/ext/wapiti/sgdl1.c +45 -43
- data/ext/wapiti/thread.c +19 -17
- data/ext/wapiti/thread.h +5 -4
- data/ext/wapiti/tools.c +7 -7
- data/ext/wapiti/tools.h +3 -4
- data/ext/wapiti/trainers.h +1 -1
- data/ext/wapiti/vmath.c +40 -38
- data/ext/wapiti/vmath.h +12 -11
- data/ext/wapiti/wapiti.c +159 -37
- data/ext/wapiti/wapiti.h +18 -4
- data/lib/wapiti.rb +15 -15
- data/lib/wapiti/errors.rb +15 -15
- data/lib/wapiti/model.rb +92 -84
- data/lib/wapiti/options.rb +123 -124
- data/lib/wapiti/utility.rb +14 -14
- data/lib/wapiti/version.rb +2 -2
- data/spec/spec_helper.rb +29 -9
- data/spec/wapiti/model_spec.rb +230 -194
- data/spec/wapiti/native_spec.rb +7 -8
- data/spec/wapiti/options_spec.rb +184 -174
- data/wapiti.gemspec +22 -8
- metadata +38 -42
- data/.gitignore +0 -5
data/ext/wapiti/pattern.h
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -29,27 +29,28 @@
|
|
29
29
|
#define pattern_h
|
30
30
|
|
31
31
|
#include <stdbool.h>
|
32
|
+
#include <stdint.h>
|
32
33
|
|
33
34
|
#include "sequence.h"
|
34
35
|
|
35
36
|
typedef struct pat_s pat_t;
|
36
37
|
typedef struct pat_item_s pat_item_t;
|
37
38
|
struct pat_s {
|
38
|
-
char
|
39
|
-
|
40
|
-
|
39
|
+
char *src;
|
40
|
+
uint32_t ntoks;
|
41
|
+
uint32_t nitems;
|
41
42
|
struct pat_item_s {
|
42
|
-
char
|
43
|
-
bool
|
44
|
-
char
|
45
|
-
bool
|
46
|
-
|
47
|
-
|
43
|
+
char type;
|
44
|
+
bool caps;
|
45
|
+
char *value;
|
46
|
+
bool absolute;
|
47
|
+
int32_t offset;
|
48
|
+
uint32_t column;
|
48
49
|
} items[];
|
49
50
|
};
|
50
51
|
|
51
52
|
pat_t *pat_comp(char *p);
|
52
|
-
char *pat_exec(const pat_t *pat, const tok_t *tok,
|
53
|
+
char *pat_exec(const pat_t *pat, const tok_t *tok, uint32_t at);
|
53
54
|
void pat_free(pat_t *pat);
|
54
55
|
|
55
56
|
#endif
|
data/ext/wapiti/progress.c
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -24,14 +24,16 @@
|
|
24
24
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
25
|
* POSSIBILITY OF SUCH DAMAGE.
|
26
26
|
*/
|
27
|
+
#include <inttypes.h>
|
27
28
|
#include <signal.h>
|
28
29
|
#include <stdbool.h>
|
29
30
|
#include <stddef.h>
|
31
|
+
#include <stdint.h>
|
30
32
|
#include <stdlib.h>
|
31
33
|
#include <stdio.h>
|
32
34
|
|
33
35
|
#include <unistd.h>
|
34
|
-
#include <sys/
|
36
|
+
#include <sys/time.h>
|
35
37
|
#include <sys/resource.h>
|
36
38
|
|
37
39
|
#include "wapiti.h"
|
@@ -89,7 +91,7 @@ void uit_setup(mdl_t *mdl) {
|
|
89
91
|
uit_stop = false;
|
90
92
|
if (signal(SIGINT, uit_signal) == SIG_ERR)
|
91
93
|
warning("failed to set signal handler, no clean early stop");
|
92
|
-
|
94
|
+
gettimeofday(&mdl->timer, NULL);
|
93
95
|
if (mdl->opt->stopwin != 0)
|
94
96
|
mdl->werr = wapiti_xmalloc(sizeof(double) * mdl->opt->stopwin);
|
95
97
|
mdl->wcnt = mdl->wpos = 0;
|
@@ -116,28 +118,27 @@ void uit_cleanup(mdl_t *mdl) {
|
|
116
118
|
* and false if he must stop, so this is were we will implement the trainer
|
117
119
|
* independant stoping criterion.
|
118
120
|
*/
|
119
|
-
bool uit_progress(mdl_t *mdl,
|
121
|
+
bool uit_progress(mdl_t *mdl, uint32_t it, double obj) {
|
120
122
|
// First we just compute the error rate on devel or train data
|
121
123
|
double te, se;
|
122
124
|
tag_eval(mdl, &te, &se);
|
123
125
|
// Next, we compute the number of active features
|
124
|
-
|
125
|
-
for (
|
126
|
+
uint64_t act = 0;
|
127
|
+
for (uint64_t f = 0; f < mdl->nftr; f++)
|
126
128
|
if (mdl->theta[f] != 0.0)
|
127
129
|
act++;
|
128
130
|
// Compute timings. As some training algorithms are multi-threaded, we
|
129
131
|
// cannot use ansi/c function and must rely on posix one to sum time
|
130
132
|
// spent in main thread and in child ones.
|
131
|
-
tms_t now;
|
132
|
-
double tm = (now.
|
133
|
-
|
134
|
-
tm /= sysconf(_SC_CLK_TCK);
|
133
|
+
tms_t now; gettimeofday(&now, NULL);
|
134
|
+
double tm = (now.tv_sec + (double)now.tv_usec * 1.0e-6)
|
135
|
+
- (mdl->timer.tv_sec + (double)mdl->timer.tv_usec * 1.0e-6);
|
135
136
|
mdl->total += tm;
|
136
137
|
mdl->timer = now;
|
137
138
|
// And display progress report
|
138
|
-
info(" [%
|
139
|
+
info(" [%4"PRIu32"]", it);
|
139
140
|
info(obj >= 0.0 ? " obj=%-10.2f" : " obj=NA", obj);
|
140
|
-
info(" act=%-
|
141
|
+
info(" act=%-8"PRIu64, act);
|
141
142
|
info(" err=%5.2f%%/%5.2f%%", te, se);
|
142
143
|
info(" time=%.2fs/%.2fs", tm, mdl->total);
|
143
144
|
info("\n");
|
@@ -150,7 +151,7 @@ bool uit_progress(mdl_t *mdl, int it, double obj) {
|
|
150
151
|
mdl->wcnt++;
|
151
152
|
if (mdl->wcnt >= mdl->opt->stopwin) {
|
152
153
|
double emin = 200.0, emax = -100.0;
|
153
|
-
for (
|
154
|
+
for (uint32_t i = 0; i < mdl->opt->stopwin; i++) {
|
154
155
|
emin = min(emin, mdl->werr[i]);
|
155
156
|
emax = max(emax, mdl->werr[i]);
|
156
157
|
}
|
data/ext/wapiti/progress.h
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -29,6 +29,7 @@
|
|
29
29
|
#define progress_h
|
30
30
|
|
31
31
|
#include <stdbool.h>
|
32
|
+
#include <stdint.h>
|
32
33
|
|
33
34
|
#include "wapiti.h"
|
34
35
|
#include "model.h"
|
@@ -37,7 +38,7 @@ extern bool uit_stop;
|
|
37
38
|
|
38
39
|
void uit_setup(mdl_t *mdl);
|
39
40
|
void uit_cleanup(mdl_t *mdl);
|
40
|
-
bool uit_progress(mdl_t *mdl,
|
41
|
+
bool uit_progress(mdl_t *mdl, uint32_t it, double obj);
|
41
42
|
|
42
43
|
#endif
|
43
44
|
|
data/ext/wapiti/quark.c
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -24,11 +24,12 @@
|
|
24
24
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
25
|
* POSSIBILITY OF SUCH DAMAGE.
|
26
26
|
*/
|
27
|
+
#include <inttypes.h>
|
27
28
|
#include <stdbool.h>
|
28
29
|
#include <stddef.h>
|
29
30
|
#include <stdlib.h>
|
30
|
-
#include <stdio.h>
|
31
31
|
#include <stdint.h>
|
32
|
+
#include <stdio.h>
|
32
33
|
#include <string.h>
|
33
34
|
|
34
35
|
#include "quark.h"
|
@@ -47,7 +48,7 @@
|
|
47
48
|
* Information Coded in Alphanumeric, Journal of the ACM 15 (4): pp. 514--534,
|
48
49
|
* 1968. DOI:10.1145/321479.321481
|
49
50
|
*
|
50
|
-
* This code is copyright 2002-
|
51
|
+
* This code is copyright 2002-2013 Thomas Lavergne and licenced under the BSD
|
51
52
|
* Licence like the remaining of Wapiti.
|
52
53
|
******************************************************************************/
|
53
54
|
|
@@ -68,8 +69,6 @@ struct qrk_s {
|
|
68
69
|
uint64_t size;
|
69
70
|
};
|
70
71
|
|
71
|
-
#define qrk_none ((uint64_t)-1)
|
72
|
-
|
73
72
|
#define qrk_lf2nd(lf) ((node_t *)((intptr_t)(lf) | 1))
|
74
73
|
#define qrk_nd2lf(nd) ((leaf_t *)((intptr_t)(nd) & ~1))
|
75
74
|
#define qrk_isleaf(nd) ((intptr_t)(nd) & 1)
|
@@ -86,7 +85,7 @@ qrk_t *qrk_new(void) {
|
|
86
85
|
qrk->count = 0;
|
87
86
|
qrk->lock = false;
|
88
87
|
qrk->size = size;
|
89
|
-
qrk->leafs = wapiti_xmalloc(sizeof(leaf_t) * size);
|
88
|
+
qrk->leafs = wapiti_xmalloc(sizeof(leaf_t *) * size);
|
90
89
|
return qrk;
|
91
90
|
}
|
92
91
|
|
@@ -96,10 +95,10 @@ qrk_t *qrk_new(void) {
|
|
96
95
|
* qrk_unmap become invalid and must not be used anymore.
|
97
96
|
*/
|
98
97
|
void qrk_free(qrk_t *qrk) {
|
99
|
-
const
|
98
|
+
const uint32_t stkmax = 1024;
|
100
99
|
if (qrk->count != 0) {
|
101
100
|
node_t *stk[stkmax];
|
102
|
-
|
101
|
+
uint32_t cnt = 0;
|
103
102
|
stk[cnt++] = qrk->root;
|
104
103
|
while (cnt != 0) {
|
105
104
|
node_t *nd = stk[--cnt];
|
@@ -122,7 +121,7 @@ void qrk_free(qrk_t *qrk) {
|
|
122
121
|
* pair inside the quark. This function is not thread safe and should not be
|
123
122
|
* called on the same map from different thread without locking.
|
124
123
|
*/
|
125
|
-
|
124
|
+
uint64_t qrk_str2id(qrk_t *qrk, const char *key) {
|
126
125
|
const uint8_t *raw = (void *)key;
|
127
126
|
const size_t len = strlen(key);
|
128
127
|
// We first take care of the empty trie case so later we can safely
|
@@ -213,7 +212,7 @@ size_t qrk_str2id(qrk_t *qrk, const char *key) {
|
|
213
212
|
* remain valid only for the life time of the quark, a call to qrk_free will
|
214
213
|
* make this pointer invalid.
|
215
214
|
*/
|
216
|
-
const char *qrk_id2str(const qrk_t *qrk,
|
215
|
+
const char *qrk_id2str(const qrk_t *qrk, uint64_t id) {
|
217
216
|
if (id >= qrk->count)
|
218
217
|
fatal("invalid identifier");
|
219
218
|
return qrk->leafs[id]->key;
|
@@ -225,7 +224,7 @@ const char *qrk_id2str(const qrk_t *qrk, size_t id) {
|
|
225
224
|
* number correspond to the id.
|
226
225
|
*/
|
227
226
|
void qrk_save(const qrk_t *qrk, FILE *file) {
|
228
|
-
if (fprintf(file, "#qrk#%
|
227
|
+
if (fprintf(file, "#qrk#%"PRIu64"\n", qrk->count) < 0)
|
229
228
|
pfatal("cannot write to file");
|
230
229
|
if (qrk->count == 0)
|
231
230
|
return;
|
@@ -240,13 +239,13 @@ void qrk_save(const qrk_t *qrk, FILE *file) {
|
|
240
239
|
* initilay empty, this will load a map exactly as saved by qrk_save.
|
241
240
|
*/
|
242
241
|
void qrk_load(qrk_t *qrk, FILE *file) {
|
243
|
-
|
244
|
-
if (fscanf(file, "#qrk#%
|
242
|
+
uint64_t cnt = 0;
|
243
|
+
if (fscanf(file, "#qrk#%"SCNu64"\n", &cnt) != 1) {
|
245
244
|
if (ferror(file) != 0)
|
246
245
|
pfatal("cannot read from file");
|
247
246
|
pfatal("invalid format");
|
248
247
|
}
|
249
|
-
for (
|
248
|
+
for (uint64_t n = 0; n < cnt; ++n) {
|
250
249
|
char *str = ns_readstr(file);
|
251
250
|
qrk_str2id(qrk, str);
|
252
251
|
free(str);
|
@@ -256,7 +255,7 @@ void qrk_load(qrk_t *qrk, FILE *file) {
|
|
256
255
|
/* qrk_count:
|
257
256
|
* Return the number of mappings stored in the quark.
|
258
257
|
*/
|
259
|
-
|
258
|
+
uint64_t qrk_count(const qrk_t *qrk) {
|
260
259
|
return qrk->count;
|
261
260
|
}
|
262
261
|
|
@@ -269,4 +268,3 @@ bool qrk_lock(qrk_t *qrk, bool lock) {
|
|
269
268
|
return old;
|
270
269
|
}
|
271
270
|
|
272
|
-
|
data/ext/wapiti/quark.h
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -28,17 +28,18 @@
|
|
28
28
|
#ifndef quark_h
|
29
29
|
#define quark_h
|
30
30
|
|
31
|
-
#include <
|
31
|
+
#include <stdbool.h>
|
32
|
+
#include <stdint.h>
|
32
33
|
#include <stdio.h>
|
33
34
|
|
34
35
|
typedef struct qrk_s qrk_t;
|
35
36
|
|
36
37
|
qrk_t *qrk_new(void);
|
37
38
|
void qrk_free(qrk_t *qrk);
|
38
|
-
|
39
|
+
uint64_t qrk_count(const qrk_t *qrk);
|
39
40
|
bool qrk_lock(qrk_t *qrk, bool lock);
|
40
|
-
const char *qrk_id2str(const qrk_t *qrk,
|
41
|
-
|
41
|
+
const char *qrk_id2str(const qrk_t *qrk, uint64_t id);
|
42
|
+
uint64_t qrk_str2id(qrk_t *qrk, const char *key);
|
42
43
|
void qrk_load(qrk_t *qrk, FILE *file);
|
43
44
|
void qrk_save(const qrk_t *qrk, FILE *file);
|
44
45
|
|
data/ext/wapiti/reader.c
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -25,6 +25,7 @@
|
|
25
25
|
* POSSIBILITY OF SUCH DAMAGE.
|
26
26
|
*/
|
27
27
|
#include <ctype.h>
|
28
|
+
#include <inttypes.h>
|
28
29
|
#include <stdbool.h>
|
29
30
|
#include <stddef.h>
|
30
31
|
#include <stdlib.h>
|
@@ -59,12 +60,14 @@
|
|
59
60
|
******************************************************************************/
|
60
61
|
|
61
62
|
/* rdr_new:
|
62
|
-
* Create a new empty reader object.
|
63
|
-
*
|
63
|
+
* Create a new empty reader object. If no patterns are loaded before you
|
64
|
+
* start using the reader the input data are assumed to be already prepared
|
65
|
+
* list of features. They must either start with a prefix 'u', 'b', or '*', or
|
66
|
+
* you must set autouni to true in order to automatically add a 'u' prefix.
|
64
67
|
*/
|
65
|
-
rdr_t *rdr_new(bool
|
68
|
+
rdr_t *rdr_new(bool autouni) {
|
66
69
|
rdr_t *rdr = wapiti_xmalloc(sizeof(rdr_t));
|
67
|
-
rdr->
|
70
|
+
rdr->autouni = autouni;
|
68
71
|
rdr->npats = rdr->nuni = rdr->nbi = 0;
|
69
72
|
rdr->ntoks = 0;
|
70
73
|
rdr->pats = NULL;
|
@@ -78,7 +81,7 @@ rdr_t *rdr_new(bool maxent) {
|
|
78
81
|
* any string returned by them must not be used after this call.
|
79
82
|
*/
|
80
83
|
void rdr_free(rdr_t *rdr) {
|
81
|
-
for (
|
84
|
+
for (uint32_t i = 0; i < rdr->npats; i++)
|
82
85
|
pat_free(rdr->pats[i]);
|
83
86
|
free(rdr->pats);
|
84
87
|
qrk_free(rdr->lbl);
|
@@ -90,7 +93,7 @@ void rdr_free(rdr_t *rdr) {
|
|
90
93
|
* Free all memory used by a raw_t object.
|
91
94
|
*/
|
92
95
|
void rdr_freeraw(raw_t *raw) {
|
93
|
-
for (
|
96
|
+
for (uint32_t t = 0; t < raw->len; t++)
|
94
97
|
free(raw->lines[t]);
|
95
98
|
free(raw);
|
96
99
|
}
|
@@ -107,7 +110,7 @@ void rdr_freeseq(seq_t *seq) {
|
|
107
110
|
* Free all memory used by a dat_t object.
|
108
111
|
*/
|
109
112
|
void rdr_freedat(dat_t *dat) {
|
110
|
-
for (
|
113
|
+
for (uint32_t i = 0; i < dat->nseq; i++)
|
111
114
|
rdr_freeseq(dat->seq[i]);
|
112
115
|
free(dat->seq);
|
113
116
|
free(dat);
|
@@ -118,11 +121,11 @@ void rdr_freedat(dat_t *dat) {
|
|
118
121
|
* available memory, a buffer large enough is allocated and returned. The
|
119
122
|
* caller is responsible to free it. On end-of-file, NULL is returned.
|
120
123
|
*/
|
121
|
-
|
124
|
+
char *rdr_readline(FILE *file) {
|
122
125
|
if (feof(file))
|
123
126
|
return NULL;
|
124
127
|
// Initialize the buffer
|
125
|
-
|
128
|
+
uint32_t len = 0, size = 16;
|
126
129
|
char *buffer = wapiti_xmalloc(size);
|
127
130
|
// We read the line chunk by chunk until end of line, file or error
|
128
131
|
while (!feof(file)) {
|
@@ -203,7 +206,7 @@ raw_t *rdr_readraw(rdr_t *rdr, FILE *file) {
|
|
203
206
|
if (feof(file))
|
204
207
|
return NULL;
|
205
208
|
// Prepare the raw sequence object
|
206
|
-
|
209
|
+
uint32_t size = 32, cnt = 0;
|
207
210
|
raw_t *raw = wapiti_xmalloc(sizeof(raw_t) + sizeof(char *) * size);
|
208
211
|
// And read the next sequence in the file, this will skip any blank line
|
209
212
|
// before reading the sequence stoping at end of file or on a new blank
|
@@ -232,9 +235,9 @@ raw_t *rdr_readraw(rdr_t *rdr, FILE *file) {
|
|
232
235
|
+ sizeof(char *) * size);
|
233
236
|
}
|
234
237
|
raw->lines[cnt++] = line;
|
235
|
-
// In
|
236
|
-
//
|
237
|
-
if (rdr->
|
238
|
+
// In autouni mode, there will be only unigram features so we
|
239
|
+
// can use small sequences to improve multi-theading.
|
240
|
+
if (rdr->autouni)
|
238
241
|
break;
|
239
242
|
}
|
240
243
|
// If no lines was read, we just free allocated memory and return NULL
|
@@ -251,13 +254,12 @@ raw_t *rdr_readraw(rdr_t *rdr, FILE *file) {
|
|
251
254
|
|
252
255
|
/* rdr_mapobs:
|
253
256
|
* Map an observation to its identifier, automatically adding a 'u' prefix in
|
254
|
-
*
|
257
|
+
* 'autouni' mode.
|
255
258
|
*/
|
256
|
-
static
|
257
|
-
if (!rdr->
|
259
|
+
static uint64_t rdr_mapobs(rdr_t *rdr, const char *str) {
|
260
|
+
if (!rdr->autouni)
|
258
261
|
return qrk_str2id(rdr->obs, str);
|
259
|
-
|
260
|
-
char tmp[len];
|
262
|
+
char tmp[strlen(str) + 2];
|
261
263
|
tmp[0] = 'u';
|
262
264
|
strcpy(tmp + 1, str);
|
263
265
|
return qrk_str2id(rdr->obs, tmp);
|
@@ -268,13 +270,13 @@ static size_t rdr_mapobs(rdr_t *rdr, const char *str) {
|
|
268
270
|
* applying patterns.
|
269
271
|
*/
|
270
272
|
static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
|
271
|
-
const
|
272
|
-
|
273
|
-
if (rdr->
|
273
|
+
const uint32_t T = tok->len;
|
274
|
+
uint32_t size = 0;
|
275
|
+
if (rdr->autouni) {
|
274
276
|
size = tok->cnts[0];
|
275
277
|
} else {
|
276
|
-
for (
|
277
|
-
for (
|
278
|
+
for (uint32_t t = 0; t < T; t++) {
|
279
|
+
for (uint32_t n = 0; n < tok->cnts[t]; n++) {
|
278
280
|
const char *o = tok->toks[t][n];
|
279
281
|
switch (o[0]) {
|
280
282
|
case 'u': size += 1; break;
|
@@ -287,30 +289,30 @@ static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
|
|
287
289
|
}
|
288
290
|
}
|
289
291
|
seq_t *seq = wapiti_xmalloc(sizeof(seq_t) + sizeof(pos_t) * T);
|
290
|
-
seq->raw = wapiti_xmalloc(sizeof(
|
292
|
+
seq->raw = wapiti_xmalloc(sizeof(uint64_t) * size);
|
291
293
|
seq->len = T;
|
292
|
-
|
293
|
-
for (
|
294
|
-
seq->pos[t].lbl =
|
294
|
+
uint64_t *raw = seq->raw;
|
295
|
+
for (uint32_t t = 0; t < T; t++) {
|
296
|
+
seq->pos[t].lbl = (uint32_t)-1;
|
295
297
|
seq->pos[t].ucnt = 0;
|
296
298
|
seq->pos[t].uobs = raw;
|
297
|
-
for (
|
298
|
-
if (tok->toks[t][n][0] == 'b')
|
299
|
+
for (uint32_t n = 0; n < tok->cnts[t]; n++) {
|
300
|
+
if (!rdr->autouni && tok->toks[t][n][0] == 'b')
|
299
301
|
continue;
|
300
|
-
|
302
|
+
uint64_t id = rdr_mapobs(rdr, tok->toks[t][n]);
|
301
303
|
if (id != none) {
|
302
304
|
(*raw++) = id;
|
303
305
|
seq->pos[t].ucnt++;
|
304
306
|
}
|
305
307
|
}
|
306
308
|
seq->pos[t].bcnt = 0;
|
307
|
-
if (rdr->
|
309
|
+
if (rdr->autouni)
|
308
310
|
continue;
|
309
311
|
seq->pos[t].bobs = raw;
|
310
|
-
for (
|
312
|
+
for (uint32_t n = 0; n < tok->cnts[t]; n++) {
|
311
313
|
if (tok->toks[t][n][0] == 'u')
|
312
314
|
continue;
|
313
|
-
|
315
|
+
uint64_t id = rdr_mapobs(rdr, tok->toks[t][n]);
|
314
316
|
if (id != none) {
|
315
317
|
(*raw++) = id;
|
316
318
|
seq->pos[t].bcnt++;
|
@@ -319,9 +321,9 @@ static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
|
|
319
321
|
}
|
320
322
|
// And finally, if the user specified it, populate the labels
|
321
323
|
if (tok->lbl != NULL) {
|
322
|
-
for (
|
324
|
+
for (uint32_t t = 0; t < T; t++) {
|
323
325
|
const char *lbl = tok->lbl[t];
|
324
|
-
|
326
|
+
uint64_t id = qrk_str2id(rdr->lbl, lbl);
|
325
327
|
seq->pos[t].lbl = id;
|
326
328
|
}
|
327
329
|
}
|
@@ -332,35 +334,35 @@ static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
|
|
332
334
|
* Convert a tok_t to a seq_t object by applying the patterns of the reader.
|
333
335
|
*/
|
334
336
|
static seq_t *rdr_pattok2seq(rdr_t *rdr, const tok_t *tok) {
|
335
|
-
const
|
337
|
+
const uint32_t T = tok->len;
|
336
338
|
// So now the tok object is ready, we can start building the seq_t
|
337
339
|
// object by appling patterns. First we allocate the seq_t object. The
|
338
340
|
// sequence itself as well as the sub array are allocated in one time.
|
339
341
|
seq_t *seq = wapiti_xmalloc(sizeof(seq_t) + sizeof(pos_t) * T);
|
340
|
-
seq->raw = wapiti_xmalloc(sizeof(
|
342
|
+
seq->raw = wapiti_xmalloc(sizeof(uint64_t) * (rdr->nuni + rdr->nbi) * T);
|
341
343
|
seq->len = T;
|
342
|
-
|
343
|
-
for (
|
344
|
-
seq->pos[t].lbl =
|
344
|
+
uint64_t *tmp = seq->raw;
|
345
|
+
for (uint32_t t = 0; t < T; t++) {
|
346
|
+
seq->pos[t].lbl = (uint32_t)-1;
|
345
347
|
seq->pos[t].uobs = tmp; tmp += rdr->nuni;
|
346
348
|
seq->pos[t].bobs = tmp; tmp += rdr->nbi;
|
347
349
|
}
|
348
350
|
// Next, we can build the observations list by applying the patterns on
|
349
351
|
// the tok_t sequence.
|
350
|
-
for (
|
352
|
+
for (uint32_t t = 0; t < T; t++) {
|
351
353
|
pos_t *pos = &seq->pos[t];
|
352
354
|
pos->ucnt = 0;
|
353
355
|
pos->bcnt = 0;
|
354
|
-
for (
|
356
|
+
for (uint32_t x = 0; x < rdr->npats; x++) {
|
355
357
|
// Get the observation and map it to an identifier
|
356
358
|
char *obs = pat_exec(rdr->pats[x], tok, t);
|
357
|
-
|
359
|
+
uint64_t id = rdr_mapobs(rdr, obs);
|
358
360
|
if (id == none) {
|
359
361
|
free(obs);
|
360
362
|
continue;
|
361
363
|
}
|
362
364
|
// If the observation is ok, add it to the lists
|
363
|
-
|
365
|
+
char kind = 0;
|
364
366
|
switch (obs[0]) {
|
365
367
|
case 'u': kind = 1; break;
|
366
368
|
case 'b': kind = 2; break;
|
@@ -375,9 +377,9 @@ static seq_t *rdr_pattok2seq(rdr_t *rdr, const tok_t *tok) {
|
|
375
377
|
}
|
376
378
|
// And finally, if the user specified it, populate the labels
|
377
379
|
if (tok->lbl != NULL) {
|
378
|
-
for (
|
380
|
+
for (uint32_t t = 0; t < T; t++) {
|
379
381
|
const char *lbl = tok->lbl[t];
|
380
|
-
|
382
|
+
uint64_t id = qrk_str2id(rdr->lbl, lbl);
|
381
383
|
seq->pos[t].lbl = id;
|
382
384
|
}
|
383
385
|
}
|
@@ -390,11 +392,11 @@ static seq_t *rdr_pattok2seq(rdr_t *rdr, const tok_t *tok) {
|
|
390
392
|
* interned also.
|
391
393
|
*/
|
392
394
|
seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl) {
|
393
|
-
const
|
395
|
+
const uint32_t T = raw->len;
|
394
396
|
// Allocate the tok_t object, the label array is allocated only if they
|
395
397
|
// are requested by the user.
|
396
398
|
tok_t *tok = wapiti_xmalloc(sizeof(tok_t) + T * sizeof(char **));
|
397
|
-
tok->cnts = wapiti_xmalloc(sizeof(
|
399
|
+
tok->cnts = wapiti_xmalloc(sizeof(uint32_t) * T);
|
398
400
|
tok->lbl = NULL;
|
399
401
|
if (lbl == true)
|
400
402
|
tok->lbl = wapiti_xmalloc(sizeof(char *) * T);
|
@@ -402,16 +404,15 @@ seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl) {
|
|
402
404
|
// tokens. To reduce memory fragmentation, the raw line is copied and
|
403
405
|
// his reference is kept by the first tokens, next tokens are pointer to
|
404
406
|
// this copy.
|
405
|
-
for (
|
407
|
+
for (uint32_t t = 0; t < T; t++) {
|
406
408
|
// Get a copy of the raw line skiping leading space characters
|
407
409
|
const char *src = raw->lines[t];
|
408
410
|
while (isspace(*src))
|
409
411
|
src++;
|
410
412
|
char *line = xstrdup(src);
|
411
413
|
// Split it in tokens
|
412
|
-
|
413
|
-
|
414
|
-
int cnt = 0;
|
414
|
+
char *toks[strlen(line) / 2 + 1];
|
415
|
+
uint32_t cnt = 0;
|
415
416
|
while (*line != '\0') {
|
416
417
|
toks[cnt++] = line;
|
417
418
|
while (*line != '\0' && !isspace(*line))
|
@@ -441,7 +442,7 @@ seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl) {
|
|
441
442
|
else
|
442
443
|
seq = rdr_pattok2seq(rdr, tok);
|
443
444
|
// Before returning the sequence, we have to free the tok_t
|
444
|
-
for (
|
445
|
+
for (uint32_t t = 0; t < T; t++) {
|
445
446
|
if (tok->cnts[t] == 0)
|
446
447
|
continue;
|
447
448
|
free(tok->toks[t][0]);
|
@@ -477,7 +478,7 @@ seq_t *rdr_readseq(rdr_t *rdr, FILE *file, bool lbl) {
|
|
477
478
|
*/
|
478
479
|
dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl) {
|
479
480
|
// Prepare dataset
|
480
|
-
|
481
|
+
uint32_t size = 1000;
|
481
482
|
dat_t *dat = wapiti_xmalloc(sizeof(dat_t));
|
482
483
|
dat->nseq = 0;
|
483
484
|
dat->mlen = 0;
|
@@ -498,7 +499,7 @@ dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl) {
|
|
498
499
|
dat->seq[dat->nseq++] = seq;
|
499
500
|
dat->mlen = max(dat->mlen, seq->len);
|
500
501
|
if (dat->nseq % 1000 == 0)
|
501
|
-
info("%
|
502
|
+
info("%7"PRIu32" sequences loaded\n", dat->nseq);
|
502
503
|
}
|
503
504
|
// If no sequence readed, cleanup and repport
|
504
505
|
if (dat->nseq == 0) {
|
@@ -520,18 +521,30 @@ dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl) {
|
|
520
521
|
*/
|
521
522
|
void rdr_load(rdr_t *rdr, FILE *file) {
|
522
523
|
const char *err = "broken file, invalid reader format";
|
523
|
-
|
524
|
-
|
524
|
+
int autouni = rdr->autouni;
|
525
|
+
fpos_t pos;
|
526
|
+
fgetpos(file, &pos);
|
527
|
+
if (fscanf(file, "#rdr#%"PRIu32"/%"PRIu32"/%d\n",
|
528
|
+
&rdr->npats, &rdr->ntoks, &autouni) != 3) {
|
529
|
+
// This for compatibility with previous file format
|
530
|
+
fsetpos(file, &pos);
|
531
|
+
if (fscanf(file, "#rdr#%"PRIu32"/%"PRIu32"\n",
|
532
|
+
&rdr->npats, &rdr->ntoks) != 2)
|
533
|
+
fatal(err);
|
534
|
+
}
|
535
|
+
rdr->autouni = autouni;
|
525
536
|
rdr->nuni = rdr->nbi = 0;
|
526
|
-
rdr->
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
537
|
+
if (rdr->npats != 0) {
|
538
|
+
rdr->pats = wapiti_xmalloc(sizeof(pat_t *) * rdr->npats);
|
539
|
+
for (uint32_t p = 0; p < rdr->npats; p++) {
|
540
|
+
char *pat = ns_readstr(file);
|
541
|
+
rdr->pats[p] = pat_comp(pat);
|
542
|
+
switch (tolower(pat[0])) {
|
543
|
+
case 'u': rdr->nuni++; break;
|
544
|
+
case 'b': rdr->nbi++; break;
|
545
|
+
case '*': rdr->nuni++;
|
546
|
+
rdr->nbi++; break;
|
547
|
+
}
|
535
548
|
}
|
536
549
|
}
|
537
550
|
qrk_load(rdr->lbl, file);
|
@@ -543,9 +556,10 @@ void rdr_load(rdr_t *rdr, FILE *file) {
|
|
543
556
|
* is plain text and portable accros computers.
|
544
557
|
*/
|
545
558
|
void rdr_save(const rdr_t *rdr, FILE *file) {
|
546
|
-
if(fprintf(file, "#rdr#%
|
559
|
+
if (fprintf(file, "#rdr#%"PRIu32"/%"PRIu32"/%d\n",
|
560
|
+
rdr->npats, rdr->ntoks, rdr->autouni) < 0)
|
547
561
|
pfatal("cannot write to file");
|
548
|
-
for (
|
562
|
+
for (uint32_t p = 0; p < rdr->npats; p++)
|
549
563
|
ns_writestr(file, rdr->pats[p]->src);
|
550
564
|
qrk_save(rdr->lbl, file);
|
551
565
|
qrk_save(rdr->obs, file);
|