wapiti 0.0.5 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.simplecov +3 -0
- data/Gemfile +25 -2
- data/HISTORY.md +5 -1
- data/LICENSE +14 -13
- data/README.md +9 -16
- data/Rakefile +38 -8
- data/ext/wapiti/bcd.c +126 -124
- data/ext/wapiti/decoder.c +203 -124
- data/ext/wapiti/decoder.h +6 -4
- data/ext/wapiti/extconf.rb +2 -2
- data/ext/wapiti/gradient.c +491 -320
- data/ext/wapiti/gradient.h +52 -34
- data/ext/wapiti/lbfgs.c +74 -33
- data/ext/wapiti/model.c +47 -37
- data/ext/wapiti/model.h +22 -20
- data/ext/wapiti/native.c +850 -839
- data/ext/wapiti/native.h +1 -1
- data/ext/wapiti/options.c +52 -20
- data/ext/wapiti/options.h +37 -30
- data/ext/wapiti/pattern.c +35 -33
- data/ext/wapiti/pattern.h +12 -11
- data/ext/wapiti/progress.c +14 -13
- data/ext/wapiti/progress.h +3 -2
- data/ext/wapiti/quark.c +14 -16
- data/ext/wapiti/quark.h +6 -5
- data/ext/wapiti/reader.c +83 -69
- data/ext/wapiti/reader.h +11 -9
- data/ext/wapiti/rprop.c +84 -43
- data/ext/wapiti/sequence.h +18 -16
- data/ext/wapiti/sgdl1.c +45 -43
- data/ext/wapiti/thread.c +19 -17
- data/ext/wapiti/thread.h +5 -4
- data/ext/wapiti/tools.c +7 -7
- data/ext/wapiti/tools.h +3 -4
- data/ext/wapiti/trainers.h +1 -1
- data/ext/wapiti/vmath.c +40 -38
- data/ext/wapiti/vmath.h +12 -11
- data/ext/wapiti/wapiti.c +159 -37
- data/ext/wapiti/wapiti.h +18 -4
- data/lib/wapiti.rb +15 -15
- data/lib/wapiti/errors.rb +15 -15
- data/lib/wapiti/model.rb +92 -84
- data/lib/wapiti/options.rb +123 -124
- data/lib/wapiti/utility.rb +14 -14
- data/lib/wapiti/version.rb +2 -2
- data/spec/spec_helper.rb +29 -9
- data/spec/wapiti/model_spec.rb +230 -194
- data/spec/wapiti/native_spec.rb +7 -8
- data/spec/wapiti/options_spec.rb +184 -174
- data/wapiti.gemspec +22 -8
- metadata +38 -42
- data/.gitignore +0 -5
data/ext/wapiti/pattern.h
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -29,27 +29,28 @@
|
|
29
29
|
#define pattern_h
|
30
30
|
|
31
31
|
#include <stdbool.h>
|
32
|
+
#include <stdint.h>
|
32
33
|
|
33
34
|
#include "sequence.h"
|
34
35
|
|
35
36
|
typedef struct pat_s pat_t;
|
36
37
|
typedef struct pat_item_s pat_item_t;
|
37
38
|
struct pat_s {
|
38
|
-
char
|
39
|
-
|
40
|
-
|
39
|
+
char *src;
|
40
|
+
uint32_t ntoks;
|
41
|
+
uint32_t nitems;
|
41
42
|
struct pat_item_s {
|
42
|
-
char
|
43
|
-
bool
|
44
|
-
char
|
45
|
-
bool
|
46
|
-
|
47
|
-
|
43
|
+
char type;
|
44
|
+
bool caps;
|
45
|
+
char *value;
|
46
|
+
bool absolute;
|
47
|
+
int32_t offset;
|
48
|
+
uint32_t column;
|
48
49
|
} items[];
|
49
50
|
};
|
50
51
|
|
51
52
|
pat_t *pat_comp(char *p);
|
52
|
-
char *pat_exec(const pat_t *pat, const tok_t *tok,
|
53
|
+
char *pat_exec(const pat_t *pat, const tok_t *tok, uint32_t at);
|
53
54
|
void pat_free(pat_t *pat);
|
54
55
|
|
55
56
|
#endif
|
data/ext/wapiti/progress.c
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -24,14 +24,16 @@
|
|
24
24
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
25
|
* POSSIBILITY OF SUCH DAMAGE.
|
26
26
|
*/
|
27
|
+
#include <inttypes.h>
|
27
28
|
#include <signal.h>
|
28
29
|
#include <stdbool.h>
|
29
30
|
#include <stddef.h>
|
31
|
+
#include <stdint.h>
|
30
32
|
#include <stdlib.h>
|
31
33
|
#include <stdio.h>
|
32
34
|
|
33
35
|
#include <unistd.h>
|
34
|
-
#include <sys/
|
36
|
+
#include <sys/time.h>
|
35
37
|
#include <sys/resource.h>
|
36
38
|
|
37
39
|
#include "wapiti.h"
|
@@ -89,7 +91,7 @@ void uit_setup(mdl_t *mdl) {
|
|
89
91
|
uit_stop = false;
|
90
92
|
if (signal(SIGINT, uit_signal) == SIG_ERR)
|
91
93
|
warning("failed to set signal handler, no clean early stop");
|
92
|
-
|
94
|
+
gettimeofday(&mdl->timer, NULL);
|
93
95
|
if (mdl->opt->stopwin != 0)
|
94
96
|
mdl->werr = wapiti_xmalloc(sizeof(double) * mdl->opt->stopwin);
|
95
97
|
mdl->wcnt = mdl->wpos = 0;
|
@@ -116,28 +118,27 @@ void uit_cleanup(mdl_t *mdl) {
|
|
116
118
|
* and false if he must stop, so this is were we will implement the trainer
|
117
119
|
* independant stoping criterion.
|
118
120
|
*/
|
119
|
-
bool uit_progress(mdl_t *mdl,
|
121
|
+
bool uit_progress(mdl_t *mdl, uint32_t it, double obj) {
|
120
122
|
// First we just compute the error rate on devel or train data
|
121
123
|
double te, se;
|
122
124
|
tag_eval(mdl, &te, &se);
|
123
125
|
// Next, we compute the number of active features
|
124
|
-
|
125
|
-
for (
|
126
|
+
uint64_t act = 0;
|
127
|
+
for (uint64_t f = 0; f < mdl->nftr; f++)
|
126
128
|
if (mdl->theta[f] != 0.0)
|
127
129
|
act++;
|
128
130
|
// Compute timings. As some training algorithms are multi-threaded, we
|
129
131
|
// cannot use ansi/c function and must rely on posix one to sum time
|
130
132
|
// spent in main thread and in child ones.
|
131
|
-
tms_t now;
|
132
|
-
double tm = (now.
|
133
|
-
|
134
|
-
tm /= sysconf(_SC_CLK_TCK);
|
133
|
+
tms_t now; gettimeofday(&now, NULL);
|
134
|
+
double tm = (now.tv_sec + (double)now.tv_usec * 1.0e-6)
|
135
|
+
- (mdl->timer.tv_sec + (double)mdl->timer.tv_usec * 1.0e-6);
|
135
136
|
mdl->total += tm;
|
136
137
|
mdl->timer = now;
|
137
138
|
// And display progress report
|
138
|
-
info(" [%
|
139
|
+
info(" [%4"PRIu32"]", it);
|
139
140
|
info(obj >= 0.0 ? " obj=%-10.2f" : " obj=NA", obj);
|
140
|
-
info(" act=%-
|
141
|
+
info(" act=%-8"PRIu64, act);
|
141
142
|
info(" err=%5.2f%%/%5.2f%%", te, se);
|
142
143
|
info(" time=%.2fs/%.2fs", tm, mdl->total);
|
143
144
|
info("\n");
|
@@ -150,7 +151,7 @@ bool uit_progress(mdl_t *mdl, int it, double obj) {
|
|
150
151
|
mdl->wcnt++;
|
151
152
|
if (mdl->wcnt >= mdl->opt->stopwin) {
|
152
153
|
double emin = 200.0, emax = -100.0;
|
153
|
-
for (
|
154
|
+
for (uint32_t i = 0; i < mdl->opt->stopwin; i++) {
|
154
155
|
emin = min(emin, mdl->werr[i]);
|
155
156
|
emax = max(emax, mdl->werr[i]);
|
156
157
|
}
|
data/ext/wapiti/progress.h
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -29,6 +29,7 @@
|
|
29
29
|
#define progress_h
|
30
30
|
|
31
31
|
#include <stdbool.h>
|
32
|
+
#include <stdint.h>
|
32
33
|
|
33
34
|
#include "wapiti.h"
|
34
35
|
#include "model.h"
|
@@ -37,7 +38,7 @@ extern bool uit_stop;
|
|
37
38
|
|
38
39
|
void uit_setup(mdl_t *mdl);
|
39
40
|
void uit_cleanup(mdl_t *mdl);
|
40
|
-
bool uit_progress(mdl_t *mdl,
|
41
|
+
bool uit_progress(mdl_t *mdl, uint32_t it, double obj);
|
41
42
|
|
42
43
|
#endif
|
43
44
|
|
data/ext/wapiti/quark.c
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -24,11 +24,12 @@
|
|
24
24
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
25
|
* POSSIBILITY OF SUCH DAMAGE.
|
26
26
|
*/
|
27
|
+
#include <inttypes.h>
|
27
28
|
#include <stdbool.h>
|
28
29
|
#include <stddef.h>
|
29
30
|
#include <stdlib.h>
|
30
|
-
#include <stdio.h>
|
31
31
|
#include <stdint.h>
|
32
|
+
#include <stdio.h>
|
32
33
|
#include <string.h>
|
33
34
|
|
34
35
|
#include "quark.h"
|
@@ -47,7 +48,7 @@
|
|
47
48
|
* Information Coded in Alphanumeric, Journal of the ACM 15 (4): pp. 514--534,
|
48
49
|
* 1968. DOI:10.1145/321479.321481
|
49
50
|
*
|
50
|
-
* This code is copyright 2002-
|
51
|
+
* This code is copyright 2002-2013 Thomas Lavergne and licenced under the BSD
|
51
52
|
* Licence like the remaining of Wapiti.
|
52
53
|
******************************************************************************/
|
53
54
|
|
@@ -68,8 +69,6 @@ struct qrk_s {
|
|
68
69
|
uint64_t size;
|
69
70
|
};
|
70
71
|
|
71
|
-
#define qrk_none ((uint64_t)-1)
|
72
|
-
|
73
72
|
#define qrk_lf2nd(lf) ((node_t *)((intptr_t)(lf) | 1))
|
74
73
|
#define qrk_nd2lf(nd) ((leaf_t *)((intptr_t)(nd) & ~1))
|
75
74
|
#define qrk_isleaf(nd) ((intptr_t)(nd) & 1)
|
@@ -86,7 +85,7 @@ qrk_t *qrk_new(void) {
|
|
86
85
|
qrk->count = 0;
|
87
86
|
qrk->lock = false;
|
88
87
|
qrk->size = size;
|
89
|
-
qrk->leafs = wapiti_xmalloc(sizeof(leaf_t) * size);
|
88
|
+
qrk->leafs = wapiti_xmalloc(sizeof(leaf_t *) * size);
|
90
89
|
return qrk;
|
91
90
|
}
|
92
91
|
|
@@ -96,10 +95,10 @@ qrk_t *qrk_new(void) {
|
|
96
95
|
* qrk_unmap become invalid and must not be used anymore.
|
97
96
|
*/
|
98
97
|
void qrk_free(qrk_t *qrk) {
|
99
|
-
const
|
98
|
+
const uint32_t stkmax = 1024;
|
100
99
|
if (qrk->count != 0) {
|
101
100
|
node_t *stk[stkmax];
|
102
|
-
|
101
|
+
uint32_t cnt = 0;
|
103
102
|
stk[cnt++] = qrk->root;
|
104
103
|
while (cnt != 0) {
|
105
104
|
node_t *nd = stk[--cnt];
|
@@ -122,7 +121,7 @@ void qrk_free(qrk_t *qrk) {
|
|
122
121
|
* pair inside the quark. This function is not thread safe and should not be
|
123
122
|
* called on the same map from different thread without locking.
|
124
123
|
*/
|
125
|
-
|
124
|
+
uint64_t qrk_str2id(qrk_t *qrk, const char *key) {
|
126
125
|
const uint8_t *raw = (void *)key;
|
127
126
|
const size_t len = strlen(key);
|
128
127
|
// We first take care of the empty trie case so later we can safely
|
@@ -213,7 +212,7 @@ size_t qrk_str2id(qrk_t *qrk, const char *key) {
|
|
213
212
|
* remain valid only for the life time of the quark, a call to qrk_free will
|
214
213
|
* make this pointer invalid.
|
215
214
|
*/
|
216
|
-
const char *qrk_id2str(const qrk_t *qrk,
|
215
|
+
const char *qrk_id2str(const qrk_t *qrk, uint64_t id) {
|
217
216
|
if (id >= qrk->count)
|
218
217
|
fatal("invalid identifier");
|
219
218
|
return qrk->leafs[id]->key;
|
@@ -225,7 +224,7 @@ const char *qrk_id2str(const qrk_t *qrk, size_t id) {
|
|
225
224
|
* number correspond to the id.
|
226
225
|
*/
|
227
226
|
void qrk_save(const qrk_t *qrk, FILE *file) {
|
228
|
-
if (fprintf(file, "#qrk#%
|
227
|
+
if (fprintf(file, "#qrk#%"PRIu64"\n", qrk->count) < 0)
|
229
228
|
pfatal("cannot write to file");
|
230
229
|
if (qrk->count == 0)
|
231
230
|
return;
|
@@ -240,13 +239,13 @@ void qrk_save(const qrk_t *qrk, FILE *file) {
|
|
240
239
|
* initilay empty, this will load a map exactly as saved by qrk_save.
|
241
240
|
*/
|
242
241
|
void qrk_load(qrk_t *qrk, FILE *file) {
|
243
|
-
|
244
|
-
if (fscanf(file, "#qrk#%
|
242
|
+
uint64_t cnt = 0;
|
243
|
+
if (fscanf(file, "#qrk#%"SCNu64"\n", &cnt) != 1) {
|
245
244
|
if (ferror(file) != 0)
|
246
245
|
pfatal("cannot read from file");
|
247
246
|
pfatal("invalid format");
|
248
247
|
}
|
249
|
-
for (
|
248
|
+
for (uint64_t n = 0; n < cnt; ++n) {
|
250
249
|
char *str = ns_readstr(file);
|
251
250
|
qrk_str2id(qrk, str);
|
252
251
|
free(str);
|
@@ -256,7 +255,7 @@ void qrk_load(qrk_t *qrk, FILE *file) {
|
|
256
255
|
/* qrk_count:
|
257
256
|
* Return the number of mappings stored in the quark.
|
258
257
|
*/
|
259
|
-
|
258
|
+
uint64_t qrk_count(const qrk_t *qrk) {
|
260
259
|
return qrk->count;
|
261
260
|
}
|
262
261
|
|
@@ -269,4 +268,3 @@ bool qrk_lock(qrk_t *qrk, bool lock) {
|
|
269
268
|
return old;
|
270
269
|
}
|
271
270
|
|
272
|
-
|
data/ext/wapiti/quark.h
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -28,17 +28,18 @@
|
|
28
28
|
#ifndef quark_h
|
29
29
|
#define quark_h
|
30
30
|
|
31
|
-
#include <
|
31
|
+
#include <stdbool.h>
|
32
|
+
#include <stdint.h>
|
32
33
|
#include <stdio.h>
|
33
34
|
|
34
35
|
typedef struct qrk_s qrk_t;
|
35
36
|
|
36
37
|
qrk_t *qrk_new(void);
|
37
38
|
void qrk_free(qrk_t *qrk);
|
38
|
-
|
39
|
+
uint64_t qrk_count(const qrk_t *qrk);
|
39
40
|
bool qrk_lock(qrk_t *qrk, bool lock);
|
40
|
-
const char *qrk_id2str(const qrk_t *qrk,
|
41
|
-
|
41
|
+
const char *qrk_id2str(const qrk_t *qrk, uint64_t id);
|
42
|
+
uint64_t qrk_str2id(qrk_t *qrk, const char *key);
|
42
43
|
void qrk_load(qrk_t *qrk, FILE *file);
|
43
44
|
void qrk_save(const qrk_t *qrk, FILE *file);
|
44
45
|
|
data/ext/wapiti/reader.c
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -25,6 +25,7 @@
|
|
25
25
|
* POSSIBILITY OF SUCH DAMAGE.
|
26
26
|
*/
|
27
27
|
#include <ctype.h>
|
28
|
+
#include <inttypes.h>
|
28
29
|
#include <stdbool.h>
|
29
30
|
#include <stddef.h>
|
30
31
|
#include <stdlib.h>
|
@@ -59,12 +60,14 @@
|
|
59
60
|
******************************************************************************/
|
60
61
|
|
61
62
|
/* rdr_new:
|
62
|
-
* Create a new empty reader object.
|
63
|
-
*
|
63
|
+
* Create a new empty reader object. If no patterns are loaded before you
|
64
|
+
* start using the reader the input data are assumed to be already prepared
|
65
|
+
* list of features. They must either start with a prefix 'u', 'b', or '*', or
|
66
|
+
* you must set autouni to true in order to automatically add a 'u' prefix.
|
64
67
|
*/
|
65
|
-
rdr_t *rdr_new(bool
|
68
|
+
rdr_t *rdr_new(bool autouni) {
|
66
69
|
rdr_t *rdr = wapiti_xmalloc(sizeof(rdr_t));
|
67
|
-
rdr->
|
70
|
+
rdr->autouni = autouni;
|
68
71
|
rdr->npats = rdr->nuni = rdr->nbi = 0;
|
69
72
|
rdr->ntoks = 0;
|
70
73
|
rdr->pats = NULL;
|
@@ -78,7 +81,7 @@ rdr_t *rdr_new(bool maxent) {
|
|
78
81
|
* any string returned by them must not be used after this call.
|
79
82
|
*/
|
80
83
|
void rdr_free(rdr_t *rdr) {
|
81
|
-
for (
|
84
|
+
for (uint32_t i = 0; i < rdr->npats; i++)
|
82
85
|
pat_free(rdr->pats[i]);
|
83
86
|
free(rdr->pats);
|
84
87
|
qrk_free(rdr->lbl);
|
@@ -90,7 +93,7 @@ void rdr_free(rdr_t *rdr) {
|
|
90
93
|
* Free all memory used by a raw_t object.
|
91
94
|
*/
|
92
95
|
void rdr_freeraw(raw_t *raw) {
|
93
|
-
for (
|
96
|
+
for (uint32_t t = 0; t < raw->len; t++)
|
94
97
|
free(raw->lines[t]);
|
95
98
|
free(raw);
|
96
99
|
}
|
@@ -107,7 +110,7 @@ void rdr_freeseq(seq_t *seq) {
|
|
107
110
|
* Free all memory used by a dat_t object.
|
108
111
|
*/
|
109
112
|
void rdr_freedat(dat_t *dat) {
|
110
|
-
for (
|
113
|
+
for (uint32_t i = 0; i < dat->nseq; i++)
|
111
114
|
rdr_freeseq(dat->seq[i]);
|
112
115
|
free(dat->seq);
|
113
116
|
free(dat);
|
@@ -118,11 +121,11 @@ void rdr_freedat(dat_t *dat) {
|
|
118
121
|
* available memory, a buffer large enough is allocated and returned. The
|
119
122
|
* caller is responsible to free it. On end-of-file, NULL is returned.
|
120
123
|
*/
|
121
|
-
|
124
|
+
char *rdr_readline(FILE *file) {
|
122
125
|
if (feof(file))
|
123
126
|
return NULL;
|
124
127
|
// Initialize the buffer
|
125
|
-
|
128
|
+
uint32_t len = 0, size = 16;
|
126
129
|
char *buffer = wapiti_xmalloc(size);
|
127
130
|
// We read the line chunk by chunk until end of line, file or error
|
128
131
|
while (!feof(file)) {
|
@@ -203,7 +206,7 @@ raw_t *rdr_readraw(rdr_t *rdr, FILE *file) {
|
|
203
206
|
if (feof(file))
|
204
207
|
return NULL;
|
205
208
|
// Prepare the raw sequence object
|
206
|
-
|
209
|
+
uint32_t size = 32, cnt = 0;
|
207
210
|
raw_t *raw = wapiti_xmalloc(sizeof(raw_t) + sizeof(char *) * size);
|
208
211
|
// And read the next sequence in the file, this will skip any blank line
|
209
212
|
// before reading the sequence stoping at end of file or on a new blank
|
@@ -232,9 +235,9 @@ raw_t *rdr_readraw(rdr_t *rdr, FILE *file) {
|
|
232
235
|
+ sizeof(char *) * size);
|
233
236
|
}
|
234
237
|
raw->lines[cnt++] = line;
|
235
|
-
// In
|
236
|
-
//
|
237
|
-
if (rdr->
|
238
|
+
// In autouni mode, there will be only unigram features so we
|
239
|
+
// can use small sequences to improve multi-theading.
|
240
|
+
if (rdr->autouni)
|
238
241
|
break;
|
239
242
|
}
|
240
243
|
// If no lines was read, we just free allocated memory and return NULL
|
@@ -251,13 +254,12 @@ raw_t *rdr_readraw(rdr_t *rdr, FILE *file) {
|
|
251
254
|
|
252
255
|
/* rdr_mapobs:
|
253
256
|
* Map an observation to its identifier, automatically adding a 'u' prefix in
|
254
|
-
*
|
257
|
+
* 'autouni' mode.
|
255
258
|
*/
|
256
|
-
static
|
257
|
-
if (!rdr->
|
259
|
+
static uint64_t rdr_mapobs(rdr_t *rdr, const char *str) {
|
260
|
+
if (!rdr->autouni)
|
258
261
|
return qrk_str2id(rdr->obs, str);
|
259
|
-
|
260
|
-
char tmp[len];
|
262
|
+
char tmp[strlen(str) + 2];
|
261
263
|
tmp[0] = 'u';
|
262
264
|
strcpy(tmp + 1, str);
|
263
265
|
return qrk_str2id(rdr->obs, tmp);
|
@@ -268,13 +270,13 @@ static size_t rdr_mapobs(rdr_t *rdr, const char *str) {
|
|
268
270
|
* applying patterns.
|
269
271
|
*/
|
270
272
|
static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
|
271
|
-
const
|
272
|
-
|
273
|
-
if (rdr->
|
273
|
+
const uint32_t T = tok->len;
|
274
|
+
uint32_t size = 0;
|
275
|
+
if (rdr->autouni) {
|
274
276
|
size = tok->cnts[0];
|
275
277
|
} else {
|
276
|
-
for (
|
277
|
-
for (
|
278
|
+
for (uint32_t t = 0; t < T; t++) {
|
279
|
+
for (uint32_t n = 0; n < tok->cnts[t]; n++) {
|
278
280
|
const char *o = tok->toks[t][n];
|
279
281
|
switch (o[0]) {
|
280
282
|
case 'u': size += 1; break;
|
@@ -287,30 +289,30 @@ static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
|
|
287
289
|
}
|
288
290
|
}
|
289
291
|
seq_t *seq = wapiti_xmalloc(sizeof(seq_t) + sizeof(pos_t) * T);
|
290
|
-
seq->raw = wapiti_xmalloc(sizeof(
|
292
|
+
seq->raw = wapiti_xmalloc(sizeof(uint64_t) * size);
|
291
293
|
seq->len = T;
|
292
|
-
|
293
|
-
for (
|
294
|
-
seq->pos[t].lbl =
|
294
|
+
uint64_t *raw = seq->raw;
|
295
|
+
for (uint32_t t = 0; t < T; t++) {
|
296
|
+
seq->pos[t].lbl = (uint32_t)-1;
|
295
297
|
seq->pos[t].ucnt = 0;
|
296
298
|
seq->pos[t].uobs = raw;
|
297
|
-
for (
|
298
|
-
if (tok->toks[t][n][0] == 'b')
|
299
|
+
for (uint32_t n = 0; n < tok->cnts[t]; n++) {
|
300
|
+
if (!rdr->autouni && tok->toks[t][n][0] == 'b')
|
299
301
|
continue;
|
300
|
-
|
302
|
+
uint64_t id = rdr_mapobs(rdr, tok->toks[t][n]);
|
301
303
|
if (id != none) {
|
302
304
|
(*raw++) = id;
|
303
305
|
seq->pos[t].ucnt++;
|
304
306
|
}
|
305
307
|
}
|
306
308
|
seq->pos[t].bcnt = 0;
|
307
|
-
if (rdr->
|
309
|
+
if (rdr->autouni)
|
308
310
|
continue;
|
309
311
|
seq->pos[t].bobs = raw;
|
310
|
-
for (
|
312
|
+
for (uint32_t n = 0; n < tok->cnts[t]; n++) {
|
311
313
|
if (tok->toks[t][n][0] == 'u')
|
312
314
|
continue;
|
313
|
-
|
315
|
+
uint64_t id = rdr_mapobs(rdr, tok->toks[t][n]);
|
314
316
|
if (id != none) {
|
315
317
|
(*raw++) = id;
|
316
318
|
seq->pos[t].bcnt++;
|
@@ -319,9 +321,9 @@ static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
|
|
319
321
|
}
|
320
322
|
// And finally, if the user specified it, populate the labels
|
321
323
|
if (tok->lbl != NULL) {
|
322
|
-
for (
|
324
|
+
for (uint32_t t = 0; t < T; t++) {
|
323
325
|
const char *lbl = tok->lbl[t];
|
324
|
-
|
326
|
+
uint64_t id = qrk_str2id(rdr->lbl, lbl);
|
325
327
|
seq->pos[t].lbl = id;
|
326
328
|
}
|
327
329
|
}
|
@@ -332,35 +334,35 @@ static seq_t *rdr_rawtok2seq(rdr_t *rdr, const tok_t *tok) {
|
|
332
334
|
* Convert a tok_t to a seq_t object by applying the patterns of the reader.
|
333
335
|
*/
|
334
336
|
static seq_t *rdr_pattok2seq(rdr_t *rdr, const tok_t *tok) {
|
335
|
-
const
|
337
|
+
const uint32_t T = tok->len;
|
336
338
|
// So now the tok object is ready, we can start building the seq_t
|
337
339
|
// object by appling patterns. First we allocate the seq_t object. The
|
338
340
|
// sequence itself as well as the sub array are allocated in one time.
|
339
341
|
seq_t *seq = wapiti_xmalloc(sizeof(seq_t) + sizeof(pos_t) * T);
|
340
|
-
seq->raw = wapiti_xmalloc(sizeof(
|
342
|
+
seq->raw = wapiti_xmalloc(sizeof(uint64_t) * (rdr->nuni + rdr->nbi) * T);
|
341
343
|
seq->len = T;
|
342
|
-
|
343
|
-
for (
|
344
|
-
seq->pos[t].lbl =
|
344
|
+
uint64_t *tmp = seq->raw;
|
345
|
+
for (uint32_t t = 0; t < T; t++) {
|
346
|
+
seq->pos[t].lbl = (uint32_t)-1;
|
345
347
|
seq->pos[t].uobs = tmp; tmp += rdr->nuni;
|
346
348
|
seq->pos[t].bobs = tmp; tmp += rdr->nbi;
|
347
349
|
}
|
348
350
|
// Next, we can build the observations list by applying the patterns on
|
349
351
|
// the tok_t sequence.
|
350
|
-
for (
|
352
|
+
for (uint32_t t = 0; t < T; t++) {
|
351
353
|
pos_t *pos = &seq->pos[t];
|
352
354
|
pos->ucnt = 0;
|
353
355
|
pos->bcnt = 0;
|
354
|
-
for (
|
356
|
+
for (uint32_t x = 0; x < rdr->npats; x++) {
|
355
357
|
// Get the observation and map it to an identifier
|
356
358
|
char *obs = pat_exec(rdr->pats[x], tok, t);
|
357
|
-
|
359
|
+
uint64_t id = rdr_mapobs(rdr, obs);
|
358
360
|
if (id == none) {
|
359
361
|
free(obs);
|
360
362
|
continue;
|
361
363
|
}
|
362
364
|
// If the observation is ok, add it to the lists
|
363
|
-
|
365
|
+
char kind = 0;
|
364
366
|
switch (obs[0]) {
|
365
367
|
case 'u': kind = 1; break;
|
366
368
|
case 'b': kind = 2; break;
|
@@ -375,9 +377,9 @@ static seq_t *rdr_pattok2seq(rdr_t *rdr, const tok_t *tok) {
|
|
375
377
|
}
|
376
378
|
// And finally, if the user specified it, populate the labels
|
377
379
|
if (tok->lbl != NULL) {
|
378
|
-
for (
|
380
|
+
for (uint32_t t = 0; t < T; t++) {
|
379
381
|
const char *lbl = tok->lbl[t];
|
380
|
-
|
382
|
+
uint64_t id = qrk_str2id(rdr->lbl, lbl);
|
381
383
|
seq->pos[t].lbl = id;
|
382
384
|
}
|
383
385
|
}
|
@@ -390,11 +392,11 @@ static seq_t *rdr_pattok2seq(rdr_t *rdr, const tok_t *tok) {
|
|
390
392
|
* interned also.
|
391
393
|
*/
|
392
394
|
seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl) {
|
393
|
-
const
|
395
|
+
const uint32_t T = raw->len;
|
394
396
|
// Allocate the tok_t object, the label array is allocated only if they
|
395
397
|
// are requested by the user.
|
396
398
|
tok_t *tok = wapiti_xmalloc(sizeof(tok_t) + T * sizeof(char **));
|
397
|
-
tok->cnts = wapiti_xmalloc(sizeof(
|
399
|
+
tok->cnts = wapiti_xmalloc(sizeof(uint32_t) * T);
|
398
400
|
tok->lbl = NULL;
|
399
401
|
if (lbl == true)
|
400
402
|
tok->lbl = wapiti_xmalloc(sizeof(char *) * T);
|
@@ -402,16 +404,15 @@ seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl) {
|
|
402
404
|
// tokens. To reduce memory fragmentation, the raw line is copied and
|
403
405
|
// his reference is kept by the first tokens, next tokens are pointer to
|
404
406
|
// this copy.
|
405
|
-
for (
|
407
|
+
for (uint32_t t = 0; t < T; t++) {
|
406
408
|
// Get a copy of the raw line skiping leading space characters
|
407
409
|
const char *src = raw->lines[t];
|
408
410
|
while (isspace(*src))
|
409
411
|
src++;
|
410
412
|
char *line = xstrdup(src);
|
411
413
|
// Split it in tokens
|
412
|
-
|
413
|
-
|
414
|
-
int cnt = 0;
|
414
|
+
char *toks[strlen(line) / 2 + 1];
|
415
|
+
uint32_t cnt = 0;
|
415
416
|
while (*line != '\0') {
|
416
417
|
toks[cnt++] = line;
|
417
418
|
while (*line != '\0' && !isspace(*line))
|
@@ -441,7 +442,7 @@ seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl) {
|
|
441
442
|
else
|
442
443
|
seq = rdr_pattok2seq(rdr, tok);
|
443
444
|
// Before returning the sequence, we have to free the tok_t
|
444
|
-
for (
|
445
|
+
for (uint32_t t = 0; t < T; t++) {
|
445
446
|
if (tok->cnts[t] == 0)
|
446
447
|
continue;
|
447
448
|
free(tok->toks[t][0]);
|
@@ -477,7 +478,7 @@ seq_t *rdr_readseq(rdr_t *rdr, FILE *file, bool lbl) {
|
|
477
478
|
*/
|
478
479
|
dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl) {
|
479
480
|
// Prepare dataset
|
480
|
-
|
481
|
+
uint32_t size = 1000;
|
481
482
|
dat_t *dat = wapiti_xmalloc(sizeof(dat_t));
|
482
483
|
dat->nseq = 0;
|
483
484
|
dat->mlen = 0;
|
@@ -498,7 +499,7 @@ dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl) {
|
|
498
499
|
dat->seq[dat->nseq++] = seq;
|
499
500
|
dat->mlen = max(dat->mlen, seq->len);
|
500
501
|
if (dat->nseq % 1000 == 0)
|
501
|
-
info("%
|
502
|
+
info("%7"PRIu32" sequences loaded\n", dat->nseq);
|
502
503
|
}
|
503
504
|
// If no sequence readed, cleanup and repport
|
504
505
|
if (dat->nseq == 0) {
|
@@ -520,18 +521,30 @@ dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl) {
|
|
520
521
|
*/
|
521
522
|
void rdr_load(rdr_t *rdr, FILE *file) {
|
522
523
|
const char *err = "broken file, invalid reader format";
|
523
|
-
|
524
|
-
|
524
|
+
int autouni = rdr->autouni;
|
525
|
+
fpos_t pos;
|
526
|
+
fgetpos(file, &pos);
|
527
|
+
if (fscanf(file, "#rdr#%"PRIu32"/%"PRIu32"/%d\n",
|
528
|
+
&rdr->npats, &rdr->ntoks, &autouni) != 3) {
|
529
|
+
// This for compatibility with previous file format
|
530
|
+
fsetpos(file, &pos);
|
531
|
+
if (fscanf(file, "#rdr#%"PRIu32"/%"PRIu32"\n",
|
532
|
+
&rdr->npats, &rdr->ntoks) != 2)
|
533
|
+
fatal(err);
|
534
|
+
}
|
535
|
+
rdr->autouni = autouni;
|
525
536
|
rdr->nuni = rdr->nbi = 0;
|
526
|
-
rdr->
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
537
|
+
if (rdr->npats != 0) {
|
538
|
+
rdr->pats = wapiti_xmalloc(sizeof(pat_t *) * rdr->npats);
|
539
|
+
for (uint32_t p = 0; p < rdr->npats; p++) {
|
540
|
+
char *pat = ns_readstr(file);
|
541
|
+
rdr->pats[p] = pat_comp(pat);
|
542
|
+
switch (tolower(pat[0])) {
|
543
|
+
case 'u': rdr->nuni++; break;
|
544
|
+
case 'b': rdr->nbi++; break;
|
545
|
+
case '*': rdr->nuni++;
|
546
|
+
rdr->nbi++; break;
|
547
|
+
}
|
535
548
|
}
|
536
549
|
}
|
537
550
|
qrk_load(rdr->lbl, file);
|
@@ -543,9 +556,10 @@ void rdr_load(rdr_t *rdr, FILE *file) {
|
|
543
556
|
* is plain text and portable accros computers.
|
544
557
|
*/
|
545
558
|
void rdr_save(const rdr_t *rdr, FILE *file) {
|
546
|
-
if(fprintf(file, "#rdr#%
|
559
|
+
if (fprintf(file, "#rdr#%"PRIu32"/%"PRIu32"/%d\n",
|
560
|
+
rdr->npats, rdr->ntoks, rdr->autouni) < 0)
|
547
561
|
pfatal("cannot write to file");
|
548
|
-
for (
|
562
|
+
for (uint32_t p = 0; p < rdr->npats; p++)
|
549
563
|
ns_writestr(file, rdr->pats[p]->src);
|
550
564
|
qrk_save(rdr->lbl, file);
|
551
565
|
qrk_save(rdr->obs, file);
|