wapiti 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +13 -0
- data/.gitignore +5 -0
- data/.rspec +3 -0
- data/Gemfile +6 -0
- data/LICENSE +30 -0
- data/README.md +153 -0
- data/Rakefile +33 -0
- data/ext/wapiti/bcd.c +392 -0
- data/ext/wapiti/decoder.c +535 -0
- data/ext/wapiti/decoder.h +46 -0
- data/ext/wapiti/extconf.rb +8 -0
- data/ext/wapiti/gradient.c +818 -0
- data/ext/wapiti/gradient.h +81 -0
- data/ext/wapiti/lbfgs.c +294 -0
- data/ext/wapiti/model.c +296 -0
- data/ext/wapiti/model.h +100 -0
- data/ext/wapiti/native.c +1238 -0
- data/ext/wapiti/native.h +15 -0
- data/ext/wapiti/options.c +278 -0
- data/ext/wapiti/options.h +91 -0
- data/ext/wapiti/pattern.c +395 -0
- data/ext/wapiti/pattern.h +56 -0
- data/ext/wapiti/progress.c +167 -0
- data/ext/wapiti/progress.h +43 -0
- data/ext/wapiti/quark.c +272 -0
- data/ext/wapiti/quark.h +46 -0
- data/ext/wapiti/reader.c +553 -0
- data/ext/wapiti/reader.h +73 -0
- data/ext/wapiti/rprop.c +191 -0
- data/ext/wapiti/sequence.h +148 -0
- data/ext/wapiti/sgdl1.c +218 -0
- data/ext/wapiti/thread.c +171 -0
- data/ext/wapiti/thread.h +42 -0
- data/ext/wapiti/tools.c +202 -0
- data/ext/wapiti/tools.h +54 -0
- data/ext/wapiti/trainers.h +39 -0
- data/ext/wapiti/vmath.c +372 -0
- data/ext/wapiti/vmath.h +51 -0
- data/ext/wapiti/wapiti.c +288 -0
- data/ext/wapiti/wapiti.h +45 -0
- data/lib/wapiti.rb +30 -0
- data/lib/wapiti/errors.rb +17 -0
- data/lib/wapiti/model.rb +49 -0
- data/lib/wapiti/options.rb +113 -0
- data/lib/wapiti/utility.rb +15 -0
- data/lib/wapiti/version.rb +3 -0
- data/spec/fixtures/ch.mod +18550 -0
- data/spec/fixtures/chpattern.txt +52 -0
- data/spec/fixtures/chtest.txt +1973 -0
- data/spec/fixtures/chtrain.txt +19995 -0
- data/spec/fixtures/nppattern.txt +52 -0
- data/spec/fixtures/nptest.txt +1973 -0
- data/spec/fixtures/nptrain.txt +19995 -0
- data/spec/fixtures/pattern.txt +14 -0
- data/spec/fixtures/test.txt +60000 -0
- data/spec/fixtures/train.txt +1200 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/wapiti/model_spec.rb +173 -0
- data/spec/wapiti/native_spec.rb +12 -0
- data/spec/wapiti/options_spec.rb +175 -0
- data/spec/wapiti/utility_spec.rb +22 -0
- data/wapiti.gemspec +35 -0
- metadata +178 -0
@@ -0,0 +1,56 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#ifndef pattern_h
|
29
|
+
#define pattern_h
|
30
|
+
|
31
|
+
#include <stdbool.h>
|
32
|
+
|
33
|
+
#include "sequence.h"
|
34
|
+
|
35
|
+
typedef struct pat_s pat_t;
|
36
|
+
typedef struct pat_item_s pat_item_t;
|
37
|
+
struct pat_s {
|
38
|
+
char *src;
|
39
|
+
int ntoks;
|
40
|
+
int nitems;
|
41
|
+
struct pat_item_s {
|
42
|
+
char type;
|
43
|
+
bool caps;
|
44
|
+
char *value;
|
45
|
+
bool absolute;
|
46
|
+
int offset;
|
47
|
+
int column;
|
48
|
+
} items[];
|
49
|
+
};
|
50
|
+
|
51
|
+
pat_t *pat_comp(char *p);
|
52
|
+
char *pat_exec(const pat_t *pat, const tok_t *tok, int at);
|
53
|
+
void pat_free(pat_t *pat);
|
54
|
+
|
55
|
+
#endif
|
56
|
+
|
@@ -0,0 +1,167 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
#include <signal.h>
|
28
|
+
#include <stdbool.h>
|
29
|
+
#include <stddef.h>
|
30
|
+
#include <stdlib.h>
|
31
|
+
#include <stdio.h>
|
32
|
+
|
33
|
+
#include <unistd.h>
|
34
|
+
#include <sys/times.h>
|
35
|
+
#include <sys/resource.h>
|
36
|
+
|
37
|
+
#include "wapiti.h"
|
38
|
+
#include "decoder.h"
|
39
|
+
#include "model.h"
|
40
|
+
#include "options.h"
|
41
|
+
#include "progress.h"
|
42
|
+
#include "tools.h"
|
43
|
+
|
44
|
+
/*******************************************************************************
|
45
|
+
* User interaction during training
|
46
|
+
*
|
47
|
+
* Handle progress reporting during training and clean early stoping. Trainers
|
48
|
+
* have to call uit_progress at the end of each iterations, this will display
|
49
|
+
* various informations for the user.
|
50
|
+
* Timing is also done here, an iteration is assumed to take all the time
|
51
|
+
* between to call to the progress function and evualtion on the devel data
|
52
|
+
* are included.
|
53
|
+
*
|
54
|
+
* This module setup a signal handler for SIGINT. If this signal is catched,
|
55
|
+
* the uit_stop global variable to inform the trainer that it have to stop as
|
56
|
+
* early as possible, discarding the recent computations if they cannot be
|
57
|
+
* integrated very quickly. They must leave the model in a clean state. Any
|
58
|
+
* further signal will terminate the program. So it's simple :
|
59
|
+
* - 1 signal mean "I can wait a little so try to stop as soon as possible
|
60
|
+
* but leave me a working model"
|
61
|
+
* - 2 signal mean "Stop immediatly what you are doing, I can't wait and
|
62
|
+
* don't care about getting a working model"
|
63
|
+
******************************************************************************/
|
64
|
+
|
65
|
+
/* uit_stop:
|
66
|
+
* This value is set to true when the user request the trainer to stop. In
|
67
|
+
* this case, the trainer have to stop as soon as possible in a clean state,
|
68
|
+
* discarding the lasts computations if it cannot integrate them quickly.
|
69
|
+
*/
|
70
|
+
bool uit_stop = false;
|
71
|
+
|
72
|
+
/* uit_signal:
|
73
|
+
* Signal handler to catch interupt signal. When a signal is received, the
|
74
|
+
* trainer is aksed to stop as soon as possible leaving the model in a clean
|
75
|
+
* state. We don't reinstall the handler so if user send a second interupt
|
76
|
+
* signal, the program will stop imediatly. (to cope with BSD system, we even
|
77
|
+
* reinstall explicitly the default handler)
|
78
|
+
*/
|
79
|
+
static void uit_signal(int sig) {
|
80
|
+
signal(sig, SIG_DFL);
|
81
|
+
uit_stop = true;
|
82
|
+
}
|
83
|
+
|
84
|
+
/* uit_setup:
|
85
|
+
* Install the signal handler for clean early stop from the user if possible
|
86
|
+
* and start the timer.
|
87
|
+
*/
|
88
|
+
void uit_setup(mdl_t *mdl) {
|
89
|
+
uit_stop = false;
|
90
|
+
if (signal(SIGINT, uit_signal) == SIG_ERR)
|
91
|
+
warning("failed to set signal handler, no clean early stop");
|
92
|
+
times(&mdl->timer);
|
93
|
+
if (mdl->opt->stopwin != 0)
|
94
|
+
mdl->werr = wapiti_xmalloc(sizeof(double) * mdl->opt->stopwin);
|
95
|
+
mdl->wcnt = mdl->wpos = 0;
|
96
|
+
}
|
97
|
+
|
98
|
+
/* uit_cleanup:
|
99
|
+
* Remove the signal handler restoring the defaul behavior in case of
|
100
|
+
* interrupt.
|
101
|
+
*/
|
102
|
+
void uit_cleanup(mdl_t *mdl) {
|
103
|
+
unused(mdl);
|
104
|
+
if (mdl->opt->stopwin != 0) {
|
105
|
+
free(mdl->werr);
|
106
|
+
mdl->werr = NULL;
|
107
|
+
}
|
108
|
+
signal(SIGINT, SIG_DFL);
|
109
|
+
}
|
110
|
+
|
111
|
+
/* uit_progress:
|
112
|
+
* Display a progress repport to the user consisting of some informations
|
113
|
+
* provided by the trainer: iteration count and objective function value, and
|
114
|
+
* some informations computed here on the current model performances.
|
115
|
+
* This function return true if the trainer have to keep training the model
|
116
|
+
* and false if he must stop, so this is were we will implement the trainer
|
117
|
+
* independant stoping criterion.
|
118
|
+
*/
|
119
|
+
bool uit_progress(mdl_t *mdl, int it, double obj) {
|
120
|
+
// First we just compute the error rate on devel or train data
|
121
|
+
double te, se;
|
122
|
+
tag_eval(mdl, &te, &se);
|
123
|
+
// Next, we compute the number of active features
|
124
|
+
size_t act = 0;
|
125
|
+
for (size_t f = 0; f < mdl->nftr; f++)
|
126
|
+
if (mdl->theta[f] != 0.0)
|
127
|
+
act++;
|
128
|
+
// Compute timings. As some training algorithms are multi-threaded, we
|
129
|
+
// cannot use ansi/c function and must rely on posix one to sum time
|
130
|
+
// spent in main thread and in child ones.
|
131
|
+
tms_t now; times(&now);
|
132
|
+
double tm = (now.tms_utime - mdl->timer.tms_utime )
|
133
|
+
+ (now.tms_cutime - mdl->timer.tms_cutime);
|
134
|
+
tm /= sysconf(_SC_CLK_TCK);
|
135
|
+
mdl->total += tm;
|
136
|
+
mdl->timer = now;
|
137
|
+
// And display progress report
|
138
|
+
info(" [%4d]", it);
|
139
|
+
info(obj >= 0.0 ? " obj=%-10.2f" : " obj=NA", obj);
|
140
|
+
info(" act=%-8zu", act);
|
141
|
+
info(" err=%5.2f%%/%5.2f%%", te, se);
|
142
|
+
info(" time=%.2fs/%.2fs", tm, mdl->total);
|
143
|
+
info("\n");
|
144
|
+
// If requested, check the error rate stoping criterion. We check if the
|
145
|
+
// error rate is stable enought over a few iterations.
|
146
|
+
bool res = true;
|
147
|
+
if (mdl->opt->stopwin != 0) {
|
148
|
+
mdl->werr[mdl->wpos] = te;
|
149
|
+
mdl->wpos = (mdl->wpos + 1) % mdl->opt->stopwin;
|
150
|
+
mdl->wcnt++;
|
151
|
+
if (mdl->wcnt >= mdl->opt->stopwin) {
|
152
|
+
double emin = 200.0, emax = -100.0;
|
153
|
+
for (int i = 0; i < mdl->opt->stopwin; i++) {
|
154
|
+
emin = min(emin, mdl->werr[i]);
|
155
|
+
emax = max(emax, mdl->werr[i]);
|
156
|
+
}
|
157
|
+
if (emax - emin < mdl->opt->stopeps)
|
158
|
+
res = false;
|
159
|
+
}
|
160
|
+
}
|
161
|
+
// And return
|
162
|
+
if (uit_stop)
|
163
|
+
return false;
|
164
|
+
return res;
|
165
|
+
}
|
166
|
+
|
167
|
+
|
@@ -0,0 +1,43 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#ifndef progress_h
|
29
|
+
#define progress_h
|
30
|
+
|
31
|
+
#include <stdbool.h>
|
32
|
+
|
33
|
+
#include "wapiti.h"
|
34
|
+
#include "model.h"
|
35
|
+
|
36
|
+
extern bool uit_stop;
|
37
|
+
|
38
|
+
void uit_setup(mdl_t *mdl);
|
39
|
+
void uit_cleanup(mdl_t *mdl);
|
40
|
+
bool uit_progress(mdl_t *mdl, int it, double obj);
|
41
|
+
|
42
|
+
#endif
|
43
|
+
|
data/ext/wapiti/quark.c
ADDED
@@ -0,0 +1,272 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
#include <stdbool.h>
|
28
|
+
#include <stddef.h>
|
29
|
+
#include <stdlib.h>
|
30
|
+
#include <stdio.h>
|
31
|
+
#include <stdint.h>
|
32
|
+
#include <string.h>
|
33
|
+
|
34
|
+
#include "quark.h"
|
35
|
+
#include "tools.h"
|
36
|
+
|
37
|
+
/******************************************************************************
|
38
|
+
* Map object
|
39
|
+
*
|
40
|
+
* Implement quark object for mapping strings to identifiers through crit-bit
|
41
|
+
* tree (also known as PATRICIA tries). In fact it only store a compressed
|
42
|
+
* version of the trie to reduce memory footprint. The special trick of using
|
43
|
+
* the last bit of the reference to differenciate between nodes and leafs come
|
44
|
+
* from Daniel J. Bernstein implementation of crit-bit tree that can be found
|
45
|
+
* on his web site.
|
46
|
+
* [1] Morrison, Donald R. ; PATRICIA-Practical Algorithm To Retrieve
|
47
|
+
* Information Coded in Alphanumeric, Journal of the ACM 15 (4): pp. 514--534,
|
48
|
+
* 1968. DOI:10.1145/321479.321481
|
49
|
+
*
|
50
|
+
* This code is copyright 2002-2011 Thomas Lavergne and licenced under the BSD
|
51
|
+
* Licence like the remaining of Wapiti.
|
52
|
+
******************************************************************************/
|
53
|
+
|
54
|
+
typedef struct node_s node_t;
|
55
|
+
typedef struct leaf_s leaf_t;
|
56
|
+
struct qrk_s {
|
57
|
+
struct node_s {
|
58
|
+
node_t *child[2];
|
59
|
+
uint32_t pos;
|
60
|
+
uint8_t byte;
|
61
|
+
} *root;
|
62
|
+
struct leaf_s {
|
63
|
+
uint64_t id;
|
64
|
+
char key[];
|
65
|
+
} **leafs;
|
66
|
+
bool lock;
|
67
|
+
uint64_t count;
|
68
|
+
uint64_t size;
|
69
|
+
};
|
70
|
+
|
71
|
+
#define qrk_none ((uint64_t)-1)
|
72
|
+
|
73
|
+
#define qrk_lf2nd(lf) ((node_t *)((intptr_t)(lf) | 1))
|
74
|
+
#define qrk_nd2lf(nd) ((leaf_t *)((intptr_t)(nd) & ~1))
|
75
|
+
#define qrk_isleaf(nd) ((intptr_t)(nd) & 1)
|
76
|
+
|
77
|
+
/* qrk_new:
|
78
|
+
* This initialize the object for holding a new empty trie, with some pre-
|
79
|
+
* allocations. The returned object must be freed with a call to qrk_free when
|
80
|
+
* not needed anymore.
|
81
|
+
*/
|
82
|
+
qrk_t *qrk_new(void) {
|
83
|
+
const uint64_t size = 128;
|
84
|
+
qrk_t *qrk = wapiti_xmalloc(sizeof(qrk_t));
|
85
|
+
qrk->root = NULL;
|
86
|
+
qrk->count = 0;
|
87
|
+
qrk->lock = false;
|
88
|
+
qrk->size = size;
|
89
|
+
qrk->leafs = wapiti_xmalloc(sizeof(leaf_t) * size);
|
90
|
+
return qrk;
|
91
|
+
}
|
92
|
+
|
93
|
+
/* qrk_free:
|
94
|
+
* Release all the memory used by a qrk_t object allocated with qrk_new. This
|
95
|
+
* will release all key string stored internally so all key returned by
|
96
|
+
* qrk_unmap become invalid and must not be used anymore.
|
97
|
+
*/
|
98
|
+
void qrk_free(qrk_t *qrk) {
|
99
|
+
const size_t stkmax = 1024;
|
100
|
+
if (qrk->count != 0) {
|
101
|
+
node_t *stk[stkmax];
|
102
|
+
int cnt = 0;
|
103
|
+
stk[cnt++] = qrk->root;
|
104
|
+
while (cnt != 0) {
|
105
|
+
node_t *nd = stk[--cnt];
|
106
|
+
if (qrk_isleaf(nd)) {
|
107
|
+
free(qrk_nd2lf(nd));
|
108
|
+
continue;
|
109
|
+
}
|
110
|
+
stk[cnt++] = nd->child[0];
|
111
|
+
stk[cnt++] = nd->child[1];
|
112
|
+
free(nd);
|
113
|
+
}
|
114
|
+
}
|
115
|
+
free(qrk->leafs);
|
116
|
+
free(qrk);
|
117
|
+
}
|
118
|
+
|
119
|
+
/* qrk_insert:
|
120
|
+
* Map a key to a uniq identifier. If the key already exist in the map, return
|
121
|
+
* its identifier, else allocate a new identifier and insert the new (key,id)
|
122
|
+
* pair inside the quark. This function is not thread safe and should not be
|
123
|
+
* called on the same map from different thread without locking.
|
124
|
+
*/
|
125
|
+
size_t qrk_str2id(qrk_t *qrk, const char *key) {
|
126
|
+
const uint8_t *raw = (void *)key;
|
127
|
+
const size_t len = strlen(key);
|
128
|
+
// We first take care of the empty trie case so later we can safely
|
129
|
+
// assume that the trie is well formed and so there is no NULL pointers
|
130
|
+
// in it.
|
131
|
+
if (qrk->count == 0) {
|
132
|
+
if (qrk->lock == true)
|
133
|
+
return none;
|
134
|
+
const size_t size = sizeof(char) * (len + 1);
|
135
|
+
leaf_t *lf = wapiti_xmalloc(sizeof(leaf_t) + size);
|
136
|
+
memcpy(lf->key, key, size);
|
137
|
+
lf->id = 0;
|
138
|
+
qrk->root = qrk_lf2nd(lf);
|
139
|
+
qrk->leafs[0] = lf;
|
140
|
+
qrk->count = 1;
|
141
|
+
return 0;
|
142
|
+
}
|
143
|
+
// If the trie is not empty, we first go down the trie to the leaf like
|
144
|
+
// if we are searching for the key. When at leaf there is two case,
|
145
|
+
// either we have found our key or we have found another key with all
|
146
|
+
// its critical bit identical to our one. So we search for the first
|
147
|
+
// differing bit between them to know where we have to add the new node.
|
148
|
+
const node_t *nd = qrk->root;
|
149
|
+
while (!qrk_isleaf(nd)) {
|
150
|
+
const uint8_t chr = nd->pos < len ? raw[nd->pos] : 0;
|
151
|
+
const int side = ((chr | nd->byte) + 1) >> 8;
|
152
|
+
nd = nd->child[side];
|
153
|
+
}
|
154
|
+
const char *bst = qrk_nd2lf(nd)->key;
|
155
|
+
size_t pos;
|
156
|
+
for (pos = 0; pos < len; pos++)
|
157
|
+
if (key[pos] != bst[pos])
|
158
|
+
break;
|
159
|
+
uint8_t byte;
|
160
|
+
if (pos != len)
|
161
|
+
byte = key[pos] ^ bst[pos];
|
162
|
+
else if (bst[pos] != '\0')
|
163
|
+
byte = bst[pos];
|
164
|
+
else
|
165
|
+
return qrk_nd2lf(nd)->id;
|
166
|
+
if (qrk->lock == true)
|
167
|
+
return none;
|
168
|
+
// Now we known the two key are different and we know in which byte. It
|
169
|
+
// remain to build the mask for the new critical bit and build the new
|
170
|
+
// internal node and leaf.
|
171
|
+
while (byte & (byte - 1))
|
172
|
+
byte &= byte - 1;
|
173
|
+
byte ^= 255;
|
174
|
+
const uint8_t chr = bst[pos];
|
175
|
+
const int side = ((chr | byte) + 1) >> 8;
|
176
|
+
const size_t size = sizeof(char) * (len + 1);
|
177
|
+
node_t *nx = wapiti_xmalloc(sizeof(node_t));
|
178
|
+
leaf_t *lf = wapiti_xmalloc(sizeof(leaf_t) + size);
|
179
|
+
memcpy(lf->key, key, size);
|
180
|
+
lf->id = qrk->count++;
|
181
|
+
nx->pos = pos;
|
182
|
+
nx->byte = byte;
|
183
|
+
nx->child[1 - side] = qrk_lf2nd(lf);
|
184
|
+
if (lf->id == qrk->size) {
|
185
|
+
qrk->size *= 1.4;
|
186
|
+
const size_t size = sizeof(leaf_t *) * qrk->size;
|
187
|
+
qrk->leafs = wapiti_xrealloc(qrk->leafs, size);
|
188
|
+
}
|
189
|
+
qrk->leafs[lf->id] = lf;
|
190
|
+
// And last thing to do: inserting the new node in the trie. We have to
|
191
|
+
// walk down the trie again as we have to keep the ordering of nodes. So
|
192
|
+
// we search for the good position to insert it.
|
193
|
+
node_t **trg = &qrk->root;
|
194
|
+
while (true) {
|
195
|
+
node_t *nd = *trg;
|
196
|
+
if (qrk_isleaf(nd) || nd->pos > pos)
|
197
|
+
break;
|
198
|
+
if (nd->pos == pos && nd->byte > byte)
|
199
|
+
break;
|
200
|
+
const uint8_t chr = nd->pos < len ? raw[nd->pos] : 0;
|
201
|
+
const int side = ((chr | nd->byte) + 1) >> 8;
|
202
|
+
trg = &nd->child[side];
|
203
|
+
}
|
204
|
+
nx->child[side] = *trg;
|
205
|
+
*trg = nx;
|
206
|
+
return lf->id;
|
207
|
+
}
|
208
|
+
|
209
|
+
/* qrk_id2str:
|
210
|
+
* Retrieve the key associated to an identifier. The key is returned as a
|
211
|
+
* constant string that should not be modified or freed by the caller, it is
|
212
|
+
* a pointer to the internal copy of the key kept by the map object and
|
213
|
+
* remain valid only for the life time of the quark, a call to qrk_free will
|
214
|
+
* make this pointer invalid.
|
215
|
+
*/
|
216
|
+
const char *qrk_id2str(const qrk_t *qrk, size_t id) {
|
217
|
+
if (id >= qrk->count)
|
218
|
+
fatal("invalid identifier");
|
219
|
+
return qrk->leafs[id]->key;
|
220
|
+
}
|
221
|
+
|
222
|
+
/* qrk_save:
|
223
|
+
* Save list of keys present in the map object in the id order to the given
|
224
|
+
* file. We put one key per line so, if no key contains a new line, the line
|
225
|
+
* number correspond to the id.
|
226
|
+
*/
|
227
|
+
void qrk_save(const qrk_t *qrk, FILE *file) {
|
228
|
+
if (fprintf(file, "#qrk#%zu\n", (size_t)qrk->count) < 0)
|
229
|
+
pfatal("cannot write to file");
|
230
|
+
if (qrk->count == 0)
|
231
|
+
return;
|
232
|
+
for (uint64_t n = 0; n < qrk->count; n++)
|
233
|
+
ns_writestr(file, qrk->leafs[n]->key);
|
234
|
+
}
|
235
|
+
|
236
|
+
/* qrk_load:
|
237
|
+
* Load a list of key from the given file and add them to the map. Each lines
|
238
|
+
* of the file is taken as a single key and mapped to the next available id if
|
239
|
+
* not already present. If all keys are single lines and the given map is
|
240
|
+
* initilay empty, this will load a map exactly as saved by qrk_save.
|
241
|
+
*/
|
242
|
+
void qrk_load(qrk_t *qrk, FILE *file) {
|
243
|
+
size_t cnt = 0;
|
244
|
+
if (fscanf(file, "#qrk#%zu\n", &cnt) != 1) {
|
245
|
+
if (ferror(file) != 0)
|
246
|
+
pfatal("cannot read from file");
|
247
|
+
pfatal("invalid format");
|
248
|
+
}
|
249
|
+
for (size_t n = 0; n < cnt; ++n) {
|
250
|
+
char *str = ns_readstr(file);
|
251
|
+
qrk_str2id(qrk, str);
|
252
|
+
free(str);
|
253
|
+
}
|
254
|
+
}
|
255
|
+
|
256
|
+
/* qrk_count:
|
257
|
+
* Return the number of mappings stored in the quark.
|
258
|
+
*/
|
259
|
+
size_t qrk_count(const qrk_t *qrk) {
|
260
|
+
return qrk->count;
|
261
|
+
}
|
262
|
+
|
263
|
+
/* qrk_lock:
|
264
|
+
* Set the lock value of the quark and return the old one.
|
265
|
+
*/
|
266
|
+
bool qrk_lock(qrk_t *qrk, bool lock) {
|
267
|
+
bool old = qrk->lock;
|
268
|
+
qrk->lock = lock;
|
269
|
+
return old;
|
270
|
+
}
|
271
|
+
|
272
|
+
|