wapiti 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.autotest +13 -0
- data/.gitignore +5 -0
- data/.rspec +3 -0
- data/Gemfile +6 -0
- data/LICENSE +30 -0
- data/README.md +153 -0
- data/Rakefile +33 -0
- data/ext/wapiti/bcd.c +392 -0
- data/ext/wapiti/decoder.c +535 -0
- data/ext/wapiti/decoder.h +46 -0
- data/ext/wapiti/extconf.rb +8 -0
- data/ext/wapiti/gradient.c +818 -0
- data/ext/wapiti/gradient.h +81 -0
- data/ext/wapiti/lbfgs.c +294 -0
- data/ext/wapiti/model.c +296 -0
- data/ext/wapiti/model.h +100 -0
- data/ext/wapiti/native.c +1238 -0
- data/ext/wapiti/native.h +15 -0
- data/ext/wapiti/options.c +278 -0
- data/ext/wapiti/options.h +91 -0
- data/ext/wapiti/pattern.c +395 -0
- data/ext/wapiti/pattern.h +56 -0
- data/ext/wapiti/progress.c +167 -0
- data/ext/wapiti/progress.h +43 -0
- data/ext/wapiti/quark.c +272 -0
- data/ext/wapiti/quark.h +46 -0
- data/ext/wapiti/reader.c +553 -0
- data/ext/wapiti/reader.h +73 -0
- data/ext/wapiti/rprop.c +191 -0
- data/ext/wapiti/sequence.h +148 -0
- data/ext/wapiti/sgdl1.c +218 -0
- data/ext/wapiti/thread.c +171 -0
- data/ext/wapiti/thread.h +42 -0
- data/ext/wapiti/tools.c +202 -0
- data/ext/wapiti/tools.h +54 -0
- data/ext/wapiti/trainers.h +39 -0
- data/ext/wapiti/vmath.c +372 -0
- data/ext/wapiti/vmath.h +51 -0
- data/ext/wapiti/wapiti.c +288 -0
- data/ext/wapiti/wapiti.h +45 -0
- data/lib/wapiti.rb +30 -0
- data/lib/wapiti/errors.rb +17 -0
- data/lib/wapiti/model.rb +49 -0
- data/lib/wapiti/options.rb +113 -0
- data/lib/wapiti/utility.rb +15 -0
- data/lib/wapiti/version.rb +3 -0
- data/spec/fixtures/ch.mod +18550 -0
- data/spec/fixtures/chpattern.txt +52 -0
- data/spec/fixtures/chtest.txt +1973 -0
- data/spec/fixtures/chtrain.txt +19995 -0
- data/spec/fixtures/nppattern.txt +52 -0
- data/spec/fixtures/nptest.txt +1973 -0
- data/spec/fixtures/nptrain.txt +19995 -0
- data/spec/fixtures/pattern.txt +14 -0
- data/spec/fixtures/test.txt +60000 -0
- data/spec/fixtures/train.txt +1200 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/wapiti/model_spec.rb +173 -0
- data/spec/wapiti/native_spec.rb +12 -0
- data/spec/wapiti/options_spec.rb +175 -0
- data/spec/wapiti/utility_spec.rb +22 -0
- data/wapiti.gemspec +35 -0
- metadata +178 -0
@@ -0,0 +1,56 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#ifndef pattern_h
|
29
|
+
#define pattern_h
|
30
|
+
|
31
|
+
#include <stdbool.h>
|
32
|
+
|
33
|
+
#include "sequence.h"
|
34
|
+
|
35
|
+
typedef struct pat_s pat_t;
|
36
|
+
typedef struct pat_item_s pat_item_t;
|
37
|
+
struct pat_s {
|
38
|
+
char *src;
|
39
|
+
int ntoks;
|
40
|
+
int nitems;
|
41
|
+
struct pat_item_s {
|
42
|
+
char type;
|
43
|
+
bool caps;
|
44
|
+
char *value;
|
45
|
+
bool absolute;
|
46
|
+
int offset;
|
47
|
+
int column;
|
48
|
+
} items[];
|
49
|
+
};
|
50
|
+
|
51
|
+
pat_t *pat_comp(char *p);
|
52
|
+
char *pat_exec(const pat_t *pat, const tok_t *tok, int at);
|
53
|
+
void pat_free(pat_t *pat);
|
54
|
+
|
55
|
+
#endif
|
56
|
+
|
@@ -0,0 +1,167 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
#include <signal.h>
|
28
|
+
#include <stdbool.h>
|
29
|
+
#include <stddef.h>
|
30
|
+
#include <stdlib.h>
|
31
|
+
#include <stdio.h>
|
32
|
+
|
33
|
+
#include <unistd.h>
|
34
|
+
#include <sys/times.h>
|
35
|
+
#include <sys/resource.h>
|
36
|
+
|
37
|
+
#include "wapiti.h"
|
38
|
+
#include "decoder.h"
|
39
|
+
#include "model.h"
|
40
|
+
#include "options.h"
|
41
|
+
#include "progress.h"
|
42
|
+
#include "tools.h"
|
43
|
+
|
44
|
+
/*******************************************************************************
|
45
|
+
* User interaction during training
|
46
|
+
*
|
47
|
+
* Handle progress reporting during training and clean early stoping. Trainers
|
48
|
+
* have to call uit_progress at the end of each iterations, this will display
|
49
|
+
* various informations for the user.
|
50
|
+
* Timing is also done here, an iteration is assumed to take all the time
|
51
|
+
* between to call to the progress function and evualtion on the devel data
|
52
|
+
* are included.
|
53
|
+
*
|
54
|
+
* This module setup a signal handler for SIGINT. If this signal is catched,
|
55
|
+
* the uit_stop global variable to inform the trainer that it have to stop as
|
56
|
+
* early as possible, discarding the recent computations if they cannot be
|
57
|
+
* integrated very quickly. They must leave the model in a clean state. Any
|
58
|
+
* further signal will terminate the program. So it's simple :
|
59
|
+
* - 1 signal mean "I can wait a little so try to stop as soon as possible
|
60
|
+
* but leave me a working model"
|
61
|
+
* - 2 signal mean "Stop immediatly what you are doing, I can't wait and
|
62
|
+
* don't care about getting a working model"
|
63
|
+
******************************************************************************/
|
64
|
+
|
65
|
+
/* uit_stop:
|
66
|
+
* This value is set to true when the user request the trainer to stop. In
|
67
|
+
* this case, the trainer have to stop as soon as possible in a clean state,
|
68
|
+
* discarding the lasts computations if it cannot integrate them quickly.
|
69
|
+
*/
|
70
|
+
bool uit_stop = false;
|
71
|
+
|
72
|
+
/* uit_signal:
|
73
|
+
* Signal handler to catch interupt signal. When a signal is received, the
|
74
|
+
* trainer is aksed to stop as soon as possible leaving the model in a clean
|
75
|
+
* state. We don't reinstall the handler so if user send a second interupt
|
76
|
+
* signal, the program will stop imediatly. (to cope with BSD system, we even
|
77
|
+
* reinstall explicitly the default handler)
|
78
|
+
*/
|
79
|
+
static void uit_signal(int sig) {
|
80
|
+
signal(sig, SIG_DFL);
|
81
|
+
uit_stop = true;
|
82
|
+
}
|
83
|
+
|
84
|
+
/* uit_setup:
|
85
|
+
* Install the signal handler for clean early stop from the user if possible
|
86
|
+
* and start the timer.
|
87
|
+
*/
|
88
|
+
void uit_setup(mdl_t *mdl) {
|
89
|
+
uit_stop = false;
|
90
|
+
if (signal(SIGINT, uit_signal) == SIG_ERR)
|
91
|
+
warning("failed to set signal handler, no clean early stop");
|
92
|
+
times(&mdl->timer);
|
93
|
+
if (mdl->opt->stopwin != 0)
|
94
|
+
mdl->werr = wapiti_xmalloc(sizeof(double) * mdl->opt->stopwin);
|
95
|
+
mdl->wcnt = mdl->wpos = 0;
|
96
|
+
}
|
97
|
+
|
98
|
+
/* uit_cleanup:
|
99
|
+
* Remove the signal handler restoring the defaul behavior in case of
|
100
|
+
* interrupt.
|
101
|
+
*/
|
102
|
+
void uit_cleanup(mdl_t *mdl) {
|
103
|
+
unused(mdl);
|
104
|
+
if (mdl->opt->stopwin != 0) {
|
105
|
+
free(mdl->werr);
|
106
|
+
mdl->werr = NULL;
|
107
|
+
}
|
108
|
+
signal(SIGINT, SIG_DFL);
|
109
|
+
}
|
110
|
+
|
111
|
+
/* uit_progress:
|
112
|
+
* Display a progress repport to the user consisting of some informations
|
113
|
+
* provided by the trainer: iteration count and objective function value, and
|
114
|
+
* some informations computed here on the current model performances.
|
115
|
+
* This function return true if the trainer have to keep training the model
|
116
|
+
* and false if he must stop, so this is were we will implement the trainer
|
117
|
+
* independant stoping criterion.
|
118
|
+
*/
|
119
|
+
bool uit_progress(mdl_t *mdl, int it, double obj) {
|
120
|
+
// First we just compute the error rate on devel or train data
|
121
|
+
double te, se;
|
122
|
+
tag_eval(mdl, &te, &se);
|
123
|
+
// Next, we compute the number of active features
|
124
|
+
size_t act = 0;
|
125
|
+
for (size_t f = 0; f < mdl->nftr; f++)
|
126
|
+
if (mdl->theta[f] != 0.0)
|
127
|
+
act++;
|
128
|
+
// Compute timings. As some training algorithms are multi-threaded, we
|
129
|
+
// cannot use ansi/c function and must rely on posix one to sum time
|
130
|
+
// spent in main thread and in child ones.
|
131
|
+
tms_t now; times(&now);
|
132
|
+
double tm = (now.tms_utime - mdl->timer.tms_utime )
|
133
|
+
+ (now.tms_cutime - mdl->timer.tms_cutime);
|
134
|
+
tm /= sysconf(_SC_CLK_TCK);
|
135
|
+
mdl->total += tm;
|
136
|
+
mdl->timer = now;
|
137
|
+
// And display progress report
|
138
|
+
info(" [%4d]", it);
|
139
|
+
info(obj >= 0.0 ? " obj=%-10.2f" : " obj=NA", obj);
|
140
|
+
info(" act=%-8zu", act);
|
141
|
+
info(" err=%5.2f%%/%5.2f%%", te, se);
|
142
|
+
info(" time=%.2fs/%.2fs", tm, mdl->total);
|
143
|
+
info("\n");
|
144
|
+
// If requested, check the error rate stoping criterion. We check if the
|
145
|
+
// error rate is stable enought over a few iterations.
|
146
|
+
bool res = true;
|
147
|
+
if (mdl->opt->stopwin != 0) {
|
148
|
+
mdl->werr[mdl->wpos] = te;
|
149
|
+
mdl->wpos = (mdl->wpos + 1) % mdl->opt->stopwin;
|
150
|
+
mdl->wcnt++;
|
151
|
+
if (mdl->wcnt >= mdl->opt->stopwin) {
|
152
|
+
double emin = 200.0, emax = -100.0;
|
153
|
+
for (int i = 0; i < mdl->opt->stopwin; i++) {
|
154
|
+
emin = min(emin, mdl->werr[i]);
|
155
|
+
emax = max(emax, mdl->werr[i]);
|
156
|
+
}
|
157
|
+
if (emax - emin < mdl->opt->stopeps)
|
158
|
+
res = false;
|
159
|
+
}
|
160
|
+
}
|
161
|
+
// And return
|
162
|
+
if (uit_stop)
|
163
|
+
return false;
|
164
|
+
return res;
|
165
|
+
}
|
166
|
+
|
167
|
+
|
@@ -0,0 +1,43 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#ifndef progress_h
|
29
|
+
#define progress_h
|
30
|
+
|
31
|
+
#include <stdbool.h>
|
32
|
+
|
33
|
+
#include "wapiti.h"
|
34
|
+
#include "model.h"
|
35
|
+
|
36
|
+
extern bool uit_stop;
|
37
|
+
|
38
|
+
void uit_setup(mdl_t *mdl);
|
39
|
+
void uit_cleanup(mdl_t *mdl);
|
40
|
+
bool uit_progress(mdl_t *mdl, int it, double obj);
|
41
|
+
|
42
|
+
#endif
|
43
|
+
|
data/ext/wapiti/quark.c
ADDED
@@ -0,0 +1,272 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
#include <stdbool.h>
|
28
|
+
#include <stddef.h>
|
29
|
+
#include <stdlib.h>
|
30
|
+
#include <stdio.h>
|
31
|
+
#include <stdint.h>
|
32
|
+
#include <string.h>
|
33
|
+
|
34
|
+
#include "quark.h"
|
35
|
+
#include "tools.h"
|
36
|
+
|
37
|
+
/******************************************************************************
|
38
|
+
* Map object
|
39
|
+
*
|
40
|
+
* Implement quark object for mapping strings to identifiers through crit-bit
|
41
|
+
* tree (also known as PATRICIA tries). In fact it only store a compressed
|
42
|
+
* version of the trie to reduce memory footprint. The special trick of using
|
43
|
+
* the last bit of the reference to differenciate between nodes and leafs come
|
44
|
+
* from Daniel J. Bernstein implementation of crit-bit tree that can be found
|
45
|
+
* on his web site.
|
46
|
+
* [1] Morrison, Donald R. ; PATRICIA-Practical Algorithm To Retrieve
|
47
|
+
* Information Coded in Alphanumeric, Journal of the ACM 15 (4): pp. 514--534,
|
48
|
+
* 1968. DOI:10.1145/321479.321481
|
49
|
+
*
|
50
|
+
* This code is copyright 2002-2011 Thomas Lavergne and licenced under the BSD
|
51
|
+
* Licence like the remaining of Wapiti.
|
52
|
+
******************************************************************************/
|
53
|
+
|
54
|
+
typedef struct node_s node_t;
|
55
|
+
typedef struct leaf_s leaf_t;
|
56
|
+
struct qrk_s {
|
57
|
+
struct node_s {
|
58
|
+
node_t *child[2];
|
59
|
+
uint32_t pos;
|
60
|
+
uint8_t byte;
|
61
|
+
} *root;
|
62
|
+
struct leaf_s {
|
63
|
+
uint64_t id;
|
64
|
+
char key[];
|
65
|
+
} **leafs;
|
66
|
+
bool lock;
|
67
|
+
uint64_t count;
|
68
|
+
uint64_t size;
|
69
|
+
};
|
70
|
+
|
71
|
+
#define qrk_none ((uint64_t)-1)
|
72
|
+
|
73
|
+
#define qrk_lf2nd(lf) ((node_t *)((intptr_t)(lf) | 1))
|
74
|
+
#define qrk_nd2lf(nd) ((leaf_t *)((intptr_t)(nd) & ~1))
|
75
|
+
#define qrk_isleaf(nd) ((intptr_t)(nd) & 1)
|
76
|
+
|
77
|
+
/* qrk_new:
|
78
|
+
* This initialize the object for holding a new empty trie, with some pre-
|
79
|
+
* allocations. The returned object must be freed with a call to qrk_free when
|
80
|
+
* not needed anymore.
|
81
|
+
*/
|
82
|
+
qrk_t *qrk_new(void) {
|
83
|
+
const uint64_t size = 128;
|
84
|
+
qrk_t *qrk = wapiti_xmalloc(sizeof(qrk_t));
|
85
|
+
qrk->root = NULL;
|
86
|
+
qrk->count = 0;
|
87
|
+
qrk->lock = false;
|
88
|
+
qrk->size = size;
|
89
|
+
qrk->leafs = wapiti_xmalloc(sizeof(leaf_t) * size);
|
90
|
+
return qrk;
|
91
|
+
}
|
92
|
+
|
93
|
+
/* qrk_free:
|
94
|
+
* Release all the memory used by a qrk_t object allocated with qrk_new. This
|
95
|
+
* will release all key string stored internally so all key returned by
|
96
|
+
* qrk_unmap become invalid and must not be used anymore.
|
97
|
+
*/
|
98
|
+
void qrk_free(qrk_t *qrk) {
|
99
|
+
const size_t stkmax = 1024;
|
100
|
+
if (qrk->count != 0) {
|
101
|
+
node_t *stk[stkmax];
|
102
|
+
int cnt = 0;
|
103
|
+
stk[cnt++] = qrk->root;
|
104
|
+
while (cnt != 0) {
|
105
|
+
node_t *nd = stk[--cnt];
|
106
|
+
if (qrk_isleaf(nd)) {
|
107
|
+
free(qrk_nd2lf(nd));
|
108
|
+
continue;
|
109
|
+
}
|
110
|
+
stk[cnt++] = nd->child[0];
|
111
|
+
stk[cnt++] = nd->child[1];
|
112
|
+
free(nd);
|
113
|
+
}
|
114
|
+
}
|
115
|
+
free(qrk->leafs);
|
116
|
+
free(qrk);
|
117
|
+
}
|
118
|
+
|
119
|
+
/* qrk_insert:
|
120
|
+
* Map a key to a uniq identifier. If the key already exist in the map, return
|
121
|
+
* its identifier, else allocate a new identifier and insert the new (key,id)
|
122
|
+
* pair inside the quark. This function is not thread safe and should not be
|
123
|
+
* called on the same map from different thread without locking.
|
124
|
+
*/
|
125
|
+
size_t qrk_str2id(qrk_t *qrk, const char *key) {
|
126
|
+
const uint8_t *raw = (void *)key;
|
127
|
+
const size_t len = strlen(key);
|
128
|
+
// We first take care of the empty trie case so later we can safely
|
129
|
+
// assume that the trie is well formed and so there is no NULL pointers
|
130
|
+
// in it.
|
131
|
+
if (qrk->count == 0) {
|
132
|
+
if (qrk->lock == true)
|
133
|
+
return none;
|
134
|
+
const size_t size = sizeof(char) * (len + 1);
|
135
|
+
leaf_t *lf = wapiti_xmalloc(sizeof(leaf_t) + size);
|
136
|
+
memcpy(lf->key, key, size);
|
137
|
+
lf->id = 0;
|
138
|
+
qrk->root = qrk_lf2nd(lf);
|
139
|
+
qrk->leafs[0] = lf;
|
140
|
+
qrk->count = 1;
|
141
|
+
return 0;
|
142
|
+
}
|
143
|
+
// If the trie is not empty, we first go down the trie to the leaf like
|
144
|
+
// if we are searching for the key. When at leaf there is two case,
|
145
|
+
// either we have found our key or we have found another key with all
|
146
|
+
// its critical bit identical to our one. So we search for the first
|
147
|
+
// differing bit between them to know where we have to add the new node.
|
148
|
+
const node_t *nd = qrk->root;
|
149
|
+
while (!qrk_isleaf(nd)) {
|
150
|
+
const uint8_t chr = nd->pos < len ? raw[nd->pos] : 0;
|
151
|
+
const int side = ((chr | nd->byte) + 1) >> 8;
|
152
|
+
nd = nd->child[side];
|
153
|
+
}
|
154
|
+
const char *bst = qrk_nd2lf(nd)->key;
|
155
|
+
size_t pos;
|
156
|
+
for (pos = 0; pos < len; pos++)
|
157
|
+
if (key[pos] != bst[pos])
|
158
|
+
break;
|
159
|
+
uint8_t byte;
|
160
|
+
if (pos != len)
|
161
|
+
byte = key[pos] ^ bst[pos];
|
162
|
+
else if (bst[pos] != '\0')
|
163
|
+
byte = bst[pos];
|
164
|
+
else
|
165
|
+
return qrk_nd2lf(nd)->id;
|
166
|
+
if (qrk->lock == true)
|
167
|
+
return none;
|
168
|
+
// Now we known the two key are different and we know in which byte. It
|
169
|
+
// remain to build the mask for the new critical bit and build the new
|
170
|
+
// internal node and leaf.
|
171
|
+
while (byte & (byte - 1))
|
172
|
+
byte &= byte - 1;
|
173
|
+
byte ^= 255;
|
174
|
+
const uint8_t chr = bst[pos];
|
175
|
+
const int side = ((chr | byte) + 1) >> 8;
|
176
|
+
const size_t size = sizeof(char) * (len + 1);
|
177
|
+
node_t *nx = wapiti_xmalloc(sizeof(node_t));
|
178
|
+
leaf_t *lf = wapiti_xmalloc(sizeof(leaf_t) + size);
|
179
|
+
memcpy(lf->key, key, size);
|
180
|
+
lf->id = qrk->count++;
|
181
|
+
nx->pos = pos;
|
182
|
+
nx->byte = byte;
|
183
|
+
nx->child[1 - side] = qrk_lf2nd(lf);
|
184
|
+
if (lf->id == qrk->size) {
|
185
|
+
qrk->size *= 1.4;
|
186
|
+
const size_t size = sizeof(leaf_t *) * qrk->size;
|
187
|
+
qrk->leafs = wapiti_xrealloc(qrk->leafs, size);
|
188
|
+
}
|
189
|
+
qrk->leafs[lf->id] = lf;
|
190
|
+
// And last thing to do: inserting the new node in the trie. We have to
|
191
|
+
// walk down the trie again as we have to keep the ordering of nodes. So
|
192
|
+
// we search for the good position to insert it.
|
193
|
+
node_t **trg = &qrk->root;
|
194
|
+
while (true) {
|
195
|
+
node_t *nd = *trg;
|
196
|
+
if (qrk_isleaf(nd) || nd->pos > pos)
|
197
|
+
break;
|
198
|
+
if (nd->pos == pos && nd->byte > byte)
|
199
|
+
break;
|
200
|
+
const uint8_t chr = nd->pos < len ? raw[nd->pos] : 0;
|
201
|
+
const int side = ((chr | nd->byte) + 1) >> 8;
|
202
|
+
trg = &nd->child[side];
|
203
|
+
}
|
204
|
+
nx->child[side] = *trg;
|
205
|
+
*trg = nx;
|
206
|
+
return lf->id;
|
207
|
+
}
|
208
|
+
|
209
|
+
/* qrk_id2str:
|
210
|
+
* Retrieve the key associated to an identifier. The key is returned as a
|
211
|
+
* constant string that should not be modified or freed by the caller, it is
|
212
|
+
* a pointer to the internal copy of the key kept by the map object and
|
213
|
+
* remain valid only for the life time of the quark, a call to qrk_free will
|
214
|
+
* make this pointer invalid.
|
215
|
+
*/
|
216
|
+
const char *qrk_id2str(const qrk_t *qrk, size_t id) {
|
217
|
+
if (id >= qrk->count)
|
218
|
+
fatal("invalid identifier");
|
219
|
+
return qrk->leafs[id]->key;
|
220
|
+
}
|
221
|
+
|
222
|
+
/* qrk_save:
|
223
|
+
* Save list of keys present in the map object in the id order to the given
|
224
|
+
* file. We put one key per line so, if no key contains a new line, the line
|
225
|
+
* number correspond to the id.
|
226
|
+
*/
|
227
|
+
void qrk_save(const qrk_t *qrk, FILE *file) {
|
228
|
+
if (fprintf(file, "#qrk#%zu\n", (size_t)qrk->count) < 0)
|
229
|
+
pfatal("cannot write to file");
|
230
|
+
if (qrk->count == 0)
|
231
|
+
return;
|
232
|
+
for (uint64_t n = 0; n < qrk->count; n++)
|
233
|
+
ns_writestr(file, qrk->leafs[n]->key);
|
234
|
+
}
|
235
|
+
|
236
|
+
/* qrk_load:
|
237
|
+
* Load a list of key from the given file and add them to the map. Each lines
|
238
|
+
* of the file is taken as a single key and mapped to the next available id if
|
239
|
+
* not already present. If all keys are single lines and the given map is
|
240
|
+
* initilay empty, this will load a map exactly as saved by qrk_save.
|
241
|
+
*/
|
242
|
+
void qrk_load(qrk_t *qrk, FILE *file) {
|
243
|
+
size_t cnt = 0;
|
244
|
+
if (fscanf(file, "#qrk#%zu\n", &cnt) != 1) {
|
245
|
+
if (ferror(file) != 0)
|
246
|
+
pfatal("cannot read from file");
|
247
|
+
pfatal("invalid format");
|
248
|
+
}
|
249
|
+
for (size_t n = 0; n < cnt; ++n) {
|
250
|
+
char *str = ns_readstr(file);
|
251
|
+
qrk_str2id(qrk, str);
|
252
|
+
free(str);
|
253
|
+
}
|
254
|
+
}
|
255
|
+
|
256
|
+
/* qrk_count:
|
257
|
+
* Return the number of mappings stored in the quark.
|
258
|
+
*/
|
259
|
+
size_t qrk_count(const qrk_t *qrk) {
|
260
|
+
return qrk->count;
|
261
|
+
}
|
262
|
+
|
263
|
+
/* qrk_lock:
|
264
|
+
* Set the lock value of the quark and return the old one.
|
265
|
+
*/
|
266
|
+
bool qrk_lock(qrk_t *qrk, bool lock) {
|
267
|
+
bool old = qrk->lock;
|
268
|
+
qrk->lock = lock;
|
269
|
+
return old;
|
270
|
+
}
|
271
|
+
|
272
|
+
|