wapiti 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/.autotest +13 -0
  2. data/.gitignore +5 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +30 -0
  6. data/README.md +153 -0
  7. data/Rakefile +33 -0
  8. data/ext/wapiti/bcd.c +392 -0
  9. data/ext/wapiti/decoder.c +535 -0
  10. data/ext/wapiti/decoder.h +46 -0
  11. data/ext/wapiti/extconf.rb +8 -0
  12. data/ext/wapiti/gradient.c +818 -0
  13. data/ext/wapiti/gradient.h +81 -0
  14. data/ext/wapiti/lbfgs.c +294 -0
  15. data/ext/wapiti/model.c +296 -0
  16. data/ext/wapiti/model.h +100 -0
  17. data/ext/wapiti/native.c +1238 -0
  18. data/ext/wapiti/native.h +15 -0
  19. data/ext/wapiti/options.c +278 -0
  20. data/ext/wapiti/options.h +91 -0
  21. data/ext/wapiti/pattern.c +395 -0
  22. data/ext/wapiti/pattern.h +56 -0
  23. data/ext/wapiti/progress.c +167 -0
  24. data/ext/wapiti/progress.h +43 -0
  25. data/ext/wapiti/quark.c +272 -0
  26. data/ext/wapiti/quark.h +46 -0
  27. data/ext/wapiti/reader.c +553 -0
  28. data/ext/wapiti/reader.h +73 -0
  29. data/ext/wapiti/rprop.c +191 -0
  30. data/ext/wapiti/sequence.h +148 -0
  31. data/ext/wapiti/sgdl1.c +218 -0
  32. data/ext/wapiti/thread.c +171 -0
  33. data/ext/wapiti/thread.h +42 -0
  34. data/ext/wapiti/tools.c +202 -0
  35. data/ext/wapiti/tools.h +54 -0
  36. data/ext/wapiti/trainers.h +39 -0
  37. data/ext/wapiti/vmath.c +372 -0
  38. data/ext/wapiti/vmath.h +51 -0
  39. data/ext/wapiti/wapiti.c +288 -0
  40. data/ext/wapiti/wapiti.h +45 -0
  41. data/lib/wapiti.rb +30 -0
  42. data/lib/wapiti/errors.rb +17 -0
  43. data/lib/wapiti/model.rb +49 -0
  44. data/lib/wapiti/options.rb +113 -0
  45. data/lib/wapiti/utility.rb +15 -0
  46. data/lib/wapiti/version.rb +3 -0
  47. data/spec/fixtures/ch.mod +18550 -0
  48. data/spec/fixtures/chpattern.txt +52 -0
  49. data/spec/fixtures/chtest.txt +1973 -0
  50. data/spec/fixtures/chtrain.txt +19995 -0
  51. data/spec/fixtures/nppattern.txt +52 -0
  52. data/spec/fixtures/nptest.txt +1973 -0
  53. data/spec/fixtures/nptrain.txt +19995 -0
  54. data/spec/fixtures/pattern.txt +14 -0
  55. data/spec/fixtures/test.txt +60000 -0
  56. data/spec/fixtures/train.txt +1200 -0
  57. data/spec/spec_helper.rb +21 -0
  58. data/spec/wapiti/model_spec.rb +173 -0
  59. data/spec/wapiti/native_spec.rb +12 -0
  60. data/spec/wapiti/options_spec.rb +175 -0
  61. data/spec/wapiti/utility_spec.rb +22 -0
  62. data/wapiti.gemspec +35 -0
  63. metadata +178 -0
@@ -0,0 +1,56 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #ifndef pattern_h
29
+ #define pattern_h
30
+
31
+ #include <stdbool.h>
32
+
33
+ #include "sequence.h"
34
+
35
+ typedef struct pat_s pat_t;
36
+ typedef struct pat_item_s pat_item_t;
37
+ struct pat_s {
38
+ char *src;
39
+ int ntoks;
40
+ int nitems;
41
+ struct pat_item_s {
42
+ char type;
43
+ bool caps;
44
+ char *value;
45
+ bool absolute;
46
+ int offset;
47
+ int column;
48
+ } items[];
49
+ };
50
+
51
+ pat_t *pat_comp(char *p);
52
+ char *pat_exec(const pat_t *pat, const tok_t *tok, int at);
53
+ void pat_free(pat_t *pat);
54
+
55
+ #endif
56
+
@@ -0,0 +1,167 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+ #include <signal.h>
28
+ #include <stdbool.h>
29
+ #include <stddef.h>
30
+ #include <stdlib.h>
31
+ #include <stdio.h>
32
+
33
+ #include <unistd.h>
34
+ #include <sys/times.h>
35
+ #include <sys/resource.h>
36
+
37
+ #include "wapiti.h"
38
+ #include "decoder.h"
39
+ #include "model.h"
40
+ #include "options.h"
41
+ #include "progress.h"
42
+ #include "tools.h"
43
+
44
+ /*******************************************************************************
45
+ * User interaction during training
46
+ *
47
+ * Handle progress reporting during training and clean early stoping. Trainers
48
+ * have to call uit_progress at the end of each iterations, this will display
49
+ * various informations for the user.
50
+ * Timing is also done here, an iteration is assumed to take all the time
51
+ * between to call to the progress function and evualtion on the devel data
52
+ * are included.
53
+ *
54
+ * This module setup a signal handler for SIGINT. If this signal is catched,
55
+ * the uit_stop global variable to inform the trainer that it have to stop as
56
+ * early as possible, discarding the recent computations if they cannot be
57
+ * integrated very quickly. They must leave the model in a clean state. Any
58
+ * further signal will terminate the program. So it's simple :
59
+ * - 1 signal mean "I can wait a little so try to stop as soon as possible
60
+ * but leave me a working model"
61
+ * - 2 signal mean "Stop immediatly what you are doing, I can't wait and
62
+ * don't care about getting a working model"
63
+ ******************************************************************************/
64
+
65
+ /* uit_stop:
66
+ * This value is set to true when the user request the trainer to stop. In
67
+ * this case, the trainer have to stop as soon as possible in a clean state,
68
+ * discarding the lasts computations if it cannot integrate them quickly.
69
+ */
70
+ bool uit_stop = false;
71
+
72
+ /* uit_signal:
73
+ * Signal handler to catch interupt signal. When a signal is received, the
74
+ * trainer is aksed to stop as soon as possible leaving the model in a clean
75
+ * state. We don't reinstall the handler so if user send a second interupt
76
+ * signal, the program will stop imediatly. (to cope with BSD system, we even
77
+ * reinstall explicitly the default handler)
78
+ */
79
+ static void uit_signal(int sig) {
80
+ signal(sig, SIG_DFL);
81
+ uit_stop = true;
82
+ }
83
+
84
+ /* uit_setup:
85
+ * Install the signal handler for clean early stop from the user if possible
86
+ * and start the timer.
87
+ */
88
+ void uit_setup(mdl_t *mdl) {
89
+ uit_stop = false;
90
+ if (signal(SIGINT, uit_signal) == SIG_ERR)
91
+ warning("failed to set signal handler, no clean early stop");
92
+ times(&mdl->timer);
93
+ if (mdl->opt->stopwin != 0)
94
+ mdl->werr = wapiti_xmalloc(sizeof(double) * mdl->opt->stopwin);
95
+ mdl->wcnt = mdl->wpos = 0;
96
+ }
97
+
98
+ /* uit_cleanup:
99
+ * Remove the signal handler restoring the defaul behavior in case of
100
+ * interrupt.
101
+ */
102
+ void uit_cleanup(mdl_t *mdl) {
103
+ unused(mdl);
104
+ if (mdl->opt->stopwin != 0) {
105
+ free(mdl->werr);
106
+ mdl->werr = NULL;
107
+ }
108
+ signal(SIGINT, SIG_DFL);
109
+ }
110
+
111
+ /* uit_progress:
112
+ * Display a progress repport to the user consisting of some informations
113
+ * provided by the trainer: iteration count and objective function value, and
114
+ * some informations computed here on the current model performances.
115
+ * This function return true if the trainer have to keep training the model
116
+ * and false if he must stop, so this is were we will implement the trainer
117
+ * independant stoping criterion.
118
+ */
119
+ bool uit_progress(mdl_t *mdl, int it, double obj) {
120
+ // First we just compute the error rate on devel or train data
121
+ double te, se;
122
+ tag_eval(mdl, &te, &se);
123
+ // Next, we compute the number of active features
124
+ size_t act = 0;
125
+ for (size_t f = 0; f < mdl->nftr; f++)
126
+ if (mdl->theta[f] != 0.0)
127
+ act++;
128
+ // Compute timings. As some training algorithms are multi-threaded, we
129
+ // cannot use ansi/c function and must rely on posix one to sum time
130
+ // spent in main thread and in child ones.
131
+ tms_t now; times(&now);
132
+ double tm = (now.tms_utime - mdl->timer.tms_utime )
133
+ + (now.tms_cutime - mdl->timer.tms_cutime);
134
+ tm /= sysconf(_SC_CLK_TCK);
135
+ mdl->total += tm;
136
+ mdl->timer = now;
137
+ // And display progress report
138
+ info(" [%4d]", it);
139
+ info(obj >= 0.0 ? " obj=%-10.2f" : " obj=NA", obj);
140
+ info(" act=%-8zu", act);
141
+ info(" err=%5.2f%%/%5.2f%%", te, se);
142
+ info(" time=%.2fs/%.2fs", tm, mdl->total);
143
+ info("\n");
144
+ // If requested, check the error rate stoping criterion. We check if the
145
+ // error rate is stable enought over a few iterations.
146
+ bool res = true;
147
+ if (mdl->opt->stopwin != 0) {
148
+ mdl->werr[mdl->wpos] = te;
149
+ mdl->wpos = (mdl->wpos + 1) % mdl->opt->stopwin;
150
+ mdl->wcnt++;
151
+ if (mdl->wcnt >= mdl->opt->stopwin) {
152
+ double emin = 200.0, emax = -100.0;
153
+ for (int i = 0; i < mdl->opt->stopwin; i++) {
154
+ emin = min(emin, mdl->werr[i]);
155
+ emax = max(emax, mdl->werr[i]);
156
+ }
157
+ if (emax - emin < mdl->opt->stopeps)
158
+ res = false;
159
+ }
160
+ }
161
+ // And return
162
+ if (uit_stop)
163
+ return false;
164
+ return res;
165
+ }
166
+
167
+
@@ -0,0 +1,43 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #ifndef progress_h
29
+ #define progress_h
30
+
31
+ #include <stdbool.h>
32
+
33
+ #include "wapiti.h"
34
+ #include "model.h"
35
+
36
+ extern bool uit_stop;
37
+
38
+ void uit_setup(mdl_t *mdl);
39
+ void uit_cleanup(mdl_t *mdl);
40
+ bool uit_progress(mdl_t *mdl, int it, double obj);
41
+
42
+ #endif
43
+
@@ -0,0 +1,272 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+ #include <stdbool.h>
28
+ #include <stddef.h>
29
+ #include <stdlib.h>
30
+ #include <stdio.h>
31
+ #include <stdint.h>
32
+ #include <string.h>
33
+
34
+ #include "quark.h"
35
+ #include "tools.h"
36
+
37
+ /******************************************************************************
38
+ * Map object
39
+ *
40
+ * Implement quark object for mapping strings to identifiers through crit-bit
41
+ * tree (also known as PATRICIA tries). In fact it only store a compressed
42
+ * version of the trie to reduce memory footprint. The special trick of using
43
+ * the last bit of the reference to differenciate between nodes and leafs come
44
+ * from Daniel J. Bernstein implementation of crit-bit tree that can be found
45
+ * on his web site.
46
+ * [1] Morrison, Donald R. ; PATRICIA-Practical Algorithm To Retrieve
47
+ * Information Coded in Alphanumeric, Journal of the ACM 15 (4): pp. 514--534,
48
+ * 1968. DOI:10.1145/321479.321481
49
+ *
50
+ * This code is copyright 2002-2011 Thomas Lavergne and licenced under the BSD
51
+ * Licence like the remaining of Wapiti.
52
+ ******************************************************************************/
53
+
54
+ typedef struct node_s node_t;
55
+ typedef struct leaf_s leaf_t;
56
+ struct qrk_s {
57
+ struct node_s {
58
+ node_t *child[2];
59
+ uint32_t pos;
60
+ uint8_t byte;
61
+ } *root;
62
+ struct leaf_s {
63
+ uint64_t id;
64
+ char key[];
65
+ } **leafs;
66
+ bool lock;
67
+ uint64_t count;
68
+ uint64_t size;
69
+ };
70
+
71
+ #define qrk_none ((uint64_t)-1)
72
+
73
+ #define qrk_lf2nd(lf) ((node_t *)((intptr_t)(lf) | 1))
74
+ #define qrk_nd2lf(nd) ((leaf_t *)((intptr_t)(nd) & ~1))
75
+ #define qrk_isleaf(nd) ((intptr_t)(nd) & 1)
76
+
77
+ /* qrk_new:
78
+ * This initialize the object for holding a new empty trie, with some pre-
79
+ * allocations. The returned object must be freed with a call to qrk_free when
80
+ * not needed anymore.
81
+ */
82
+ qrk_t *qrk_new(void) {
83
+ const uint64_t size = 128;
84
+ qrk_t *qrk = wapiti_xmalloc(sizeof(qrk_t));
85
+ qrk->root = NULL;
86
+ qrk->count = 0;
87
+ qrk->lock = false;
88
+ qrk->size = size;
89
+ qrk->leafs = wapiti_xmalloc(sizeof(leaf_t) * size);
90
+ return qrk;
91
+ }
92
+
93
+ /* qrk_free:
94
+ * Release all the memory used by a qrk_t object allocated with qrk_new. This
95
+ * will release all key string stored internally so all key returned by
96
+ * qrk_unmap become invalid and must not be used anymore.
97
+ */
98
+ void qrk_free(qrk_t *qrk) {
99
+ const size_t stkmax = 1024;
100
+ if (qrk->count != 0) {
101
+ node_t *stk[stkmax];
102
+ int cnt = 0;
103
+ stk[cnt++] = qrk->root;
104
+ while (cnt != 0) {
105
+ node_t *nd = stk[--cnt];
106
+ if (qrk_isleaf(nd)) {
107
+ free(qrk_nd2lf(nd));
108
+ continue;
109
+ }
110
+ stk[cnt++] = nd->child[0];
111
+ stk[cnt++] = nd->child[1];
112
+ free(nd);
113
+ }
114
+ }
115
+ free(qrk->leafs);
116
+ free(qrk);
117
+ }
118
+
119
+ /* qrk_insert:
120
+ * Map a key to a uniq identifier. If the key already exist in the map, return
121
+ * its identifier, else allocate a new identifier and insert the new (key,id)
122
+ * pair inside the quark. This function is not thread safe and should not be
123
+ * called on the same map from different thread without locking.
124
+ */
125
+ size_t qrk_str2id(qrk_t *qrk, const char *key) {
126
+ const uint8_t *raw = (void *)key;
127
+ const size_t len = strlen(key);
128
+ // We first take care of the empty trie case so later we can safely
129
+ // assume that the trie is well formed and so there is no NULL pointers
130
+ // in it.
131
+ if (qrk->count == 0) {
132
+ if (qrk->lock == true)
133
+ return none;
134
+ const size_t size = sizeof(char) * (len + 1);
135
+ leaf_t *lf = wapiti_xmalloc(sizeof(leaf_t) + size);
136
+ memcpy(lf->key, key, size);
137
+ lf->id = 0;
138
+ qrk->root = qrk_lf2nd(lf);
139
+ qrk->leafs[0] = lf;
140
+ qrk->count = 1;
141
+ return 0;
142
+ }
143
+ // If the trie is not empty, we first go down the trie to the leaf like
144
+ // if we are searching for the key. When at leaf there is two case,
145
+ // either we have found our key or we have found another key with all
146
+ // its critical bit identical to our one. So we search for the first
147
+ // differing bit between them to know where we have to add the new node.
148
+ const node_t *nd = qrk->root;
149
+ while (!qrk_isleaf(nd)) {
150
+ const uint8_t chr = nd->pos < len ? raw[nd->pos] : 0;
151
+ const int side = ((chr | nd->byte) + 1) >> 8;
152
+ nd = nd->child[side];
153
+ }
154
+ const char *bst = qrk_nd2lf(nd)->key;
155
+ size_t pos;
156
+ for (pos = 0; pos < len; pos++)
157
+ if (key[pos] != bst[pos])
158
+ break;
159
+ uint8_t byte;
160
+ if (pos != len)
161
+ byte = key[pos] ^ bst[pos];
162
+ else if (bst[pos] != '\0')
163
+ byte = bst[pos];
164
+ else
165
+ return qrk_nd2lf(nd)->id;
166
+ if (qrk->lock == true)
167
+ return none;
168
+ // Now we known the two key are different and we know in which byte. It
169
+ // remain to build the mask for the new critical bit and build the new
170
+ // internal node and leaf.
171
+ while (byte & (byte - 1))
172
+ byte &= byte - 1;
173
+ byte ^= 255;
174
+ const uint8_t chr = bst[pos];
175
+ const int side = ((chr | byte) + 1) >> 8;
176
+ const size_t size = sizeof(char) * (len + 1);
177
+ node_t *nx = wapiti_xmalloc(sizeof(node_t));
178
+ leaf_t *lf = wapiti_xmalloc(sizeof(leaf_t) + size);
179
+ memcpy(lf->key, key, size);
180
+ lf->id = qrk->count++;
181
+ nx->pos = pos;
182
+ nx->byte = byte;
183
+ nx->child[1 - side] = qrk_lf2nd(lf);
184
+ if (lf->id == qrk->size) {
185
+ qrk->size *= 1.4;
186
+ const size_t size = sizeof(leaf_t *) * qrk->size;
187
+ qrk->leafs = wapiti_xrealloc(qrk->leafs, size);
188
+ }
189
+ qrk->leafs[lf->id] = lf;
190
+ // And last thing to do: inserting the new node in the trie. We have to
191
+ // walk down the trie again as we have to keep the ordering of nodes. So
192
+ // we search for the good position to insert it.
193
+ node_t **trg = &qrk->root;
194
+ while (true) {
195
+ node_t *nd = *trg;
196
+ if (qrk_isleaf(nd) || nd->pos > pos)
197
+ break;
198
+ if (nd->pos == pos && nd->byte > byte)
199
+ break;
200
+ const uint8_t chr = nd->pos < len ? raw[nd->pos] : 0;
201
+ const int side = ((chr | nd->byte) + 1) >> 8;
202
+ trg = &nd->child[side];
203
+ }
204
+ nx->child[side] = *trg;
205
+ *trg = nx;
206
+ return lf->id;
207
+ }
208
+
209
+ /* qrk_id2str:
210
+ * Retrieve the key associated to an identifier. The key is returned as a
211
+ * constant string that should not be modified or freed by the caller, it is
212
+ * a pointer to the internal copy of the key kept by the map object and
213
+ * remain valid only for the life time of the quark, a call to qrk_free will
214
+ * make this pointer invalid.
215
+ */
216
+ const char *qrk_id2str(const qrk_t *qrk, size_t id) {
217
+ if (id >= qrk->count)
218
+ fatal("invalid identifier");
219
+ return qrk->leafs[id]->key;
220
+ }
221
+
222
+ /* qrk_save:
223
+ * Save list of keys present in the map object in the id order to the given
224
+ * file. We put one key per line so, if no key contains a new line, the line
225
+ * number correspond to the id.
226
+ */
227
+ void qrk_save(const qrk_t *qrk, FILE *file) {
228
+ if (fprintf(file, "#qrk#%zu\n", (size_t)qrk->count) < 0)
229
+ pfatal("cannot write to file");
230
+ if (qrk->count == 0)
231
+ return;
232
+ for (uint64_t n = 0; n < qrk->count; n++)
233
+ ns_writestr(file, qrk->leafs[n]->key);
234
+ }
235
+
236
+ /* qrk_load:
237
+ * Load a list of key from the given file and add them to the map. Each lines
238
+ * of the file is taken as a single key and mapped to the next available id if
239
+ * not already present. If all keys are single lines and the given map is
240
+ * initilay empty, this will load a map exactly as saved by qrk_save.
241
+ */
242
+ void qrk_load(qrk_t *qrk, FILE *file) {
243
+ size_t cnt = 0;
244
+ if (fscanf(file, "#qrk#%zu\n", &cnt) != 1) {
245
+ if (ferror(file) != 0)
246
+ pfatal("cannot read from file");
247
+ pfatal("invalid format");
248
+ }
249
+ for (size_t n = 0; n < cnt; ++n) {
250
+ char *str = ns_readstr(file);
251
+ qrk_str2id(qrk, str);
252
+ free(str);
253
+ }
254
+ }
255
+
256
+ /* qrk_count:
257
+ * Return the number of mappings stored in the quark.
258
+ */
259
+ size_t qrk_count(const qrk_t *qrk) {
260
+ return qrk->count;
261
+ }
262
+
263
+ /* qrk_lock:
264
+ * Set the lock value of the quark and return the old one.
265
+ */
266
+ bool qrk_lock(qrk_t *qrk, bool lock) {
267
+ bool old = qrk->lock;
268
+ qrk->lock = lock;
269
+ return old;
270
+ }
271
+
272
+