wapiti 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/.autotest +13 -0
  2. data/.gitignore +5 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +30 -0
  6. data/README.md +153 -0
  7. data/Rakefile +33 -0
  8. data/ext/wapiti/bcd.c +392 -0
  9. data/ext/wapiti/decoder.c +535 -0
  10. data/ext/wapiti/decoder.h +46 -0
  11. data/ext/wapiti/extconf.rb +8 -0
  12. data/ext/wapiti/gradient.c +818 -0
  13. data/ext/wapiti/gradient.h +81 -0
  14. data/ext/wapiti/lbfgs.c +294 -0
  15. data/ext/wapiti/model.c +296 -0
  16. data/ext/wapiti/model.h +100 -0
  17. data/ext/wapiti/native.c +1238 -0
  18. data/ext/wapiti/native.h +15 -0
  19. data/ext/wapiti/options.c +278 -0
  20. data/ext/wapiti/options.h +91 -0
  21. data/ext/wapiti/pattern.c +395 -0
  22. data/ext/wapiti/pattern.h +56 -0
  23. data/ext/wapiti/progress.c +167 -0
  24. data/ext/wapiti/progress.h +43 -0
  25. data/ext/wapiti/quark.c +272 -0
  26. data/ext/wapiti/quark.h +46 -0
  27. data/ext/wapiti/reader.c +553 -0
  28. data/ext/wapiti/reader.h +73 -0
  29. data/ext/wapiti/rprop.c +191 -0
  30. data/ext/wapiti/sequence.h +148 -0
  31. data/ext/wapiti/sgdl1.c +218 -0
  32. data/ext/wapiti/thread.c +171 -0
  33. data/ext/wapiti/thread.h +42 -0
  34. data/ext/wapiti/tools.c +202 -0
  35. data/ext/wapiti/tools.h +54 -0
  36. data/ext/wapiti/trainers.h +39 -0
  37. data/ext/wapiti/vmath.c +372 -0
  38. data/ext/wapiti/vmath.h +51 -0
  39. data/ext/wapiti/wapiti.c +288 -0
  40. data/ext/wapiti/wapiti.h +45 -0
  41. data/lib/wapiti.rb +30 -0
  42. data/lib/wapiti/errors.rb +17 -0
  43. data/lib/wapiti/model.rb +49 -0
  44. data/lib/wapiti/options.rb +113 -0
  45. data/lib/wapiti/utility.rb +15 -0
  46. data/lib/wapiti/version.rb +3 -0
  47. data/spec/fixtures/ch.mod +18550 -0
  48. data/spec/fixtures/chpattern.txt +52 -0
  49. data/spec/fixtures/chtest.txt +1973 -0
  50. data/spec/fixtures/chtrain.txt +19995 -0
  51. data/spec/fixtures/nppattern.txt +52 -0
  52. data/spec/fixtures/nptest.txt +1973 -0
  53. data/spec/fixtures/nptrain.txt +19995 -0
  54. data/spec/fixtures/pattern.txt +14 -0
  55. data/spec/fixtures/test.txt +60000 -0
  56. data/spec/fixtures/train.txt +1200 -0
  57. data/spec/spec_helper.rb +21 -0
  58. data/spec/wapiti/model_spec.rb +173 -0
  59. data/spec/wapiti/native_spec.rb +12 -0
  60. data/spec/wapiti/options_spec.rb +175 -0
  61. data/spec/wapiti/utility_spec.rb +22 -0
  62. data/wapiti.gemspec +35 -0
  63. metadata +178 -0
@@ -0,0 +1,56 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #ifndef pattern_h
29
+ #define pattern_h
30
+
31
+ #include <stdbool.h>
32
+
33
+ #include "sequence.h"
34
+
35
+ typedef struct pat_s pat_t;
36
+ typedef struct pat_item_s pat_item_t;
37
+ struct pat_s {
38
+ char *src;
39
+ int ntoks;
40
+ int nitems;
41
+ struct pat_item_s {
42
+ char type;
43
+ bool caps;
44
+ char *value;
45
+ bool absolute;
46
+ int offset;
47
+ int column;
48
+ } items[];
49
+ };
50
+
51
+ pat_t *pat_comp(char *p);
52
+ char *pat_exec(const pat_t *pat, const tok_t *tok, int at);
53
+ void pat_free(pat_t *pat);
54
+
55
+ #endif
56
+
@@ -0,0 +1,167 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+ #include <signal.h>
28
+ #include <stdbool.h>
29
+ #include <stddef.h>
30
+ #include <stdlib.h>
31
+ #include <stdio.h>
32
+
33
+ #include <unistd.h>
34
+ #include <sys/times.h>
35
+ #include <sys/resource.h>
36
+
37
+ #include "wapiti.h"
38
+ #include "decoder.h"
39
+ #include "model.h"
40
+ #include "options.h"
41
+ #include "progress.h"
42
+ #include "tools.h"
43
+
44
+ /*******************************************************************************
45
+ * User interaction during training
46
+ *
47
+ * Handle progress reporting during training and clean early stoping. Trainers
48
+ * have to call uit_progress at the end of each iterations, this will display
49
+ * various informations for the user.
50
+ * Timing is also done here, an iteration is assumed to take all the time
51
+ * between to call to the progress function and evualtion on the devel data
52
+ * are included.
53
+ *
54
+ * This module setup a signal handler for SIGINT. If this signal is catched,
55
+ * the uit_stop global variable to inform the trainer that it have to stop as
56
+ * early as possible, discarding the recent computations if they cannot be
57
+ * integrated very quickly. They must leave the model in a clean state. Any
58
+ * further signal will terminate the program. So it's simple :
59
+ * - 1 signal mean "I can wait a little so try to stop as soon as possible
60
+ * but leave me a working model"
61
+ * - 2 signal mean "Stop immediatly what you are doing, I can't wait and
62
+ * don't care about getting a working model"
63
+ ******************************************************************************/
64
+
65
+ /* uit_stop:
66
+ * This value is set to true when the user request the trainer to stop. In
67
+ * this case, the trainer have to stop as soon as possible in a clean state,
68
+ * discarding the lasts computations if it cannot integrate them quickly.
69
+ */
70
+ bool uit_stop = false;
71
+
72
+ /* uit_signal:
73
+ * Signal handler to catch interupt signal. When a signal is received, the
74
+ * trainer is aksed to stop as soon as possible leaving the model in a clean
75
+ * state. We don't reinstall the handler so if user send a second interupt
76
+ * signal, the program will stop imediatly. (to cope with BSD system, we even
77
+ * reinstall explicitly the default handler)
78
+ */
79
+ static void uit_signal(int sig) {
80
+ signal(sig, SIG_DFL);
81
+ uit_stop = true;
82
+ }
83
+
84
+ /* uit_setup:
85
+ * Install the signal handler for clean early stop from the user if possible
86
+ * and start the timer.
87
+ */
88
+ void uit_setup(mdl_t *mdl) {
89
+ uit_stop = false;
90
+ if (signal(SIGINT, uit_signal) == SIG_ERR)
91
+ warning("failed to set signal handler, no clean early stop");
92
+ times(&mdl->timer);
93
+ if (mdl->opt->stopwin != 0)
94
+ mdl->werr = wapiti_xmalloc(sizeof(double) * mdl->opt->stopwin);
95
+ mdl->wcnt = mdl->wpos = 0;
96
+ }
97
+
98
+ /* uit_cleanup:
99
+ * Remove the signal handler restoring the defaul behavior in case of
100
+ * interrupt.
101
+ */
102
+ void uit_cleanup(mdl_t *mdl) {
103
+ unused(mdl);
104
+ if (mdl->opt->stopwin != 0) {
105
+ free(mdl->werr);
106
+ mdl->werr = NULL;
107
+ }
108
+ signal(SIGINT, SIG_DFL);
109
+ }
110
+
111
+ /* uit_progress:
112
+ * Display a progress repport to the user consisting of some informations
113
+ * provided by the trainer: iteration count and objective function value, and
114
+ * some informations computed here on the current model performances.
115
+ * This function return true if the trainer have to keep training the model
116
+ * and false if he must stop, so this is were we will implement the trainer
117
+ * independant stoping criterion.
118
+ */
119
+ bool uit_progress(mdl_t *mdl, int it, double obj) {
120
+ // First we just compute the error rate on devel or train data
121
+ double te, se;
122
+ tag_eval(mdl, &te, &se);
123
+ // Next, we compute the number of active features
124
+ size_t act = 0;
125
+ for (size_t f = 0; f < mdl->nftr; f++)
126
+ if (mdl->theta[f] != 0.0)
127
+ act++;
128
+ // Compute timings. As some training algorithms are multi-threaded, we
129
+ // cannot use ansi/c function and must rely on posix one to sum time
130
+ // spent in main thread and in child ones.
131
+ tms_t now; times(&now);
132
+ double tm = (now.tms_utime - mdl->timer.tms_utime )
133
+ + (now.tms_cutime - mdl->timer.tms_cutime);
134
+ tm /= sysconf(_SC_CLK_TCK);
135
+ mdl->total += tm;
136
+ mdl->timer = now;
137
+ // And display progress report
138
+ info(" [%4d]", it);
139
+ info(obj >= 0.0 ? " obj=%-10.2f" : " obj=NA", obj);
140
+ info(" act=%-8zu", act);
141
+ info(" err=%5.2f%%/%5.2f%%", te, se);
142
+ info(" time=%.2fs/%.2fs", tm, mdl->total);
143
+ info("\n");
144
+ // If requested, check the error rate stoping criterion. We check if the
145
+ // error rate is stable enought over a few iterations.
146
+ bool res = true;
147
+ if (mdl->opt->stopwin != 0) {
148
+ mdl->werr[mdl->wpos] = te;
149
+ mdl->wpos = (mdl->wpos + 1) % mdl->opt->stopwin;
150
+ mdl->wcnt++;
151
+ if (mdl->wcnt >= mdl->opt->stopwin) {
152
+ double emin = 200.0, emax = -100.0;
153
+ for (int i = 0; i < mdl->opt->stopwin; i++) {
154
+ emin = min(emin, mdl->werr[i]);
155
+ emax = max(emax, mdl->werr[i]);
156
+ }
157
+ if (emax - emin < mdl->opt->stopeps)
158
+ res = false;
159
+ }
160
+ }
161
+ // And return
162
+ if (uit_stop)
163
+ return false;
164
+ return res;
165
+ }
166
+
167
+
@@ -0,0 +1,43 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+
28
+ #ifndef progress_h
29
+ #define progress_h
30
+
31
+ #include <stdbool.h>
32
+
33
+ #include "wapiti.h"
34
+ #include "model.h"
35
+
36
+ extern bool uit_stop;
37
+
38
+ void uit_setup(mdl_t *mdl);
39
+ void uit_cleanup(mdl_t *mdl);
40
+ bool uit_progress(mdl_t *mdl, int it, double obj);
41
+
42
+ #endif
43
+
@@ -0,0 +1,272 @@
1
+ /*
2
+ * Wapiti - A linear-chain CRF tool
3
+ *
4
+ * Copyright (c) 2009-2011 CNRS
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
+ * POSSIBILITY OF SUCH DAMAGE.
26
+ */
27
+ #include <stdbool.h>
28
+ #include <stddef.h>
29
+ #include <stdlib.h>
30
+ #include <stdio.h>
31
+ #include <stdint.h>
32
+ #include <string.h>
33
+
34
+ #include "quark.h"
35
+ #include "tools.h"
36
+
37
+ /******************************************************************************
38
+ * Map object
39
+ *
40
+ * Implement quark object for mapping strings to identifiers through crit-bit
41
+ * tree (also known as PATRICIA tries). In fact it only store a compressed
42
+ * version of the trie to reduce memory footprint. The special trick of using
43
+ * the last bit of the reference to differenciate between nodes and leafs come
44
+ * from Daniel J. Bernstein implementation of crit-bit tree that can be found
45
+ * on his web site.
46
+ * [1] Morrison, Donald R. ; PATRICIA-Practical Algorithm To Retrieve
47
+ * Information Coded in Alphanumeric, Journal of the ACM 15 (4): pp. 514--534,
48
+ * 1968. DOI:10.1145/321479.321481
49
+ *
50
+ * This code is copyright 2002-2011 Thomas Lavergne and licenced under the BSD
51
+ * Licence like the remaining of Wapiti.
52
+ ******************************************************************************/
53
+
54
+ typedef struct node_s node_t;
55
+ typedef struct leaf_s leaf_t;
56
+ struct qrk_s {
57
+ struct node_s {
58
+ node_t *child[2];
59
+ uint32_t pos;
60
+ uint8_t byte;
61
+ } *root;
62
+ struct leaf_s {
63
+ uint64_t id;
64
+ char key[];
65
+ } **leafs;
66
+ bool lock;
67
+ uint64_t count;
68
+ uint64_t size;
69
+ };
70
+
71
+ #define qrk_none ((uint64_t)-1)
72
+
73
+ #define qrk_lf2nd(lf) ((node_t *)((intptr_t)(lf) | 1))
74
+ #define qrk_nd2lf(nd) ((leaf_t *)((intptr_t)(nd) & ~1))
75
+ #define qrk_isleaf(nd) ((intptr_t)(nd) & 1)
76
+
77
+ /* qrk_new:
78
+ * This initialize the object for holding a new empty trie, with some pre-
79
+ * allocations. The returned object must be freed with a call to qrk_free when
80
+ * not needed anymore.
81
+ */
82
+ qrk_t *qrk_new(void) {
83
+ const uint64_t size = 128;
84
+ qrk_t *qrk = wapiti_xmalloc(sizeof(qrk_t));
85
+ qrk->root = NULL;
86
+ qrk->count = 0;
87
+ qrk->lock = false;
88
+ qrk->size = size;
89
+ qrk->leafs = wapiti_xmalloc(sizeof(leaf_t) * size);
90
+ return qrk;
91
+ }
92
+
93
+ /* qrk_free:
94
+ * Release all the memory used by a qrk_t object allocated with qrk_new. This
95
+ * will release all key string stored internally so all key returned by
96
+ * qrk_unmap become invalid and must not be used anymore.
97
+ */
98
+ void qrk_free(qrk_t *qrk) {
99
+ const size_t stkmax = 1024;
100
+ if (qrk->count != 0) {
101
+ node_t *stk[stkmax];
102
+ int cnt = 0;
103
+ stk[cnt++] = qrk->root;
104
+ while (cnt != 0) {
105
+ node_t *nd = stk[--cnt];
106
+ if (qrk_isleaf(nd)) {
107
+ free(qrk_nd2lf(nd));
108
+ continue;
109
+ }
110
+ stk[cnt++] = nd->child[0];
111
+ stk[cnt++] = nd->child[1];
112
+ free(nd);
113
+ }
114
+ }
115
+ free(qrk->leafs);
116
+ free(qrk);
117
+ }
118
+
119
+ /* qrk_insert:
120
+ * Map a key to a uniq identifier. If the key already exist in the map, return
121
+ * its identifier, else allocate a new identifier and insert the new (key,id)
122
+ * pair inside the quark. This function is not thread safe and should not be
123
+ * called on the same map from different thread without locking.
124
+ */
125
+ size_t qrk_str2id(qrk_t *qrk, const char *key) {
126
+ const uint8_t *raw = (void *)key;
127
+ const size_t len = strlen(key);
128
+ // We first take care of the empty trie case so later we can safely
129
+ // assume that the trie is well formed and so there is no NULL pointers
130
+ // in it.
131
+ if (qrk->count == 0) {
132
+ if (qrk->lock == true)
133
+ return none;
134
+ const size_t size = sizeof(char) * (len + 1);
135
+ leaf_t *lf = wapiti_xmalloc(sizeof(leaf_t) + size);
136
+ memcpy(lf->key, key, size);
137
+ lf->id = 0;
138
+ qrk->root = qrk_lf2nd(lf);
139
+ qrk->leafs[0] = lf;
140
+ qrk->count = 1;
141
+ return 0;
142
+ }
143
+ // If the trie is not empty, we first go down the trie to the leaf like
144
+ // if we are searching for the key. When at leaf there is two case,
145
+ // either we have found our key or we have found another key with all
146
+ // its critical bit identical to our one. So we search for the first
147
+ // differing bit between them to know where we have to add the new node.
148
+ const node_t *nd = qrk->root;
149
+ while (!qrk_isleaf(nd)) {
150
+ const uint8_t chr = nd->pos < len ? raw[nd->pos] : 0;
151
+ const int side = ((chr | nd->byte) + 1) >> 8;
152
+ nd = nd->child[side];
153
+ }
154
+ const char *bst = qrk_nd2lf(nd)->key;
155
+ size_t pos;
156
+ for (pos = 0; pos < len; pos++)
157
+ if (key[pos] != bst[pos])
158
+ break;
159
+ uint8_t byte;
160
+ if (pos != len)
161
+ byte = key[pos] ^ bst[pos];
162
+ else if (bst[pos] != '\0')
163
+ byte = bst[pos];
164
+ else
165
+ return qrk_nd2lf(nd)->id;
166
+ if (qrk->lock == true)
167
+ return none;
168
+ // Now we known the two key are different and we know in which byte. It
169
+ // remain to build the mask for the new critical bit and build the new
170
+ // internal node and leaf.
171
+ while (byte & (byte - 1))
172
+ byte &= byte - 1;
173
+ byte ^= 255;
174
+ const uint8_t chr = bst[pos];
175
+ const int side = ((chr | byte) + 1) >> 8;
176
+ const size_t size = sizeof(char) * (len + 1);
177
+ node_t *nx = wapiti_xmalloc(sizeof(node_t));
178
+ leaf_t *lf = wapiti_xmalloc(sizeof(leaf_t) + size);
179
+ memcpy(lf->key, key, size);
180
+ lf->id = qrk->count++;
181
+ nx->pos = pos;
182
+ nx->byte = byte;
183
+ nx->child[1 - side] = qrk_lf2nd(lf);
184
+ if (lf->id == qrk->size) {
185
+ qrk->size *= 1.4;
186
+ const size_t size = sizeof(leaf_t *) * qrk->size;
187
+ qrk->leafs = wapiti_xrealloc(qrk->leafs, size);
188
+ }
189
+ qrk->leafs[lf->id] = lf;
190
+ // And last thing to do: inserting the new node in the trie. We have to
191
+ // walk down the trie again as we have to keep the ordering of nodes. So
192
+ // we search for the good position to insert it.
193
+ node_t **trg = &qrk->root;
194
+ while (true) {
195
+ node_t *nd = *trg;
196
+ if (qrk_isleaf(nd) || nd->pos > pos)
197
+ break;
198
+ if (nd->pos == pos && nd->byte > byte)
199
+ break;
200
+ const uint8_t chr = nd->pos < len ? raw[nd->pos] : 0;
201
+ const int side = ((chr | nd->byte) + 1) >> 8;
202
+ trg = &nd->child[side];
203
+ }
204
+ nx->child[side] = *trg;
205
+ *trg = nx;
206
+ return lf->id;
207
+ }
208
+
209
+ /* qrk_id2str:
210
+ * Retrieve the key associated to an identifier. The key is returned as a
211
+ * constant string that should not be modified or freed by the caller, it is
212
+ * a pointer to the internal copy of the key kept by the map object and
213
+ * remain valid only for the life time of the quark, a call to qrk_free will
214
+ * make this pointer invalid.
215
+ */
216
+ const char *qrk_id2str(const qrk_t *qrk, size_t id) {
217
+ if (id >= qrk->count)
218
+ fatal("invalid identifier");
219
+ return qrk->leafs[id]->key;
220
+ }
221
+
222
+ /* qrk_save:
223
+ * Save list of keys present in the map object in the id order to the given
224
+ * file. We put one key per line so, if no key contains a new line, the line
225
+ * number correspond to the id.
226
+ */
227
+ void qrk_save(const qrk_t *qrk, FILE *file) {
228
+ if (fprintf(file, "#qrk#%zu\n", (size_t)qrk->count) < 0)
229
+ pfatal("cannot write to file");
230
+ if (qrk->count == 0)
231
+ return;
232
+ for (uint64_t n = 0; n < qrk->count; n++)
233
+ ns_writestr(file, qrk->leafs[n]->key);
234
+ }
235
+
236
+ /* qrk_load:
237
+ * Load a list of key from the given file and add them to the map. Each lines
238
+ * of the file is taken as a single key and mapped to the next available id if
239
+ * not already present. If all keys are single lines and the given map is
240
+ * initilay empty, this will load a map exactly as saved by qrk_save.
241
+ */
242
+ void qrk_load(qrk_t *qrk, FILE *file) {
243
+ size_t cnt = 0;
244
+ if (fscanf(file, "#qrk#%zu\n", &cnt) != 1) {
245
+ if (ferror(file) != 0)
246
+ pfatal("cannot read from file");
247
+ pfatal("invalid format");
248
+ }
249
+ for (size_t n = 0; n < cnt; ++n) {
250
+ char *str = ns_readstr(file);
251
+ qrk_str2id(qrk, str);
252
+ free(str);
253
+ }
254
+ }
255
+
256
+ /* qrk_count:
257
+ * Return the number of mappings stored in the quark.
258
+ */
259
+ size_t qrk_count(const qrk_t *qrk) {
260
+ return qrk->count;
261
+ }
262
+
263
+ /* qrk_lock:
264
+ * Set the lock value of the quark and return the old one.
265
+ */
266
+ bool qrk_lock(qrk_t *qrk, bool lock) {
267
+ bool old = qrk->lock;
268
+ qrk->lock = lock;
269
+ return old;
270
+ }
271
+
272
+