melisa 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. data/README.md +11 -0
  2. data/ext/marisa/bindings/marisa-swig.cxx +253 -0
  3. data/ext/marisa/bindings/marisa-swig.h +183 -0
  4. data/ext/marisa/bindings/perl/marisa-swig.cxx +253 -0
  5. data/ext/marisa/bindings/perl/marisa-swig.h +183 -0
  6. data/ext/marisa/bindings/perl/marisa-swig_wrap.cxx +5160 -0
  7. data/ext/marisa/bindings/python/marisa-swig.cxx +253 -0
  8. data/ext/marisa/bindings/python/marisa-swig.h +183 -0
  9. data/ext/marisa/bindings/python/marisa-swig_wrap.cxx +6090 -0
  10. data/ext/marisa/bindings/ruby/extconf.rb +5 -0
  11. data/ext/marisa/bindings/ruby/marisa-swig.cxx +253 -0
  12. data/ext/marisa/bindings/ruby/marisa-swig.h +183 -0
  13. data/ext/marisa/bindings/ruby/marisa-swig_wrap.cxx +4708 -0
  14. data/ext/marisa/lib/marisa.h +14 -0
  15. data/ext/marisa/lib/marisa/agent.cc +51 -0
  16. data/ext/marisa/lib/marisa/agent.h +73 -0
  17. data/ext/marisa/lib/marisa/base.h +193 -0
  18. data/ext/marisa/lib/marisa/exception.h +82 -0
  19. data/ext/marisa/lib/marisa/grimoire/algorithm.h +26 -0
  20. data/ext/marisa/lib/marisa/grimoire/algorithm/sort.h +196 -0
  21. data/ext/marisa/lib/marisa/grimoire/intrin.h +115 -0
  22. data/ext/marisa/lib/marisa/grimoire/io.h +18 -0
  23. data/ext/marisa/lib/marisa/grimoire/io/mapper.cc +163 -0
  24. data/ext/marisa/lib/marisa/grimoire/io/mapper.h +67 -0
  25. data/ext/marisa/lib/marisa/grimoire/io/reader.cc +147 -0
  26. data/ext/marisa/lib/marisa/grimoire/io/reader.h +66 -0
  27. data/ext/marisa/lib/marisa/grimoire/io/writer.cc +148 -0
  28. data/ext/marisa/lib/marisa/grimoire/io/writer.h +65 -0
  29. data/ext/marisa/lib/marisa/grimoire/trie.h +16 -0
  30. data/ext/marisa/lib/marisa/grimoire/trie/cache.h +81 -0
  31. data/ext/marisa/lib/marisa/grimoire/trie/config.h +155 -0
  32. data/ext/marisa/lib/marisa/grimoire/trie/entry.h +82 -0
  33. data/ext/marisa/lib/marisa/grimoire/trie/header.h +61 -0
  34. data/ext/marisa/lib/marisa/grimoire/trie/history.h +65 -0
  35. data/ext/marisa/lib/marisa/grimoire/trie/key.h +228 -0
  36. data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.cc +876 -0
  37. data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.h +134 -0
  38. data/ext/marisa/lib/marisa/grimoire/trie/range.h +115 -0
  39. data/ext/marisa/lib/marisa/grimoire/trie/state.h +117 -0
  40. data/ext/marisa/lib/marisa/grimoire/trie/tail.cc +218 -0
  41. data/ext/marisa/lib/marisa/grimoire/trie/tail.h +72 -0
  42. data/ext/marisa/lib/marisa/grimoire/vector.h +18 -0
  43. data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.cc +826 -0
  44. data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.h +179 -0
  45. data/ext/marisa/lib/marisa/grimoire/vector/flat-vector.h +205 -0
  46. data/ext/marisa/lib/marisa/grimoire/vector/pop-count.h +110 -0
  47. data/ext/marisa/lib/marisa/grimoire/vector/rank-index.h +82 -0
  48. data/ext/marisa/lib/marisa/grimoire/vector/vector.h +256 -0
  49. data/ext/marisa/lib/marisa/iostream.h +18 -0
  50. data/ext/marisa/lib/marisa/key.h +85 -0
  51. data/ext/marisa/lib/marisa/keyset.cc +181 -0
  52. data/ext/marisa/lib/marisa/keyset.h +80 -0
  53. data/ext/marisa/lib/marisa/query.h +71 -0
  54. data/ext/marisa/lib/marisa/scoped-array.h +48 -0
  55. data/ext/marisa/lib/marisa/scoped-ptr.h +52 -0
  56. data/ext/marisa/lib/marisa/stdio.h +15 -0
  57. data/ext/marisa/lib/marisa/trie.cc +249 -0
  58. data/ext/marisa/lib/marisa/trie.h +64 -0
  59. data/ext/marisa/tests/base-test.cc +309 -0
  60. data/ext/marisa/tests/io-test.cc +252 -0
  61. data/ext/marisa/tests/marisa-assert.h +26 -0
  62. data/ext/marisa/tests/marisa-test.cc +388 -0
  63. data/ext/marisa/tests/trie-test.cc +507 -0
  64. data/ext/marisa/tests/vector-test.cc +466 -0
  65. data/ext/marisa/tools/cmdopt.cc +298 -0
  66. data/ext/marisa/tools/cmdopt.h +58 -0
  67. data/ext/marisa/tools/marisa-benchmark.cc +418 -0
  68. data/ext/marisa/tools/marisa-build.cc +206 -0
  69. data/ext/marisa/tools/marisa-common-prefix-search.cc +143 -0
  70. data/ext/marisa/tools/marisa-dump.cc +151 -0
  71. data/ext/marisa/tools/marisa-lookup.cc +110 -0
  72. data/ext/marisa/tools/marisa-predictive-search.cc +143 -0
  73. data/ext/marisa/tools/marisa-reverse-lookup.cc +110 -0
  74. data/lib/melisa.rb +7 -0
  75. data/lib/melisa/base_config_flags.rb +76 -0
  76. data/lib/melisa/bytes_trie.rb +55 -0
  77. data/lib/melisa/int_trie.rb +14 -0
  78. data/lib/melisa/search.rb +55 -0
  79. data/lib/melisa/trie.rb +96 -0
  80. data/lib/melisa/version.rb +3 -0
  81. data/melisa.gemspec +36 -0
  82. data/spec/base_config_flags_spec.rb +73 -0
  83. data/spec/bytes_trie_spec.rb +16 -0
  84. data/spec/int_trie_spec.rb +16 -0
  85. data/spec/search_spec.rb +29 -0
  86. data/spec/spec_helper.rb +1 -0
  87. data/spec/trie_spec.rb +30 -0
  88. metadata +207 -0
@@ -0,0 +1,26 @@
1
+ #ifndef MARISA_ASSERT_H_
2
+ #define MARISA_ASSERT_H_
3
+
4
+ #include <iostream>
5
+ #include <cstdlib>
6
+
7
+ #define ASSERT(cond) (void)((!!(cond)) || \
8
+ ((std::cout << __LINE__ << ": Assertion `" << #cond << "' failed." \
9
+ << std::endl), std::exit(-1), 0))
10
+
11
+ #define EXCEPT(code, expected_error_code) try { \
12
+ code; \
13
+ std::cout << __LINE__ << ": Exception `" << #code << "' failed." \
14
+ << std::endl; \
15
+ std::exit(-1); \
16
+ } catch (const marisa::Exception &ex) { \
17
+ ASSERT(ex.error_code() == expected_error_code); \
18
+ }
19
+
20
+ #define TEST_START() \
21
+ (std::cout << __FILE__ << ":" << __LINE__ << ": " << __FUNCTION__ << "(): ")
22
+
23
+ #define TEST_END() \
24
+ (std::cout << "ok" << std::endl)
25
+
26
+ #endif // MARISA_ASSERT_H_
@@ -0,0 +1,388 @@
1
+ #include <cstdlib>
2
+ #include <cstring>
3
+ #include <ctime>
4
+ #include <sstream>
5
+
6
+ #include <marisa.h>
7
+
8
+ #include "marisa-assert.h"
9
+
10
+ namespace {
11
+
12
+ void TestEmptyTrie() {
13
+ TEST_START();
14
+
15
+ marisa::Trie trie;
16
+
17
+ EXCEPT(trie.save("marisa-test.dat"), MARISA_STATE_ERROR);
18
+ #ifdef _MSC_VER
19
+ EXCEPT(trie.write(::_fileno(stdout)), MARISA_STATE_ERROR);
20
+ #else // _MSC_VER
21
+ EXCEPT(trie.write(::fileno(stdout)), MARISA_STATE_ERROR);
22
+ #endif // _MSC_VER
23
+ EXCEPT(std::cout << trie, MARISA_STATE_ERROR);
24
+ EXCEPT(marisa::fwrite(stdout, trie), MARISA_STATE_ERROR);
25
+
26
+ marisa::Agent agent;
27
+
28
+ EXCEPT(trie.lookup(agent), MARISA_STATE_ERROR);
29
+ EXCEPT(trie.reverse_lookup(agent), MARISA_STATE_ERROR);
30
+ EXCEPT(trie.common_prefix_search(agent), MARISA_STATE_ERROR);
31
+ EXCEPT(trie.predictive_search(agent), MARISA_STATE_ERROR);
32
+
33
+ EXCEPT(trie.num_tries(), MARISA_STATE_ERROR);
34
+ EXCEPT(trie.num_keys(), MARISA_STATE_ERROR);
35
+ EXCEPT(trie.num_nodes(), MARISA_STATE_ERROR);
36
+
37
+ EXCEPT(trie.tail_mode(), MARISA_STATE_ERROR);
38
+ EXCEPT(trie.node_order(), MARISA_STATE_ERROR);
39
+
40
+ EXCEPT(trie.empty(), MARISA_STATE_ERROR);
41
+ EXCEPT(trie.size(), MARISA_STATE_ERROR);
42
+ EXCEPT(trie.total_size(), MARISA_STATE_ERROR);
43
+ EXCEPT(trie.io_size(), MARISA_STATE_ERROR);
44
+
45
+ marisa::Keyset keyset;
46
+ trie.build(keyset);
47
+
48
+ ASSERT(!trie.lookup(agent));
49
+ EXCEPT(trie.reverse_lookup(agent), MARISA_BOUND_ERROR);
50
+ ASSERT(!trie.common_prefix_search(agent));
51
+ ASSERT(!trie.predictive_search(agent));
52
+
53
+ ASSERT(trie.num_tries() == 1);
54
+ ASSERT(trie.num_keys() == 0);
55
+ ASSERT(trie.num_nodes() == 1);
56
+
57
+ ASSERT(trie.tail_mode() == MARISA_DEFAULT_TAIL);
58
+ ASSERT(trie.node_order() == MARISA_DEFAULT_ORDER);
59
+
60
+ ASSERT(trie.empty());
61
+ ASSERT(trie.size() == 0);
62
+ ASSERT(trie.total_size() != 0);
63
+ ASSERT(trie.io_size() != 0);
64
+
65
+ keyset.push_back("");
66
+ trie.build(keyset);
67
+
68
+ ASSERT(trie.lookup(agent));
69
+ trie.reverse_lookup(agent);
70
+ ASSERT(trie.common_prefix_search(agent));
71
+ ASSERT(!trie.common_prefix_search(agent));
72
+ ASSERT(trie.predictive_search(agent));
73
+ ASSERT(!trie.predictive_search(agent));
74
+
75
+ ASSERT(trie.num_keys() == 1);
76
+ ASSERT(trie.num_nodes() == 1);
77
+
78
+ ASSERT(!trie.empty());
79
+ ASSERT(trie.size() == 1);
80
+ ASSERT(trie.total_size() != 0);
81
+ ASSERT(trie.io_size() != 0);
82
+
83
+ TEST_END();
84
+ }
85
+
86
+ void TestTinyTrie() {
87
+ TEST_START();
88
+
89
+ marisa::Keyset keyset;
90
+ keyset.push_back("bach");
91
+ keyset.push_back("bet");
92
+ keyset.push_back("chat");
93
+ keyset.push_back("check");
94
+ keyset.push_back("check");
95
+
96
+ marisa::Trie trie;
97
+ trie.build(keyset, 1);
98
+
99
+ ASSERT(trie.num_tries() == 1);
100
+ ASSERT(trie.num_keys() == 4);
101
+ ASSERT(trie.num_nodes() == 7);
102
+
103
+ ASSERT(trie.tail_mode() == MARISA_DEFAULT_TAIL);
104
+ ASSERT(trie.node_order() == MARISA_DEFAULT_ORDER);
105
+
106
+ ASSERT(keyset[0].id() == 2);
107
+ ASSERT(keyset[1].id() == 3);
108
+ ASSERT(keyset[2].id() == 1);
109
+ ASSERT(keyset[3].id() == 0);
110
+ ASSERT(keyset[4].id() == 0);
111
+
112
+ marisa::Agent agent;
113
+ for (std::size_t i = 0; i < keyset.size(); ++i) {
114
+ agent.set_query(keyset[i].ptr(), keyset[i].length());
115
+ ASSERT(trie.lookup(agent));
116
+ ASSERT(agent.key().id() == keyset[i].id());
117
+
118
+ agent.set_query(keyset[i].id());
119
+ trie.reverse_lookup(agent);
120
+ ASSERT(agent.key().length() == keyset[i].length());
121
+ ASSERT(std::memcmp(agent.key().ptr(), keyset[i].ptr(),
122
+ agent.key().length()) == 0);
123
+ }
124
+
125
+ agent.set_query("be");
126
+ ASSERT(!trie.common_prefix_search(agent));
127
+ agent.set_query("beX");
128
+ ASSERT(!trie.common_prefix_search(agent));
129
+ agent.set_query("bet");
130
+ ASSERT(trie.common_prefix_search(agent));
131
+ ASSERT(!trie.common_prefix_search(agent));
132
+ agent.set_query("betX");
133
+ ASSERT(trie.common_prefix_search(agent));
134
+ ASSERT(!trie.common_prefix_search(agent));
135
+
136
+ agent.set_query("chatX");
137
+ ASSERT(!trie.predictive_search(agent));
138
+ agent.set_query("chat");
139
+ ASSERT(trie.predictive_search(agent));
140
+ ASSERT(agent.key().length() == 4);
141
+ ASSERT(!trie.predictive_search(agent));
142
+
143
+ agent.set_query("cha");
144
+ ASSERT(trie.predictive_search(agent));
145
+ ASSERT(agent.key().length() == 4);
146
+ ASSERT(!trie.predictive_search(agent));
147
+
148
+ agent.set_query("c");
149
+ ASSERT(trie.predictive_search(agent));
150
+ ASSERT(agent.key().length() == 5);
151
+ ASSERT(std::memcmp(agent.key().ptr(), "check", 5) == 0);
152
+ ASSERT(trie.predictive_search(agent));
153
+ ASSERT(agent.key().length() == 4);
154
+ ASSERT(std::memcmp(agent.key().ptr(), "chat", 4) == 0);
155
+ ASSERT(!trie.predictive_search(agent));
156
+
157
+ agent.set_query("ch");
158
+ ASSERT(trie.predictive_search(agent));
159
+ ASSERT(agent.key().length() == 5);
160
+ ASSERT(std::memcmp(agent.key().ptr(), "check", 5) == 0);
161
+ ASSERT(trie.predictive_search(agent));
162
+ ASSERT(agent.key().length() == 4);
163
+ ASSERT(std::memcmp(agent.key().ptr(), "chat", 4) == 0);
164
+ ASSERT(!trie.predictive_search(agent));
165
+
166
+ trie.build(keyset, 1 | MARISA_LABEL_ORDER);
167
+
168
+ ASSERT(trie.num_tries() == 1);
169
+ ASSERT(trie.num_keys() == 4);
170
+ ASSERT(trie.num_nodes() == 7);
171
+
172
+ ASSERT(trie.tail_mode() == MARISA_DEFAULT_TAIL);
173
+ ASSERT(trie.node_order() == MARISA_LABEL_ORDER);
174
+
175
+ ASSERT(keyset[0].id() == 0);
176
+ ASSERT(keyset[1].id() == 1);
177
+ ASSERT(keyset[2].id() == 2);
178
+ ASSERT(keyset[3].id() == 3);
179
+ ASSERT(keyset[4].id() == 3);
180
+
181
+ for (std::size_t i = 0; i < keyset.size(); ++i) {
182
+ agent.set_query(keyset[i].ptr(), keyset[i].length());
183
+ ASSERT(trie.lookup(agent));
184
+ ASSERT(agent.key().id() == keyset[i].id());
185
+
186
+ agent.set_query(keyset[i].id());
187
+ trie.reverse_lookup(agent);
188
+ ASSERT(agent.key().length() == keyset[i].length());
189
+ ASSERT(std::memcmp(agent.key().ptr(), keyset[i].ptr(),
190
+ agent.key().length()) == 0);
191
+ }
192
+
193
+ agent.set_query("");
194
+ for (std::size_t i = 0; i < trie.size(); ++i) {
195
+ ASSERT(trie.predictive_search(agent));
196
+ ASSERT(agent.key().id() == i);
197
+ }
198
+ ASSERT(!trie.predictive_search(agent));
199
+
200
+ TEST_END();
201
+ }
202
+
203
+ void MakeKeyset(std::size_t num_keys, marisa::TailMode tail_mode,
204
+ marisa::Keyset *keyset) {
205
+ char key_buf[16];
206
+ for (std::size_t i = 0; i < num_keys; ++i) {
207
+ const std::size_t length = std::rand() % sizeof(key_buf);
208
+ for (std::size_t j = 0; j < length; ++j) {
209
+ key_buf[j] = (char)(std::rand() % 10);
210
+ if (tail_mode == MARISA_TEXT_TAIL) {
211
+ key_buf[j] += '0';
212
+ }
213
+ }
214
+ keyset->push_back(key_buf, length);
215
+ }
216
+ }
217
+
218
+ void TestLookup(const marisa::Trie &trie, const marisa::Keyset &keyset) {
219
+ marisa::Agent agent;
220
+ for (std::size_t i = 0; i < keyset.size(); ++i) {
221
+ agent.set_query(keyset[i].ptr(), keyset[i].length());
222
+ ASSERT(trie.lookup(agent));
223
+ ASSERT(agent.key().id() == keyset[i].id());
224
+
225
+ agent.set_query(keyset[i].id());
226
+ trie.reverse_lookup(agent);
227
+ ASSERT(agent.key().length() == keyset[i].length());
228
+ ASSERT(std::memcmp(agent.key().ptr(), keyset[i].ptr(),
229
+ agent.key().length()) == 0);
230
+ }
231
+ }
232
+
233
+ void TestCommonPrefixSearch(const marisa::Trie &trie,
234
+ const marisa::Keyset &keyset) {
235
+ marisa::Agent agent;
236
+ for (std::size_t i = 0; i < keyset.size(); ++i) {
237
+ agent.set_query(keyset[i].ptr(), keyset[i].length());
238
+ ASSERT(trie.common_prefix_search(agent));
239
+ ASSERT(agent.key().id() <= keyset[i].id());
240
+ while (trie.common_prefix_search(agent)) {
241
+ ASSERT(agent.key().id() <= keyset[i].id());
242
+ }
243
+ ASSERT(agent.key().id() == keyset[i].id());
244
+ }
245
+ }
246
+
247
+ void TestPredictiveSearch(const marisa::Trie &trie,
248
+ const marisa::Keyset &keyset) {
249
+ marisa::Agent agent;
250
+ for (std::size_t i = 0; i < keyset.size(); ++i) {
251
+ agent.set_query(keyset[i].ptr(), keyset[i].length());
252
+ ASSERT(trie.predictive_search(agent));
253
+ ASSERT(agent.key().id() == keyset[i].id());
254
+ while (trie.predictive_search(agent)) {
255
+ ASSERT(agent.key().id() > keyset[i].id());
256
+ }
257
+ }
258
+ }
259
+
260
+ void TestTrie(int num_tries, marisa::TailMode tail_mode,
261
+ marisa::NodeOrder node_order, marisa::Keyset &keyset) {
262
+ for (std::size_t i = 0; i < keyset.size(); ++i) {
263
+ keyset[i].set_weight(1.0F);
264
+ }
265
+
266
+ marisa::Trie trie;
267
+ trie.build(keyset, num_tries | tail_mode | node_order);
268
+
269
+ ASSERT(trie.num_tries() == (std::size_t)num_tries);
270
+ ASSERT(trie.num_keys() <= keyset.size());
271
+
272
+ ASSERT(trie.tail_mode() == tail_mode);
273
+ ASSERT(trie.node_order() == node_order);
274
+
275
+ TestLookup(trie, keyset);
276
+ TestCommonPrefixSearch(trie, keyset);
277
+ TestPredictiveSearch(trie, keyset);
278
+
279
+ trie.save("marisa-test.dat");
280
+
281
+ trie.clear();
282
+ trie.load("marisa-test.dat");
283
+
284
+ ASSERT(trie.num_tries() == (std::size_t)num_tries);
285
+ ASSERT(trie.num_keys() <= keyset.size());
286
+
287
+ ASSERT(trie.tail_mode() == tail_mode);
288
+ ASSERT(trie.node_order() == node_order);
289
+
290
+ TestLookup(trie, keyset);
291
+
292
+ {
293
+ std::FILE *file;
294
+ #ifdef _MSC_VER
295
+ ASSERT(::fopen_s(&file, "marisa-test.dat", "wb") == 0);
296
+ #else // _MSC_VER
297
+ file = std::fopen("marisa-test.dat", "wb");
298
+ ASSERT(file != NULL);
299
+ #endif // _MSC_VER
300
+ marisa::fwrite(file, trie);
301
+ std::fclose(file);
302
+ trie.clear();
303
+ #ifdef _MSC_VER
304
+ ASSERT(::fopen_s(&file, "marisa-test.dat", "rb") == 0);
305
+ #else // _MSC_VER
306
+ file = std::fopen("marisa-test.dat", "rb");
307
+ ASSERT(file != NULL);
308
+ #endif // _MSC_VER
309
+ marisa::fread(file, &trie);
310
+ std::fclose(file);
311
+ }
312
+
313
+ ASSERT(trie.num_tries() == (std::size_t)num_tries);
314
+ ASSERT(trie.num_keys() <= keyset.size());
315
+
316
+ ASSERT(trie.tail_mode() == tail_mode);
317
+ ASSERT(trie.node_order() == node_order);
318
+
319
+ TestLookup(trie, keyset);
320
+
321
+ trie.clear();
322
+ trie.mmap("marisa-test.dat");
323
+
324
+ ASSERT(trie.num_tries() == (std::size_t)num_tries);
325
+ ASSERT(trie.num_keys() <= keyset.size());
326
+
327
+ ASSERT(trie.tail_mode() == tail_mode);
328
+ ASSERT(trie.node_order() == node_order);
329
+
330
+ TestLookup(trie, keyset);
331
+
332
+ {
333
+ std::stringstream stream;
334
+ stream << trie;
335
+ trie.clear();
336
+ stream >> trie;
337
+ }
338
+
339
+ ASSERT(trie.num_tries() == (std::size_t)num_tries);
340
+ ASSERT(trie.num_keys() <= keyset.size());
341
+
342
+ ASSERT(trie.tail_mode() == tail_mode);
343
+ ASSERT(trie.node_order() == node_order);
344
+
345
+ TestLookup(trie, keyset);
346
+ }
347
+
348
+ void TestTrie(marisa::TailMode tail_mode, marisa::NodeOrder node_order,
349
+ marisa::Keyset &keyset) {
350
+ TEST_START();
351
+ std::cout << ((tail_mode == MARISA_TEXT_TAIL) ? "TEXT" : "BINARY") << ", ";
352
+ std::cout << ((node_order == MARISA_WEIGHT_ORDER) ?
353
+ "WEIGHT" : "LABEL") << ": ";
354
+
355
+ for (int i = 1; i < 5; ++i) {
356
+ TestTrie(i, tail_mode, node_order, keyset);
357
+ }
358
+
359
+ TEST_END();
360
+ }
361
+
362
+ void TestTrie(marisa::TailMode tail_mode) {
363
+ marisa::Keyset keyset;
364
+ MakeKeyset(1000, tail_mode, &keyset);
365
+
366
+ TestTrie(tail_mode, MARISA_WEIGHT_ORDER, keyset);
367
+ TestTrie(tail_mode, MARISA_LABEL_ORDER, keyset);
368
+ }
369
+
370
+ void TestTrie() {
371
+ TestTrie(MARISA_TEXT_TAIL);
372
+ TestTrie(MARISA_BINARY_TAIL);
373
+ }
374
+
375
+ } // namespace
376
+
377
+ int main() try {
378
+ std::srand((unsigned int)std::time(NULL));
379
+
380
+ TestEmptyTrie();
381
+ TestTinyTrie();
382
+ TestTrie();
383
+
384
+ return 0;
385
+ } catch (const marisa::Exception &ex) {
386
+ std::cerr << ex.what() << std::endl;
387
+ throw;
388
+ }
@@ -0,0 +1,507 @@
1
+ #include <algorithm>
2
+ #include <cstring>
3
+ #include <sstream>
4
+
5
+ #include <marisa/grimoire/trie/config.h>
6
+ #include <marisa/grimoire/trie/header.h>
7
+ #include <marisa/grimoire/trie/key.h>
8
+ #include <marisa/grimoire/trie/range.h>
9
+ #include <marisa/grimoire/trie/tail.h>
10
+ #include <marisa/grimoire/trie/state.h>
11
+
12
+ #include "marisa-assert.h"
13
+
14
+ namespace {
15
+
16
+ void TestConfig() {
17
+ TEST_START();
18
+
19
+ marisa::grimoire::trie::Config config;
20
+
21
+ ASSERT(config.num_tries() == MARISA_DEFAULT_NUM_TRIES);
22
+ ASSERT(config.tail_mode() == MARISA_DEFAULT_TAIL);
23
+ ASSERT(config.node_order() == MARISA_DEFAULT_ORDER);
24
+ ASSERT(config.cache_level() == MARISA_DEFAULT_CACHE);
25
+
26
+ config.parse(10 | MARISA_BINARY_TAIL | MARISA_LABEL_ORDER |
27
+ MARISA_TINY_CACHE);
28
+
29
+ ASSERT(config.num_tries() == 10);
30
+ ASSERT(config.tail_mode() == MARISA_BINARY_TAIL);
31
+ ASSERT(config.node_order() == MARISA_LABEL_ORDER);
32
+ ASSERT(config.cache_level() == MARISA_TINY_CACHE);
33
+
34
+ config.parse(0);
35
+
36
+ ASSERT(config.num_tries() == MARISA_DEFAULT_NUM_TRIES);
37
+ ASSERT(config.tail_mode() == MARISA_DEFAULT_TAIL);
38
+ ASSERT(config.node_order() == MARISA_DEFAULT_ORDER);
39
+ ASSERT(config.cache_level() == MARISA_DEFAULT_CACHE);
40
+
41
+ TEST_END();
42
+ }
43
+
44
+ void TestHeader() {
45
+ TEST_START();
46
+
47
+ marisa::grimoire::trie::Header header;
48
+
49
+ {
50
+ marisa::grimoire::Writer writer;
51
+ writer.open("trie-test.dat");
52
+ header.write(writer);
53
+ }
54
+
55
+ {
56
+ marisa::grimoire::Mapper mapper;
57
+ mapper.open("trie-test.dat");
58
+ header.map(mapper);
59
+ }
60
+
61
+ {
62
+ marisa::grimoire::Reader reader;
63
+ reader.open("trie-test.dat");
64
+ header.read(reader);
65
+ }
66
+
67
+ TEST_END();
68
+ }
69
+
70
+ void TestKey() {
71
+ TEST_START();
72
+
73
+ marisa::grimoire::trie::Key key;
74
+
75
+ ASSERT(key.ptr() == NULL);
76
+ ASSERT(key.length() == 0);
77
+ ASSERT(key.id() == 0);
78
+ ASSERT(key.terminal() == 0);
79
+
80
+ const char *str = "xyz";
81
+
82
+ key.set_str(str, 3);
83
+ key.set_weight(10.0F);
84
+ key.set_id(20);
85
+
86
+
87
+ ASSERT(key.ptr() == str);
88
+ ASSERT(key.length() == 3);
89
+ ASSERT(key[0] == 'x');
90
+ ASSERT(key[1] == 'y');
91
+ ASSERT(key[2] == 'z');
92
+ ASSERT(key.weight() == 10.0F);
93
+ ASSERT(key.id() == 20);
94
+
95
+ key.set_terminal(30);
96
+ ASSERT(key.terminal() == 30);
97
+
98
+ key.substr(1, 2);
99
+
100
+ ASSERT(key.ptr() == str + 1);
101
+ ASSERT(key.length() == 2);
102
+ ASSERT(key[0] == 'y');
103
+ ASSERT(key[1] == 'z');
104
+
105
+ marisa::grimoire::trie::Key key2;
106
+ key2.set_str("abc", 3);
107
+
108
+ ASSERT(key == key);
109
+ ASSERT(key != key2);
110
+ ASSERT(key > key2);
111
+ ASSERT(key2 < key);
112
+
113
+ marisa::grimoire::trie::ReverseKey r_key;
114
+
115
+ ASSERT(r_key.ptr() == NULL);
116
+ ASSERT(r_key.length() == 0);
117
+ ASSERT(r_key.id() == 0);
118
+ ASSERT(r_key.terminal() == 0);
119
+
120
+ r_key.set_str(str, 3);
121
+ r_key.set_weight(100.0F);
122
+ r_key.set_id(200);
123
+
124
+ ASSERT(r_key.ptr() == str);
125
+ ASSERT(r_key.length() == 3);
126
+ ASSERT(r_key[0] == 'z');
127
+ ASSERT(r_key[1] == 'y');
128
+ ASSERT(r_key[2] == 'x');
129
+ ASSERT(r_key.weight() == 100.0F);
130
+ ASSERT(r_key.id() == 200);
131
+
132
+ r_key.set_terminal(300);
133
+ ASSERT(r_key.terminal() == 300);
134
+
135
+ r_key.substr(1, 2);
136
+
137
+ ASSERT(r_key.ptr() == str);
138
+ ASSERT(r_key.length() == 2);
139
+ ASSERT(r_key[0] == 'y');
140
+ ASSERT(r_key[1] == 'x');
141
+
142
+ marisa::grimoire::trie::ReverseKey r_key2;
143
+ r_key2.set_str("abc", 3);
144
+
145
+ ASSERT(r_key == r_key);
146
+ ASSERT(r_key != r_key2);
147
+ ASSERT(r_key > r_key2);
148
+ ASSERT(r_key2 < r_key);
149
+
150
+ TEST_END();
151
+ }
152
+
153
+ void TestRange() {
154
+ TEST_START();
155
+
156
+ marisa::grimoire::trie::Range range;
157
+
158
+ ASSERT(range.begin() == 0);
159
+ ASSERT(range.end() == 0);
160
+ ASSERT(range.key_pos() == 0);
161
+
162
+ range.set_begin(1);
163
+ range.set_end(2);
164
+ range.set_key_pos(3);
165
+
166
+ ASSERT(range.begin() == 1);
167
+ ASSERT(range.end() == 2);
168
+ ASSERT(range.key_pos() == 3);
169
+
170
+ range = marisa::grimoire::trie::make_range(10, 20, 30);
171
+
172
+ ASSERT(range.begin() == 10);
173
+ ASSERT(range.end() == 20);
174
+ ASSERT(range.key_pos() == 30);
175
+
176
+ marisa::grimoire::trie::WeightedRange w_range;
177
+
178
+ ASSERT(w_range.begin() == 0);
179
+ ASSERT(w_range.end() == 0);
180
+ ASSERT(w_range.key_pos() == 0);
181
+ ASSERT(w_range.weight() == 0.0F);
182
+
183
+ w_range.set_begin(10);
184
+ w_range.set_end(20);
185
+ w_range.set_key_pos(30);
186
+ w_range.set_weight(40.0F);
187
+
188
+ ASSERT(w_range.begin() == 10);
189
+ ASSERT(w_range.end() == 20);
190
+ ASSERT(w_range.key_pos() == 30);
191
+ ASSERT(w_range.weight() == 40.0F);
192
+
193
+ marisa::grimoire::trie::WeightedRange w_range2 =
194
+ marisa::grimoire::trie::make_weighted_range(100, 200, 300, 400.0F);
195
+
196
+ ASSERT(w_range2.begin() == 100);
197
+ ASSERT(w_range2.end() == 200);
198
+ ASSERT(w_range2.key_pos() == 300);
199
+ ASSERT(w_range2.weight() == 400.0F);
200
+
201
+ ASSERT(w_range < w_range2);
202
+ ASSERT(w_range2 > w_range);
203
+
204
+ TEST_END();
205
+ }
206
+
207
+ void TestEntry() {
208
+ TEST_START();
209
+
210
+ marisa::grimoire::trie::Entry entry;
211
+
212
+ ASSERT(entry.ptr() == NULL);
213
+ ASSERT(entry.length() == 0);
214
+ ASSERT(entry.id() == 0);
215
+
216
+ const char *str = "XYZ";
217
+
218
+ entry.set_str(str, 3);
219
+ entry.set_id(123);
220
+
221
+ ASSERT(entry.ptr() == str);
222
+ ASSERT(entry.length() == 3);
223
+ ASSERT(entry[0] == 'Z');
224
+ ASSERT(entry[1] == 'Y');
225
+ ASSERT(entry[2] == 'X');
226
+ ASSERT(entry.id() == 123);
227
+
228
+ TEST_END();
229
+ }
230
+
231
+ void TestTextTail() {
232
+ TEST_START();
233
+
234
+ marisa::grimoire::trie::Tail tail;
235
+ marisa::grimoire::Vector<marisa::grimoire::trie::Entry> entries;
236
+ marisa::grimoire::Vector<marisa::UInt32> offsets;
237
+ tail.build(entries, &offsets, MARISA_TEXT_TAIL);
238
+
239
+ ASSERT(tail.mode() == MARISA_TEXT_TAIL);
240
+ ASSERT(tail.size() == 0);
241
+ ASSERT(tail.empty());
242
+ ASSERT(tail.total_size() == tail.size());
243
+ ASSERT(tail.io_size() == (sizeof(marisa::UInt64) * 6));
244
+
245
+ ASSERT(offsets.empty());
246
+
247
+ marisa::grimoire::trie::Entry entry;
248
+ entry.set_str("X", 1);
249
+ entries.push_back(entry);
250
+
251
+ tail.build(entries, &offsets, MARISA_TEXT_TAIL);
252
+
253
+ ASSERT(tail.mode() == MARISA_TEXT_TAIL);
254
+ ASSERT(tail.size() == 2);
255
+ ASSERT(!tail.empty());
256
+ ASSERT(tail.total_size() == tail.size());
257
+ ASSERT(tail.io_size() == (sizeof(marisa::UInt64) * 7));
258
+
259
+ ASSERT(offsets.size() == entries.size());
260
+ ASSERT(offsets[0] == 0);
261
+ ASSERT(tail[offsets[0]] == 'X');
262
+ ASSERT(tail[offsets[0] + 1] == '\0');
263
+
264
+ entries.clear();
265
+ entry.set_str("abc", 3);
266
+ entries.push_back(entry);
267
+ entry.set_str("bc", 2);
268
+ entries.push_back(entry);
269
+ entry.set_str("abc", 3);
270
+ entries.push_back(entry);
271
+ entry.set_str("c", 1);
272
+ entries.push_back(entry);
273
+ entry.set_str("ABC", 3);
274
+ entries.push_back(entry);
275
+ entry.set_str("AB", 2);
276
+ entries.push_back(entry);
277
+
278
+ tail.build(entries, &offsets, MARISA_TEXT_TAIL);
279
+ std::sort(entries.begin(), entries.end(),
280
+ marisa::grimoire::trie::Entry::IDComparer());
281
+
282
+ ASSERT(tail.size() == 11);
283
+ ASSERT(offsets.size() == entries.size());
284
+ for (std::size_t i = 0; i < entries.size(); ++i) {
285
+ const char * const ptr = &tail[offsets[i]];
286
+ ASSERT(std::strlen(ptr) == entries[i].length());
287
+ ASSERT(std::strcmp(ptr, entries[i].ptr()) == 0);
288
+ }
289
+
290
+ {
291
+ marisa::grimoire::Writer writer;
292
+ writer.open("trie-test.dat");
293
+ tail.write(writer);
294
+ }
295
+
296
+ tail.clear();
297
+
298
+ ASSERT(tail.size() == 0);
299
+ ASSERT(tail.total_size() == tail.size());
300
+
301
+ {
302
+ marisa::grimoire::Mapper mapper;
303
+ mapper.open("trie-test.dat");
304
+ tail.map(mapper);
305
+
306
+ ASSERT(tail.mode() == MARISA_TEXT_TAIL);
307
+ ASSERT(tail.size() == 11);
308
+ for (std::size_t i = 0; i < entries.size(); ++i) {
309
+ const char * const ptr = &tail[offsets[i]];
310
+ ASSERT(std::strlen(ptr) == entries[i].length());
311
+ ASSERT(std::strcmp(ptr, entries[i].ptr()) == 0);
312
+ }
313
+ tail.clear();
314
+ }
315
+
316
+ {
317
+ marisa::grimoire::Reader reader;
318
+ reader.open("trie-test.dat");
319
+ tail.read(reader);
320
+ }
321
+
322
+ ASSERT(tail.size() == 11);
323
+ ASSERT(offsets.size() == entries.size());
324
+ for (std::size_t i = 0; i < entries.size(); ++i) {
325
+ const char * const ptr = &tail[offsets[i]];
326
+ ASSERT(std::strlen(ptr) == entries[i].length());
327
+ ASSERT(std::strcmp(ptr, entries[i].ptr()) == 0);
328
+ }
329
+
330
+ {
331
+ std::stringstream stream;
332
+ marisa::grimoire::Writer writer;
333
+ writer.open(stream);
334
+ tail.write(writer);
335
+ tail.clear();
336
+ marisa::grimoire::Reader reader;
337
+ reader.open(stream);
338
+ tail.read(reader);
339
+ }
340
+
341
+ ASSERT(tail.size() == 11);
342
+ ASSERT(offsets.size() == entries.size());
343
+ for (std::size_t i = 0; i < entries.size(); ++i) {
344
+ const char * const ptr = &tail[offsets[i]];
345
+ ASSERT(std::strlen(ptr) == entries[i].length());
346
+ ASSERT(std::strcmp(ptr, entries[i].ptr()) == 0);
347
+ }
348
+
349
+ TEST_END();
350
+ }
351
+
352
+ void TestBinaryTail() {
353
+ TEST_START();
354
+
355
+ marisa::grimoire::trie::Tail tail;
356
+ marisa::grimoire::Vector<marisa::grimoire::trie::Entry> entries;
357
+ marisa::grimoire::Vector<marisa::UInt32> offsets;
358
+ tail.build(entries, &offsets, MARISA_BINARY_TAIL);
359
+
360
+ ASSERT(tail.mode() == MARISA_TEXT_TAIL);
361
+ ASSERT(tail.size() == 0);
362
+ ASSERT(tail.empty());
363
+ ASSERT(tail.total_size() == tail.size());
364
+ ASSERT(tail.io_size() == (sizeof(marisa::UInt64) * 6));
365
+
366
+ ASSERT(offsets.empty());
367
+
368
+ marisa::grimoire::trie::Entry entry;
369
+ entry.set_str("X", 1);
370
+ entries.push_back(entry);
371
+
372
+ tail.build(entries, &offsets, MARISA_BINARY_TAIL);
373
+
374
+ ASSERT(tail.mode() == MARISA_BINARY_TAIL);
375
+ ASSERT(tail.size() == 1);
376
+ ASSERT(!tail.empty());
377
+ ASSERT(tail.total_size() == (tail.size() + sizeof(marisa::UInt64)));
378
+ ASSERT(tail.io_size() == (sizeof(marisa::UInt64) * 8));
379
+
380
+ ASSERT(offsets.size() == entries.size());
381
+ ASSERT(offsets[0] == 0);
382
+
383
+ const char binary_entry[] = { 'N', 'P', '\0', 'T', 'r', 'i', 'e' };
384
+ entries[0].set_str(binary_entry, sizeof(binary_entry));
385
+
386
+ tail.build(entries, &offsets, MARISA_TEXT_TAIL);
387
+
388
+ ASSERT(tail.mode() == MARISA_BINARY_TAIL);
389
+ ASSERT(tail.size() == entries[0].length());
390
+
391
+ ASSERT(offsets.size() == entries.size());
392
+ ASSERT(offsets[0] == 0);
393
+
394
+ entries.clear();
395
+ entry.set_str("abc", 3);
396
+ entries.push_back(entry);
397
+ entry.set_str("bc", 2);
398
+ entries.push_back(entry);
399
+ entry.set_str("abc", 3);
400
+ entries.push_back(entry);
401
+ entry.set_str("c", 1);
402
+ entries.push_back(entry);
403
+ entry.set_str("ABC", 3);
404
+ entries.push_back(entry);
405
+ entry.set_str("AB", 2);
406
+ entries.push_back(entry);
407
+
408
+ tail.build(entries, &offsets, MARISA_BINARY_TAIL);
409
+ std::sort(entries.begin(), entries.end(),
410
+ marisa::grimoire::trie::Entry::IDComparer());
411
+
412
+ ASSERT(tail.mode() == MARISA_BINARY_TAIL);
413
+ ASSERT(tail.size() == 8);
414
+ ASSERT(offsets.size() == entries.size());
415
+ for (std::size_t i = 0; i < entries.size(); ++i) {
416
+ const char * const ptr = &tail[offsets[i]];
417
+ ASSERT(std::memcmp(ptr, entries[i].ptr(), entries[i].length()) == 0);
418
+ }
419
+
420
+ TEST_END();
421
+ }
422
+
423
+ void TestHistory() {
424
+ TEST_START();
425
+
426
+ marisa::grimoire::trie::History history;
427
+
428
+ ASSERT(history.node_id() == 0);
429
+ ASSERT(history.louds_pos() == 0);
430
+ ASSERT(history.key_pos() == 0);
431
+ ASSERT(history.link_id() == MARISA_INVALID_LINK_ID);
432
+ ASSERT(history.key_id() == MARISA_INVALID_KEY_ID);
433
+
434
+ history.set_node_id(100);
435
+ history.set_louds_pos(200);
436
+ history.set_key_pos(300);
437
+ history.set_link_id(400);
438
+ history.set_key_id(500);
439
+
440
+ ASSERT(history.node_id() == 100);
441
+ ASSERT(history.louds_pos() == 200);
442
+ ASSERT(history.key_pos() == 300);
443
+ ASSERT(history.link_id() == 400);
444
+ ASSERT(history.key_id() == 500);
445
+
446
+ TEST_END();
447
+ }
448
+
449
+ void TestState() {
450
+ TEST_START();
451
+
452
+ marisa::grimoire::trie::State state;
453
+
454
+ ASSERT(state.key_buf().empty());
455
+ ASSERT(state.history().empty());
456
+ ASSERT(state.node_id() == 0);
457
+ ASSERT(state.query_pos() == 0);
458
+ ASSERT(state.history_pos() == 0);
459
+ ASSERT(state.status_code() == marisa::grimoire::trie::MARISA_READY_TO_ALL);
460
+
461
+ state.set_node_id(10);
462
+ state.set_query_pos(100);
463
+ state.set_history_pos(1000);
464
+ state.set_status_code(
465
+ marisa::grimoire::trie::MARISA_END_OF_PREDICTIVE_SEARCH);
466
+
467
+ ASSERT(state.node_id() == 10);
468
+ ASSERT(state.query_pos() == 100);
469
+ ASSERT(state.history_pos() == 1000);
470
+ ASSERT(state.status_code() ==
471
+ marisa::grimoire::trie::MARISA_END_OF_PREDICTIVE_SEARCH);
472
+
473
+ state.lookup_init();
474
+ ASSERT(state.status_code() == marisa::grimoire::trie::MARISA_READY_TO_ALL);
475
+
476
+ state.reverse_lookup_init();
477
+ ASSERT(state.status_code() == marisa::grimoire::trie::MARISA_READY_TO_ALL);
478
+
479
+ state.common_prefix_search_init();
480
+ ASSERT(state.status_code() ==
481
+ marisa::grimoire::trie::MARISA_READY_TO_COMMON_PREFIX_SEARCH);
482
+
483
+ state.predictive_search_init();
484
+ ASSERT(state.status_code() ==
485
+ marisa::grimoire::trie::MARISA_READY_TO_PREDICTIVE_SEARCH);
486
+
487
+ TEST_END();
488
+ }
489
+
490
+ } // namespace
491
+
492
+ int main() try {
493
+ TestConfig();
494
+ TestHeader();
495
+ TestKey();
496
+ TestRange();
497
+ TestEntry();
498
+ TestTextTail();
499
+ TestBinaryTail();
500
+ TestHistory();
501
+ TestState();
502
+
503
+ return 0;
504
+ } catch (const marisa::Exception &ex) {
505
+ std::cerr << ex.what() << std::endl;
506
+ throw;
507
+ }