melisa 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. data/README.md +11 -0
  2. data/ext/marisa/bindings/marisa-swig.cxx +253 -0
  3. data/ext/marisa/bindings/marisa-swig.h +183 -0
  4. data/ext/marisa/bindings/perl/marisa-swig.cxx +253 -0
  5. data/ext/marisa/bindings/perl/marisa-swig.h +183 -0
  6. data/ext/marisa/bindings/perl/marisa-swig_wrap.cxx +5160 -0
  7. data/ext/marisa/bindings/python/marisa-swig.cxx +253 -0
  8. data/ext/marisa/bindings/python/marisa-swig.h +183 -0
  9. data/ext/marisa/bindings/python/marisa-swig_wrap.cxx +6090 -0
  10. data/ext/marisa/bindings/ruby/extconf.rb +5 -0
  11. data/ext/marisa/bindings/ruby/marisa-swig.cxx +253 -0
  12. data/ext/marisa/bindings/ruby/marisa-swig.h +183 -0
  13. data/ext/marisa/bindings/ruby/marisa-swig_wrap.cxx +4708 -0
  14. data/ext/marisa/lib/marisa.h +14 -0
  15. data/ext/marisa/lib/marisa/agent.cc +51 -0
  16. data/ext/marisa/lib/marisa/agent.h +73 -0
  17. data/ext/marisa/lib/marisa/base.h +193 -0
  18. data/ext/marisa/lib/marisa/exception.h +82 -0
  19. data/ext/marisa/lib/marisa/grimoire/algorithm.h +26 -0
  20. data/ext/marisa/lib/marisa/grimoire/algorithm/sort.h +196 -0
  21. data/ext/marisa/lib/marisa/grimoire/intrin.h +115 -0
  22. data/ext/marisa/lib/marisa/grimoire/io.h +18 -0
  23. data/ext/marisa/lib/marisa/grimoire/io/mapper.cc +163 -0
  24. data/ext/marisa/lib/marisa/grimoire/io/mapper.h +67 -0
  25. data/ext/marisa/lib/marisa/grimoire/io/reader.cc +147 -0
  26. data/ext/marisa/lib/marisa/grimoire/io/reader.h +66 -0
  27. data/ext/marisa/lib/marisa/grimoire/io/writer.cc +148 -0
  28. data/ext/marisa/lib/marisa/grimoire/io/writer.h +65 -0
  29. data/ext/marisa/lib/marisa/grimoire/trie.h +16 -0
  30. data/ext/marisa/lib/marisa/grimoire/trie/cache.h +81 -0
  31. data/ext/marisa/lib/marisa/grimoire/trie/config.h +155 -0
  32. data/ext/marisa/lib/marisa/grimoire/trie/entry.h +82 -0
  33. data/ext/marisa/lib/marisa/grimoire/trie/header.h +61 -0
  34. data/ext/marisa/lib/marisa/grimoire/trie/history.h +65 -0
  35. data/ext/marisa/lib/marisa/grimoire/trie/key.h +228 -0
  36. data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.cc +876 -0
  37. data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.h +134 -0
  38. data/ext/marisa/lib/marisa/grimoire/trie/range.h +115 -0
  39. data/ext/marisa/lib/marisa/grimoire/trie/state.h +117 -0
  40. data/ext/marisa/lib/marisa/grimoire/trie/tail.cc +218 -0
  41. data/ext/marisa/lib/marisa/grimoire/trie/tail.h +72 -0
  42. data/ext/marisa/lib/marisa/grimoire/vector.h +18 -0
  43. data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.cc +826 -0
  44. data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.h +179 -0
  45. data/ext/marisa/lib/marisa/grimoire/vector/flat-vector.h +205 -0
  46. data/ext/marisa/lib/marisa/grimoire/vector/pop-count.h +110 -0
  47. data/ext/marisa/lib/marisa/grimoire/vector/rank-index.h +82 -0
  48. data/ext/marisa/lib/marisa/grimoire/vector/vector.h +256 -0
  49. data/ext/marisa/lib/marisa/iostream.h +18 -0
  50. data/ext/marisa/lib/marisa/key.h +85 -0
  51. data/ext/marisa/lib/marisa/keyset.cc +181 -0
  52. data/ext/marisa/lib/marisa/keyset.h +80 -0
  53. data/ext/marisa/lib/marisa/query.h +71 -0
  54. data/ext/marisa/lib/marisa/scoped-array.h +48 -0
  55. data/ext/marisa/lib/marisa/scoped-ptr.h +52 -0
  56. data/ext/marisa/lib/marisa/stdio.h +15 -0
  57. data/ext/marisa/lib/marisa/trie.cc +249 -0
  58. data/ext/marisa/lib/marisa/trie.h +64 -0
  59. data/ext/marisa/tests/base-test.cc +309 -0
  60. data/ext/marisa/tests/io-test.cc +252 -0
  61. data/ext/marisa/tests/marisa-assert.h +26 -0
  62. data/ext/marisa/tests/marisa-test.cc +388 -0
  63. data/ext/marisa/tests/trie-test.cc +507 -0
  64. data/ext/marisa/tests/vector-test.cc +466 -0
  65. data/ext/marisa/tools/cmdopt.cc +298 -0
  66. data/ext/marisa/tools/cmdopt.h +58 -0
  67. data/ext/marisa/tools/marisa-benchmark.cc +418 -0
  68. data/ext/marisa/tools/marisa-build.cc +206 -0
  69. data/ext/marisa/tools/marisa-common-prefix-search.cc +143 -0
  70. data/ext/marisa/tools/marisa-dump.cc +151 -0
  71. data/ext/marisa/tools/marisa-lookup.cc +110 -0
  72. data/ext/marisa/tools/marisa-predictive-search.cc +143 -0
  73. data/ext/marisa/tools/marisa-reverse-lookup.cc +110 -0
  74. data/lib/melisa.rb +7 -0
  75. data/lib/melisa/base_config_flags.rb +76 -0
  76. data/lib/melisa/bytes_trie.rb +55 -0
  77. data/lib/melisa/int_trie.rb +14 -0
  78. data/lib/melisa/search.rb +55 -0
  79. data/lib/melisa/trie.rb +96 -0
  80. data/lib/melisa/version.rb +3 -0
  81. data/melisa.gemspec +36 -0
  82. data/spec/base_config_flags_spec.rb +73 -0
  83. data/spec/bytes_trie_spec.rb +16 -0
  84. data/spec/int_trie_spec.rb +16 -0
  85. data/spec/search_spec.rb +29 -0
  86. data/spec/spec_helper.rb +1 -0
  87. data/spec/trie_spec.rb +30 -0
  88. metadata +207 -0
@@ -0,0 +1,298 @@
1
+ #include <stdio.h>
2
+
3
+ #include "cmdopt.h"
4
+
5
+ #ifdef __cplusplus
6
+ extern "C" {
7
+ #endif // __cplusplus
8
+
9
+ // Moves `optind' to the end and shifts other arguments.
10
+ static void cmdopt_shift(cmdopt_t *h) {
11
+ int i;
12
+ char *tmp;
13
+
14
+ tmp = h->argv[h->optind];
15
+ for (i = h->optind; i < h->argc - 1; i++) {
16
+ h->argv[i] = h->argv[i + 1];
17
+ }
18
+ h->argv[i] = tmp;
19
+
20
+ h->nextchar = NULL;
21
+ h->optnum--;
22
+ }
23
+
24
+ // Moves to the next argument.
25
+ static void cmdopt_next(cmdopt_t *h) {
26
+ h->optind++;
27
+ h->nextchar = NULL;
28
+ }
29
+
30
+ // Checks if the current argument is an option or not.
31
+ static int cmdopt_check(cmdopt_t *h) {
32
+ int ret = 1;
33
+ const char *arg = h->argv[h->optind];
34
+
35
+ if (*arg++ != '-') {
36
+ return 0;
37
+ }
38
+
39
+ if (*arg == '-') {
40
+ arg++;
41
+ ret++;
42
+ }
43
+
44
+ return ret - (*arg == '\0');
45
+ }
46
+
47
+ // Gets an argument of the current option.
48
+ static void cmdopt_getopt(cmdopt_t *h) {
49
+ // Moves to the next argument if the current argument has no more characters.
50
+ if (*h->nextchar == '\0') {
51
+ cmdopt_next(h);
52
+ h->nextchar = h->argv[h->optind];
53
+ }
54
+
55
+ // Checks whether the current option has an argument or not.
56
+ if (h->optind < h->optnum) {
57
+ h->optarg = h->nextchar;
58
+ cmdopt_next(h);
59
+ } else {
60
+ h->optarg = NULL;
61
+ }
62
+ }
63
+
64
+ // Searches an option.
65
+ static int cmdopt_search(cmdopt_t *h) {
66
+ const char *ptr;
67
+
68
+ // Updates an option character.
69
+ h->optopt = *h->nextchar++;
70
+
71
+ for (ptr = h->optstring; *ptr != '\0'; ptr++) {
72
+ if (*ptr == h->optopt) {
73
+ // Gets an option argument if required.
74
+ if (ptr[1] == ':') {
75
+ cmdopt_getopt(h);
76
+
77
+ // Returns ':' if there is no argument.
78
+ if (h->optarg == NULL && ptr[2] != ':') {
79
+ return ':';
80
+ }
81
+ }
82
+ return h->optopt;
83
+ }
84
+ }
85
+
86
+ if (h->optopt == '-') {
87
+ cmdopt_next(h);
88
+ while (h->optind < h->optnum) {
89
+ cmdopt_shift(h);
90
+ }
91
+ return -1;
92
+ }
93
+
94
+ // Returns '?' if the option character is undefined.
95
+ return '?';
96
+ }
97
+
98
+ // Compares a long option with an argument and returns the length of the
99
+ // matched prefix.
100
+ static int cmdopt_match_len(const char *opt, const char *arg) {
101
+ int len = 0;
102
+
103
+ // Returns 0 if there is a mismatch.
104
+ while ((*arg != '\0') && (*arg != '=')) {
105
+ if (*arg++ != *opt++) {
106
+ return 0;
107
+ }
108
+ len++;
109
+ }
110
+
111
+ // Returns a negative value in case of a perfect match.
112
+ if ((*arg == '\0') || (*arg == '=')) {
113
+ return -len;
114
+ }
115
+
116
+ return len;
117
+ }
118
+
119
+ // Checks long options.
120
+ static int cmdopt_match(cmdopt_t *h) {
121
+ int i, len;
122
+ int max = 0, max_optind = -1;
123
+
124
+ // Returns -1 if there are no long options.
125
+ if (h->longopts == NULL) {
126
+ return max_optind;
127
+ }
128
+
129
+ for (i = 0; h->longopts[i].name != NULL; i++) {
130
+ len = cmdopt_match_len(h->longopts[i].name, h->nextchar);
131
+ if (len < 0) {
132
+ // In case of a perfect match.
133
+ h->nextchar -= len;
134
+ return i;
135
+ } else if (len > max) {
136
+ // In case of a prefix match.
137
+ max = len;
138
+ max_optind = i;
139
+ } else if (len == max) {
140
+ // There are other candidates.
141
+ max_optind = -1;
142
+ }
143
+ }
144
+
145
+ // If there is no perfect match, adopts the longest one.
146
+ h->nextchar += max;
147
+ return max_optind;
148
+ }
149
+
150
+ // Gets an argument of a long option.
151
+ static void cmdopt_getopt_long(cmdopt_t *h) {
152
+ if (*h->nextchar == '=') {
153
+ h->optarg = h->nextchar + 1;
154
+ cmdopt_next(h);
155
+ } else {
156
+ cmdopt_next(h);
157
+
158
+ // Checks whether there are more options or not.
159
+ if (h->optind < h->optnum) {
160
+ h->optarg = h->argv[h->optind];
161
+ cmdopt_next(h);
162
+ } else {
163
+ h->optarg = NULL;
164
+ }
165
+ }
166
+ }
167
+
168
+ // Searches long options.
169
+ static int cmdopt_search_long(cmdopt_t *h) {
170
+ const cmdopt_option *option;
171
+
172
+ // Keeps the long option.
173
+ h->optlong = h->argv[h->optind];
174
+
175
+ // Gets the next option.
176
+ h->longindex = cmdopt_match(h);
177
+ if (h->longindex < 0) {
178
+ cmdopt_next(h);
179
+ return '?';
180
+ }
181
+
182
+ // Gets an argument if required.
183
+ option = h->longopts + h->longindex;
184
+ if (option->has_arg) {
185
+ cmdopt_getopt_long(h);
186
+
187
+ // Return ':' if there are no more arguments.
188
+ if (h->optarg == NULL) {
189
+ return ':';
190
+ }
191
+ } else if (*h->nextchar == '=') {
192
+ // Returns '?' for an extra option argument.
193
+ cmdopt_getopt_long(h);
194
+ return '?';
195
+ }
196
+
197
+ // Overwrites a variable if specified in settings.
198
+ if (option->flag != NULL) {
199
+ *option->flag = option->val;
200
+ return 0;
201
+ }
202
+
203
+ return option->val;
204
+ }
205
+
206
+ // Analyze command line option.
207
+ static int cmdopt_main(cmdopt_t *h) {
208
+ int type;
209
+
210
+ // Initializes the internal state.
211
+ h->optopt = 0;
212
+ h->optlong = NULL;
213
+ h->optarg = NULL;
214
+ h->longindex = 0;
215
+
216
+ while (h->optind < h->optnum) {
217
+ if (h->nextchar == NULL) {
218
+ // Checks whether the next argument is an option or not.
219
+ type = cmdopt_check(h);
220
+ if (type == 0) {
221
+ cmdopt_shift(h);
222
+ } else {
223
+ h->nextchar = h->argv[h->optind] + type;
224
+ if (type == 2) {
225
+ return cmdopt_search_long(h);
226
+ }
227
+ }
228
+ } else {
229
+ if (*h->nextchar == '\0') {
230
+ cmdopt_next(h);
231
+ continue;
232
+ }
233
+ // Searches an option string.
234
+ return cmdopt_search(h);
235
+ }
236
+ }
237
+
238
+ return -1;
239
+ }
240
+
241
+ // cmdopt_init() initializes a cmdopt_t for successive cmdopt_get()s.
242
+ void cmdopt_init(cmdopt_t *h, int argc, char **argv,
243
+ const char *optstring, const cmdopt_option *longopts) {
244
+ static const char empty_optstring[] = "";
245
+
246
+ h->argc = argc;
247
+ h->argv = argv;
248
+ h->optnum = h->argc;
249
+
250
+ h->longopts = longopts;
251
+ h->optstring = (optstring != NULL) ? optstring : empty_optstring;
252
+
253
+ h->optind = 1;
254
+ h->nextchar = NULL;
255
+ h->optarg = NULL;
256
+ h->optopt = 0;
257
+ h->optlong = NULL;
258
+ h->opterr = 1;
259
+ h->longindex = 0;
260
+ }
261
+
262
+ // cmdopt_get() analyzes command line arguments and gets the next option.
263
+ int cmdopt_get(cmdopt_t *h) {
264
+ int value = cmdopt_main(h);
265
+
266
+ // Prints a warning to the standard error stream if enabled.
267
+ if (h->opterr) {
268
+ if (value == ':') {
269
+ // Warning for a lack of an option argument.
270
+ if (h->optlong == NULL) {
271
+ fprintf(stderr, "option requires an argument -- %c\n", h->optopt);
272
+ } else {
273
+ fprintf(stderr, "option `--%s' requires an argument\n",
274
+ h->longopts[h->longindex].name);
275
+ }
276
+ } else if (value == '?') {
277
+ // Warning for an invalid option.
278
+ if (h->optlong == NULL) {
279
+ fprintf(stderr, "invalid option -- %c\n", h->optopt);
280
+ } else {
281
+ fprintf(stderr, "unrecognized option `%s'\n", h->optlong);
282
+ }
283
+ } else if ((value != -1) && (h->opterr == 2)) {
284
+ // Actually this is not for warning, but for debugging.
285
+ if (h->optlong == NULL) {
286
+ fprintf(stderr, "option with `%s' -- %c\n", h->optarg, h->optopt);
287
+ } else {
288
+ fprintf(stderr, "option `--%s' with `%s'\n",
289
+ h->longopts[h->longindex].name, h->optarg);
290
+ }
291
+ }
292
+ }
293
+ return value;
294
+ }
295
+
296
+ #ifdef __cplusplus
297
+ } // extern "C"
298
+ #endif // __cplusplus
@@ -0,0 +1,58 @@
1
+ #ifndef MARISA_CMDOPT_H_
2
+ #define MARISA_CMDOPT_H_
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ typedef struct cmdopt_option_ {
9
+ // `name' specifies the name of this option.
10
+ // An array of options must be terminated with an option whose name == NULL.
11
+ const char *name;
12
+
13
+ // `has_name' specifies whether an option takes an argument or not.
14
+ // 0 specifies that this option does not have any argument.
15
+ // 1 specifies that this option has an argument.
16
+ // 2 specifies that this option may have an argument.
17
+ int has_arg;
18
+
19
+ // `flag' specifies an integer variable which is overwritten by cmdopt_next()
20
+ // with its return value.
21
+ int *flag;
22
+
23
+ // `val' specifies a return value of cmdopt_next(). This value is returned
24
+ // when cmdopt_next() finds this option.
25
+ int val;
26
+ } cmdopt_option;
27
+
28
+ typedef struct cmdopt_t_ {
29
+ // Command line arguments.
30
+ int argc;
31
+ char **argv;
32
+
33
+ // Option settings.
34
+ const cmdopt_option *longopts;
35
+ const char *optstring;
36
+
37
+ int optind; // Index of the next argument.
38
+ char *nextchar; // Next character.
39
+ char *optarg; // Argument of the last option.
40
+ int optopt; // Label of the last option.
41
+ char *optlong; // Long option.
42
+ int opterr; // Warning level (0: nothing, 1: warning, 2: all).
43
+ int longindex; // Index of the last long option.
44
+ int optnum; // Number of options.
45
+ } cmdopt_t;
46
+
47
+ // cmdopt_init() initializes a cmdopt_t for successive cmdopt_next()s.
48
+ void cmdopt_init(cmdopt_t *h, int argc, char **argv,
49
+ const char *optstring, const cmdopt_option *longopts);
50
+
51
+ // cmdopt_get() analyzes command line arguments and gets the next option.
52
+ int cmdopt_get(cmdopt_t *h);
53
+
54
+ #ifdef __cplusplus
55
+ } // extern "C"
56
+ #endif
57
+
58
+ #endif // MARISA_CMDOPT_H_
@@ -0,0 +1,418 @@
1
+ #include <cstdlib>
2
+ #include <cstring>
3
+ #include <ctime>
4
+ #include <fstream>
5
+ #include <iostream>
6
+ #include <string>
7
+ #include <vector>
8
+
9
+ #include <marisa.h>
10
+
11
+ #include "cmdopt.h"
12
+
13
+ namespace {
14
+
15
+ int param_min_num_tries = 1;
16
+ int param_max_num_tries = 5;
17
+ marisa::TailMode param_tail_mode = MARISA_DEFAULT_TAIL;
18
+ marisa::NodeOrder param_node_order = MARISA_DEFAULT_ORDER;
19
+ marisa::CacheLevel param_cache_level = MARISA_DEFAULT_CACHE;
20
+ bool param_with_predict = true;
21
+ bool param_print_speed = true;
22
+
23
+ class Clock {
24
+ public:
25
+ Clock() : cl_(std::clock()) {}
26
+
27
+ void reset() {
28
+ cl_ = std::clock();
29
+ }
30
+
31
+ double elasped() const {
32
+ std::clock_t cur = std::clock();
33
+ return 1.0 * (cur - cl_) / CLOCKS_PER_SEC;
34
+ }
35
+
36
+ private:
37
+ std::clock_t cl_;
38
+ };
39
+
40
+ void print_help(const char *cmd) {
41
+ std::cerr << "Usage: " << cmd << " [OPTION]... [FILE]...\n\n"
42
+ "Options:\n"
43
+ " -N, --min-num-tries=[N] limit the number of tries"
44
+ " [" << MARISA_MIN_NUM_TRIES << ", " << MARISA_MAX_NUM_TRIES
45
+ << "] (default: 1)\n"
46
+ " -n, --max-num-tries=[N] limit the number of tries"
47
+ " [" << MARISA_MIN_NUM_TRIES << ", " << MARISA_MAX_NUM_TRIES
48
+ << "] (default: 10)\n"
49
+ " -t, --text-tail build a dictionary with text TAIL (default)\n"
50
+ " -b, --binary-tail build a dictionary with binary TAIL\n"
51
+ " -w, --weight-order arrange siblings in weight order (default)\n"
52
+ " -l, --label-order arrange siblings in label order\n"
53
+ " -c, --cache-level=[N] specify the cache size"
54
+ " [1, 5] (default: 3)\n"
55
+ " -P, --with-predict include predictive search (default)\n"
56
+ " -p, --without-predict skip predictive search\n"
57
+ " -S, --print-speed print speed [1000 keys/s] (default)\n"
58
+ " -s, --print-time print time [ns/key]\n"
59
+ " -h, --help print this help\n"
60
+ << std::endl;
61
+ }
62
+
63
+ void print_config() {
64
+ std::cout << "Number of tries: " << param_min_num_tries
65
+ << " - " << param_max_num_tries << std::endl;
66
+
67
+ std::cout << "TAIL mode: ";
68
+ switch (param_tail_mode) {
69
+ case MARISA_TEXT_TAIL: {
70
+ std::cout << "Text mode" << std::endl;
71
+ break;
72
+ }
73
+ case MARISA_BINARY_TAIL: {
74
+ std::cout << "Binary mode" << std::endl;
75
+ break;
76
+ }
77
+ }
78
+
79
+ std::cout << "Node order: ";
80
+ switch (param_node_order) {
81
+ case MARISA_LABEL_ORDER: {
82
+ std::cout << "Ascending label order" << std::endl;
83
+ break;
84
+ }
85
+ case MARISA_WEIGHT_ORDER: {
86
+ std::cout << "Descending weight order" << std::endl;
87
+ break;
88
+ }
89
+ }
90
+
91
+ std::cout << "Cache level: ";
92
+ switch (param_cache_level) {
93
+ case MARISA_HUGE_CACHE: {
94
+ std::cout << "Huge cache" << std::endl;
95
+ break;
96
+ }
97
+ case MARISA_LARGE_CACHE: {
98
+ std::cout << "Large cache" << std::endl;
99
+ break;
100
+ }
101
+ case MARISA_NORMAL_CACHE: {
102
+ std::cout << "Normal cache" << std::endl;
103
+ break;
104
+ }
105
+ case MARISA_SMALL_CACHE: {
106
+ std::cout << "Small cache" << std::endl;
107
+ break;
108
+ }
109
+ case MARISA_TINY_CACHE: {
110
+ std::cout << "Tiny cache" << std::endl;
111
+ break;
112
+ }
113
+ }
114
+ }
115
+
116
+ void print_time_info(std::size_t num_keys, double elasped) {
117
+ if (param_print_speed) {
118
+ if (elasped == 0.0) {
119
+ std::printf(" %8s", "-");
120
+ } else {
121
+ std::printf(" %8.2f", num_keys / elasped / 1000.0);
122
+ }
123
+ } else {
124
+ if ((elasped == 0.0) || (num_keys == 0)) {
125
+ std::printf(" %8s", "-");
126
+ } else {
127
+ std::printf(" %8.1f", 1000000000.0 * elasped / num_keys);
128
+ }
129
+ }
130
+ }
131
+
132
+ void read_keys(std::istream &input, marisa::Keyset *keyset,
133
+ std::vector<float> *weights) {
134
+ std::string line;
135
+ while (std::getline(input, line)) {
136
+ const std::string::size_type delim_pos = line.find_last_of('\t');
137
+ float weight = 1.0F;
138
+ if (delim_pos != line.npos) {
139
+ char *end_of_value;
140
+ weight = (float)std::strtod(&line[delim_pos + 1], &end_of_value);
141
+ if (*end_of_value == '\0') {
142
+ line.resize(delim_pos);
143
+ }
144
+ }
145
+ keyset->push_back(line.c_str(), line.length());
146
+ weights->push_back(weight);
147
+ }
148
+ }
149
+
150
+ int read_keys(const char * const *args, std::size_t num_args,
151
+ marisa::Keyset *keyset, std::vector<float> *weights) {
152
+ if (num_args == 0) {
153
+ read_keys(std::cin, keyset, weights);
154
+ }
155
+ for (std::size_t i = 0; i < num_args; ++i) {
156
+ std::ifstream input_file(args[i], std::ios::binary);
157
+ if (!input_file) {
158
+ std::cerr << "error: failed to open: " << args[i] << std::endl;
159
+ return 10;
160
+ }
161
+ read_keys(input_file, keyset, weights);
162
+ }
163
+ std::cout << "Number of keys: " << keyset->size() << std::endl;
164
+ std::cout << "Total length: " << keyset->total_length() << std::endl;
165
+ return 0;
166
+ }
167
+
168
+ void benchmark_build(marisa::Keyset &keyset,
169
+ const std::vector<float> &weights, int num_tries, marisa::Trie *trie) {
170
+ for (std::size_t i = 0; i < keyset.size(); ++i) {
171
+ keyset[i].set_weight(weights[i]);
172
+ }
173
+ Clock cl;
174
+ trie->build(keyset, num_tries | param_tail_mode | param_node_order |
175
+ param_cache_level);
176
+ std::printf(" %10lu", (unsigned long)trie->io_size());
177
+ print_time_info(keyset.size(), cl.elasped());
178
+ }
179
+
180
+ void benchmark_lookup(const marisa::Trie &trie,
181
+ const marisa::Keyset &keyset) {
182
+ Clock cl;
183
+ marisa::Agent agent;
184
+ for (std::size_t i = 0; i < keyset.size(); ++i) {
185
+ agent.set_query(keyset[i].ptr(), keyset[i].length());
186
+ if (!trie.lookup(agent) || (agent.key().id() != keyset[i].id())) {
187
+ std::cerr << "error: lookup() failed" << std::endl;
188
+ return;
189
+ }
190
+ }
191
+ print_time_info(keyset.size(), cl.elasped());
192
+ }
193
+
194
+ void benchmark_reverse_lookup(const marisa::Trie &trie,
195
+ const marisa::Keyset &keyset) {
196
+ Clock cl;
197
+ marisa::Agent agent;
198
+ for (std::size_t i = 0; i < keyset.size(); ++i) {
199
+ agent.set_query(keyset[i].id());
200
+ trie.reverse_lookup(agent);
201
+ if ((agent.key().id() != keyset[i].id()) ||
202
+ (agent.key().length() != keyset[i].length()) ||
203
+ (std::memcmp(agent.key().ptr(), keyset[i].ptr(),
204
+ agent.key().length()) != 0)) {
205
+ std::cerr << "error: reverse_lookup() failed" << std::endl;
206
+ return;
207
+ }
208
+ }
209
+ print_time_info(keyset.size(), cl.elasped());
210
+ }
211
+
212
+ void benchmark_common_prefix_search(const marisa::Trie &trie,
213
+ const marisa::Keyset &keyset) {
214
+ Clock cl;
215
+ marisa::Agent agent;
216
+ for (std::size_t i = 0; i < keyset.size(); ++i) {
217
+ agent.set_query(keyset[i].ptr(), keyset[i].length());
218
+ while (trie.common_prefix_search(agent)) {
219
+ if (agent.key().id() > keyset[i].id()) {
220
+ std::cerr << "error: common_prefix_search() failed" << std::endl;
221
+ return;
222
+ }
223
+ }
224
+ if (agent.key().id() != keyset[i].id()) {
225
+ std::cerr << "error: common_prefix_search() failed" << std::endl;
226
+ return;
227
+ }
228
+ }
229
+ print_time_info(keyset.size(), cl.elasped());
230
+ }
231
+
232
+ void benchmark_predictive_search(const marisa::Trie &trie,
233
+ const marisa::Keyset &keyset) {
234
+ if (!param_with_predict) {
235
+ print_time_info(keyset.size(), 0.0);
236
+ return;
237
+ }
238
+
239
+ Clock cl;
240
+ marisa::Agent agent;
241
+ for (std::size_t i = 0; i < keyset.size(); ++i) {
242
+ agent.set_query(keyset[i].ptr(), keyset[i].length());
243
+ if (!trie.predictive_search(agent) ||
244
+ (agent.key().id() != keyset[i].id())) {
245
+ std::cerr << "error: predictive_search() failed" << std::endl;
246
+ return;
247
+ }
248
+ while (trie.predictive_search(agent)) {
249
+ if (agent.key().id() <= keyset[i].id()) {
250
+ std::cerr << "error: predictive_search() failed" << std::endl;
251
+ return;
252
+ }
253
+ }
254
+ }
255
+ print_time_info(keyset.size(), cl.elasped());
256
+ }
257
+
258
+ void benchmark(marisa::Keyset &keyset, const std::vector<float> &weights,
259
+ int num_tries) {
260
+ std::printf("%6d", num_tries);
261
+ marisa::Trie trie;
262
+ benchmark_build(keyset, weights, num_tries, &trie);
263
+ if (!trie.empty()) {
264
+ benchmark_lookup(trie, keyset);
265
+ benchmark_reverse_lookup(trie, keyset);
266
+ benchmark_common_prefix_search(trie, keyset);
267
+ benchmark_predictive_search(trie, keyset);
268
+ }
269
+ std::printf("\n");
270
+ }
271
+
272
+ int benchmark(const char * const *args, std::size_t num_args) try {
273
+ marisa::Keyset keyset;
274
+ std::vector<float> weights;
275
+ const int ret = read_keys(args, num_args, &keyset, &weights);
276
+ if (ret != 0) {
277
+ return ret;
278
+ }
279
+ std::printf("------+----------+--------+--------+"
280
+ "--------+--------+--------\n");
281
+ std::printf("%6s %10s %8s %8s %8s %8s %8s\n",
282
+ "#tries", "size", "build", "lookup", "reverse", "prefix", "predict");
283
+ std::printf("%6s %10s %8s %8s %8s %8s %8s\n",
284
+ "", "", "", "", "lookup", "search", "search");
285
+ if (param_print_speed) {
286
+ std::printf("%6s %10s %8s %8s %8s %8s %8s\n",
287
+ "", "[bytes]",
288
+ "[K/s]", "[K/s]", "[K/s]", "[K/s]", "[K/s]");
289
+ } else {
290
+ std::printf("%6s %10s %8s %8s %8s %8s %8s\n",
291
+ "", "[bytes]", "[ns]", "[ns]", "[ns]", "[ns]", "[ns]");
292
+ }
293
+ std::printf("------+----------+--------+--------+"
294
+ "--------+--------+--------\n");
295
+ for (int i = param_min_num_tries; i <= param_max_num_tries; ++i) {
296
+ benchmark(keyset, weights, i);
297
+ }
298
+ std::printf("------+----------+--------+--------+"
299
+ "--------+--------+--------\n");
300
+ return 0;
301
+ } catch (const marisa::Exception &ex) {
302
+ std::cerr << ex.what() << std::endl;
303
+ return -1;
304
+ }
305
+
306
+ } // namespace
307
+
308
+ int main(int argc, char *argv[]) {
309
+ std::ios::sync_with_stdio(false);
310
+
311
+ ::cmdopt_option long_options[] = {
312
+ { "min-num-tries", 1, NULL, 'N' },
313
+ { "max-num-tries", 1, NULL, 'n' },
314
+ { "text-tail", 0, NULL, 't' },
315
+ { "binary-tail", 0, NULL, 'b' },
316
+ { "weight-order", 0, NULL, 'w' },
317
+ { "label-order", 0, NULL, 'l' },
318
+ { "cache-level", 1, NULL, 'c' },
319
+ { "predict-on", 0, NULL, 'P' },
320
+ { "predict-off", 0, NULL, 'p' },
321
+ { "print-speed", 0, NULL, 'S' },
322
+ { "print-time", 0, NULL, 's' },
323
+ { "help", 0, NULL, 'h' },
324
+ { NULL, 0, NULL, 0 }
325
+ };
326
+ ::cmdopt_t cmdopt;
327
+ ::cmdopt_init(&cmdopt, argc, argv, "N:n:tbwlc:PpSsh", long_options);
328
+ int label;
329
+ while ((label = ::cmdopt_get(&cmdopt)) != -1) {
330
+ switch (label) {
331
+ case 'N': {
332
+ char *end_of_value;
333
+ const long value = std::strtol(cmdopt.optarg, &end_of_value, 10);
334
+ if ((*end_of_value != '\0') || (value <= 0) ||
335
+ (value > MARISA_MAX_NUM_TRIES)) {
336
+ std::cerr << "error: option `-n' with an invalid argument: "
337
+ << cmdopt.optarg << std::endl;
338
+ return 1;
339
+ }
340
+ param_min_num_tries = (int)value;
341
+ break;
342
+ }
343
+ case 'n': {
344
+ char *end_of_value;
345
+ const long value = std::strtol(cmdopt.optarg, &end_of_value, 10);
346
+ if ((*end_of_value != '\0') || (value <= 0) ||
347
+ (value > MARISA_MAX_NUM_TRIES)) {
348
+ std::cerr << "error: option `-n' with an invalid argument: "
349
+ << cmdopt.optarg << std::endl;
350
+ return 2;
351
+ }
352
+ param_max_num_tries = (int)value;
353
+ break;
354
+ }
355
+ case 't': {
356
+ param_tail_mode = MARISA_TEXT_TAIL;
357
+ break;
358
+ }
359
+ case 'b': {
360
+ param_tail_mode = MARISA_BINARY_TAIL;
361
+ break;
362
+ }
363
+ case 'w': {
364
+ param_node_order = MARISA_WEIGHT_ORDER;
365
+ break;
366
+ }
367
+ case 'l': {
368
+ param_node_order = MARISA_LABEL_ORDER;
369
+ break;
370
+ }
371
+ case 'c': {
372
+ char *end_of_value;
373
+ const long value = std::strtol(cmdopt.optarg, &end_of_value, 10);
374
+ if ((*end_of_value != '\0') || (value < 1) || (value > 5)) {
375
+ std::cerr << "error: option `-c' with an invalid argument: "
376
+ << cmdopt.optarg << std::endl;
377
+ return 3;
378
+ } else if (value == 1) {
379
+ param_cache_level = MARISA_TINY_CACHE;
380
+ } else if (value == 2) {
381
+ param_cache_level = MARISA_SMALL_CACHE;
382
+ } else if (value == 3) {
383
+ param_cache_level = MARISA_NORMAL_CACHE;
384
+ } else if (value == 4) {
385
+ param_cache_level = MARISA_LARGE_CACHE;
386
+ } else if (value == 5) {
387
+ param_cache_level = MARISA_HUGE_CACHE;
388
+ }
389
+ break;
390
+ }
391
+ case 'P': {
392
+ param_with_predict = true;
393
+ break;
394
+ }
395
+ case 'p': {
396
+ param_with_predict = false;
397
+ break;
398
+ }
399
+ case 'S': {
400
+ param_print_speed = true;
401
+ break;
402
+ }
403
+ case 's': {
404
+ param_print_speed = false;
405
+ break;
406
+ }
407
+ case 'h': {
408
+ print_help(argv[0]);
409
+ return 0;
410
+ }
411
+ default: {
412
+ return 1;
413
+ }
414
+ }
415
+ }
416
+ print_config();
417
+ return benchmark(cmdopt.argv + cmdopt.optind, cmdopt.argc - cmdopt.optind);
418
+ }