melisa 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. data/README.md +11 -0
  2. data/ext/marisa/bindings/marisa-swig.cxx +253 -0
  3. data/ext/marisa/bindings/marisa-swig.h +183 -0
  4. data/ext/marisa/bindings/perl/marisa-swig.cxx +253 -0
  5. data/ext/marisa/bindings/perl/marisa-swig.h +183 -0
  6. data/ext/marisa/bindings/perl/marisa-swig_wrap.cxx +5160 -0
  7. data/ext/marisa/bindings/python/marisa-swig.cxx +253 -0
  8. data/ext/marisa/bindings/python/marisa-swig.h +183 -0
  9. data/ext/marisa/bindings/python/marisa-swig_wrap.cxx +6090 -0
  10. data/ext/marisa/bindings/ruby/extconf.rb +5 -0
  11. data/ext/marisa/bindings/ruby/marisa-swig.cxx +253 -0
  12. data/ext/marisa/bindings/ruby/marisa-swig.h +183 -0
  13. data/ext/marisa/bindings/ruby/marisa-swig_wrap.cxx +4708 -0
  14. data/ext/marisa/lib/marisa.h +14 -0
  15. data/ext/marisa/lib/marisa/agent.cc +51 -0
  16. data/ext/marisa/lib/marisa/agent.h +73 -0
  17. data/ext/marisa/lib/marisa/base.h +193 -0
  18. data/ext/marisa/lib/marisa/exception.h +82 -0
  19. data/ext/marisa/lib/marisa/grimoire/algorithm.h +26 -0
  20. data/ext/marisa/lib/marisa/grimoire/algorithm/sort.h +196 -0
  21. data/ext/marisa/lib/marisa/grimoire/intrin.h +115 -0
  22. data/ext/marisa/lib/marisa/grimoire/io.h +18 -0
  23. data/ext/marisa/lib/marisa/grimoire/io/mapper.cc +163 -0
  24. data/ext/marisa/lib/marisa/grimoire/io/mapper.h +67 -0
  25. data/ext/marisa/lib/marisa/grimoire/io/reader.cc +147 -0
  26. data/ext/marisa/lib/marisa/grimoire/io/reader.h +66 -0
  27. data/ext/marisa/lib/marisa/grimoire/io/writer.cc +148 -0
  28. data/ext/marisa/lib/marisa/grimoire/io/writer.h +65 -0
  29. data/ext/marisa/lib/marisa/grimoire/trie.h +16 -0
  30. data/ext/marisa/lib/marisa/grimoire/trie/cache.h +81 -0
  31. data/ext/marisa/lib/marisa/grimoire/trie/config.h +155 -0
  32. data/ext/marisa/lib/marisa/grimoire/trie/entry.h +82 -0
  33. data/ext/marisa/lib/marisa/grimoire/trie/header.h +61 -0
  34. data/ext/marisa/lib/marisa/grimoire/trie/history.h +65 -0
  35. data/ext/marisa/lib/marisa/grimoire/trie/key.h +228 -0
  36. data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.cc +876 -0
  37. data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.h +134 -0
  38. data/ext/marisa/lib/marisa/grimoire/trie/range.h +115 -0
  39. data/ext/marisa/lib/marisa/grimoire/trie/state.h +117 -0
  40. data/ext/marisa/lib/marisa/grimoire/trie/tail.cc +218 -0
  41. data/ext/marisa/lib/marisa/grimoire/trie/tail.h +72 -0
  42. data/ext/marisa/lib/marisa/grimoire/vector.h +18 -0
  43. data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.cc +826 -0
  44. data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.h +179 -0
  45. data/ext/marisa/lib/marisa/grimoire/vector/flat-vector.h +205 -0
  46. data/ext/marisa/lib/marisa/grimoire/vector/pop-count.h +110 -0
  47. data/ext/marisa/lib/marisa/grimoire/vector/rank-index.h +82 -0
  48. data/ext/marisa/lib/marisa/grimoire/vector/vector.h +256 -0
  49. data/ext/marisa/lib/marisa/iostream.h +18 -0
  50. data/ext/marisa/lib/marisa/key.h +85 -0
  51. data/ext/marisa/lib/marisa/keyset.cc +181 -0
  52. data/ext/marisa/lib/marisa/keyset.h +80 -0
  53. data/ext/marisa/lib/marisa/query.h +71 -0
  54. data/ext/marisa/lib/marisa/scoped-array.h +48 -0
  55. data/ext/marisa/lib/marisa/scoped-ptr.h +52 -0
  56. data/ext/marisa/lib/marisa/stdio.h +15 -0
  57. data/ext/marisa/lib/marisa/trie.cc +249 -0
  58. data/ext/marisa/lib/marisa/trie.h +64 -0
  59. data/ext/marisa/tests/base-test.cc +309 -0
  60. data/ext/marisa/tests/io-test.cc +252 -0
  61. data/ext/marisa/tests/marisa-assert.h +26 -0
  62. data/ext/marisa/tests/marisa-test.cc +388 -0
  63. data/ext/marisa/tests/trie-test.cc +507 -0
  64. data/ext/marisa/tests/vector-test.cc +466 -0
  65. data/ext/marisa/tools/cmdopt.cc +298 -0
  66. data/ext/marisa/tools/cmdopt.h +58 -0
  67. data/ext/marisa/tools/marisa-benchmark.cc +418 -0
  68. data/ext/marisa/tools/marisa-build.cc +206 -0
  69. data/ext/marisa/tools/marisa-common-prefix-search.cc +143 -0
  70. data/ext/marisa/tools/marisa-dump.cc +151 -0
  71. data/ext/marisa/tools/marisa-lookup.cc +110 -0
  72. data/ext/marisa/tools/marisa-predictive-search.cc +143 -0
  73. data/ext/marisa/tools/marisa-reverse-lookup.cc +110 -0
  74. data/lib/melisa.rb +7 -0
  75. data/lib/melisa/base_config_flags.rb +76 -0
  76. data/lib/melisa/bytes_trie.rb +55 -0
  77. data/lib/melisa/int_trie.rb +14 -0
  78. data/lib/melisa/search.rb +55 -0
  79. data/lib/melisa/trie.rb +96 -0
  80. data/lib/melisa/version.rb +3 -0
  81. data/melisa.gemspec +36 -0
  82. data/spec/base_config_flags_spec.rb +73 -0
  83. data/spec/bytes_trie_spec.rb +16 -0
  84. data/spec/int_trie_spec.rb +16 -0
  85. data/spec/search_spec.rb +29 -0
  86. data/spec/spec_helper.rb +1 -0
  87. data/spec/trie_spec.rb +30 -0
  88. metadata +207 -0
@@ -0,0 +1,298 @@
1
+ #include <stdio.h>
2
+
3
+ #include "cmdopt.h"
4
+
5
+ #ifdef __cplusplus
6
+ extern "C" {
7
+ #endif // __cplusplus
8
+
9
+ // Moves `optind' to the end and shifts other arguments.
10
+ static void cmdopt_shift(cmdopt_t *h) {
11
+ int i;
12
+ char *tmp;
13
+
14
+ tmp = h->argv[h->optind];
15
+ for (i = h->optind; i < h->argc - 1; i++) {
16
+ h->argv[i] = h->argv[i + 1];
17
+ }
18
+ h->argv[i] = tmp;
19
+
20
+ h->nextchar = NULL;
21
+ h->optnum--;
22
+ }
23
+
24
+ // Moves to the next argument.
25
+ static void cmdopt_next(cmdopt_t *h) {
26
+ h->optind++;
27
+ h->nextchar = NULL;
28
+ }
29
+
30
+ // Checks if the current argument is an option or not.
31
+ static int cmdopt_check(cmdopt_t *h) {
32
+ int ret = 1;
33
+ const char *arg = h->argv[h->optind];
34
+
35
+ if (*arg++ != '-') {
36
+ return 0;
37
+ }
38
+
39
+ if (*arg == '-') {
40
+ arg++;
41
+ ret++;
42
+ }
43
+
44
+ return ret - (*arg == '\0');
45
+ }
46
+
47
+ // Gets an argument of the current option.
48
+ static void cmdopt_getopt(cmdopt_t *h) {
49
+ // Moves to the next argument if the current argument has no more characters.
50
+ if (*h->nextchar == '\0') {
51
+ cmdopt_next(h);
52
+ h->nextchar = h->argv[h->optind];
53
+ }
54
+
55
+ // Checks whether the current option has an argument or not.
56
+ if (h->optind < h->optnum) {
57
+ h->optarg = h->nextchar;
58
+ cmdopt_next(h);
59
+ } else {
60
+ h->optarg = NULL;
61
+ }
62
+ }
63
+
64
+ // Searches an option.
65
+ static int cmdopt_search(cmdopt_t *h) {
66
+ const char *ptr;
67
+
68
+ // Updates an option character.
69
+ h->optopt = *h->nextchar++;
70
+
71
+ for (ptr = h->optstring; *ptr != '\0'; ptr++) {
72
+ if (*ptr == h->optopt) {
73
+ // Gets an option argument if required.
74
+ if (ptr[1] == ':') {
75
+ cmdopt_getopt(h);
76
+
77
+ // Returns ':' if there is no argument.
78
+ if (h->optarg == NULL && ptr[2] != ':') {
79
+ return ':';
80
+ }
81
+ }
82
+ return h->optopt;
83
+ }
84
+ }
85
+
86
+ if (h->optopt == '-') {
87
+ cmdopt_next(h);
88
+ while (h->optind < h->optnum) {
89
+ cmdopt_shift(h);
90
+ }
91
+ return -1;
92
+ }
93
+
94
+ // Returns '?' if the option character is undefined.
95
+ return '?';
96
+ }
97
+
98
+ // Compares a long option with an argument and returns the length of the
99
+ // matched prefix.
100
+ static int cmdopt_match_len(const char *opt, const char *arg) {
101
+ int len = 0;
102
+
103
+ // Returns 0 if there is a mismatch.
104
+ while ((*arg != '\0') && (*arg != '=')) {
105
+ if (*arg++ != *opt++) {
106
+ return 0;
107
+ }
108
+ len++;
109
+ }
110
+
111
+ // Returns a negative value in case of a perfect match.
112
+ if ((*arg == '\0') || (*arg == '=')) {
113
+ return -len;
114
+ }
115
+
116
+ return len;
117
+ }
118
+
119
+ // Checks long options.
120
+ static int cmdopt_match(cmdopt_t *h) {
121
+ int i, len;
122
+ int max = 0, max_optind = -1;
123
+
124
+ // Returns -1 if there are no long options.
125
+ if (h->longopts == NULL) {
126
+ return max_optind;
127
+ }
128
+
129
+ for (i = 0; h->longopts[i].name != NULL; i++) {
130
+ len = cmdopt_match_len(h->longopts[i].name, h->nextchar);
131
+ if (len < 0) {
132
+ // In case of a perfect match.
133
+ h->nextchar -= len;
134
+ return i;
135
+ } else if (len > max) {
136
+ // In case of a prefix match.
137
+ max = len;
138
+ max_optind = i;
139
+ } else if (len == max) {
140
+ // There are other candidates.
141
+ max_optind = -1;
142
+ }
143
+ }
144
+
145
+ // If there is no perfect match, adopts the longest one.
146
+ h->nextchar += max;
147
+ return max_optind;
148
+ }
149
+
150
+ // Gets an argument of a long option.
151
+ static void cmdopt_getopt_long(cmdopt_t *h) {
152
+ if (*h->nextchar == '=') {
153
+ h->optarg = h->nextchar + 1;
154
+ cmdopt_next(h);
155
+ } else {
156
+ cmdopt_next(h);
157
+
158
+ // Checks whether there are more options or not.
159
+ if (h->optind < h->optnum) {
160
+ h->optarg = h->argv[h->optind];
161
+ cmdopt_next(h);
162
+ } else {
163
+ h->optarg = NULL;
164
+ }
165
+ }
166
+ }
167
+
168
+ // Searches long options.
169
+ static int cmdopt_search_long(cmdopt_t *h) {
170
+ const cmdopt_option *option;
171
+
172
+ // Keeps the long option.
173
+ h->optlong = h->argv[h->optind];
174
+
175
+ // Gets the next option.
176
+ h->longindex = cmdopt_match(h);
177
+ if (h->longindex < 0) {
178
+ cmdopt_next(h);
179
+ return '?';
180
+ }
181
+
182
+ // Gets an argument if required.
183
+ option = h->longopts + h->longindex;
184
+ if (option->has_arg) {
185
+ cmdopt_getopt_long(h);
186
+
187
+ // Return ':' if there are no more arguments.
188
+ if (h->optarg == NULL) {
189
+ return ':';
190
+ }
191
+ } else if (*h->nextchar == '=') {
192
+ // Returns '?' for an extra option argument.
193
+ cmdopt_getopt_long(h);
194
+ return '?';
195
+ }
196
+
197
+ // Overwrites a variable if specified in settings.
198
+ if (option->flag != NULL) {
199
+ *option->flag = option->val;
200
+ return 0;
201
+ }
202
+
203
+ return option->val;
204
+ }
205
+
206
+ // Analyze command line option.
207
+ static int cmdopt_main(cmdopt_t *h) {
208
+ int type;
209
+
210
+ // Initializes the internal state.
211
+ h->optopt = 0;
212
+ h->optlong = NULL;
213
+ h->optarg = NULL;
214
+ h->longindex = 0;
215
+
216
+ while (h->optind < h->optnum) {
217
+ if (h->nextchar == NULL) {
218
+ // Checks whether the next argument is an option or not.
219
+ type = cmdopt_check(h);
220
+ if (type == 0) {
221
+ cmdopt_shift(h);
222
+ } else {
223
+ h->nextchar = h->argv[h->optind] + type;
224
+ if (type == 2) {
225
+ return cmdopt_search_long(h);
226
+ }
227
+ }
228
+ } else {
229
+ if (*h->nextchar == '\0') {
230
+ cmdopt_next(h);
231
+ continue;
232
+ }
233
+ // Searches an option string.
234
+ return cmdopt_search(h);
235
+ }
236
+ }
237
+
238
+ return -1;
239
+ }
240
+
241
+ // cmdopt_init() initializes a cmdopt_t for successive cmdopt_get()s.
242
+ void cmdopt_init(cmdopt_t *h, int argc, char **argv,
243
+ const char *optstring, const cmdopt_option *longopts) {
244
+ static const char empty_optstring[] = "";
245
+
246
+ h->argc = argc;
247
+ h->argv = argv;
248
+ h->optnum = h->argc;
249
+
250
+ h->longopts = longopts;
251
+ h->optstring = (optstring != NULL) ? optstring : empty_optstring;
252
+
253
+ h->optind = 1;
254
+ h->nextchar = NULL;
255
+ h->optarg = NULL;
256
+ h->optopt = 0;
257
+ h->optlong = NULL;
258
+ h->opterr = 1;
259
+ h->longindex = 0;
260
+ }
261
+
262
+ // cmdopt_get() analyzes command line arguments and gets the next option.
263
+ int cmdopt_get(cmdopt_t *h) {
264
+ int value = cmdopt_main(h);
265
+
266
+ // Prints a warning to the standard error stream if enabled.
267
+ if (h->opterr) {
268
+ if (value == ':') {
269
+ // Warning for a lack of an option argument.
270
+ if (h->optlong == NULL) {
271
+ fprintf(stderr, "option requires an argument -- %c\n", h->optopt);
272
+ } else {
273
+ fprintf(stderr, "option `--%s' requires an argument\n",
274
+ h->longopts[h->longindex].name);
275
+ }
276
+ } else if (value == '?') {
277
+ // Warning for an invalid option.
278
+ if (h->optlong == NULL) {
279
+ fprintf(stderr, "invalid option -- %c\n", h->optopt);
280
+ } else {
281
+ fprintf(stderr, "unrecognized option `%s'\n", h->optlong);
282
+ }
283
+ } else if ((value != -1) && (h->opterr == 2)) {
284
+ // Actually this is not for warning, but for debugging.
285
+ if (h->optlong == NULL) {
286
+ fprintf(stderr, "option with `%s' -- %c\n", h->optarg, h->optopt);
287
+ } else {
288
+ fprintf(stderr, "option `--%s' with `%s'\n",
289
+ h->longopts[h->longindex].name, h->optarg);
290
+ }
291
+ }
292
+ }
293
+ return value;
294
+ }
295
+
296
+ #ifdef __cplusplus
297
+ } // extern "C"
298
+ #endif // __cplusplus
@@ -0,0 +1,58 @@
1
+ #ifndef MARISA_CMDOPT_H_
2
+ #define MARISA_CMDOPT_H_
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ typedef struct cmdopt_option_ {
9
+ // `name' specifies the name of this option.
10
+ // An array of options must be terminated with an option whose name == NULL.
11
+ const char *name;
12
+
13
+ // `has_name' specifies whether an option takes an argument or not.
14
+ // 0 specifies that this option does not have any argument.
15
+ // 1 specifies that this option has an argument.
16
+ // 2 specifies that this option may have an argument.
17
+ int has_arg;
18
+
19
+ // `flag' specifies an integer variable which is overwritten by cmdopt_next()
20
+ // with its return value.
21
+ int *flag;
22
+
23
+ // `val' specifies a return value of cmdopt_next(). This value is returned
24
+ // when cmdopt_next() finds this option.
25
+ int val;
26
+ } cmdopt_option;
27
+
28
+ typedef struct cmdopt_t_ {
29
+ // Command line arguments.
30
+ int argc;
31
+ char **argv;
32
+
33
+ // Option settings.
34
+ const cmdopt_option *longopts;
35
+ const char *optstring;
36
+
37
+ int optind; // Index of the next argument.
38
+ char *nextchar; // Next character.
39
+ char *optarg; // Argument of the last option.
40
+ int optopt; // Label of the last option.
41
+ char *optlong; // Long option.
42
+ int opterr; // Warning level (0: nothing, 1: warning, 2: all).
43
+ int longindex; // Index of the last long option.
44
+ int optnum; // Number of options.
45
+ } cmdopt_t;
46
+
47
+ // cmdopt_init() initializes a cmdopt_t for successive cmdopt_next()s.
48
+ void cmdopt_init(cmdopt_t *h, int argc, char **argv,
49
+ const char *optstring, const cmdopt_option *longopts);
50
+
51
+ // cmdopt_get() analyzes command line arguments and gets the next option.
52
+ int cmdopt_get(cmdopt_t *h);
53
+
54
+ #ifdef __cplusplus
55
+ } // extern "C"
56
+ #endif
57
+
58
+ #endif // MARISA_CMDOPT_H_
@@ -0,0 +1,418 @@
1
+ #include <cstdlib>
2
+ #include <cstring>
3
+ #include <ctime>
4
+ #include <fstream>
5
+ #include <iostream>
6
+ #include <string>
7
+ #include <vector>
8
+
9
+ #include <marisa.h>
10
+
11
+ #include "cmdopt.h"
12
+
13
+ namespace {
14
+
15
+ int param_min_num_tries = 1;
16
+ int param_max_num_tries = 5;
17
+ marisa::TailMode param_tail_mode = MARISA_DEFAULT_TAIL;
18
+ marisa::NodeOrder param_node_order = MARISA_DEFAULT_ORDER;
19
+ marisa::CacheLevel param_cache_level = MARISA_DEFAULT_CACHE;
20
+ bool param_with_predict = true;
21
+ bool param_print_speed = true;
22
+
23
+ class Clock {
24
+ public:
25
+ Clock() : cl_(std::clock()) {}
26
+
27
+ void reset() {
28
+ cl_ = std::clock();
29
+ }
30
+
31
+ double elasped() const {
32
+ std::clock_t cur = std::clock();
33
+ return 1.0 * (cur - cl_) / CLOCKS_PER_SEC;
34
+ }
35
+
36
+ private:
37
+ std::clock_t cl_;
38
+ };
39
+
40
+ void print_help(const char *cmd) {
41
+ std::cerr << "Usage: " << cmd << " [OPTION]... [FILE]...\n\n"
42
+ "Options:\n"
43
+ " -N, --min-num-tries=[N] limit the number of tries"
44
+ " [" << MARISA_MIN_NUM_TRIES << ", " << MARISA_MAX_NUM_TRIES
45
+ << "] (default: 1)\n"
46
+ " -n, --max-num-tries=[N] limit the number of tries"
47
+ " [" << MARISA_MIN_NUM_TRIES << ", " << MARISA_MAX_NUM_TRIES
48
+ << "] (default: 10)\n"
49
+ " -t, --text-tail build a dictionary with text TAIL (default)\n"
50
+ " -b, --binary-tail build a dictionary with binary TAIL\n"
51
+ " -w, --weight-order arrange siblings in weight order (default)\n"
52
+ " -l, --label-order arrange siblings in label order\n"
53
+ " -c, --cache-level=[N] specify the cache size"
54
+ " [1, 5] (default: 3)\n"
55
+ " -P, --with-predict include predictive search (default)\n"
56
+ " -p, --without-predict skip predictive search\n"
57
+ " -S, --print-speed print speed [1000 keys/s] (default)\n"
58
+ " -s, --print-time print time [ns/key]\n"
59
+ " -h, --help print this help\n"
60
+ << std::endl;
61
+ }
62
+
63
+ void print_config() {
64
+ std::cout << "Number of tries: " << param_min_num_tries
65
+ << " - " << param_max_num_tries << std::endl;
66
+
67
+ std::cout << "TAIL mode: ";
68
+ switch (param_tail_mode) {
69
+ case MARISA_TEXT_TAIL: {
70
+ std::cout << "Text mode" << std::endl;
71
+ break;
72
+ }
73
+ case MARISA_BINARY_TAIL: {
74
+ std::cout << "Binary mode" << std::endl;
75
+ break;
76
+ }
77
+ }
78
+
79
+ std::cout << "Node order: ";
80
+ switch (param_node_order) {
81
+ case MARISA_LABEL_ORDER: {
82
+ std::cout << "Ascending label order" << std::endl;
83
+ break;
84
+ }
85
+ case MARISA_WEIGHT_ORDER: {
86
+ std::cout << "Descending weight order" << std::endl;
87
+ break;
88
+ }
89
+ }
90
+
91
+ std::cout << "Cache level: ";
92
+ switch (param_cache_level) {
93
+ case MARISA_HUGE_CACHE: {
94
+ std::cout << "Huge cache" << std::endl;
95
+ break;
96
+ }
97
+ case MARISA_LARGE_CACHE: {
98
+ std::cout << "Large cache" << std::endl;
99
+ break;
100
+ }
101
+ case MARISA_NORMAL_CACHE: {
102
+ std::cout << "Normal cache" << std::endl;
103
+ break;
104
+ }
105
+ case MARISA_SMALL_CACHE: {
106
+ std::cout << "Small cache" << std::endl;
107
+ break;
108
+ }
109
+ case MARISA_TINY_CACHE: {
110
+ std::cout << "Tiny cache" << std::endl;
111
+ break;
112
+ }
113
+ }
114
+ }
115
+
116
+ void print_time_info(std::size_t num_keys, double elasped) {
117
+ if (param_print_speed) {
118
+ if (elasped == 0.0) {
119
+ std::printf(" %8s", "-");
120
+ } else {
121
+ std::printf(" %8.2f", num_keys / elasped / 1000.0);
122
+ }
123
+ } else {
124
+ if ((elasped == 0.0) || (num_keys == 0)) {
125
+ std::printf(" %8s", "-");
126
+ } else {
127
+ std::printf(" %8.1f", 1000000000.0 * elasped / num_keys);
128
+ }
129
+ }
130
+ }
131
+
132
+ void read_keys(std::istream &input, marisa::Keyset *keyset,
133
+ std::vector<float> *weights) {
134
+ std::string line;
135
+ while (std::getline(input, line)) {
136
+ const std::string::size_type delim_pos = line.find_last_of('\t');
137
+ float weight = 1.0F;
138
+ if (delim_pos != line.npos) {
139
+ char *end_of_value;
140
+ weight = (float)std::strtod(&line[delim_pos + 1], &end_of_value);
141
+ if (*end_of_value == '\0') {
142
+ line.resize(delim_pos);
143
+ }
144
+ }
145
+ keyset->push_back(line.c_str(), line.length());
146
+ weights->push_back(weight);
147
+ }
148
+ }
149
+
150
+ int read_keys(const char * const *args, std::size_t num_args,
151
+ marisa::Keyset *keyset, std::vector<float> *weights) {
152
+ if (num_args == 0) {
153
+ read_keys(std::cin, keyset, weights);
154
+ }
155
+ for (std::size_t i = 0; i < num_args; ++i) {
156
+ std::ifstream input_file(args[i], std::ios::binary);
157
+ if (!input_file) {
158
+ std::cerr << "error: failed to open: " << args[i] << std::endl;
159
+ return 10;
160
+ }
161
+ read_keys(input_file, keyset, weights);
162
+ }
163
+ std::cout << "Number of keys: " << keyset->size() << std::endl;
164
+ std::cout << "Total length: " << keyset->total_length() << std::endl;
165
+ return 0;
166
+ }
167
+
168
+ void benchmark_build(marisa::Keyset &keyset,
169
+ const std::vector<float> &weights, int num_tries, marisa::Trie *trie) {
170
+ for (std::size_t i = 0; i < keyset.size(); ++i) {
171
+ keyset[i].set_weight(weights[i]);
172
+ }
173
+ Clock cl;
174
+ trie->build(keyset, num_tries | param_tail_mode | param_node_order |
175
+ param_cache_level);
176
+ std::printf(" %10lu", (unsigned long)trie->io_size());
177
+ print_time_info(keyset.size(), cl.elasped());
178
+ }
179
+
180
+ void benchmark_lookup(const marisa::Trie &trie,
181
+ const marisa::Keyset &keyset) {
182
+ Clock cl;
183
+ marisa::Agent agent;
184
+ for (std::size_t i = 0; i < keyset.size(); ++i) {
185
+ agent.set_query(keyset[i].ptr(), keyset[i].length());
186
+ if (!trie.lookup(agent) || (agent.key().id() != keyset[i].id())) {
187
+ std::cerr << "error: lookup() failed" << std::endl;
188
+ return;
189
+ }
190
+ }
191
+ print_time_info(keyset.size(), cl.elasped());
192
+ }
193
+
194
+ void benchmark_reverse_lookup(const marisa::Trie &trie,
195
+ const marisa::Keyset &keyset) {
196
+ Clock cl;
197
+ marisa::Agent agent;
198
+ for (std::size_t i = 0; i < keyset.size(); ++i) {
199
+ agent.set_query(keyset[i].id());
200
+ trie.reverse_lookup(agent);
201
+ if ((agent.key().id() != keyset[i].id()) ||
202
+ (agent.key().length() != keyset[i].length()) ||
203
+ (std::memcmp(agent.key().ptr(), keyset[i].ptr(),
204
+ agent.key().length()) != 0)) {
205
+ std::cerr << "error: reverse_lookup() failed" << std::endl;
206
+ return;
207
+ }
208
+ }
209
+ print_time_info(keyset.size(), cl.elasped());
210
+ }
211
+
212
+ void benchmark_common_prefix_search(const marisa::Trie &trie,
213
+ const marisa::Keyset &keyset) {
214
+ Clock cl;
215
+ marisa::Agent agent;
216
+ for (std::size_t i = 0; i < keyset.size(); ++i) {
217
+ agent.set_query(keyset[i].ptr(), keyset[i].length());
218
+ while (trie.common_prefix_search(agent)) {
219
+ if (agent.key().id() > keyset[i].id()) {
220
+ std::cerr << "error: common_prefix_search() failed" << std::endl;
221
+ return;
222
+ }
223
+ }
224
+ if (agent.key().id() != keyset[i].id()) {
225
+ std::cerr << "error: common_prefix_search() failed" << std::endl;
226
+ return;
227
+ }
228
+ }
229
+ print_time_info(keyset.size(), cl.elasped());
230
+ }
231
+
232
+ void benchmark_predictive_search(const marisa::Trie &trie,
233
+ const marisa::Keyset &keyset) {
234
+ if (!param_with_predict) {
235
+ print_time_info(keyset.size(), 0.0);
236
+ return;
237
+ }
238
+
239
+ Clock cl;
240
+ marisa::Agent agent;
241
+ for (std::size_t i = 0; i < keyset.size(); ++i) {
242
+ agent.set_query(keyset[i].ptr(), keyset[i].length());
243
+ if (!trie.predictive_search(agent) ||
244
+ (agent.key().id() != keyset[i].id())) {
245
+ std::cerr << "error: predictive_search() failed" << std::endl;
246
+ return;
247
+ }
248
+ while (trie.predictive_search(agent)) {
249
+ if (agent.key().id() <= keyset[i].id()) {
250
+ std::cerr << "error: predictive_search() failed" << std::endl;
251
+ return;
252
+ }
253
+ }
254
+ }
255
+ print_time_info(keyset.size(), cl.elasped());
256
+ }
257
+
258
+ void benchmark(marisa::Keyset &keyset, const std::vector<float> &weights,
259
+ int num_tries) {
260
+ std::printf("%6d", num_tries);
261
+ marisa::Trie trie;
262
+ benchmark_build(keyset, weights, num_tries, &trie);
263
+ if (!trie.empty()) {
264
+ benchmark_lookup(trie, keyset);
265
+ benchmark_reverse_lookup(trie, keyset);
266
+ benchmark_common_prefix_search(trie, keyset);
267
+ benchmark_predictive_search(trie, keyset);
268
+ }
269
+ std::printf("\n");
270
+ }
271
+
272
+ int benchmark(const char * const *args, std::size_t num_args) try {
273
+ marisa::Keyset keyset;
274
+ std::vector<float> weights;
275
+ const int ret = read_keys(args, num_args, &keyset, &weights);
276
+ if (ret != 0) {
277
+ return ret;
278
+ }
279
+ std::printf("------+----------+--------+--------+"
280
+ "--------+--------+--------\n");
281
+ std::printf("%6s %10s %8s %8s %8s %8s %8s\n",
282
+ "#tries", "size", "build", "lookup", "reverse", "prefix", "predict");
283
+ std::printf("%6s %10s %8s %8s %8s %8s %8s\n",
284
+ "", "", "", "", "lookup", "search", "search");
285
+ if (param_print_speed) {
286
+ std::printf("%6s %10s %8s %8s %8s %8s %8s\n",
287
+ "", "[bytes]",
288
+ "[K/s]", "[K/s]", "[K/s]", "[K/s]", "[K/s]");
289
+ } else {
290
+ std::printf("%6s %10s %8s %8s %8s %8s %8s\n",
291
+ "", "[bytes]", "[ns]", "[ns]", "[ns]", "[ns]", "[ns]");
292
+ }
293
+ std::printf("------+----------+--------+--------+"
294
+ "--------+--------+--------\n");
295
+ for (int i = param_min_num_tries; i <= param_max_num_tries; ++i) {
296
+ benchmark(keyset, weights, i);
297
+ }
298
+ std::printf("------+----------+--------+--------+"
299
+ "--------+--------+--------\n");
300
+ return 0;
301
+ } catch (const marisa::Exception &ex) {
302
+ std::cerr << ex.what() << std::endl;
303
+ return -1;
304
+ }
305
+
306
+ } // namespace
307
+
308
+ int main(int argc, char *argv[]) {
309
+ std::ios::sync_with_stdio(false);
310
+
311
+ ::cmdopt_option long_options[] = {
312
+ { "min-num-tries", 1, NULL, 'N' },
313
+ { "max-num-tries", 1, NULL, 'n' },
314
+ { "text-tail", 0, NULL, 't' },
315
+ { "binary-tail", 0, NULL, 'b' },
316
+ { "weight-order", 0, NULL, 'w' },
317
+ { "label-order", 0, NULL, 'l' },
318
+ { "cache-level", 1, NULL, 'c' },
319
+ { "predict-on", 0, NULL, 'P' },
320
+ { "predict-off", 0, NULL, 'p' },
321
+ { "print-speed", 0, NULL, 'S' },
322
+ { "print-time", 0, NULL, 's' },
323
+ { "help", 0, NULL, 'h' },
324
+ { NULL, 0, NULL, 0 }
325
+ };
326
+ ::cmdopt_t cmdopt;
327
+ ::cmdopt_init(&cmdopt, argc, argv, "N:n:tbwlc:PpSsh", long_options);
328
+ int label;
329
+ while ((label = ::cmdopt_get(&cmdopt)) != -1) {
330
+ switch (label) {
331
+ case 'N': {
332
+ char *end_of_value;
333
+ const long value = std::strtol(cmdopt.optarg, &end_of_value, 10);
334
+ if ((*end_of_value != '\0') || (value <= 0) ||
335
+ (value > MARISA_MAX_NUM_TRIES)) {
336
+ std::cerr << "error: option `-n' with an invalid argument: "
337
+ << cmdopt.optarg << std::endl;
338
+ return 1;
339
+ }
340
+ param_min_num_tries = (int)value;
341
+ break;
342
+ }
343
+ case 'n': {
344
+ char *end_of_value;
345
+ const long value = std::strtol(cmdopt.optarg, &end_of_value, 10);
346
+ if ((*end_of_value != '\0') || (value <= 0) ||
347
+ (value > MARISA_MAX_NUM_TRIES)) {
348
+ std::cerr << "error: option `-n' with an invalid argument: "
349
+ << cmdopt.optarg << std::endl;
350
+ return 2;
351
+ }
352
+ param_max_num_tries = (int)value;
353
+ break;
354
+ }
355
+ case 't': {
356
+ param_tail_mode = MARISA_TEXT_TAIL;
357
+ break;
358
+ }
359
+ case 'b': {
360
+ param_tail_mode = MARISA_BINARY_TAIL;
361
+ break;
362
+ }
363
+ case 'w': {
364
+ param_node_order = MARISA_WEIGHT_ORDER;
365
+ break;
366
+ }
367
+ case 'l': {
368
+ param_node_order = MARISA_LABEL_ORDER;
369
+ break;
370
+ }
371
+ case 'c': {
372
+ char *end_of_value;
373
+ const long value = std::strtol(cmdopt.optarg, &end_of_value, 10);
374
+ if ((*end_of_value != '\0') || (value < 1) || (value > 5)) {
375
+ std::cerr << "error: option `-c' with an invalid argument: "
376
+ << cmdopt.optarg << std::endl;
377
+ return 3;
378
+ } else if (value == 1) {
379
+ param_cache_level = MARISA_TINY_CACHE;
380
+ } else if (value == 2) {
381
+ param_cache_level = MARISA_SMALL_CACHE;
382
+ } else if (value == 3) {
383
+ param_cache_level = MARISA_NORMAL_CACHE;
384
+ } else if (value == 4) {
385
+ param_cache_level = MARISA_LARGE_CACHE;
386
+ } else if (value == 5) {
387
+ param_cache_level = MARISA_HUGE_CACHE;
388
+ }
389
+ break;
390
+ }
391
+ case 'P': {
392
+ param_with_predict = true;
393
+ break;
394
+ }
395
+ case 'p': {
396
+ param_with_predict = false;
397
+ break;
398
+ }
399
+ case 'S': {
400
+ param_print_speed = true;
401
+ break;
402
+ }
403
+ case 's': {
404
+ param_print_speed = false;
405
+ break;
406
+ }
407
+ case 'h': {
408
+ print_help(argv[0]);
409
+ return 0;
410
+ }
411
+ default: {
412
+ return 1;
413
+ }
414
+ }
415
+ }
416
+ print_config();
417
+ return benchmark(cmdopt.argv + cmdopt.optind, cmdopt.argc - cmdopt.optind);
418
+ }