melisa 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +11 -0
- data/ext/marisa/bindings/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/marisa-swig.h +183 -0
- data/ext/marisa/bindings/perl/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/perl/marisa-swig.h +183 -0
- data/ext/marisa/bindings/perl/marisa-swig_wrap.cxx +5160 -0
- data/ext/marisa/bindings/python/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/python/marisa-swig.h +183 -0
- data/ext/marisa/bindings/python/marisa-swig_wrap.cxx +6090 -0
- data/ext/marisa/bindings/ruby/extconf.rb +5 -0
- data/ext/marisa/bindings/ruby/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/ruby/marisa-swig.h +183 -0
- data/ext/marisa/bindings/ruby/marisa-swig_wrap.cxx +4708 -0
- data/ext/marisa/lib/marisa.h +14 -0
- data/ext/marisa/lib/marisa/agent.cc +51 -0
- data/ext/marisa/lib/marisa/agent.h +73 -0
- data/ext/marisa/lib/marisa/base.h +193 -0
- data/ext/marisa/lib/marisa/exception.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/algorithm.h +26 -0
- data/ext/marisa/lib/marisa/grimoire/algorithm/sort.h +196 -0
- data/ext/marisa/lib/marisa/grimoire/intrin.h +115 -0
- data/ext/marisa/lib/marisa/grimoire/io.h +18 -0
- data/ext/marisa/lib/marisa/grimoire/io/mapper.cc +163 -0
- data/ext/marisa/lib/marisa/grimoire/io/mapper.h +67 -0
- data/ext/marisa/lib/marisa/grimoire/io/reader.cc +147 -0
- data/ext/marisa/lib/marisa/grimoire/io/reader.h +66 -0
- data/ext/marisa/lib/marisa/grimoire/io/writer.cc +148 -0
- data/ext/marisa/lib/marisa/grimoire/io/writer.h +65 -0
- data/ext/marisa/lib/marisa/grimoire/trie.h +16 -0
- data/ext/marisa/lib/marisa/grimoire/trie/cache.h +81 -0
- data/ext/marisa/lib/marisa/grimoire/trie/config.h +155 -0
- data/ext/marisa/lib/marisa/grimoire/trie/entry.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/trie/header.h +61 -0
- data/ext/marisa/lib/marisa/grimoire/trie/history.h +65 -0
- data/ext/marisa/lib/marisa/grimoire/trie/key.h +228 -0
- data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.cc +876 -0
- data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.h +134 -0
- data/ext/marisa/lib/marisa/grimoire/trie/range.h +115 -0
- data/ext/marisa/lib/marisa/grimoire/trie/state.h +117 -0
- data/ext/marisa/lib/marisa/grimoire/trie/tail.cc +218 -0
- data/ext/marisa/lib/marisa/grimoire/trie/tail.h +72 -0
- data/ext/marisa/lib/marisa/grimoire/vector.h +18 -0
- data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.cc +826 -0
- data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.h +179 -0
- data/ext/marisa/lib/marisa/grimoire/vector/flat-vector.h +205 -0
- data/ext/marisa/lib/marisa/grimoire/vector/pop-count.h +110 -0
- data/ext/marisa/lib/marisa/grimoire/vector/rank-index.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/vector/vector.h +256 -0
- data/ext/marisa/lib/marisa/iostream.h +18 -0
- data/ext/marisa/lib/marisa/key.h +85 -0
- data/ext/marisa/lib/marisa/keyset.cc +181 -0
- data/ext/marisa/lib/marisa/keyset.h +80 -0
- data/ext/marisa/lib/marisa/query.h +71 -0
- data/ext/marisa/lib/marisa/scoped-array.h +48 -0
- data/ext/marisa/lib/marisa/scoped-ptr.h +52 -0
- data/ext/marisa/lib/marisa/stdio.h +15 -0
- data/ext/marisa/lib/marisa/trie.cc +249 -0
- data/ext/marisa/lib/marisa/trie.h +64 -0
- data/ext/marisa/tests/base-test.cc +309 -0
- data/ext/marisa/tests/io-test.cc +252 -0
- data/ext/marisa/tests/marisa-assert.h +26 -0
- data/ext/marisa/tests/marisa-test.cc +388 -0
- data/ext/marisa/tests/trie-test.cc +507 -0
- data/ext/marisa/tests/vector-test.cc +466 -0
- data/ext/marisa/tools/cmdopt.cc +298 -0
- data/ext/marisa/tools/cmdopt.h +58 -0
- data/ext/marisa/tools/marisa-benchmark.cc +418 -0
- data/ext/marisa/tools/marisa-build.cc +206 -0
- data/ext/marisa/tools/marisa-common-prefix-search.cc +143 -0
- data/ext/marisa/tools/marisa-dump.cc +151 -0
- data/ext/marisa/tools/marisa-lookup.cc +110 -0
- data/ext/marisa/tools/marisa-predictive-search.cc +143 -0
- data/ext/marisa/tools/marisa-reverse-lookup.cc +110 -0
- data/lib/melisa.rb +7 -0
- data/lib/melisa/base_config_flags.rb +76 -0
- data/lib/melisa/bytes_trie.rb +55 -0
- data/lib/melisa/int_trie.rb +14 -0
- data/lib/melisa/search.rb +55 -0
- data/lib/melisa/trie.rb +96 -0
- data/lib/melisa/version.rb +3 -0
- data/melisa.gemspec +36 -0
- data/spec/base_config_flags_spec.rb +73 -0
- data/spec/bytes_trie_spec.rb +16 -0
- data/spec/int_trie_spec.rb +16 -0
- data/spec/search_spec.rb +29 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/trie_spec.rb +30 -0
- metadata +207 -0
@@ -0,0 +1,298 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
|
3
|
+
#include "cmdopt.h"
|
4
|
+
|
5
|
+
#ifdef __cplusplus
|
6
|
+
extern "C" {
|
7
|
+
#endif // __cplusplus
|
8
|
+
|
9
|
+
// Moves `optind' to the end and shifts other arguments.
|
10
|
+
static void cmdopt_shift(cmdopt_t *h) {
|
11
|
+
int i;
|
12
|
+
char *tmp;
|
13
|
+
|
14
|
+
tmp = h->argv[h->optind];
|
15
|
+
for (i = h->optind; i < h->argc - 1; i++) {
|
16
|
+
h->argv[i] = h->argv[i + 1];
|
17
|
+
}
|
18
|
+
h->argv[i] = tmp;
|
19
|
+
|
20
|
+
h->nextchar = NULL;
|
21
|
+
h->optnum--;
|
22
|
+
}
|
23
|
+
|
24
|
+
// Moves to the next argument.
|
25
|
+
static void cmdopt_next(cmdopt_t *h) {
|
26
|
+
h->optind++;
|
27
|
+
h->nextchar = NULL;
|
28
|
+
}
|
29
|
+
|
30
|
+
// Checks if the current argument is an option or not.
|
31
|
+
static int cmdopt_check(cmdopt_t *h) {
|
32
|
+
int ret = 1;
|
33
|
+
const char *arg = h->argv[h->optind];
|
34
|
+
|
35
|
+
if (*arg++ != '-') {
|
36
|
+
return 0;
|
37
|
+
}
|
38
|
+
|
39
|
+
if (*arg == '-') {
|
40
|
+
arg++;
|
41
|
+
ret++;
|
42
|
+
}
|
43
|
+
|
44
|
+
return ret - (*arg == '\0');
|
45
|
+
}
|
46
|
+
|
47
|
+
// Gets an argument of the current option.
|
48
|
+
static void cmdopt_getopt(cmdopt_t *h) {
|
49
|
+
// Moves to the next argument if the current argument has no more characters.
|
50
|
+
if (*h->nextchar == '\0') {
|
51
|
+
cmdopt_next(h);
|
52
|
+
h->nextchar = h->argv[h->optind];
|
53
|
+
}
|
54
|
+
|
55
|
+
// Checks whether the current option has an argument or not.
|
56
|
+
if (h->optind < h->optnum) {
|
57
|
+
h->optarg = h->nextchar;
|
58
|
+
cmdopt_next(h);
|
59
|
+
} else {
|
60
|
+
h->optarg = NULL;
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
// Searches an option.
|
65
|
+
static int cmdopt_search(cmdopt_t *h) {
|
66
|
+
const char *ptr;
|
67
|
+
|
68
|
+
// Updates an option character.
|
69
|
+
h->optopt = *h->nextchar++;
|
70
|
+
|
71
|
+
for (ptr = h->optstring; *ptr != '\0'; ptr++) {
|
72
|
+
if (*ptr == h->optopt) {
|
73
|
+
// Gets an option argument if required.
|
74
|
+
if (ptr[1] == ':') {
|
75
|
+
cmdopt_getopt(h);
|
76
|
+
|
77
|
+
// Returns ':' if there is no argument.
|
78
|
+
if (h->optarg == NULL && ptr[2] != ':') {
|
79
|
+
return ':';
|
80
|
+
}
|
81
|
+
}
|
82
|
+
return h->optopt;
|
83
|
+
}
|
84
|
+
}
|
85
|
+
|
86
|
+
if (h->optopt == '-') {
|
87
|
+
cmdopt_next(h);
|
88
|
+
while (h->optind < h->optnum) {
|
89
|
+
cmdopt_shift(h);
|
90
|
+
}
|
91
|
+
return -1;
|
92
|
+
}
|
93
|
+
|
94
|
+
// Returns '?' if the option character is undefined.
|
95
|
+
return '?';
|
96
|
+
}
|
97
|
+
|
98
|
+
// Compares a long option with an argument and returns the length of the
|
99
|
+
// matched prefix.
|
100
|
+
static int cmdopt_match_len(const char *opt, const char *arg) {
|
101
|
+
int len = 0;
|
102
|
+
|
103
|
+
// Returns 0 if there is a mismatch.
|
104
|
+
while ((*arg != '\0') && (*arg != '=')) {
|
105
|
+
if (*arg++ != *opt++) {
|
106
|
+
return 0;
|
107
|
+
}
|
108
|
+
len++;
|
109
|
+
}
|
110
|
+
|
111
|
+
// Returns a negative value in case of a perfect match.
|
112
|
+
if ((*arg == '\0') || (*arg == '=')) {
|
113
|
+
return -len;
|
114
|
+
}
|
115
|
+
|
116
|
+
return len;
|
117
|
+
}
|
118
|
+
|
119
|
+
// Checks long options.
|
120
|
+
static int cmdopt_match(cmdopt_t *h) {
|
121
|
+
int i, len;
|
122
|
+
int max = 0, max_optind = -1;
|
123
|
+
|
124
|
+
// Returns -1 if there are no long options.
|
125
|
+
if (h->longopts == NULL) {
|
126
|
+
return max_optind;
|
127
|
+
}
|
128
|
+
|
129
|
+
for (i = 0; h->longopts[i].name != NULL; i++) {
|
130
|
+
len = cmdopt_match_len(h->longopts[i].name, h->nextchar);
|
131
|
+
if (len < 0) {
|
132
|
+
// In case of a perfect match.
|
133
|
+
h->nextchar -= len;
|
134
|
+
return i;
|
135
|
+
} else if (len > max) {
|
136
|
+
// In case of a prefix match.
|
137
|
+
max = len;
|
138
|
+
max_optind = i;
|
139
|
+
} else if (len == max) {
|
140
|
+
// There are other candidates.
|
141
|
+
max_optind = -1;
|
142
|
+
}
|
143
|
+
}
|
144
|
+
|
145
|
+
// If there is no perfect match, adopts the longest one.
|
146
|
+
h->nextchar += max;
|
147
|
+
return max_optind;
|
148
|
+
}
|
149
|
+
|
150
|
+
// Gets an argument of a long option.
|
151
|
+
static void cmdopt_getopt_long(cmdopt_t *h) {
|
152
|
+
if (*h->nextchar == '=') {
|
153
|
+
h->optarg = h->nextchar + 1;
|
154
|
+
cmdopt_next(h);
|
155
|
+
} else {
|
156
|
+
cmdopt_next(h);
|
157
|
+
|
158
|
+
// Checks whether there are more options or not.
|
159
|
+
if (h->optind < h->optnum) {
|
160
|
+
h->optarg = h->argv[h->optind];
|
161
|
+
cmdopt_next(h);
|
162
|
+
} else {
|
163
|
+
h->optarg = NULL;
|
164
|
+
}
|
165
|
+
}
|
166
|
+
}
|
167
|
+
|
168
|
+
// Searches long options.
|
169
|
+
static int cmdopt_search_long(cmdopt_t *h) {
|
170
|
+
const cmdopt_option *option;
|
171
|
+
|
172
|
+
// Keeps the long option.
|
173
|
+
h->optlong = h->argv[h->optind];
|
174
|
+
|
175
|
+
// Gets the next option.
|
176
|
+
h->longindex = cmdopt_match(h);
|
177
|
+
if (h->longindex < 0) {
|
178
|
+
cmdopt_next(h);
|
179
|
+
return '?';
|
180
|
+
}
|
181
|
+
|
182
|
+
// Gets an argument if required.
|
183
|
+
option = h->longopts + h->longindex;
|
184
|
+
if (option->has_arg) {
|
185
|
+
cmdopt_getopt_long(h);
|
186
|
+
|
187
|
+
// Return ':' if there are no more arguments.
|
188
|
+
if (h->optarg == NULL) {
|
189
|
+
return ':';
|
190
|
+
}
|
191
|
+
} else if (*h->nextchar == '=') {
|
192
|
+
// Returns '?' for an extra option argument.
|
193
|
+
cmdopt_getopt_long(h);
|
194
|
+
return '?';
|
195
|
+
}
|
196
|
+
|
197
|
+
// Overwrites a variable if specified in settings.
|
198
|
+
if (option->flag != NULL) {
|
199
|
+
*option->flag = option->val;
|
200
|
+
return 0;
|
201
|
+
}
|
202
|
+
|
203
|
+
return option->val;
|
204
|
+
}
|
205
|
+
|
206
|
+
// Analyze command line option.
|
207
|
+
static int cmdopt_main(cmdopt_t *h) {
|
208
|
+
int type;
|
209
|
+
|
210
|
+
// Initializes the internal state.
|
211
|
+
h->optopt = 0;
|
212
|
+
h->optlong = NULL;
|
213
|
+
h->optarg = NULL;
|
214
|
+
h->longindex = 0;
|
215
|
+
|
216
|
+
while (h->optind < h->optnum) {
|
217
|
+
if (h->nextchar == NULL) {
|
218
|
+
// Checks whether the next argument is an option or not.
|
219
|
+
type = cmdopt_check(h);
|
220
|
+
if (type == 0) {
|
221
|
+
cmdopt_shift(h);
|
222
|
+
} else {
|
223
|
+
h->nextchar = h->argv[h->optind] + type;
|
224
|
+
if (type == 2) {
|
225
|
+
return cmdopt_search_long(h);
|
226
|
+
}
|
227
|
+
}
|
228
|
+
} else {
|
229
|
+
if (*h->nextchar == '\0') {
|
230
|
+
cmdopt_next(h);
|
231
|
+
continue;
|
232
|
+
}
|
233
|
+
// Searches an option string.
|
234
|
+
return cmdopt_search(h);
|
235
|
+
}
|
236
|
+
}
|
237
|
+
|
238
|
+
return -1;
|
239
|
+
}
|
240
|
+
|
241
|
+
// cmdopt_init() initializes a cmdopt_t for successive cmdopt_get()s.
|
242
|
+
void cmdopt_init(cmdopt_t *h, int argc, char **argv,
|
243
|
+
const char *optstring, const cmdopt_option *longopts) {
|
244
|
+
static const char empty_optstring[] = "";
|
245
|
+
|
246
|
+
h->argc = argc;
|
247
|
+
h->argv = argv;
|
248
|
+
h->optnum = h->argc;
|
249
|
+
|
250
|
+
h->longopts = longopts;
|
251
|
+
h->optstring = (optstring != NULL) ? optstring : empty_optstring;
|
252
|
+
|
253
|
+
h->optind = 1;
|
254
|
+
h->nextchar = NULL;
|
255
|
+
h->optarg = NULL;
|
256
|
+
h->optopt = 0;
|
257
|
+
h->optlong = NULL;
|
258
|
+
h->opterr = 1;
|
259
|
+
h->longindex = 0;
|
260
|
+
}
|
261
|
+
|
262
|
+
// cmdopt_get() analyzes command line arguments and gets the next option.
|
263
|
+
int cmdopt_get(cmdopt_t *h) {
|
264
|
+
int value = cmdopt_main(h);
|
265
|
+
|
266
|
+
// Prints a warning to the standard error stream if enabled.
|
267
|
+
if (h->opterr) {
|
268
|
+
if (value == ':') {
|
269
|
+
// Warning for a lack of an option argument.
|
270
|
+
if (h->optlong == NULL) {
|
271
|
+
fprintf(stderr, "option requires an argument -- %c\n", h->optopt);
|
272
|
+
} else {
|
273
|
+
fprintf(stderr, "option `--%s' requires an argument\n",
|
274
|
+
h->longopts[h->longindex].name);
|
275
|
+
}
|
276
|
+
} else if (value == '?') {
|
277
|
+
// Warning for an invalid option.
|
278
|
+
if (h->optlong == NULL) {
|
279
|
+
fprintf(stderr, "invalid option -- %c\n", h->optopt);
|
280
|
+
} else {
|
281
|
+
fprintf(stderr, "unrecognized option `%s'\n", h->optlong);
|
282
|
+
}
|
283
|
+
} else if ((value != -1) && (h->opterr == 2)) {
|
284
|
+
// Actually this is not for warning, but for debugging.
|
285
|
+
if (h->optlong == NULL) {
|
286
|
+
fprintf(stderr, "option with `%s' -- %c\n", h->optarg, h->optopt);
|
287
|
+
} else {
|
288
|
+
fprintf(stderr, "option `--%s' with `%s'\n",
|
289
|
+
h->longopts[h->longindex].name, h->optarg);
|
290
|
+
}
|
291
|
+
}
|
292
|
+
}
|
293
|
+
return value;
|
294
|
+
}
|
295
|
+
|
296
|
+
#ifdef __cplusplus
|
297
|
+
} // extern "C"
|
298
|
+
#endif // __cplusplus
|
@@ -0,0 +1,58 @@
|
|
1
|
+
#ifndef MARISA_CMDOPT_H_
|
2
|
+
#define MARISA_CMDOPT_H_
|
3
|
+
|
4
|
+
#ifdef __cplusplus
|
5
|
+
extern "C" {
|
6
|
+
#endif
|
7
|
+
|
8
|
+
typedef struct cmdopt_option_ {
|
9
|
+
// `name' specifies the name of this option.
|
10
|
+
// An array of options must be terminated with an option whose name == NULL.
|
11
|
+
const char *name;
|
12
|
+
|
13
|
+
// `has_name' specifies whether an option takes an argument or not.
|
14
|
+
// 0 specifies that this option does not have any argument.
|
15
|
+
// 1 specifies that this option has an argument.
|
16
|
+
// 2 specifies that this option may have an argument.
|
17
|
+
int has_arg;
|
18
|
+
|
19
|
+
// `flag' specifies an integer variable which is overwritten by cmdopt_next()
|
20
|
+
// with its return value.
|
21
|
+
int *flag;
|
22
|
+
|
23
|
+
// `val' specifies a return value of cmdopt_next(). This value is returned
|
24
|
+
// when cmdopt_next() finds this option.
|
25
|
+
int val;
|
26
|
+
} cmdopt_option;
|
27
|
+
|
28
|
+
typedef struct cmdopt_t_ {
|
29
|
+
// Command line arguments.
|
30
|
+
int argc;
|
31
|
+
char **argv;
|
32
|
+
|
33
|
+
// Option settings.
|
34
|
+
const cmdopt_option *longopts;
|
35
|
+
const char *optstring;
|
36
|
+
|
37
|
+
int optind; // Index of the next argument.
|
38
|
+
char *nextchar; // Next character.
|
39
|
+
char *optarg; // Argument of the last option.
|
40
|
+
int optopt; // Label of the last option.
|
41
|
+
char *optlong; // Long option.
|
42
|
+
int opterr; // Warning level (0: nothing, 1: warning, 2: all).
|
43
|
+
int longindex; // Index of the last long option.
|
44
|
+
int optnum; // Number of options.
|
45
|
+
} cmdopt_t;
|
46
|
+
|
47
|
+
// cmdopt_init() initializes a cmdopt_t for successive cmdopt_next()s.
|
48
|
+
void cmdopt_init(cmdopt_t *h, int argc, char **argv,
|
49
|
+
const char *optstring, const cmdopt_option *longopts);
|
50
|
+
|
51
|
+
// cmdopt_get() analyzes command line arguments and gets the next option.
|
52
|
+
int cmdopt_get(cmdopt_t *h);
|
53
|
+
|
54
|
+
#ifdef __cplusplus
|
55
|
+
} // extern "C"
|
56
|
+
#endif
|
57
|
+
|
58
|
+
#endif // MARISA_CMDOPT_H_
|
@@ -0,0 +1,418 @@
|
|
1
|
+
#include <cstdlib>
|
2
|
+
#include <cstring>
|
3
|
+
#include <ctime>
|
4
|
+
#include <fstream>
|
5
|
+
#include <iostream>
|
6
|
+
#include <string>
|
7
|
+
#include <vector>
|
8
|
+
|
9
|
+
#include <marisa.h>
|
10
|
+
|
11
|
+
#include "cmdopt.h"
|
12
|
+
|
13
|
+
namespace {
|
14
|
+
|
15
|
+
int param_min_num_tries = 1;
|
16
|
+
int param_max_num_tries = 5;
|
17
|
+
marisa::TailMode param_tail_mode = MARISA_DEFAULT_TAIL;
|
18
|
+
marisa::NodeOrder param_node_order = MARISA_DEFAULT_ORDER;
|
19
|
+
marisa::CacheLevel param_cache_level = MARISA_DEFAULT_CACHE;
|
20
|
+
bool param_with_predict = true;
|
21
|
+
bool param_print_speed = true;
|
22
|
+
|
23
|
+
class Clock {
|
24
|
+
public:
|
25
|
+
Clock() : cl_(std::clock()) {}
|
26
|
+
|
27
|
+
void reset() {
|
28
|
+
cl_ = std::clock();
|
29
|
+
}
|
30
|
+
|
31
|
+
double elasped() const {
|
32
|
+
std::clock_t cur = std::clock();
|
33
|
+
return 1.0 * (cur - cl_) / CLOCKS_PER_SEC;
|
34
|
+
}
|
35
|
+
|
36
|
+
private:
|
37
|
+
std::clock_t cl_;
|
38
|
+
};
|
39
|
+
|
40
|
+
void print_help(const char *cmd) {
|
41
|
+
std::cerr << "Usage: " << cmd << " [OPTION]... [FILE]...\n\n"
|
42
|
+
"Options:\n"
|
43
|
+
" -N, --min-num-tries=[N] limit the number of tries"
|
44
|
+
" [" << MARISA_MIN_NUM_TRIES << ", " << MARISA_MAX_NUM_TRIES
|
45
|
+
<< "] (default: 1)\n"
|
46
|
+
" -n, --max-num-tries=[N] limit the number of tries"
|
47
|
+
" [" << MARISA_MIN_NUM_TRIES << ", " << MARISA_MAX_NUM_TRIES
|
48
|
+
<< "] (default: 10)\n"
|
49
|
+
" -t, --text-tail build a dictionary with text TAIL (default)\n"
|
50
|
+
" -b, --binary-tail build a dictionary with binary TAIL\n"
|
51
|
+
" -w, --weight-order arrange siblings in weight order (default)\n"
|
52
|
+
" -l, --label-order arrange siblings in label order\n"
|
53
|
+
" -c, --cache-level=[N] specify the cache size"
|
54
|
+
" [1, 5] (default: 3)\n"
|
55
|
+
" -P, --with-predict include predictive search (default)\n"
|
56
|
+
" -p, --without-predict skip predictive search\n"
|
57
|
+
" -S, --print-speed print speed [1000 keys/s] (default)\n"
|
58
|
+
" -s, --print-time print time [ns/key]\n"
|
59
|
+
" -h, --help print this help\n"
|
60
|
+
<< std::endl;
|
61
|
+
}
|
62
|
+
|
63
|
+
void print_config() {
|
64
|
+
std::cout << "Number of tries: " << param_min_num_tries
|
65
|
+
<< " - " << param_max_num_tries << std::endl;
|
66
|
+
|
67
|
+
std::cout << "TAIL mode: ";
|
68
|
+
switch (param_tail_mode) {
|
69
|
+
case MARISA_TEXT_TAIL: {
|
70
|
+
std::cout << "Text mode" << std::endl;
|
71
|
+
break;
|
72
|
+
}
|
73
|
+
case MARISA_BINARY_TAIL: {
|
74
|
+
std::cout << "Binary mode" << std::endl;
|
75
|
+
break;
|
76
|
+
}
|
77
|
+
}
|
78
|
+
|
79
|
+
std::cout << "Node order: ";
|
80
|
+
switch (param_node_order) {
|
81
|
+
case MARISA_LABEL_ORDER: {
|
82
|
+
std::cout << "Ascending label order" << std::endl;
|
83
|
+
break;
|
84
|
+
}
|
85
|
+
case MARISA_WEIGHT_ORDER: {
|
86
|
+
std::cout << "Descending weight order" << std::endl;
|
87
|
+
break;
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
91
|
+
std::cout << "Cache level: ";
|
92
|
+
switch (param_cache_level) {
|
93
|
+
case MARISA_HUGE_CACHE: {
|
94
|
+
std::cout << "Huge cache" << std::endl;
|
95
|
+
break;
|
96
|
+
}
|
97
|
+
case MARISA_LARGE_CACHE: {
|
98
|
+
std::cout << "Large cache" << std::endl;
|
99
|
+
break;
|
100
|
+
}
|
101
|
+
case MARISA_NORMAL_CACHE: {
|
102
|
+
std::cout << "Normal cache" << std::endl;
|
103
|
+
break;
|
104
|
+
}
|
105
|
+
case MARISA_SMALL_CACHE: {
|
106
|
+
std::cout << "Small cache" << std::endl;
|
107
|
+
break;
|
108
|
+
}
|
109
|
+
case MARISA_TINY_CACHE: {
|
110
|
+
std::cout << "Tiny cache" << std::endl;
|
111
|
+
break;
|
112
|
+
}
|
113
|
+
}
|
114
|
+
}
|
115
|
+
|
116
|
+
void print_time_info(std::size_t num_keys, double elasped) {
|
117
|
+
if (param_print_speed) {
|
118
|
+
if (elasped == 0.0) {
|
119
|
+
std::printf(" %8s", "-");
|
120
|
+
} else {
|
121
|
+
std::printf(" %8.2f", num_keys / elasped / 1000.0);
|
122
|
+
}
|
123
|
+
} else {
|
124
|
+
if ((elasped == 0.0) || (num_keys == 0)) {
|
125
|
+
std::printf(" %8s", "-");
|
126
|
+
} else {
|
127
|
+
std::printf(" %8.1f", 1000000000.0 * elasped / num_keys);
|
128
|
+
}
|
129
|
+
}
|
130
|
+
}
|
131
|
+
|
132
|
+
void read_keys(std::istream &input, marisa::Keyset *keyset,
|
133
|
+
std::vector<float> *weights) {
|
134
|
+
std::string line;
|
135
|
+
while (std::getline(input, line)) {
|
136
|
+
const std::string::size_type delim_pos = line.find_last_of('\t');
|
137
|
+
float weight = 1.0F;
|
138
|
+
if (delim_pos != line.npos) {
|
139
|
+
char *end_of_value;
|
140
|
+
weight = (float)std::strtod(&line[delim_pos + 1], &end_of_value);
|
141
|
+
if (*end_of_value == '\0') {
|
142
|
+
line.resize(delim_pos);
|
143
|
+
}
|
144
|
+
}
|
145
|
+
keyset->push_back(line.c_str(), line.length());
|
146
|
+
weights->push_back(weight);
|
147
|
+
}
|
148
|
+
}
|
149
|
+
|
150
|
+
int read_keys(const char * const *args, std::size_t num_args,
|
151
|
+
marisa::Keyset *keyset, std::vector<float> *weights) {
|
152
|
+
if (num_args == 0) {
|
153
|
+
read_keys(std::cin, keyset, weights);
|
154
|
+
}
|
155
|
+
for (std::size_t i = 0; i < num_args; ++i) {
|
156
|
+
std::ifstream input_file(args[i], std::ios::binary);
|
157
|
+
if (!input_file) {
|
158
|
+
std::cerr << "error: failed to open: " << args[i] << std::endl;
|
159
|
+
return 10;
|
160
|
+
}
|
161
|
+
read_keys(input_file, keyset, weights);
|
162
|
+
}
|
163
|
+
std::cout << "Number of keys: " << keyset->size() << std::endl;
|
164
|
+
std::cout << "Total length: " << keyset->total_length() << std::endl;
|
165
|
+
return 0;
|
166
|
+
}
|
167
|
+
|
168
|
+
void benchmark_build(marisa::Keyset &keyset,
|
169
|
+
const std::vector<float> &weights, int num_tries, marisa::Trie *trie) {
|
170
|
+
for (std::size_t i = 0; i < keyset.size(); ++i) {
|
171
|
+
keyset[i].set_weight(weights[i]);
|
172
|
+
}
|
173
|
+
Clock cl;
|
174
|
+
trie->build(keyset, num_tries | param_tail_mode | param_node_order |
|
175
|
+
param_cache_level);
|
176
|
+
std::printf(" %10lu", (unsigned long)trie->io_size());
|
177
|
+
print_time_info(keyset.size(), cl.elasped());
|
178
|
+
}
|
179
|
+
|
180
|
+
void benchmark_lookup(const marisa::Trie &trie,
|
181
|
+
const marisa::Keyset &keyset) {
|
182
|
+
Clock cl;
|
183
|
+
marisa::Agent agent;
|
184
|
+
for (std::size_t i = 0; i < keyset.size(); ++i) {
|
185
|
+
agent.set_query(keyset[i].ptr(), keyset[i].length());
|
186
|
+
if (!trie.lookup(agent) || (agent.key().id() != keyset[i].id())) {
|
187
|
+
std::cerr << "error: lookup() failed" << std::endl;
|
188
|
+
return;
|
189
|
+
}
|
190
|
+
}
|
191
|
+
print_time_info(keyset.size(), cl.elasped());
|
192
|
+
}
|
193
|
+
|
194
|
+
void benchmark_reverse_lookup(const marisa::Trie &trie,
|
195
|
+
const marisa::Keyset &keyset) {
|
196
|
+
Clock cl;
|
197
|
+
marisa::Agent agent;
|
198
|
+
for (std::size_t i = 0; i < keyset.size(); ++i) {
|
199
|
+
agent.set_query(keyset[i].id());
|
200
|
+
trie.reverse_lookup(agent);
|
201
|
+
if ((agent.key().id() != keyset[i].id()) ||
|
202
|
+
(agent.key().length() != keyset[i].length()) ||
|
203
|
+
(std::memcmp(agent.key().ptr(), keyset[i].ptr(),
|
204
|
+
agent.key().length()) != 0)) {
|
205
|
+
std::cerr << "error: reverse_lookup() failed" << std::endl;
|
206
|
+
return;
|
207
|
+
}
|
208
|
+
}
|
209
|
+
print_time_info(keyset.size(), cl.elasped());
|
210
|
+
}
|
211
|
+
|
212
|
+
void benchmark_common_prefix_search(const marisa::Trie &trie,
|
213
|
+
const marisa::Keyset &keyset) {
|
214
|
+
Clock cl;
|
215
|
+
marisa::Agent agent;
|
216
|
+
for (std::size_t i = 0; i < keyset.size(); ++i) {
|
217
|
+
agent.set_query(keyset[i].ptr(), keyset[i].length());
|
218
|
+
while (trie.common_prefix_search(agent)) {
|
219
|
+
if (agent.key().id() > keyset[i].id()) {
|
220
|
+
std::cerr << "error: common_prefix_search() failed" << std::endl;
|
221
|
+
return;
|
222
|
+
}
|
223
|
+
}
|
224
|
+
if (agent.key().id() != keyset[i].id()) {
|
225
|
+
std::cerr << "error: common_prefix_search() failed" << std::endl;
|
226
|
+
return;
|
227
|
+
}
|
228
|
+
}
|
229
|
+
print_time_info(keyset.size(), cl.elasped());
|
230
|
+
}
|
231
|
+
|
232
|
+
void benchmark_predictive_search(const marisa::Trie &trie,
|
233
|
+
const marisa::Keyset &keyset) {
|
234
|
+
if (!param_with_predict) {
|
235
|
+
print_time_info(keyset.size(), 0.0);
|
236
|
+
return;
|
237
|
+
}
|
238
|
+
|
239
|
+
Clock cl;
|
240
|
+
marisa::Agent agent;
|
241
|
+
for (std::size_t i = 0; i < keyset.size(); ++i) {
|
242
|
+
agent.set_query(keyset[i].ptr(), keyset[i].length());
|
243
|
+
if (!trie.predictive_search(agent) ||
|
244
|
+
(agent.key().id() != keyset[i].id())) {
|
245
|
+
std::cerr << "error: predictive_search() failed" << std::endl;
|
246
|
+
return;
|
247
|
+
}
|
248
|
+
while (trie.predictive_search(agent)) {
|
249
|
+
if (agent.key().id() <= keyset[i].id()) {
|
250
|
+
std::cerr << "error: predictive_search() failed" << std::endl;
|
251
|
+
return;
|
252
|
+
}
|
253
|
+
}
|
254
|
+
}
|
255
|
+
print_time_info(keyset.size(), cl.elasped());
|
256
|
+
}
|
257
|
+
|
258
|
+
void benchmark(marisa::Keyset &keyset, const std::vector<float> &weights,
|
259
|
+
int num_tries) {
|
260
|
+
std::printf("%6d", num_tries);
|
261
|
+
marisa::Trie trie;
|
262
|
+
benchmark_build(keyset, weights, num_tries, &trie);
|
263
|
+
if (!trie.empty()) {
|
264
|
+
benchmark_lookup(trie, keyset);
|
265
|
+
benchmark_reverse_lookup(trie, keyset);
|
266
|
+
benchmark_common_prefix_search(trie, keyset);
|
267
|
+
benchmark_predictive_search(trie, keyset);
|
268
|
+
}
|
269
|
+
std::printf("\n");
|
270
|
+
}
|
271
|
+
|
272
|
+
int benchmark(const char * const *args, std::size_t num_args) try {
|
273
|
+
marisa::Keyset keyset;
|
274
|
+
std::vector<float> weights;
|
275
|
+
const int ret = read_keys(args, num_args, &keyset, &weights);
|
276
|
+
if (ret != 0) {
|
277
|
+
return ret;
|
278
|
+
}
|
279
|
+
std::printf("------+----------+--------+--------+"
|
280
|
+
"--------+--------+--------\n");
|
281
|
+
std::printf("%6s %10s %8s %8s %8s %8s %8s\n",
|
282
|
+
"#tries", "size", "build", "lookup", "reverse", "prefix", "predict");
|
283
|
+
std::printf("%6s %10s %8s %8s %8s %8s %8s\n",
|
284
|
+
"", "", "", "", "lookup", "search", "search");
|
285
|
+
if (param_print_speed) {
|
286
|
+
std::printf("%6s %10s %8s %8s %8s %8s %8s\n",
|
287
|
+
"", "[bytes]",
|
288
|
+
"[K/s]", "[K/s]", "[K/s]", "[K/s]", "[K/s]");
|
289
|
+
} else {
|
290
|
+
std::printf("%6s %10s %8s %8s %8s %8s %8s\n",
|
291
|
+
"", "[bytes]", "[ns]", "[ns]", "[ns]", "[ns]", "[ns]");
|
292
|
+
}
|
293
|
+
std::printf("------+----------+--------+--------+"
|
294
|
+
"--------+--------+--------\n");
|
295
|
+
for (int i = param_min_num_tries; i <= param_max_num_tries; ++i) {
|
296
|
+
benchmark(keyset, weights, i);
|
297
|
+
}
|
298
|
+
std::printf("------+----------+--------+--------+"
|
299
|
+
"--------+--------+--------\n");
|
300
|
+
return 0;
|
301
|
+
} catch (const marisa::Exception &ex) {
|
302
|
+
std::cerr << ex.what() << std::endl;
|
303
|
+
return -1;
|
304
|
+
}
|
305
|
+
|
306
|
+
} // namespace
|
307
|
+
|
308
|
+
int main(int argc, char *argv[]) {
|
309
|
+
std::ios::sync_with_stdio(false);
|
310
|
+
|
311
|
+
::cmdopt_option long_options[] = {
|
312
|
+
{ "min-num-tries", 1, NULL, 'N' },
|
313
|
+
{ "max-num-tries", 1, NULL, 'n' },
|
314
|
+
{ "text-tail", 0, NULL, 't' },
|
315
|
+
{ "binary-tail", 0, NULL, 'b' },
|
316
|
+
{ "weight-order", 0, NULL, 'w' },
|
317
|
+
{ "label-order", 0, NULL, 'l' },
|
318
|
+
{ "cache-level", 1, NULL, 'c' },
|
319
|
+
{ "predict-on", 0, NULL, 'P' },
|
320
|
+
{ "predict-off", 0, NULL, 'p' },
|
321
|
+
{ "print-speed", 0, NULL, 'S' },
|
322
|
+
{ "print-time", 0, NULL, 's' },
|
323
|
+
{ "help", 0, NULL, 'h' },
|
324
|
+
{ NULL, 0, NULL, 0 }
|
325
|
+
};
|
326
|
+
::cmdopt_t cmdopt;
|
327
|
+
::cmdopt_init(&cmdopt, argc, argv, "N:n:tbwlc:PpSsh", long_options);
|
328
|
+
int label;
|
329
|
+
while ((label = ::cmdopt_get(&cmdopt)) != -1) {
|
330
|
+
switch (label) {
|
331
|
+
case 'N': {
|
332
|
+
char *end_of_value;
|
333
|
+
const long value = std::strtol(cmdopt.optarg, &end_of_value, 10);
|
334
|
+
if ((*end_of_value != '\0') || (value <= 0) ||
|
335
|
+
(value > MARISA_MAX_NUM_TRIES)) {
|
336
|
+
std::cerr << "error: option `-n' with an invalid argument: "
|
337
|
+
<< cmdopt.optarg << std::endl;
|
338
|
+
return 1;
|
339
|
+
}
|
340
|
+
param_min_num_tries = (int)value;
|
341
|
+
break;
|
342
|
+
}
|
343
|
+
case 'n': {
|
344
|
+
char *end_of_value;
|
345
|
+
const long value = std::strtol(cmdopt.optarg, &end_of_value, 10);
|
346
|
+
if ((*end_of_value != '\0') || (value <= 0) ||
|
347
|
+
(value > MARISA_MAX_NUM_TRIES)) {
|
348
|
+
std::cerr << "error: option `-n' with an invalid argument: "
|
349
|
+
<< cmdopt.optarg << std::endl;
|
350
|
+
return 2;
|
351
|
+
}
|
352
|
+
param_max_num_tries = (int)value;
|
353
|
+
break;
|
354
|
+
}
|
355
|
+
case 't': {
|
356
|
+
param_tail_mode = MARISA_TEXT_TAIL;
|
357
|
+
break;
|
358
|
+
}
|
359
|
+
case 'b': {
|
360
|
+
param_tail_mode = MARISA_BINARY_TAIL;
|
361
|
+
break;
|
362
|
+
}
|
363
|
+
case 'w': {
|
364
|
+
param_node_order = MARISA_WEIGHT_ORDER;
|
365
|
+
break;
|
366
|
+
}
|
367
|
+
case 'l': {
|
368
|
+
param_node_order = MARISA_LABEL_ORDER;
|
369
|
+
break;
|
370
|
+
}
|
371
|
+
case 'c': {
|
372
|
+
char *end_of_value;
|
373
|
+
const long value = std::strtol(cmdopt.optarg, &end_of_value, 10);
|
374
|
+
if ((*end_of_value != '\0') || (value < 1) || (value > 5)) {
|
375
|
+
std::cerr << "error: option `-c' with an invalid argument: "
|
376
|
+
<< cmdopt.optarg << std::endl;
|
377
|
+
return 3;
|
378
|
+
} else if (value == 1) {
|
379
|
+
param_cache_level = MARISA_TINY_CACHE;
|
380
|
+
} else if (value == 2) {
|
381
|
+
param_cache_level = MARISA_SMALL_CACHE;
|
382
|
+
} else if (value == 3) {
|
383
|
+
param_cache_level = MARISA_NORMAL_CACHE;
|
384
|
+
} else if (value == 4) {
|
385
|
+
param_cache_level = MARISA_LARGE_CACHE;
|
386
|
+
} else if (value == 5) {
|
387
|
+
param_cache_level = MARISA_HUGE_CACHE;
|
388
|
+
}
|
389
|
+
break;
|
390
|
+
}
|
391
|
+
case 'P': {
|
392
|
+
param_with_predict = true;
|
393
|
+
break;
|
394
|
+
}
|
395
|
+
case 'p': {
|
396
|
+
param_with_predict = false;
|
397
|
+
break;
|
398
|
+
}
|
399
|
+
case 'S': {
|
400
|
+
param_print_speed = true;
|
401
|
+
break;
|
402
|
+
}
|
403
|
+
case 's': {
|
404
|
+
param_print_speed = false;
|
405
|
+
break;
|
406
|
+
}
|
407
|
+
case 'h': {
|
408
|
+
print_help(argv[0]);
|
409
|
+
return 0;
|
410
|
+
}
|
411
|
+
default: {
|
412
|
+
return 1;
|
413
|
+
}
|
414
|
+
}
|
415
|
+
}
|
416
|
+
print_config();
|
417
|
+
return benchmark(cmdopt.argv + cmdopt.optind, cmdopt.argc - cmdopt.optind);
|
418
|
+
}
|