melisa 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +11 -0
- data/ext/marisa/bindings/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/marisa-swig.h +183 -0
- data/ext/marisa/bindings/perl/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/perl/marisa-swig.h +183 -0
- data/ext/marisa/bindings/perl/marisa-swig_wrap.cxx +5160 -0
- data/ext/marisa/bindings/python/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/python/marisa-swig.h +183 -0
- data/ext/marisa/bindings/python/marisa-swig_wrap.cxx +6090 -0
- data/ext/marisa/bindings/ruby/extconf.rb +5 -0
- data/ext/marisa/bindings/ruby/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/ruby/marisa-swig.h +183 -0
- data/ext/marisa/bindings/ruby/marisa-swig_wrap.cxx +4708 -0
- data/ext/marisa/lib/marisa.h +14 -0
- data/ext/marisa/lib/marisa/agent.cc +51 -0
- data/ext/marisa/lib/marisa/agent.h +73 -0
- data/ext/marisa/lib/marisa/base.h +193 -0
- data/ext/marisa/lib/marisa/exception.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/algorithm.h +26 -0
- data/ext/marisa/lib/marisa/grimoire/algorithm/sort.h +196 -0
- data/ext/marisa/lib/marisa/grimoire/intrin.h +115 -0
- data/ext/marisa/lib/marisa/grimoire/io.h +18 -0
- data/ext/marisa/lib/marisa/grimoire/io/mapper.cc +163 -0
- data/ext/marisa/lib/marisa/grimoire/io/mapper.h +67 -0
- data/ext/marisa/lib/marisa/grimoire/io/reader.cc +147 -0
- data/ext/marisa/lib/marisa/grimoire/io/reader.h +66 -0
- data/ext/marisa/lib/marisa/grimoire/io/writer.cc +148 -0
- data/ext/marisa/lib/marisa/grimoire/io/writer.h +65 -0
- data/ext/marisa/lib/marisa/grimoire/trie.h +16 -0
- data/ext/marisa/lib/marisa/grimoire/trie/cache.h +81 -0
- data/ext/marisa/lib/marisa/grimoire/trie/config.h +155 -0
- data/ext/marisa/lib/marisa/grimoire/trie/entry.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/trie/header.h +61 -0
- data/ext/marisa/lib/marisa/grimoire/trie/history.h +65 -0
- data/ext/marisa/lib/marisa/grimoire/trie/key.h +228 -0
- data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.cc +876 -0
- data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.h +134 -0
- data/ext/marisa/lib/marisa/grimoire/trie/range.h +115 -0
- data/ext/marisa/lib/marisa/grimoire/trie/state.h +117 -0
- data/ext/marisa/lib/marisa/grimoire/trie/tail.cc +218 -0
- data/ext/marisa/lib/marisa/grimoire/trie/tail.h +72 -0
- data/ext/marisa/lib/marisa/grimoire/vector.h +18 -0
- data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.cc +826 -0
- data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.h +179 -0
- data/ext/marisa/lib/marisa/grimoire/vector/flat-vector.h +205 -0
- data/ext/marisa/lib/marisa/grimoire/vector/pop-count.h +110 -0
- data/ext/marisa/lib/marisa/grimoire/vector/rank-index.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/vector/vector.h +256 -0
- data/ext/marisa/lib/marisa/iostream.h +18 -0
- data/ext/marisa/lib/marisa/key.h +85 -0
- data/ext/marisa/lib/marisa/keyset.cc +181 -0
- data/ext/marisa/lib/marisa/keyset.h +80 -0
- data/ext/marisa/lib/marisa/query.h +71 -0
- data/ext/marisa/lib/marisa/scoped-array.h +48 -0
- data/ext/marisa/lib/marisa/scoped-ptr.h +52 -0
- data/ext/marisa/lib/marisa/stdio.h +15 -0
- data/ext/marisa/lib/marisa/trie.cc +249 -0
- data/ext/marisa/lib/marisa/trie.h +64 -0
- data/ext/marisa/tests/base-test.cc +309 -0
- data/ext/marisa/tests/io-test.cc +252 -0
- data/ext/marisa/tests/marisa-assert.h +26 -0
- data/ext/marisa/tests/marisa-test.cc +388 -0
- data/ext/marisa/tests/trie-test.cc +507 -0
- data/ext/marisa/tests/vector-test.cc +466 -0
- data/ext/marisa/tools/cmdopt.cc +298 -0
- data/ext/marisa/tools/cmdopt.h +58 -0
- data/ext/marisa/tools/marisa-benchmark.cc +418 -0
- data/ext/marisa/tools/marisa-build.cc +206 -0
- data/ext/marisa/tools/marisa-common-prefix-search.cc +143 -0
- data/ext/marisa/tools/marisa-dump.cc +151 -0
- data/ext/marisa/tools/marisa-lookup.cc +110 -0
- data/ext/marisa/tools/marisa-predictive-search.cc +143 -0
- data/ext/marisa/tools/marisa-reverse-lookup.cc +110 -0
- data/lib/melisa.rb +7 -0
- data/lib/melisa/base_config_flags.rb +76 -0
- data/lib/melisa/bytes_trie.rb +55 -0
- data/lib/melisa/int_trie.rb +14 -0
- data/lib/melisa/search.rb +55 -0
- data/lib/melisa/trie.rb +96 -0
- data/lib/melisa/version.rb +3 -0
- data/melisa.gemspec +36 -0
- data/spec/base_config_flags_spec.rb +73 -0
- data/spec/bytes_trie_spec.rb +16 -0
- data/spec/int_trie_spec.rb +16 -0
- data/spec/search_spec.rb +29 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/trie_spec.rb +30 -0
- metadata +207 -0
@@ -0,0 +1,206 @@
|
|
1
|
+
#ifdef _WIN32
|
2
|
+
#include <fcntl.h>
|
3
|
+
#include <io.h>
|
4
|
+
#include <stdio.h>
|
5
|
+
#endif // _WIN32
|
6
|
+
|
7
|
+
#include <cstdlib>
|
8
|
+
#include <fstream>
|
9
|
+
#include <iostream>
|
10
|
+
#include <string>
|
11
|
+
|
12
|
+
#include <marisa.h>
|
13
|
+
|
14
|
+
#include "cmdopt.h"
|
15
|
+
|
16
|
+
namespace {
|
17
|
+
|
18
|
+
int param_num_tries = MARISA_DEFAULT_NUM_TRIES;
|
19
|
+
marisa::TailMode param_tail_mode = MARISA_DEFAULT_TAIL;
|
20
|
+
marisa::NodeOrder param_node_order = MARISA_DEFAULT_ORDER;
|
21
|
+
marisa::CacheLevel param_cache_level = MARISA_DEFAULT_CACHE;
|
22
|
+
const char *output_filename = NULL;
|
23
|
+
|
24
|
+
void print_help(const char *cmd) {
|
25
|
+
std::cerr << "Usage: " << cmd << " [OPTION]... [FILE]...\n\n"
|
26
|
+
"Options:\n"
|
27
|
+
" -n, --num-tries=[N] limit the number of tries"
|
28
|
+
" [" << MARISA_MIN_NUM_TRIES << ", " << MARISA_MAX_NUM_TRIES
|
29
|
+
<< "] (default: 3)\n"
|
30
|
+
" -t, --text-tail build a dictionary with text TAIL (default)\n"
|
31
|
+
" -b, --binary-tail build a dictionary with binary TAIL\n"
|
32
|
+
" -w, --weight-order arrange siblings in weight order (default)\n"
|
33
|
+
" -l, --label-order arrange siblings in label order\n"
|
34
|
+
" -c, --cache-level=[N] specify the cache size"
|
35
|
+
" [1, 5] (default: 3)\n"
|
36
|
+
" -o, --output=[FILE] write tries to FILE (default: stdout)\n"
|
37
|
+
" -h, --help print this help\n"
|
38
|
+
<< std::endl;
|
39
|
+
}
|
40
|
+
|
41
|
+
void read_keys(std::istream &input, marisa::Keyset *keyset) {
|
42
|
+
std::string line;
|
43
|
+
while (std::getline(input, line)) {
|
44
|
+
const std::string::size_type delim_pos = line.find_last_of('\t');
|
45
|
+
float weight = 1.0F;
|
46
|
+
if (delim_pos != line.npos) {
|
47
|
+
char *end_of_value;
|
48
|
+
weight = (float)std::strtod(&line[delim_pos + 1], &end_of_value);
|
49
|
+
if (*end_of_value == '\0') {
|
50
|
+
line.resize(delim_pos);
|
51
|
+
}
|
52
|
+
}
|
53
|
+
keyset->push_back(line.c_str(), line.length(), weight);
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
int build(const char * const *args, std::size_t num_args) {
|
58
|
+
marisa::Keyset keyset;
|
59
|
+
if (num_args == 0) try {
|
60
|
+
read_keys(std::cin, &keyset);
|
61
|
+
} catch (const marisa::Exception &ex) {
|
62
|
+
std::cerr << ex.what() << ": failed to read keys" << std::endl;
|
63
|
+
return 10;
|
64
|
+
}
|
65
|
+
|
66
|
+
for (std::size_t i = 0; i < num_args; ++i) try {
|
67
|
+
std::ifstream input_file(args[i], std::ios::binary);
|
68
|
+
if (!input_file) {
|
69
|
+
std::cerr << "error: failed to open: " << args[i] << std::endl;
|
70
|
+
return 11;
|
71
|
+
}
|
72
|
+
read_keys(input_file, &keyset);
|
73
|
+
} catch (const marisa::Exception &ex) {
|
74
|
+
std::cerr << ex.what() << ": failed to read keys" << std::endl;
|
75
|
+
return 12;
|
76
|
+
}
|
77
|
+
|
78
|
+
marisa::Trie trie;
|
79
|
+
try {
|
80
|
+
trie.build(keyset, param_num_tries | param_tail_mode | param_node_order |
|
81
|
+
param_cache_level);
|
82
|
+
} catch (const marisa::Exception &ex) {
|
83
|
+
std::cerr << ex.what() << ": failed to build a dictionary" << std::endl;
|
84
|
+
return 20;
|
85
|
+
}
|
86
|
+
|
87
|
+
std::cerr << "#keys: " << trie.num_keys() << std::endl;
|
88
|
+
std::cerr << "#nodes: " << trie.num_nodes() << std::endl;
|
89
|
+
std::cerr << "size: " << trie.io_size() << std::endl;
|
90
|
+
|
91
|
+
if (output_filename != NULL) {
|
92
|
+
try {
|
93
|
+
trie.save(output_filename);
|
94
|
+
} catch (const marisa::Exception &ex) {
|
95
|
+
std::cerr << ex.what() << ": failed to write a dictionary to file: "
|
96
|
+
<< output_filename << std::endl;
|
97
|
+
return 30;
|
98
|
+
}
|
99
|
+
} else {
|
100
|
+
#ifdef _WIN32
|
101
|
+
const int stdout_fileno = ::_fileno(stdout);
|
102
|
+
if (stdout_fileno < 0) {
|
103
|
+
std::cerr << "error: failed to get the file descriptor of "
|
104
|
+
"standard output" << std::endl;
|
105
|
+
return 31;
|
106
|
+
}
|
107
|
+
if (::_setmode(stdout_fileno, _O_BINARY) == -1) {
|
108
|
+
std::cerr << "error: failed to set binary mode" << std::endl;
|
109
|
+
return 32;
|
110
|
+
}
|
111
|
+
#endif // _WIN32
|
112
|
+
try {
|
113
|
+
std::cout << trie;
|
114
|
+
} catch (const marisa::Exception &ex) {
|
115
|
+
std::cerr << ex.what()
|
116
|
+
<< ": failed to write a dictionary to standard output" << std::endl;
|
117
|
+
return 33;
|
118
|
+
}
|
119
|
+
}
|
120
|
+
return 0;
|
121
|
+
}
|
122
|
+
|
123
|
+
} // namespace
|
124
|
+
|
125
|
+
int main(int argc, char *argv[]) {
|
126
|
+
std::ios::sync_with_stdio(false);
|
127
|
+
|
128
|
+
::cmdopt_option long_options[] = {
|
129
|
+
{ "max-num-tries", 1, NULL, 'n' },
|
130
|
+
{ "text-tail", 0, NULL, 't' },
|
131
|
+
{ "binary-tail", 0, NULL, 'b' },
|
132
|
+
{ "weight-order", 0, NULL, 'w' },
|
133
|
+
{ "label-order", 0, NULL, 'l' },
|
134
|
+
{ "cache-level", 1, NULL, 'c' },
|
135
|
+
{ "output", 1, NULL, 'o' },
|
136
|
+
{ "help", 0, NULL, 'h' },
|
137
|
+
{ NULL, 0, NULL, 0 }
|
138
|
+
};
|
139
|
+
::cmdopt_t cmdopt;
|
140
|
+
::cmdopt_init(&cmdopt, argc, argv, "n:tbwlc:o:h", long_options);
|
141
|
+
int label;
|
142
|
+
while ((label = ::cmdopt_get(&cmdopt)) != -1) {
|
143
|
+
switch (label) {
|
144
|
+
case 'n': {
|
145
|
+
char *end_of_value;
|
146
|
+
const long value = std::strtol(cmdopt.optarg, &end_of_value, 10);
|
147
|
+
if ((*end_of_value != '\0') || (value <= 0) ||
|
148
|
+
(value > MARISA_MAX_NUM_TRIES)) {
|
149
|
+
std::cerr << "error: option `-n' with an invalid argument: "
|
150
|
+
<< cmdopt.optarg << std::endl;
|
151
|
+
return 1;
|
152
|
+
}
|
153
|
+
param_num_tries = (int)value;
|
154
|
+
break;
|
155
|
+
}
|
156
|
+
case 't': {
|
157
|
+
param_tail_mode = MARISA_TEXT_TAIL;
|
158
|
+
break;
|
159
|
+
}
|
160
|
+
case 'b': {
|
161
|
+
param_tail_mode = MARISA_BINARY_TAIL;
|
162
|
+
break;
|
163
|
+
}
|
164
|
+
case 'w': {
|
165
|
+
param_node_order = MARISA_WEIGHT_ORDER;
|
166
|
+
break;
|
167
|
+
}
|
168
|
+
case 'l': {
|
169
|
+
param_node_order = MARISA_LABEL_ORDER;
|
170
|
+
break;
|
171
|
+
}
|
172
|
+
case 'c': {
|
173
|
+
char *end_of_value;
|
174
|
+
const long value = std::strtol(cmdopt.optarg, &end_of_value, 10);
|
175
|
+
if ((*end_of_value != '\0') || (value < 1) || (value > 5)) {
|
176
|
+
std::cerr << "error: option `-c' with an invalid argument: "
|
177
|
+
<< cmdopt.optarg << std::endl;
|
178
|
+
return 2;
|
179
|
+
} else if (value == 1) {
|
180
|
+
param_cache_level = MARISA_TINY_CACHE;
|
181
|
+
} else if (value == 2) {
|
182
|
+
param_cache_level = MARISA_SMALL_CACHE;
|
183
|
+
} else if (value == 3) {
|
184
|
+
param_cache_level = MARISA_NORMAL_CACHE;
|
185
|
+
} else if (value == 4) {
|
186
|
+
param_cache_level = MARISA_LARGE_CACHE;
|
187
|
+
} else if (value == 5) {
|
188
|
+
param_cache_level = MARISA_HUGE_CACHE;
|
189
|
+
}
|
190
|
+
break;
|
191
|
+
}
|
192
|
+
case 'o': {
|
193
|
+
output_filename = cmdopt.optarg;
|
194
|
+
break;
|
195
|
+
}
|
196
|
+
case 'h': {
|
197
|
+
print_help(argv[0]);
|
198
|
+
return 0;
|
199
|
+
}
|
200
|
+
default: {
|
201
|
+
return 1;
|
202
|
+
}
|
203
|
+
}
|
204
|
+
}
|
205
|
+
return build(cmdopt.argv + cmdopt.optind, cmdopt.argc - cmdopt.optind);
|
206
|
+
}
|
@@ -0,0 +1,143 @@
|
|
1
|
+
#include <cstdlib>
|
2
|
+
#include <iostream>
|
3
|
+
#include <string>
|
4
|
+
|
5
|
+
#include <marisa.h>
|
6
|
+
|
7
|
+
#include "cmdopt.h"
|
8
|
+
|
9
|
+
namespace {
|
10
|
+
|
11
|
+
std::size_t max_num_results = 10;
|
12
|
+
bool mmap_flag = true;
|
13
|
+
|
14
|
+
void print_help(const char *cmd) {
|
15
|
+
std::cerr << "Usage: " << cmd << " [OPTION]... DIC\n\n"
|
16
|
+
"Options:\n"
|
17
|
+
" -n, --max-num-results=[N] limit the number of results to N"
|
18
|
+
" (default: 10)\n"
|
19
|
+
" 0: no limit\n"
|
20
|
+
" -m, --mmap-dictionary use memory-mapped I/O to load a dictionary"
|
21
|
+
" (default)\n"
|
22
|
+
" -r, --read-dictionary read an entire dictionary into memory\n"
|
23
|
+
" -h, --help print this help\n"
|
24
|
+
<< std::endl;
|
25
|
+
}
|
26
|
+
|
27
|
+
int common_prefix_search(const char * const *args, std::size_t num_args) {
|
28
|
+
if (num_args == 0) {
|
29
|
+
std::cerr << "error: dictionary is not specified" << std::endl;
|
30
|
+
return 10;
|
31
|
+
} else if (num_args > 1) {
|
32
|
+
std::cerr << "error: more than one dictionaries are specified"
|
33
|
+
<< std::endl;
|
34
|
+
return 11;
|
35
|
+
}
|
36
|
+
|
37
|
+
marisa::Trie trie;
|
38
|
+
if (mmap_flag) {
|
39
|
+
try {
|
40
|
+
trie.mmap(args[0]);
|
41
|
+
} catch (const marisa::Exception &ex) {
|
42
|
+
std::cerr << ex.what() << ": failed to mmap a dictionary file: "
|
43
|
+
<< args[0] << std::endl;
|
44
|
+
return 20;
|
45
|
+
}
|
46
|
+
} else {
|
47
|
+
try {
|
48
|
+
trie.load(args[0]);
|
49
|
+
} catch (const marisa::Exception &ex) {
|
50
|
+
std::cerr << ex.what() << ": failed to load a dictionary file: "
|
51
|
+
<< args[0] << std::endl;
|
52
|
+
return 21;
|
53
|
+
}
|
54
|
+
}
|
55
|
+
|
56
|
+
marisa::Agent agent;
|
57
|
+
marisa::Keyset keyset;
|
58
|
+
std::string str;
|
59
|
+
while (std::getline(std::cin, str)) {
|
60
|
+
try {
|
61
|
+
agent.set_query(str.c_str(), str.length());
|
62
|
+
while (trie.common_prefix_search(agent)) {
|
63
|
+
keyset.push_back(agent.key());
|
64
|
+
}
|
65
|
+
if (keyset.empty()) {
|
66
|
+
std::cout << "not found" << std::endl;
|
67
|
+
} else {
|
68
|
+
std::cout << keyset.size() << " found" << std::endl;
|
69
|
+
const std::size_t end = std::min(max_num_results, keyset.size());
|
70
|
+
for (std::size_t i = 0; i < end; ++i) {
|
71
|
+
std::cout << keyset[i].id() << '\t';
|
72
|
+
std::cout.write(keyset[i].ptr(), keyset[i].length()) << '\t';
|
73
|
+
std::cout << str << '\n';
|
74
|
+
}
|
75
|
+
}
|
76
|
+
keyset.reset();
|
77
|
+
} catch (const marisa::Exception &ex) {
|
78
|
+
std::cerr << ex.what() << ": common_prefix_search() failed: "
|
79
|
+
<< str << std::endl;
|
80
|
+
return 30;
|
81
|
+
}
|
82
|
+
|
83
|
+
if (!std::cout) {
|
84
|
+
std::cerr << "error: failed to write results to standard output"
|
85
|
+
<< std::endl;
|
86
|
+
return 31;
|
87
|
+
}
|
88
|
+
}
|
89
|
+
|
90
|
+
return 0;
|
91
|
+
}
|
92
|
+
|
93
|
+
} // namespace
|
94
|
+
|
95
|
+
int main(int argc, char *argv[]) {
|
96
|
+
std::ios::sync_with_stdio(false);
|
97
|
+
|
98
|
+
::cmdopt_option long_options[] = {
|
99
|
+
{ "max-num-results", 1, NULL, 'n' },
|
100
|
+
{ "mmap-dictionary", 0, NULL, 'm' },
|
101
|
+
{ "read-dictionary", 0, NULL, 'r' },
|
102
|
+
{ "help", 0, NULL, 'h' },
|
103
|
+
{ NULL, 0, NULL, 0 }
|
104
|
+
};
|
105
|
+
::cmdopt_t cmdopt;
|
106
|
+
::cmdopt_init(&cmdopt, argc, argv, "n:mrh", long_options);
|
107
|
+
int label;
|
108
|
+
while ((label = ::cmdopt_get(&cmdopt)) != -1) {
|
109
|
+
switch (label) {
|
110
|
+
case 'n': {
|
111
|
+
char *end_of_value;
|
112
|
+
const long value = std::strtol(cmdopt.optarg, &end_of_value, 10);
|
113
|
+
if ((*end_of_value != '\0') || (value < 0)) {
|
114
|
+
std::cerr << "error: option `-n' with an invalid argument: "
|
115
|
+
<< cmdopt.optarg << std::endl;
|
116
|
+
}
|
117
|
+
if ((value == 0) || ((unsigned long long)value > MARISA_SIZE_MAX)) {
|
118
|
+
max_num_results = MARISA_SIZE_MAX;
|
119
|
+
} else {
|
120
|
+
max_num_results = (std::size_t)value;
|
121
|
+
}
|
122
|
+
break;
|
123
|
+
}
|
124
|
+
case 'm': {
|
125
|
+
mmap_flag = true;
|
126
|
+
break;
|
127
|
+
}
|
128
|
+
case 'r': {
|
129
|
+
mmap_flag = false;
|
130
|
+
break;
|
131
|
+
}
|
132
|
+
case 'h': {
|
133
|
+
print_help(argv[0]);
|
134
|
+
return 0;
|
135
|
+
}
|
136
|
+
default: {
|
137
|
+
return 1;
|
138
|
+
}
|
139
|
+
}
|
140
|
+
}
|
141
|
+
return common_prefix_search(cmdopt.argv + cmdopt.optind,
|
142
|
+
cmdopt.argc - cmdopt.optind);
|
143
|
+
}
|
@@ -0,0 +1,151 @@
|
|
1
|
+
#ifdef _WIN32
|
2
|
+
#include <fcntl.h>
|
3
|
+
#include <io.h>
|
4
|
+
#include <stdio.h>
|
5
|
+
#endif // _WIN32
|
6
|
+
|
7
|
+
#include <cstdlib>
|
8
|
+
#include <iostream>
|
9
|
+
#include <string>
|
10
|
+
|
11
|
+
#include <marisa.h>
|
12
|
+
|
13
|
+
#include "cmdopt.h"
|
14
|
+
|
15
|
+
namespace {
|
16
|
+
|
17
|
+
const char *delimiter = "\n";
|
18
|
+
bool mmap_flag = true;
|
19
|
+
|
20
|
+
void print_help(const char *cmd) {
|
21
|
+
std::cerr << "Usage: " << cmd << " [OPTION]... DIC...\n\n"
|
22
|
+
"Options:\n"
|
23
|
+
" -d, --delimiter=[S] specify the delimier (default: \"\\n\")\n"
|
24
|
+
" -m, --mmap-dictionary use memory-mapped I/O to load a dictionary"
|
25
|
+
" (default)\n"
|
26
|
+
" -r, --read-dictionary read an entire dictionary into memory\n"
|
27
|
+
" -h, --help print this help\n"
|
28
|
+
<< std::endl;
|
29
|
+
}
|
30
|
+
|
31
|
+
int dump(const marisa::Trie &trie) {
|
32
|
+
std::size_t num_keys = 0;
|
33
|
+
marisa::Agent agent;
|
34
|
+
agent.set_query("");
|
35
|
+
try {
|
36
|
+
while (trie.predictive_search(agent)) {
|
37
|
+
std::cout.write(agent.key().ptr(), agent.key().length()) << delimiter;
|
38
|
+
if (!std::cout) {
|
39
|
+
std::cerr << "error: failed to write results to standard output"
|
40
|
+
<< std::endl;
|
41
|
+
return 20;
|
42
|
+
}
|
43
|
+
++num_keys;
|
44
|
+
}
|
45
|
+
} catch (const marisa::Exception &ex) {
|
46
|
+
std::cerr << ex.what() << ": predictive_search() failed" << std::endl;
|
47
|
+
return 21;
|
48
|
+
}
|
49
|
+
std::cerr << "#keys: " << num_keys << std::endl;
|
50
|
+
return 0;
|
51
|
+
}
|
52
|
+
|
53
|
+
int dump(const char *filename) {
|
54
|
+
marisa::Trie trie;
|
55
|
+
if (filename != NULL) {
|
56
|
+
std::cerr << "input: " << filename << std::endl;
|
57
|
+
if (mmap_flag) {
|
58
|
+
try {
|
59
|
+
trie.mmap(filename);
|
60
|
+
} catch (const marisa::Exception &ex) {
|
61
|
+
std::cerr << ex.what() << ": failed to mmap a dictionary file: "
|
62
|
+
<< filename << std::endl;
|
63
|
+
return 10;
|
64
|
+
}
|
65
|
+
} else {
|
66
|
+
try {
|
67
|
+
trie.load(filename);
|
68
|
+
} catch (const marisa::Exception &ex) {
|
69
|
+
std::cerr << ex.what() << ": failed to load a dictionary file: "
|
70
|
+
<< filename << std::endl;
|
71
|
+
return 11;
|
72
|
+
}
|
73
|
+
}
|
74
|
+
} else {
|
75
|
+
std::cerr << "input: <stdin>" << std::endl;
|
76
|
+
#ifdef _WIN32
|
77
|
+
const int stdin_fileno = ::_fileno(stdin);
|
78
|
+
if (stdin_fileno < 0) {
|
79
|
+
std::cerr << "error: failed to get the file descriptor of "
|
80
|
+
"standard input" << std::endl;
|
81
|
+
return 20;
|
82
|
+
}
|
83
|
+
if (::_setmode(stdin_fileno, _O_BINARY) == -1) {
|
84
|
+
std::cerr << "error: failed to set binary mode" << std::endl;
|
85
|
+
return 21;
|
86
|
+
}
|
87
|
+
#endif // _WIN32
|
88
|
+
try {
|
89
|
+
std::cin >> trie;
|
90
|
+
} catch (const marisa::Exception &ex) {
|
91
|
+
std::cerr << ex.what()
|
92
|
+
<< ": failed to read a dictionary from standard input" << std::endl;
|
93
|
+
return 22;
|
94
|
+
}
|
95
|
+
}
|
96
|
+
return dump(trie);
|
97
|
+
}
|
98
|
+
|
99
|
+
int dump(const char * const *args, std::size_t num_args) {
|
100
|
+
if (num_args == 0) {
|
101
|
+
return dump(NULL);
|
102
|
+
}
|
103
|
+
for (std::size_t i = 0; i < num_args; ++i) {
|
104
|
+
const int result = dump(args[i]);
|
105
|
+
if (result != 0) {
|
106
|
+
return result;
|
107
|
+
}
|
108
|
+
}
|
109
|
+
return 0;
|
110
|
+
}
|
111
|
+
|
112
|
+
} // namespace
|
113
|
+
|
114
|
+
int main(int argc, char *argv[]) {
|
115
|
+
std::ios::sync_with_stdio(false);
|
116
|
+
|
117
|
+
::cmdopt_option long_options[] = {
|
118
|
+
{ "delimiter", 1, NULL, 'd' },
|
119
|
+
{ "mmap-dictionary", 0, NULL, 'm' },
|
120
|
+
{ "read-dictionary", 0, NULL, 'r' },
|
121
|
+
{ "help", 0, NULL, 'h' },
|
122
|
+
{ NULL, 0, NULL, 0 }
|
123
|
+
};
|
124
|
+
::cmdopt_t cmdopt;
|
125
|
+
::cmdopt_init(&cmdopt, argc, argv, "d:mrh", long_options);
|
126
|
+
int label;
|
127
|
+
while ((label = ::cmdopt_get(&cmdopt)) != -1) {
|
128
|
+
switch (label) {
|
129
|
+
case 'd': {
|
130
|
+
delimiter = cmdopt.optarg;
|
131
|
+
break;
|
132
|
+
}
|
133
|
+
case 'm': {
|
134
|
+
mmap_flag = true;
|
135
|
+
break;
|
136
|
+
}
|
137
|
+
case 'r': {
|
138
|
+
mmap_flag = false;
|
139
|
+
break;
|
140
|
+
}
|
141
|
+
case 'h': {
|
142
|
+
print_help(argv[0]);
|
143
|
+
return 0;
|
144
|
+
}
|
145
|
+
default: {
|
146
|
+
return 1;
|
147
|
+
}
|
148
|
+
}
|
149
|
+
}
|
150
|
+
return dump(cmdopt.argv + cmdopt.optind, cmdopt.argc - cmdopt.optind);
|
151
|
+
}
|