melisa 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. data/README.md +11 -0
  2. data/ext/marisa/bindings/marisa-swig.cxx +253 -0
  3. data/ext/marisa/bindings/marisa-swig.h +183 -0
  4. data/ext/marisa/bindings/perl/marisa-swig.cxx +253 -0
  5. data/ext/marisa/bindings/perl/marisa-swig.h +183 -0
  6. data/ext/marisa/bindings/perl/marisa-swig_wrap.cxx +5160 -0
  7. data/ext/marisa/bindings/python/marisa-swig.cxx +253 -0
  8. data/ext/marisa/bindings/python/marisa-swig.h +183 -0
  9. data/ext/marisa/bindings/python/marisa-swig_wrap.cxx +6090 -0
  10. data/ext/marisa/bindings/ruby/extconf.rb +5 -0
  11. data/ext/marisa/bindings/ruby/marisa-swig.cxx +253 -0
  12. data/ext/marisa/bindings/ruby/marisa-swig.h +183 -0
  13. data/ext/marisa/bindings/ruby/marisa-swig_wrap.cxx +4708 -0
  14. data/ext/marisa/lib/marisa.h +14 -0
  15. data/ext/marisa/lib/marisa/agent.cc +51 -0
  16. data/ext/marisa/lib/marisa/agent.h +73 -0
  17. data/ext/marisa/lib/marisa/base.h +193 -0
  18. data/ext/marisa/lib/marisa/exception.h +82 -0
  19. data/ext/marisa/lib/marisa/grimoire/algorithm.h +26 -0
  20. data/ext/marisa/lib/marisa/grimoire/algorithm/sort.h +196 -0
  21. data/ext/marisa/lib/marisa/grimoire/intrin.h +115 -0
  22. data/ext/marisa/lib/marisa/grimoire/io.h +18 -0
  23. data/ext/marisa/lib/marisa/grimoire/io/mapper.cc +163 -0
  24. data/ext/marisa/lib/marisa/grimoire/io/mapper.h +67 -0
  25. data/ext/marisa/lib/marisa/grimoire/io/reader.cc +147 -0
  26. data/ext/marisa/lib/marisa/grimoire/io/reader.h +66 -0
  27. data/ext/marisa/lib/marisa/grimoire/io/writer.cc +148 -0
  28. data/ext/marisa/lib/marisa/grimoire/io/writer.h +65 -0
  29. data/ext/marisa/lib/marisa/grimoire/trie.h +16 -0
  30. data/ext/marisa/lib/marisa/grimoire/trie/cache.h +81 -0
  31. data/ext/marisa/lib/marisa/grimoire/trie/config.h +155 -0
  32. data/ext/marisa/lib/marisa/grimoire/trie/entry.h +82 -0
  33. data/ext/marisa/lib/marisa/grimoire/trie/header.h +61 -0
  34. data/ext/marisa/lib/marisa/grimoire/trie/history.h +65 -0
  35. data/ext/marisa/lib/marisa/grimoire/trie/key.h +228 -0
  36. data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.cc +876 -0
  37. data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.h +134 -0
  38. data/ext/marisa/lib/marisa/grimoire/trie/range.h +115 -0
  39. data/ext/marisa/lib/marisa/grimoire/trie/state.h +117 -0
  40. data/ext/marisa/lib/marisa/grimoire/trie/tail.cc +218 -0
  41. data/ext/marisa/lib/marisa/grimoire/trie/tail.h +72 -0
  42. data/ext/marisa/lib/marisa/grimoire/vector.h +18 -0
  43. data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.cc +826 -0
  44. data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.h +179 -0
  45. data/ext/marisa/lib/marisa/grimoire/vector/flat-vector.h +205 -0
  46. data/ext/marisa/lib/marisa/grimoire/vector/pop-count.h +110 -0
  47. data/ext/marisa/lib/marisa/grimoire/vector/rank-index.h +82 -0
  48. data/ext/marisa/lib/marisa/grimoire/vector/vector.h +256 -0
  49. data/ext/marisa/lib/marisa/iostream.h +18 -0
  50. data/ext/marisa/lib/marisa/key.h +85 -0
  51. data/ext/marisa/lib/marisa/keyset.cc +181 -0
  52. data/ext/marisa/lib/marisa/keyset.h +80 -0
  53. data/ext/marisa/lib/marisa/query.h +71 -0
  54. data/ext/marisa/lib/marisa/scoped-array.h +48 -0
  55. data/ext/marisa/lib/marisa/scoped-ptr.h +52 -0
  56. data/ext/marisa/lib/marisa/stdio.h +15 -0
  57. data/ext/marisa/lib/marisa/trie.cc +249 -0
  58. data/ext/marisa/lib/marisa/trie.h +64 -0
  59. data/ext/marisa/tests/base-test.cc +309 -0
  60. data/ext/marisa/tests/io-test.cc +252 -0
  61. data/ext/marisa/tests/marisa-assert.h +26 -0
  62. data/ext/marisa/tests/marisa-test.cc +388 -0
  63. data/ext/marisa/tests/trie-test.cc +507 -0
  64. data/ext/marisa/tests/vector-test.cc +466 -0
  65. data/ext/marisa/tools/cmdopt.cc +298 -0
  66. data/ext/marisa/tools/cmdopt.h +58 -0
  67. data/ext/marisa/tools/marisa-benchmark.cc +418 -0
  68. data/ext/marisa/tools/marisa-build.cc +206 -0
  69. data/ext/marisa/tools/marisa-common-prefix-search.cc +143 -0
  70. data/ext/marisa/tools/marisa-dump.cc +151 -0
  71. data/ext/marisa/tools/marisa-lookup.cc +110 -0
  72. data/ext/marisa/tools/marisa-predictive-search.cc +143 -0
  73. data/ext/marisa/tools/marisa-reverse-lookup.cc +110 -0
  74. data/lib/melisa.rb +7 -0
  75. data/lib/melisa/base_config_flags.rb +76 -0
  76. data/lib/melisa/bytes_trie.rb +55 -0
  77. data/lib/melisa/int_trie.rb +14 -0
  78. data/lib/melisa/search.rb +55 -0
  79. data/lib/melisa/trie.rb +96 -0
  80. data/lib/melisa/version.rb +3 -0
  81. data/melisa.gemspec +36 -0
  82. data/spec/base_config_flags_spec.rb +73 -0
  83. data/spec/bytes_trie_spec.rb +16 -0
  84. data/spec/int_trie_spec.rb +16 -0
  85. data/spec/search_spec.rb +29 -0
  86. data/spec/spec_helper.rb +1 -0
  87. data/spec/trie_spec.rb +30 -0
  88. metadata +207 -0
@@ -0,0 +1,110 @@
1
+ #include <iostream>
2
+ #include <string>
3
+
4
+ #include <marisa.h>
5
+
6
+ #include "cmdopt.h"
7
+
8
+ namespace {
9
+
10
+ bool mmap_flag = true;
11
+
12
+ void print_help(const char *cmd) {
13
+ std::cerr << "Usage: " << cmd << " [OPTION]... DIC\n\n"
14
+ "Options:\n"
15
+ " -m, --mmap-dictionary use memory-mapped I/O to load a dictionary"
16
+ " (default)\n"
17
+ " -r, --read-dictionary read an entire dictionary into memory\n"
18
+ " -h, --help print this help\n"
19
+ << std::endl;
20
+ }
21
+
22
+ int lookup(const char * const *args, std::size_t num_args) {
23
+ if (num_args == 0) {
24
+ std::cerr << "error: dictionary is not specified" << std::endl;
25
+ return 10;
26
+ } else if (num_args > 1) {
27
+ std::cerr << "error: more than one dictionaries are specified"
28
+ << std::endl;
29
+ return 11;
30
+ }
31
+
32
+ marisa::Trie trie;
33
+ if (mmap_flag) {
34
+ try {
35
+ trie.mmap(args[0]);
36
+ } catch (const marisa::Exception &ex) {
37
+ std::cerr << ex.what() << ": failed to mmap a dictionary file: "
38
+ << args[0] << std::endl;
39
+ return 20;
40
+ }
41
+ } else {
42
+ try {
43
+ trie.load(args[0]);
44
+ } catch (const marisa::Exception &ex) {
45
+ std::cerr << ex.what() << ": failed to load a dictionary file: "
46
+ << args[0] << std::endl;
47
+ return 21;
48
+ }
49
+ }
50
+
51
+ marisa::Agent agent;
52
+ std::string str;
53
+ while (std::getline(std::cin, str)) {
54
+ try {
55
+ agent.set_query(str.c_str(), str.length());
56
+ if (trie.lookup(agent)) {
57
+ std::cout << agent.key().id() << '\t' << str << '\n';
58
+ } else {
59
+ std::cout << "-1\t" << str << '\n';
60
+ }
61
+ } catch (const marisa::Exception &ex) {
62
+ std::cerr << ex.what() << ": lookup() failed: " << str << std::endl;
63
+ return 30;
64
+ }
65
+
66
+ if (!std::cout) {
67
+ std::cerr << "error: failed to write results to standard output"
68
+ << std::endl;
69
+ return 30;
70
+ }
71
+ }
72
+
73
+ return 0;
74
+ }
75
+
76
+ } // namespace
77
+
78
+ int main(int argc, char *argv[]) {
79
+ std::ios::sync_with_stdio(false);
80
+
81
+ ::cmdopt_option long_options[] = {
82
+ { "mmap-dictionary", 0, NULL, 'm' },
83
+ { "read-dictionary", 0, NULL, 'r' },
84
+ { "help", 0, NULL, 'h' },
85
+ { NULL, 0, NULL, 0 }
86
+ };
87
+ ::cmdopt_t cmdopt;
88
+ ::cmdopt_init(&cmdopt, argc, argv, "mrh", long_options);
89
+ int label;
90
+ while ((label = ::cmdopt_get(&cmdopt)) != -1) {
91
+ switch (label) {
92
+ case 'm': {
93
+ mmap_flag = true;
94
+ break;
95
+ }
96
+ case 'r': {
97
+ mmap_flag = false;
98
+ break;
99
+ }
100
+ case 'h': {
101
+ print_help(argv[0]);
102
+ return 0;
103
+ }
104
+ default: {
105
+ return 1;
106
+ }
107
+ }
108
+ }
109
+ return lookup(cmdopt.argv + cmdopt.optind, cmdopt.argc - cmdopt.optind);
110
+ }
@@ -0,0 +1,143 @@
1
+ #include <cstdlib>
2
+ #include <iostream>
3
+ #include <string>
4
+
5
+ #include <marisa.h>
6
+
7
+ #include "cmdopt.h"
8
+
9
+ namespace {
10
+
11
+ std::size_t max_num_results = 10;
12
+ bool mmap_flag = true;
13
+
14
+ void print_help(const char *cmd) {
15
+ std::cerr << "Usage: " << cmd << " [OPTION]... DIC\n\n"
16
+ "Options:\n"
17
+ " -n, --max-num-results=[N] limit the number of outputs to N"
18
+ " (default: 10)\n"
19
+ " 0: no limit\n"
20
+ " -m, --mmap-dictionary use memory-mapped I/O to load a dictionary"
21
+ " (default)\n"
22
+ " -r, --read-dictionary read an entire dictionary into memory\n"
23
+ " -h, --help print this help\n"
24
+ << std::endl;
25
+ }
26
+
27
+ int predictive_search(const char * const *args, std::size_t num_args) {
28
+ if (num_args == 0) {
29
+ std::cerr << "error: dictionary is not specified" << std::endl;
30
+ return 10;
31
+ } else if (num_args > 1) {
32
+ std::cerr << "error: more than one dictionaries are specified"
33
+ << std::endl;
34
+ return 11;
35
+ }
36
+
37
+ marisa::Trie trie;
38
+ if (mmap_flag) {
39
+ try {
40
+ trie.mmap(args[0]);
41
+ } catch (const marisa::Exception &ex) {
42
+ std::cerr << ex.what() << ": failed to mmap a dictionary file: "
43
+ << args[0] << std::endl;
44
+ return 20;
45
+ }
46
+ } else {
47
+ try {
48
+ trie.load(args[0]);
49
+ } catch (const marisa::Exception &ex) {
50
+ std::cerr << ex.what() << ": failed to load a dictionary file: "
51
+ << args[0] << std::endl;
52
+ return 21;
53
+ }
54
+ }
55
+
56
+ marisa::Agent agent;
57
+ marisa::Keyset keyset;
58
+ std::string str;
59
+ while (std::getline(std::cin, str)) {
60
+ try {
61
+ agent.set_query(str.c_str(), str.length());
62
+ while (trie.predictive_search(agent)) {
63
+ keyset.push_back(agent.key());
64
+ }
65
+ if (keyset.empty()) {
66
+ std::cout << "not found" << std::endl;
67
+ } else {
68
+ std::cout << keyset.size() << " found" << std::endl;
69
+ const std::size_t end = std::min(max_num_results, keyset.size());
70
+ for (std::size_t i = 0; i < end; ++i) {
71
+ std::cout << keyset[i].id() << '\t';
72
+ std::cout.write(keyset[i].ptr(), keyset[i].length()) << '\t';
73
+ std::cout << str << '\n';
74
+ }
75
+ }
76
+ keyset.reset();
77
+ } catch (const marisa::Exception &ex) {
78
+ std::cerr << ex.what() << ": predictive_search() failed: "
79
+ << str << std::endl;
80
+ return 30;
81
+ }
82
+
83
+ if (!std::cout) {
84
+ std::cerr << "error: failed to write results to standard output"
85
+ << std::endl;
86
+ return 31;
87
+ }
88
+ }
89
+
90
+ return 0;
91
+ }
92
+
93
+ } // namespace
94
+
95
+ int main(int argc, char *argv[]) {
96
+ std::ios::sync_with_stdio(false);
97
+
98
+ ::cmdopt_option long_options[] = {
99
+ { "max-num-results", 1, NULL, 'n' },
100
+ { "mmap-dictionary", 0, NULL, 'm' },
101
+ { "read-dictionary", 0, NULL, 'r' },
102
+ { "help", 0, NULL, 'h' },
103
+ { NULL, 0, NULL, 0 }
104
+ };
105
+ ::cmdopt_t cmdopt;
106
+ ::cmdopt_init(&cmdopt, argc, argv, "n:mrh", long_options);
107
+ int label;
108
+ while ((label = ::cmdopt_get(&cmdopt)) != -1) {
109
+ switch (label) {
110
+ case 'n': {
111
+ char *end_of_value;
112
+ const long value = std::strtol(cmdopt.optarg, &end_of_value, 10);
113
+ if ((*end_of_value != '\0') || (value < 0)) {
114
+ std::cerr << "error: option `-n' with an invalid argument: "
115
+ << cmdopt.optarg << std::endl;
116
+ }
117
+ if ((value == 0) || ((unsigned long long)value > MARISA_SIZE_MAX)) {
118
+ max_num_results = MARISA_SIZE_MAX;
119
+ } else {
120
+ max_num_results = (std::size_t)value;
121
+ }
122
+ break;
123
+ }
124
+ case 'm': {
125
+ mmap_flag = true;
126
+ break;
127
+ }
128
+ case 'r': {
129
+ mmap_flag = false;
130
+ break;
131
+ }
132
+ case 'h': {
133
+ print_help(argv[0]);
134
+ return 0;
135
+ }
136
+ default: {
137
+ return 1;
138
+ }
139
+ }
140
+ }
141
+ return predictive_search(cmdopt.argv + cmdopt.optind,
142
+ cmdopt.argc - cmdopt.optind);
143
+ }
@@ -0,0 +1,110 @@
1
+ #include <iostream>
2
+ #include <string>
3
+
4
+ #include <marisa.h>
5
+
6
+ #include "cmdopt.h"
7
+
8
+ namespace {
9
+
10
+ bool mmap_flag = true;
11
+
12
+ void print_help(const char *cmd) {
13
+ std::cerr << "Usage: " << cmd << " [OPTION]... DIC\n\n"
14
+ "Options:\n"
15
+ " -m, --mmap-dictionary use memory-mapped I/O to load a dictionary"
16
+ " (default)\n"
17
+ " -r, --read-dictionary read an entire dictionary into memory\n"
18
+ " -h, --help print this help\n"
19
+ << std::endl;
20
+ }
21
+
22
+ int reverse_lookup(const char * const *args, std::size_t num_args) {
23
+ if (num_args == 0) {
24
+ std::cerr << "error: dictionary is not specified" << std::endl;
25
+ return 10;
26
+ } else if (num_args > 1) {
27
+ std::cerr << "error: more than one dictionaries are specified"
28
+ << std::endl;
29
+ return 11;
30
+ }
31
+
32
+ marisa::Trie trie;
33
+ if (mmap_flag) {
34
+ try {
35
+ trie.mmap(args[0]);
36
+ } catch (const marisa::Exception &ex) {
37
+ std::cerr << ex.what() << ": failed to mmap a dictionary file: "
38
+ << args[0] << std::endl;
39
+ return 20;
40
+ }
41
+ } else {
42
+ try {
43
+ trie.load(args[0]);
44
+ } catch (const marisa::Exception &ex) {
45
+ std::cerr << ex.what() << ": failed to load a dictionary file: "
46
+ << args[0] << std::endl;
47
+ return 21;
48
+ }
49
+ }
50
+
51
+ marisa::Agent agent;
52
+ std::size_t key_id;
53
+ while (std::cin >> key_id) {
54
+ try {
55
+ agent.set_query(key_id);
56
+ trie.reverse_lookup(agent);
57
+ std::cout << agent.key().id() << '\t';
58
+ std::cout.write(agent.key().ptr(), agent.key().length()) << '\n';
59
+ } catch (const marisa::Exception &ex) {
60
+ std::cerr << ex.what() << ": reverse_lookup() failed: "
61
+ << key_id << std::endl;
62
+ return 30;
63
+ }
64
+
65
+ if (!std::cout) {
66
+ std::cerr << "error: failed to write results to standard output"
67
+ << std::endl;
68
+ return 30;
69
+ }
70
+ }
71
+
72
+ return 0;
73
+ }
74
+
75
+ } // namespace
76
+
77
+ int main(int argc, char *argv[]) {
78
+ std::ios::sync_with_stdio(false);
79
+
80
+ ::cmdopt_option long_options[] = {
81
+ { "mmap-dictionary", 0, NULL, 'm' },
82
+ { "read-dictionary", 0, NULL, 'r' },
83
+ { "help", 0, NULL, 'h' },
84
+ { NULL, 0, NULL, 0 }
85
+ };
86
+ ::cmdopt_t cmdopt;
87
+ ::cmdopt_init(&cmdopt, argc, argv, "mrh", long_options);
88
+ int label;
89
+ while ((label = ::cmdopt_get(&cmdopt)) != -1) {
90
+ switch (label) {
91
+ case 'm': {
92
+ mmap_flag = true;
93
+ break;
94
+ }
95
+ case 'r': {
96
+ mmap_flag = false;
97
+ break;
98
+ }
99
+ case 'h': {
100
+ print_help(argv[0]);
101
+ return 0;
102
+ }
103
+ default: {
104
+ return 1;
105
+ }
106
+ }
107
+ }
108
+ return reverse_lookup(cmdopt.argv + cmdopt.optind,
109
+ cmdopt.argc - cmdopt.optind);
110
+ }
data/lib/melisa.rb ADDED
@@ -0,0 +1,7 @@
1
+ require "marisa"
2
+
3
+ require "melisa/version"
4
+ require "melisa/search"
5
+ require "melisa/trie"
6
+ require "melisa/bytes_trie"
7
+ require "melisa/int_trie"
@@ -0,0 +1,76 @@
1
+ module Melisa
2
+ ConfigError = Class.new(StandardError)
3
+
4
+ CacheSizes = {
5
+ :huge => Marisa::HUGE_CACHE,
6
+ :large => Marisa::LARGE_CACHE,
7
+ :normal => Marisa::NORMAL_CACHE,
8
+ :small => Marisa::SMALL_CACHE,
9
+ :tiny => Marisa::TINY_CACHE,
10
+ :default => Marisa::NORMAL_CACHE
11
+ }
12
+
13
+ NodeOrders = {
14
+ :label => Marisa::LABEL_ORDER,
15
+ :weight => Marisa::WEIGHT_ORDER,
16
+ :default => Marisa::DEFAULT_ORDER
17
+ }
18
+
19
+ module BaseConfigFlags
20
+ def config_flags(opts={})
21
+ opts = {
22
+ :binary => false,
23
+ :num_tries => :default,
24
+ :cache_size => :default,
25
+ :order => :default
26
+ }.merge(opts)
27
+
28
+ return \
29
+ binary_flag(opts[:binary]) |
30
+ valid_num_tries(opts[:num_tries]) |
31
+ lookup_cache_size(opts[:cache_size]) |
32
+ valid_node_order(opts[:order])
33
+ end
34
+
35
+ def binary_flag(bool)
36
+ case bool
37
+ when true then Marisa::BINARY_TAIL
38
+ when false then Marisa::TEXT_TAIL
39
+ else
40
+ raise ArgumentError, "binary_flag must be true or false (got #{bool.inspect})"
41
+ end
42
+ end
43
+
44
+ def valid_num_tries(num_tries)
45
+ num_tries = Marisa::DEFAULT_NUM_TRIES if num_tries == :default
46
+ min = Marisa::MIN_NUM_TRIES
47
+ max = Marisa::MAX_NUM_TRIES
48
+ if (min..max).include? num_tries
49
+ return num_tries
50
+ else
51
+ msg = "num_tries (#{num_tries}) must be between #{min} and #{max}"
52
+ raise ConfigError, msg
53
+ end
54
+ end
55
+
56
+ def lookup_cache_size(cache_size)
57
+ if CacheSizes.keys.include?(cache_size)
58
+ return CacheSizes[cache_size]
59
+ else
60
+ sizes = CacheSizes.keys
61
+ msg = "cache_size (#{cache_size}) must be one of: #{sizes.inspect}"
62
+ raise ConfigError, msg
63
+ end
64
+ end
65
+
66
+ def valid_node_order(order)
67
+ if NodeOrders.keys.include?(order)
68
+ return NodeOrders[order]
69
+ else
70
+ valid_options = NodeOrders.keys
71
+ msg = "node_order (#{order}) must be one of: #{valid_options.inspect}"
72
+ raise ConfigError, msg
73
+ end
74
+ end
75
+ end
76
+ end