melisa 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. data/README.md +11 -0
  2. data/ext/marisa/bindings/marisa-swig.cxx +253 -0
  3. data/ext/marisa/bindings/marisa-swig.h +183 -0
  4. data/ext/marisa/bindings/perl/marisa-swig.cxx +253 -0
  5. data/ext/marisa/bindings/perl/marisa-swig.h +183 -0
  6. data/ext/marisa/bindings/perl/marisa-swig_wrap.cxx +5160 -0
  7. data/ext/marisa/bindings/python/marisa-swig.cxx +253 -0
  8. data/ext/marisa/bindings/python/marisa-swig.h +183 -0
  9. data/ext/marisa/bindings/python/marisa-swig_wrap.cxx +6090 -0
  10. data/ext/marisa/bindings/ruby/extconf.rb +5 -0
  11. data/ext/marisa/bindings/ruby/marisa-swig.cxx +253 -0
  12. data/ext/marisa/bindings/ruby/marisa-swig.h +183 -0
  13. data/ext/marisa/bindings/ruby/marisa-swig_wrap.cxx +4708 -0
  14. data/ext/marisa/lib/marisa.h +14 -0
  15. data/ext/marisa/lib/marisa/agent.cc +51 -0
  16. data/ext/marisa/lib/marisa/agent.h +73 -0
  17. data/ext/marisa/lib/marisa/base.h +193 -0
  18. data/ext/marisa/lib/marisa/exception.h +82 -0
  19. data/ext/marisa/lib/marisa/grimoire/algorithm.h +26 -0
  20. data/ext/marisa/lib/marisa/grimoire/algorithm/sort.h +196 -0
  21. data/ext/marisa/lib/marisa/grimoire/intrin.h +115 -0
  22. data/ext/marisa/lib/marisa/grimoire/io.h +18 -0
  23. data/ext/marisa/lib/marisa/grimoire/io/mapper.cc +163 -0
  24. data/ext/marisa/lib/marisa/grimoire/io/mapper.h +67 -0
  25. data/ext/marisa/lib/marisa/grimoire/io/reader.cc +147 -0
  26. data/ext/marisa/lib/marisa/grimoire/io/reader.h +66 -0
  27. data/ext/marisa/lib/marisa/grimoire/io/writer.cc +148 -0
  28. data/ext/marisa/lib/marisa/grimoire/io/writer.h +65 -0
  29. data/ext/marisa/lib/marisa/grimoire/trie.h +16 -0
  30. data/ext/marisa/lib/marisa/grimoire/trie/cache.h +81 -0
  31. data/ext/marisa/lib/marisa/grimoire/trie/config.h +155 -0
  32. data/ext/marisa/lib/marisa/grimoire/trie/entry.h +82 -0
  33. data/ext/marisa/lib/marisa/grimoire/trie/header.h +61 -0
  34. data/ext/marisa/lib/marisa/grimoire/trie/history.h +65 -0
  35. data/ext/marisa/lib/marisa/grimoire/trie/key.h +228 -0
  36. data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.cc +876 -0
  37. data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.h +134 -0
  38. data/ext/marisa/lib/marisa/grimoire/trie/range.h +115 -0
  39. data/ext/marisa/lib/marisa/grimoire/trie/state.h +117 -0
  40. data/ext/marisa/lib/marisa/grimoire/trie/tail.cc +218 -0
  41. data/ext/marisa/lib/marisa/grimoire/trie/tail.h +72 -0
  42. data/ext/marisa/lib/marisa/grimoire/vector.h +18 -0
  43. data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.cc +826 -0
  44. data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.h +179 -0
  45. data/ext/marisa/lib/marisa/grimoire/vector/flat-vector.h +205 -0
  46. data/ext/marisa/lib/marisa/grimoire/vector/pop-count.h +110 -0
  47. data/ext/marisa/lib/marisa/grimoire/vector/rank-index.h +82 -0
  48. data/ext/marisa/lib/marisa/grimoire/vector/vector.h +256 -0
  49. data/ext/marisa/lib/marisa/iostream.h +18 -0
  50. data/ext/marisa/lib/marisa/key.h +85 -0
  51. data/ext/marisa/lib/marisa/keyset.cc +181 -0
  52. data/ext/marisa/lib/marisa/keyset.h +80 -0
  53. data/ext/marisa/lib/marisa/query.h +71 -0
  54. data/ext/marisa/lib/marisa/scoped-array.h +48 -0
  55. data/ext/marisa/lib/marisa/scoped-ptr.h +52 -0
  56. data/ext/marisa/lib/marisa/stdio.h +15 -0
  57. data/ext/marisa/lib/marisa/trie.cc +249 -0
  58. data/ext/marisa/lib/marisa/trie.h +64 -0
  59. data/ext/marisa/tests/base-test.cc +309 -0
  60. data/ext/marisa/tests/io-test.cc +252 -0
  61. data/ext/marisa/tests/marisa-assert.h +26 -0
  62. data/ext/marisa/tests/marisa-test.cc +388 -0
  63. data/ext/marisa/tests/trie-test.cc +507 -0
  64. data/ext/marisa/tests/vector-test.cc +466 -0
  65. data/ext/marisa/tools/cmdopt.cc +298 -0
  66. data/ext/marisa/tools/cmdopt.h +58 -0
  67. data/ext/marisa/tools/marisa-benchmark.cc +418 -0
  68. data/ext/marisa/tools/marisa-build.cc +206 -0
  69. data/ext/marisa/tools/marisa-common-prefix-search.cc +143 -0
  70. data/ext/marisa/tools/marisa-dump.cc +151 -0
  71. data/ext/marisa/tools/marisa-lookup.cc +110 -0
  72. data/ext/marisa/tools/marisa-predictive-search.cc +143 -0
  73. data/ext/marisa/tools/marisa-reverse-lookup.cc +110 -0
  74. data/lib/melisa.rb +7 -0
  75. data/lib/melisa/base_config_flags.rb +76 -0
  76. data/lib/melisa/bytes_trie.rb +55 -0
  77. data/lib/melisa/int_trie.rb +14 -0
  78. data/lib/melisa/search.rb +55 -0
  79. data/lib/melisa/trie.rb +96 -0
  80. data/lib/melisa/version.rb +3 -0
  81. data/melisa.gemspec +36 -0
  82. data/spec/base_config_flags_spec.rb +73 -0
  83. data/spec/bytes_trie_spec.rb +16 -0
  84. data/spec/int_trie_spec.rb +16 -0
  85. data/spec/search_spec.rb +29 -0
  86. data/spec/spec_helper.rb +1 -0
  87. data/spec/trie_spec.rb +30 -0
  88. metadata +207 -0
@@ -0,0 +1,110 @@
1
+ #include <iostream>
2
+ #include <string>
3
+
4
+ #include <marisa.h>
5
+
6
+ #include "cmdopt.h"
7
+
8
+ namespace {
9
+
10
+ bool mmap_flag = true;
11
+
12
+ void print_help(const char *cmd) {
13
+ std::cerr << "Usage: " << cmd << " [OPTION]... DIC\n\n"
14
+ "Options:\n"
15
+ " -m, --mmap-dictionary use memory-mapped I/O to load a dictionary"
16
+ " (default)\n"
17
+ " -r, --read-dictionary read an entire dictionary into memory\n"
18
+ " -h, --help print this help\n"
19
+ << std::endl;
20
+ }
21
+
22
+ int lookup(const char * const *args, std::size_t num_args) {
23
+ if (num_args == 0) {
24
+ std::cerr << "error: dictionary is not specified" << std::endl;
25
+ return 10;
26
+ } else if (num_args > 1) {
27
+ std::cerr << "error: more than one dictionaries are specified"
28
+ << std::endl;
29
+ return 11;
30
+ }
31
+
32
+ marisa::Trie trie;
33
+ if (mmap_flag) {
34
+ try {
35
+ trie.mmap(args[0]);
36
+ } catch (const marisa::Exception &ex) {
37
+ std::cerr << ex.what() << ": failed to mmap a dictionary file: "
38
+ << args[0] << std::endl;
39
+ return 20;
40
+ }
41
+ } else {
42
+ try {
43
+ trie.load(args[0]);
44
+ } catch (const marisa::Exception &ex) {
45
+ std::cerr << ex.what() << ": failed to load a dictionary file: "
46
+ << args[0] << std::endl;
47
+ return 21;
48
+ }
49
+ }
50
+
51
+ marisa::Agent agent;
52
+ std::string str;
53
+ while (std::getline(std::cin, str)) {
54
+ try {
55
+ agent.set_query(str.c_str(), str.length());
56
+ if (trie.lookup(agent)) {
57
+ std::cout << agent.key().id() << '\t' << str << '\n';
58
+ } else {
59
+ std::cout << "-1\t" << str << '\n';
60
+ }
61
+ } catch (const marisa::Exception &ex) {
62
+ std::cerr << ex.what() << ": lookup() failed: " << str << std::endl;
63
+ return 30;
64
+ }
65
+
66
+ if (!std::cout) {
67
+ std::cerr << "error: failed to write results to standard output"
68
+ << std::endl;
69
+ return 30;
70
+ }
71
+ }
72
+
73
+ return 0;
74
+ }
75
+
76
+ } // namespace
77
+
78
+ int main(int argc, char *argv[]) {
79
+ std::ios::sync_with_stdio(false);
80
+
81
+ ::cmdopt_option long_options[] = {
82
+ { "mmap-dictionary", 0, NULL, 'm' },
83
+ { "read-dictionary", 0, NULL, 'r' },
84
+ { "help", 0, NULL, 'h' },
85
+ { NULL, 0, NULL, 0 }
86
+ };
87
+ ::cmdopt_t cmdopt;
88
+ ::cmdopt_init(&cmdopt, argc, argv, "mrh", long_options);
89
+ int label;
90
+ while ((label = ::cmdopt_get(&cmdopt)) != -1) {
91
+ switch (label) {
92
+ case 'm': {
93
+ mmap_flag = true;
94
+ break;
95
+ }
96
+ case 'r': {
97
+ mmap_flag = false;
98
+ break;
99
+ }
100
+ case 'h': {
101
+ print_help(argv[0]);
102
+ return 0;
103
+ }
104
+ default: {
105
+ return 1;
106
+ }
107
+ }
108
+ }
109
+ return lookup(cmdopt.argv + cmdopt.optind, cmdopt.argc - cmdopt.optind);
110
+ }
@@ -0,0 +1,143 @@
1
+ #include <cstdlib>
2
+ #include <iostream>
3
+ #include <string>
4
+
5
+ #include <marisa.h>
6
+
7
+ #include "cmdopt.h"
8
+
9
+ namespace {
10
+
11
+ std::size_t max_num_results = 10;
12
+ bool mmap_flag = true;
13
+
14
+ void print_help(const char *cmd) {
15
+ std::cerr << "Usage: " << cmd << " [OPTION]... DIC\n\n"
16
+ "Options:\n"
17
+ " -n, --max-num-results=[N] limit the number of outputs to N"
18
+ " (default: 10)\n"
19
+ " 0: no limit\n"
20
+ " -m, --mmap-dictionary use memory-mapped I/O to load a dictionary"
21
+ " (default)\n"
22
+ " -r, --read-dictionary read an entire dictionary into memory\n"
23
+ " -h, --help print this help\n"
24
+ << std::endl;
25
+ }
26
+
27
+ int predictive_search(const char * const *args, std::size_t num_args) {
28
+ if (num_args == 0) {
29
+ std::cerr << "error: dictionary is not specified" << std::endl;
30
+ return 10;
31
+ } else if (num_args > 1) {
32
+ std::cerr << "error: more than one dictionaries are specified"
33
+ << std::endl;
34
+ return 11;
35
+ }
36
+
37
+ marisa::Trie trie;
38
+ if (mmap_flag) {
39
+ try {
40
+ trie.mmap(args[0]);
41
+ } catch (const marisa::Exception &ex) {
42
+ std::cerr << ex.what() << ": failed to mmap a dictionary file: "
43
+ << args[0] << std::endl;
44
+ return 20;
45
+ }
46
+ } else {
47
+ try {
48
+ trie.load(args[0]);
49
+ } catch (const marisa::Exception &ex) {
50
+ std::cerr << ex.what() << ": failed to load a dictionary file: "
51
+ << args[0] << std::endl;
52
+ return 21;
53
+ }
54
+ }
55
+
56
+ marisa::Agent agent;
57
+ marisa::Keyset keyset;
58
+ std::string str;
59
+ while (std::getline(std::cin, str)) {
60
+ try {
61
+ agent.set_query(str.c_str(), str.length());
62
+ while (trie.predictive_search(agent)) {
63
+ keyset.push_back(agent.key());
64
+ }
65
+ if (keyset.empty()) {
66
+ std::cout << "not found" << std::endl;
67
+ } else {
68
+ std::cout << keyset.size() << " found" << std::endl;
69
+ const std::size_t end = std::min(max_num_results, keyset.size());
70
+ for (std::size_t i = 0; i < end; ++i) {
71
+ std::cout << keyset[i].id() << '\t';
72
+ std::cout.write(keyset[i].ptr(), keyset[i].length()) << '\t';
73
+ std::cout << str << '\n';
74
+ }
75
+ }
76
+ keyset.reset();
77
+ } catch (const marisa::Exception &ex) {
78
+ std::cerr << ex.what() << ": predictive_search() failed: "
79
+ << str << std::endl;
80
+ return 30;
81
+ }
82
+
83
+ if (!std::cout) {
84
+ std::cerr << "error: failed to write results to standard output"
85
+ << std::endl;
86
+ return 31;
87
+ }
88
+ }
89
+
90
+ return 0;
91
+ }
92
+
93
+ } // namespace
94
+
95
+ int main(int argc, char *argv[]) {
96
+ std::ios::sync_with_stdio(false);
97
+
98
+ ::cmdopt_option long_options[] = {
99
+ { "max-num-results", 1, NULL, 'n' },
100
+ { "mmap-dictionary", 0, NULL, 'm' },
101
+ { "read-dictionary", 0, NULL, 'r' },
102
+ { "help", 0, NULL, 'h' },
103
+ { NULL, 0, NULL, 0 }
104
+ };
105
+ ::cmdopt_t cmdopt;
106
+ ::cmdopt_init(&cmdopt, argc, argv, "n:mrh", long_options);
107
+ int label;
108
+ while ((label = ::cmdopt_get(&cmdopt)) != -1) {
109
+ switch (label) {
110
+ case 'n': {
111
+ char *end_of_value;
112
+ const long value = std::strtol(cmdopt.optarg, &end_of_value, 10);
113
+ if ((*end_of_value != '\0') || (value < 0)) {
114
+ std::cerr << "error: option `-n' with an invalid argument: "
115
+ << cmdopt.optarg << std::endl;
116
+ }
117
+ if ((value == 0) || ((unsigned long long)value > MARISA_SIZE_MAX)) {
118
+ max_num_results = MARISA_SIZE_MAX;
119
+ } else {
120
+ max_num_results = (std::size_t)value;
121
+ }
122
+ break;
123
+ }
124
+ case 'm': {
125
+ mmap_flag = true;
126
+ break;
127
+ }
128
+ case 'r': {
129
+ mmap_flag = false;
130
+ break;
131
+ }
132
+ case 'h': {
133
+ print_help(argv[0]);
134
+ return 0;
135
+ }
136
+ default: {
137
+ return 1;
138
+ }
139
+ }
140
+ }
141
+ return predictive_search(cmdopt.argv + cmdopt.optind,
142
+ cmdopt.argc - cmdopt.optind);
143
+ }
@@ -0,0 +1,110 @@
1
+ #include <iostream>
2
+ #include <string>
3
+
4
+ #include <marisa.h>
5
+
6
+ #include "cmdopt.h"
7
+
8
+ namespace {
9
+
10
+ bool mmap_flag = true;
11
+
12
+ void print_help(const char *cmd) {
13
+ std::cerr << "Usage: " << cmd << " [OPTION]... DIC\n\n"
14
+ "Options:\n"
15
+ " -m, --mmap-dictionary use memory-mapped I/O to load a dictionary"
16
+ " (default)\n"
17
+ " -r, --read-dictionary read an entire dictionary into memory\n"
18
+ " -h, --help print this help\n"
19
+ << std::endl;
20
+ }
21
+
22
+ int reverse_lookup(const char * const *args, std::size_t num_args) {
23
+ if (num_args == 0) {
24
+ std::cerr << "error: dictionary is not specified" << std::endl;
25
+ return 10;
26
+ } else if (num_args > 1) {
27
+ std::cerr << "error: more than one dictionaries are specified"
28
+ << std::endl;
29
+ return 11;
30
+ }
31
+
32
+ marisa::Trie trie;
33
+ if (mmap_flag) {
34
+ try {
35
+ trie.mmap(args[0]);
36
+ } catch (const marisa::Exception &ex) {
37
+ std::cerr << ex.what() << ": failed to mmap a dictionary file: "
38
+ << args[0] << std::endl;
39
+ return 20;
40
+ }
41
+ } else {
42
+ try {
43
+ trie.load(args[0]);
44
+ } catch (const marisa::Exception &ex) {
45
+ std::cerr << ex.what() << ": failed to load a dictionary file: "
46
+ << args[0] << std::endl;
47
+ return 21;
48
+ }
49
+ }
50
+
51
+ marisa::Agent agent;
52
+ std::size_t key_id;
53
+ while (std::cin >> key_id) {
54
+ try {
55
+ agent.set_query(key_id);
56
+ trie.reverse_lookup(agent);
57
+ std::cout << agent.key().id() << '\t';
58
+ std::cout.write(agent.key().ptr(), agent.key().length()) << '\n';
59
+ } catch (const marisa::Exception &ex) {
60
+ std::cerr << ex.what() << ": reverse_lookup() failed: "
61
+ << key_id << std::endl;
62
+ return 30;
63
+ }
64
+
65
+ if (!std::cout) {
66
+ std::cerr << "error: failed to write results to standard output"
67
+ << std::endl;
68
+ return 30;
69
+ }
70
+ }
71
+
72
+ return 0;
73
+ }
74
+
75
+ } // namespace
76
+
77
+ int main(int argc, char *argv[]) {
78
+ std::ios::sync_with_stdio(false);
79
+
80
+ ::cmdopt_option long_options[] = {
81
+ { "mmap-dictionary", 0, NULL, 'm' },
82
+ { "read-dictionary", 0, NULL, 'r' },
83
+ { "help", 0, NULL, 'h' },
84
+ { NULL, 0, NULL, 0 }
85
+ };
86
+ ::cmdopt_t cmdopt;
87
+ ::cmdopt_init(&cmdopt, argc, argv, "mrh", long_options);
88
+ int label;
89
+ while ((label = ::cmdopt_get(&cmdopt)) != -1) {
90
+ switch (label) {
91
+ case 'm': {
92
+ mmap_flag = true;
93
+ break;
94
+ }
95
+ case 'r': {
96
+ mmap_flag = false;
97
+ break;
98
+ }
99
+ case 'h': {
100
+ print_help(argv[0]);
101
+ return 0;
102
+ }
103
+ default: {
104
+ return 1;
105
+ }
106
+ }
107
+ }
108
+ return reverse_lookup(cmdopt.argv + cmdopt.optind,
109
+ cmdopt.argc - cmdopt.optind);
110
+ }
data/lib/melisa.rb ADDED
@@ -0,0 +1,7 @@
1
+ require "marisa"
2
+
3
+ require "melisa/version"
4
+ require "melisa/search"
5
+ require "melisa/trie"
6
+ require "melisa/bytes_trie"
7
+ require "melisa/int_trie"
@@ -0,0 +1,76 @@
1
+ module Melisa
2
+ ConfigError = Class.new(StandardError)
3
+
4
+ CacheSizes = {
5
+ :huge => Marisa::HUGE_CACHE,
6
+ :large => Marisa::LARGE_CACHE,
7
+ :normal => Marisa::NORMAL_CACHE,
8
+ :small => Marisa::SMALL_CACHE,
9
+ :tiny => Marisa::TINY_CACHE,
10
+ :default => Marisa::NORMAL_CACHE
11
+ }
12
+
13
+ NodeOrders = {
14
+ :label => Marisa::LABEL_ORDER,
15
+ :weight => Marisa::WEIGHT_ORDER,
16
+ :default => Marisa::DEFAULT_ORDER
17
+ }
18
+
19
+ module BaseConfigFlags
20
+ def config_flags(opts={})
21
+ opts = {
22
+ :binary => false,
23
+ :num_tries => :default,
24
+ :cache_size => :default,
25
+ :order => :default
26
+ }.merge(opts)
27
+
28
+ return \
29
+ binary_flag(opts[:binary]) |
30
+ valid_num_tries(opts[:num_tries]) |
31
+ lookup_cache_size(opts[:cache_size]) |
32
+ valid_node_order(opts[:order])
33
+ end
34
+
35
+ def binary_flag(bool)
36
+ case bool
37
+ when true then Marisa::BINARY_TAIL
38
+ when false then Marisa::TEXT_TAIL
39
+ else
40
+ raise ArgumentError, "binary_flag must be true or false (got #{bool.inspect})"
41
+ end
42
+ end
43
+
44
+ def valid_num_tries(num_tries)
45
+ num_tries = Marisa::DEFAULT_NUM_TRIES if num_tries == :default
46
+ min = Marisa::MIN_NUM_TRIES
47
+ max = Marisa::MAX_NUM_TRIES
48
+ if (min..max).include? num_tries
49
+ return num_tries
50
+ else
51
+ msg = "num_tries (#{num_tries}) must be between #{min} and #{max}"
52
+ raise ConfigError, msg
53
+ end
54
+ end
55
+
56
+ def lookup_cache_size(cache_size)
57
+ if CacheSizes.keys.include?(cache_size)
58
+ return CacheSizes[cache_size]
59
+ else
60
+ sizes = CacheSizes.keys
61
+ msg = "cache_size (#{cache_size}) must be one of: #{sizes.inspect}"
62
+ raise ConfigError, msg
63
+ end
64
+ end
65
+
66
+ def valid_node_order(order)
67
+ if NodeOrders.keys.include?(order)
68
+ return NodeOrders[order]
69
+ else
70
+ valid_options = NodeOrders.keys
71
+ msg = "node_order (#{order}) must be one of: #{valid_options.inspect}"
72
+ raise ConfigError, msg
73
+ end
74
+ end
75
+ end
76
+ end