melisa 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. data/README.md +11 -0
  2. data/ext/marisa/bindings/marisa-swig.cxx +253 -0
  3. data/ext/marisa/bindings/marisa-swig.h +183 -0
  4. data/ext/marisa/bindings/perl/marisa-swig.cxx +253 -0
  5. data/ext/marisa/bindings/perl/marisa-swig.h +183 -0
  6. data/ext/marisa/bindings/perl/marisa-swig_wrap.cxx +5160 -0
  7. data/ext/marisa/bindings/python/marisa-swig.cxx +253 -0
  8. data/ext/marisa/bindings/python/marisa-swig.h +183 -0
  9. data/ext/marisa/bindings/python/marisa-swig_wrap.cxx +6090 -0
  10. data/ext/marisa/bindings/ruby/extconf.rb +5 -0
  11. data/ext/marisa/bindings/ruby/marisa-swig.cxx +253 -0
  12. data/ext/marisa/bindings/ruby/marisa-swig.h +183 -0
  13. data/ext/marisa/bindings/ruby/marisa-swig_wrap.cxx +4708 -0
  14. data/ext/marisa/lib/marisa.h +14 -0
  15. data/ext/marisa/lib/marisa/agent.cc +51 -0
  16. data/ext/marisa/lib/marisa/agent.h +73 -0
  17. data/ext/marisa/lib/marisa/base.h +193 -0
  18. data/ext/marisa/lib/marisa/exception.h +82 -0
  19. data/ext/marisa/lib/marisa/grimoire/algorithm.h +26 -0
  20. data/ext/marisa/lib/marisa/grimoire/algorithm/sort.h +196 -0
  21. data/ext/marisa/lib/marisa/grimoire/intrin.h +115 -0
  22. data/ext/marisa/lib/marisa/grimoire/io.h +18 -0
  23. data/ext/marisa/lib/marisa/grimoire/io/mapper.cc +163 -0
  24. data/ext/marisa/lib/marisa/grimoire/io/mapper.h +67 -0
  25. data/ext/marisa/lib/marisa/grimoire/io/reader.cc +147 -0
  26. data/ext/marisa/lib/marisa/grimoire/io/reader.h +66 -0
  27. data/ext/marisa/lib/marisa/grimoire/io/writer.cc +148 -0
  28. data/ext/marisa/lib/marisa/grimoire/io/writer.h +65 -0
  29. data/ext/marisa/lib/marisa/grimoire/trie.h +16 -0
  30. data/ext/marisa/lib/marisa/grimoire/trie/cache.h +81 -0
  31. data/ext/marisa/lib/marisa/grimoire/trie/config.h +155 -0
  32. data/ext/marisa/lib/marisa/grimoire/trie/entry.h +82 -0
  33. data/ext/marisa/lib/marisa/grimoire/trie/header.h +61 -0
  34. data/ext/marisa/lib/marisa/grimoire/trie/history.h +65 -0
  35. data/ext/marisa/lib/marisa/grimoire/trie/key.h +228 -0
  36. data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.cc +876 -0
  37. data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.h +134 -0
  38. data/ext/marisa/lib/marisa/grimoire/trie/range.h +115 -0
  39. data/ext/marisa/lib/marisa/grimoire/trie/state.h +117 -0
  40. data/ext/marisa/lib/marisa/grimoire/trie/tail.cc +218 -0
  41. data/ext/marisa/lib/marisa/grimoire/trie/tail.h +72 -0
  42. data/ext/marisa/lib/marisa/grimoire/vector.h +18 -0
  43. data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.cc +826 -0
  44. data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.h +179 -0
  45. data/ext/marisa/lib/marisa/grimoire/vector/flat-vector.h +205 -0
  46. data/ext/marisa/lib/marisa/grimoire/vector/pop-count.h +110 -0
  47. data/ext/marisa/lib/marisa/grimoire/vector/rank-index.h +82 -0
  48. data/ext/marisa/lib/marisa/grimoire/vector/vector.h +256 -0
  49. data/ext/marisa/lib/marisa/iostream.h +18 -0
  50. data/ext/marisa/lib/marisa/key.h +85 -0
  51. data/ext/marisa/lib/marisa/keyset.cc +181 -0
  52. data/ext/marisa/lib/marisa/keyset.h +80 -0
  53. data/ext/marisa/lib/marisa/query.h +71 -0
  54. data/ext/marisa/lib/marisa/scoped-array.h +48 -0
  55. data/ext/marisa/lib/marisa/scoped-ptr.h +52 -0
  56. data/ext/marisa/lib/marisa/stdio.h +15 -0
  57. data/ext/marisa/lib/marisa/trie.cc +249 -0
  58. data/ext/marisa/lib/marisa/trie.h +64 -0
  59. data/ext/marisa/tests/base-test.cc +309 -0
  60. data/ext/marisa/tests/io-test.cc +252 -0
  61. data/ext/marisa/tests/marisa-assert.h +26 -0
  62. data/ext/marisa/tests/marisa-test.cc +388 -0
  63. data/ext/marisa/tests/trie-test.cc +507 -0
  64. data/ext/marisa/tests/vector-test.cc +466 -0
  65. data/ext/marisa/tools/cmdopt.cc +298 -0
  66. data/ext/marisa/tools/cmdopt.h +58 -0
  67. data/ext/marisa/tools/marisa-benchmark.cc +418 -0
  68. data/ext/marisa/tools/marisa-build.cc +206 -0
  69. data/ext/marisa/tools/marisa-common-prefix-search.cc +143 -0
  70. data/ext/marisa/tools/marisa-dump.cc +151 -0
  71. data/ext/marisa/tools/marisa-lookup.cc +110 -0
  72. data/ext/marisa/tools/marisa-predictive-search.cc +143 -0
  73. data/ext/marisa/tools/marisa-reverse-lookup.cc +110 -0
  74. data/lib/melisa.rb +7 -0
  75. data/lib/melisa/base_config_flags.rb +76 -0
  76. data/lib/melisa/bytes_trie.rb +55 -0
  77. data/lib/melisa/int_trie.rb +14 -0
  78. data/lib/melisa/search.rb +55 -0
  79. data/lib/melisa/trie.rb +96 -0
  80. data/lib/melisa/version.rb +3 -0
  81. data/melisa.gemspec +36 -0
  82. data/spec/base_config_flags_spec.rb +73 -0
  83. data/spec/bytes_trie_spec.rb +16 -0
  84. data/spec/int_trie_spec.rb +16 -0
  85. data/spec/search_spec.rb +29 -0
  86. data/spec/spec_helper.rb +1 -0
  87. data/spec/trie_spec.rb +30 -0
  88. metadata +207 -0
@@ -0,0 +1,14 @@
1
+ #ifndef MARISA_H_
2
+ #define MARISA_H_
3
+
4
+ // "marisa/stdio.h" includes <cstdio> for I/O using std::FILE.
5
+ #include "marisa/stdio.h"
6
+
7
+ // "marisa/iostream.h" includes <iosfwd> for I/O using std::iostream.
8
+ #include "marisa/iostream.h"
9
+
10
+ // You can use <marisa/trie.h> instead of <marisa.h> if you don't need the
11
+ // above I/O interfaces and don't want to include the above I/O headers.
12
+ #include "marisa/trie.h"
13
+
14
+ #endif // MARISA_H_
@@ -0,0 +1,51 @@
1
+ #include <new>
2
+
3
+ #include "marisa/agent.h"
4
+ #include "marisa/grimoire/trie.h"
5
+
6
+ namespace marisa {
7
+
8
+ Agent::Agent() : query_(), key_(), state_() {}
9
+
10
+ Agent::~Agent() {}
11
+
12
+ void Agent::set_query(const char *str) {
13
+ MARISA_THROW_IF(str == NULL, MARISA_NULL_ERROR);
14
+ if (state_.get() != NULL) {
15
+ state_->reset();
16
+ }
17
+ query_.set_str(str);
18
+ }
19
+
20
+ void Agent::set_query(const char *ptr, std::size_t length) {
21
+ MARISA_THROW_IF((ptr == NULL) && (length != 0), MARISA_NULL_ERROR);
22
+ if (state_.get() != NULL) {
23
+ state_->reset();
24
+ }
25
+ query_.set_str(ptr, length);
26
+ }
27
+
28
+ void Agent::set_query(std::size_t key_id) {
29
+ if (state_.get() != NULL) {
30
+ state_->reset();
31
+ }
32
+ query_.set_id(key_id);
33
+ }
34
+
35
+ void Agent::init_state() {
36
+ MARISA_THROW_IF(state_.get() != NULL, MARISA_STATE_ERROR);
37
+ state_.reset(new (std::nothrow) grimoire::State);
38
+ MARISA_THROW_IF(state_.get() == NULL, MARISA_MEMORY_ERROR);
39
+ }
40
+
41
+ void Agent::clear() {
42
+ Agent().swap(*this);
43
+ }
44
+
45
+ void Agent::swap(Agent &rhs) {
46
+ query_.swap(rhs.query_);
47
+ key_.swap(rhs.key_);
48
+ state_.swap(rhs.state_);
49
+ }
50
+
51
+ } // namespace marisa
@@ -0,0 +1,73 @@
1
+ #ifndef MARISA_AGENT_H_
2
+ #define MARISA_AGENT_H_
3
+
4
+ #include "marisa/key.h"
5
+ #include "marisa/query.h"
6
+
7
+ namespace marisa {
8
+ namespace grimoire {
9
+ namespace trie {
10
+
11
+ class State;
12
+
13
+ } // namespace trie
14
+ } // namespace grimoire
15
+
16
+ class Agent {
17
+ public:
18
+ Agent();
19
+ ~Agent();
20
+
21
+ const Query &query() const {
22
+ return query_;
23
+ }
24
+ const Key &key() const {
25
+ return key_;
26
+ }
27
+
28
+ void set_query(const char *str);
29
+ void set_query(const char *ptr, std::size_t length);
30
+ void set_query(std::size_t key_id);
31
+
32
+ const grimoire::trie::State &state() const {
33
+ return *state_;
34
+ }
35
+ grimoire::trie::State &state() {
36
+ return *state_;
37
+ }
38
+
39
+ void set_key(const char *str) {
40
+ MARISA_DEBUG_IF(str == NULL, MARISA_NULL_ERROR);
41
+ key_.set_str(str);
42
+ }
43
+ void set_key(const char *ptr, std::size_t length) {
44
+ MARISA_DEBUG_IF((ptr == NULL) && (length != 0), MARISA_NULL_ERROR);
45
+ MARISA_DEBUG_IF(length > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
46
+ key_.set_str(ptr, length);
47
+ }
48
+ void set_key(std::size_t id) {
49
+ MARISA_DEBUG_IF(id > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
50
+ key_.set_id(id);
51
+ }
52
+
53
+ bool has_state() const {
54
+ return state_.get() != NULL;
55
+ }
56
+ void init_state();
57
+
58
+ void clear();
59
+ void swap(Agent &rhs);
60
+
61
+ private:
62
+ Query query_;
63
+ Key key_;
64
+ scoped_ptr<grimoire::trie::State> state_;
65
+
66
+ // Disallows copy and assignment.
67
+ Agent(const Agent &);
68
+ Agent &operator=(const Agent &);
69
+ };
70
+
71
+ } // namespace marisa
72
+
73
+ #endif // MARISA_AGENT_H_
@@ -0,0 +1,193 @@
1
+ #ifndef MARISA_BASE_H_
2
+ #define MARISA_BASE_H_
3
+
4
+ // Old Visual C++ does not provide stdint.h.
5
+ #ifndef _MSC_VER
6
+ #include <stdint.h>
7
+ #endif // _MSC_VER
8
+
9
+ #ifdef __cplusplus
10
+ #include <cstddef>
11
+ #else // __cplusplus
12
+ #include <stddef.h>
13
+ #endif // __cplusplus
14
+
15
+ #ifdef __cplusplus
16
+ extern "C" {
17
+ #endif // __cplusplus
18
+
19
+ #ifdef _MSC_VER
20
+ typedef unsigned __int8 marisa_uint8;
21
+ typedef unsigned __int16 marisa_uint16;
22
+ typedef unsigned __int32 marisa_uint32;
23
+ typedef unsigned __int64 marisa_uint64;
24
+ #else // _MSC_VER
25
+ typedef uint8_t marisa_uint8;
26
+ typedef uint16_t marisa_uint16;
27
+ typedef uint32_t marisa_uint32;
28
+ typedef uint64_t marisa_uint64;
29
+ #endif // _MSC_VER
30
+
31
+ #if defined(_WIN64) || defined(__amd64__) || defined(__x86_64__) || \
32
+ defined(__ia64__) || defined(__ppc64__) || defined(__powerpc64__) || \
33
+ defined(__sparc64__) || defined(__mips64__) || defined(__aarch64__)
34
+ #define MARISA_WORD_SIZE 64
35
+ #else // defined(_WIN64), etc.
36
+ #define MARISA_WORD_SIZE 32
37
+ #endif // defined(_WIN64), etc.
38
+
39
+ //#define MARISA_WORD_SIZE (sizeof(void *) * 8)
40
+
41
+ #define MARISA_UINT8_MAX ((marisa_uint8)~(marisa_uint8)0)
42
+ #define MARISA_UINT16_MAX ((marisa_uint16)~(marisa_uint16)0)
43
+ #define MARISA_UINT32_MAX ((marisa_uint32)~(marisa_uint32)0)
44
+ #define MARISA_UINT64_MAX ((marisa_uint64)~(marisa_uint64)0)
45
+ #define MARISA_SIZE_MAX ((size_t)~(size_t)0)
46
+
47
+ #define MARISA_INVALID_LINK_ID MARISA_UINT32_MAX
48
+ #define MARISA_INVALID_KEY_ID MARISA_UINT32_MAX
49
+ #define MARISA_INVALID_EXTRA (MARISA_UINT32_MAX >> 8)
50
+
51
+ // Error codes are defined as members of marisa_error_code. This library throws
52
+ // an exception with one of the error codes when an error occurs.
53
+ typedef enum marisa_error_code_ {
54
+ // MARISA_OK means that a requested operation has succeeded. In practice, an
55
+ // exception never has MARISA_OK because it is not an error.
56
+ MARISA_OK = 0,
57
+
58
+ // MARISA_STATE_ERROR means that an object was not ready for a requested
59
+ // operation. For example, an operation to modify a fixed vector throws an
60
+ // exception with MARISA_STATE_ERROR.
61
+ MARISA_STATE_ERROR = 1,
62
+
63
+ // MARISA_NULL_ERROR means that an invalid NULL pointer has been given.
64
+ MARISA_NULL_ERROR = 2,
65
+
66
+ // MARISA_BOUND_ERROR means that an operation has tried to access an out of
67
+ // range address.
68
+ MARISA_BOUND_ERROR = 3,
69
+
70
+ // MARISA_RANGE_ERROR means that an out of range value has appeared in
71
+ // operation.
72
+ MARISA_RANGE_ERROR = 4,
73
+
74
+ // MARISA_CODE_ERROR means that an undefined code has appeared in operation.
75
+ MARISA_CODE_ERROR = 5,
76
+
77
+ // MARISA_RESET_ERROR means that a smart pointer has tried to reset itself.
78
+ MARISA_RESET_ERROR = 6,
79
+
80
+ // MARISA_SIZE_ERROR means that a size has exceeded a library limitation.
81
+ MARISA_SIZE_ERROR = 7,
82
+
83
+ // MARISA_MEMORY_ERROR means that a memory allocation has failed.
84
+ MARISA_MEMORY_ERROR = 8,
85
+
86
+ // MARISA_IO_ERROR means that an I/O operation has failed.
87
+ MARISA_IO_ERROR = 9,
88
+
89
+ // MARISA_FORMAT_ERROR means that input was in invalid format.
90
+ MARISA_FORMAT_ERROR = 10,
91
+ } marisa_error_code;
92
+
93
+ // Min/max values, flags and masks for dictionary settings are defined below.
94
+ // Please note that unspecified settings will be replaced with the default
95
+ // settings. For example, 0 is equivalent to (MARISA_DEFAULT_NUM_TRIES |
96
+ // MARISA_DEFAULT_TRIE | MARISA_DEFAULT_TAIL | MARISA_DEFAULT_ORDER).
97
+
98
+ // A dictionary consists of 3 tries in default. Usually more tries make a
99
+ // dictionary space-efficient but time-inefficient.
100
+ typedef enum marisa_num_tries_ {
101
+ MARISA_MIN_NUM_TRIES = 0x00001,
102
+ MARISA_MAX_NUM_TRIES = 0x0007F,
103
+ MARISA_DEFAULT_NUM_TRIES = 0x00003,
104
+ } marisa_num_tries;
105
+
106
+ // This library uses a cache technique to accelerate search functions. The
107
+ // following enumerated type marisa_cache_level gives a list of available cache
108
+ // size options. A larger cache enables faster search but takes a more space.
109
+ typedef enum marisa_cache_level_ {
110
+ MARISA_HUGE_CACHE = 0x00080,
111
+ MARISA_LARGE_CACHE = 0x00100,
112
+ MARISA_NORMAL_CACHE = 0x00200,
113
+ MARISA_SMALL_CACHE = 0x00400,
114
+ MARISA_TINY_CACHE = 0x00800,
115
+ MARISA_DEFAULT_CACHE = MARISA_NORMAL_CACHE
116
+ } marisa_cache_level;
117
+
118
+ // This library provides 2 kinds of TAIL implementations.
119
+ typedef enum marisa_tail_mode_ {
120
+ // MARISA_TEXT_TAIL merges last labels as zero-terminated strings. So, it is
121
+ // available if and only if the last labels do not contain a NULL character.
122
+ // If MARISA_TEXT_TAIL is specified and a NULL character exists in the last
123
+ // labels, the setting is automatically switched to MARISA_BINARY_TAIL.
124
+ MARISA_TEXT_TAIL = 0x01000,
125
+
126
+ // MARISA_BINARY_TAIL also merges last labels but as byte sequences. It uses
127
+ // a bit vector to detect the end of a sequence, instead of NULL characters.
128
+ // So, MARISA_BINARY_TAIL requires a larger space if the average length of
129
+ // labels is greater than 8.
130
+ MARISA_BINARY_TAIL = 0x02000,
131
+
132
+ MARISA_DEFAULT_TAIL = MARISA_TEXT_TAIL,
133
+ } marisa_tail_mode;
134
+
135
+ // The arrangement of nodes affects the time cost of matching and the order of
136
+ // predictive search.
137
+ typedef enum marisa_node_order_ {
138
+ // MARISA_LABEL_ORDER arranges nodes in ascending label order.
139
+ // MARISA_LABEL_ORDER is useful if an application needs to predict keys in
140
+ // label order.
141
+ MARISA_LABEL_ORDER = 0x10000,
142
+
143
+ // MARISA_WEIGHT_ORDER arranges nodes in descending weight order.
144
+ // MARISA_WEIGHT_ORDER is generally a better choice because it enables faster
145
+ // matching.
146
+ MARISA_WEIGHT_ORDER = 0x20000,
147
+
148
+ MARISA_DEFAULT_ORDER = MARISA_WEIGHT_ORDER,
149
+ } marisa_node_order;
150
+
151
+ typedef enum marisa_config_mask_ {
152
+ MARISA_NUM_TRIES_MASK = 0x0007F,
153
+ MARISA_CACHE_LEVEL_MASK = 0x00F80,
154
+ MARISA_TAIL_MODE_MASK = 0x0F000,
155
+ MARISA_NODE_ORDER_MASK = 0xF0000,
156
+ MARISA_CONFIG_MASK = 0xFFFFF
157
+ } marisa_config_mask;
158
+
159
+ #ifdef __cplusplus
160
+ } // extern "C"
161
+ #endif // __cplusplus
162
+
163
+ #ifdef __cplusplus
164
+ namespace marisa {
165
+
166
+ typedef ::marisa_uint8 UInt8;
167
+ typedef ::marisa_uint16 UInt16;
168
+ typedef ::marisa_uint32 UInt32;
169
+ typedef ::marisa_uint64 UInt64;
170
+
171
+ typedef ::marisa_error_code ErrorCode;
172
+
173
+ typedef ::marisa_cache_level CacheLevel;
174
+ typedef ::marisa_tail_mode TailMode;
175
+ typedef ::marisa_node_order NodeOrder;
176
+
177
+ template <typename T>
178
+ inline void swap(T &lhs, T &rhs) {
179
+ T temp = lhs;
180
+ lhs = rhs;
181
+ rhs = temp;
182
+ }
183
+
184
+ } // namespace marisa
185
+ #endif // __cplusplus
186
+
187
+ #ifdef __cplusplus
188
+ #include "marisa/exception.h"
189
+ #include "marisa/scoped-ptr.h"
190
+ #include "marisa/scoped-array.h"
191
+ #endif // __cplusplus
192
+
193
+ #endif // MARISA_BASE_H_
@@ -0,0 +1,82 @@
1
+ #ifndef MARISA_EXCEPTION_H_
2
+ #define MARISA_EXCEPTION_H_
3
+
4
+ #include <exception>
5
+
6
+ #include "marisa/base.h"
7
+
8
+ namespace marisa {
9
+
10
+ // An exception object keeps a filename, a line number, an error code and an
11
+ // error message. The message format is as follows:
12
+ // "__FILE__:__LINE__: error_code: error_message"
13
+ class Exception : public std::exception {
14
+ public:
15
+ Exception(const char *filename, int line,
16
+ ErrorCode error_code, const char *error_message)
17
+ : std::exception(), filename_(filename), line_(line),
18
+ error_code_(error_code), error_message_(error_message) {}
19
+ Exception(const Exception &ex)
20
+ : std::exception(), filename_(ex.filename_), line_(ex.line_),
21
+ error_code_(ex.error_code_), error_message_(ex.error_message_) {}
22
+ virtual ~Exception() throw() {}
23
+
24
+ Exception &operator=(const Exception &rhs) {
25
+ filename_ = rhs.filename_;
26
+ line_ = rhs.line_;
27
+ error_code_ = rhs.error_code_;
28
+ error_message_ = rhs.error_message_;
29
+ return *this;
30
+ }
31
+
32
+ const char *filename() const {
33
+ return filename_;
34
+ }
35
+ int line() const {
36
+ return line_;
37
+ }
38
+ ErrorCode error_code() const {
39
+ return error_code_;
40
+ }
41
+ const char *error_message() const {
42
+ return error_message_;
43
+ }
44
+
45
+ virtual const char *what() const throw() {
46
+ return error_message_;
47
+ }
48
+
49
+ private:
50
+ const char *filename_;
51
+ int line_;
52
+ ErrorCode error_code_;
53
+ const char *error_message_;
54
+ };
55
+
56
+ // These macros are used to convert a line number to a string constant.
57
+ #define MARISA_INT_TO_STR(value) #value
58
+ #define MARISA_LINE_TO_STR(line) MARISA_INT_TO_STR(line)
59
+ #define MARISA_LINE_STR MARISA_LINE_TO_STR(__LINE__)
60
+
61
+ // MARISA_THROW throws an exception with a filename, a line number, an error
62
+ // code and an error message. The message format is as follows:
63
+ // "__FILE__:__LINE__: error_code: error_message"
64
+ #define MARISA_THROW(error_code, error_message) \
65
+ (throw marisa::Exception(__FILE__, __LINE__, error_code, \
66
+ __FILE__ ":" MARISA_LINE_STR ": " #error_code ": " error_message))
67
+
68
+ // MARISA_THROW_IF throws an exception if `condition' is true.
69
+ #define MARISA_THROW_IF(condition, error_code) \
70
+ (void)((!(condition)) || (MARISA_THROW(error_code, #condition), 0))
71
+
72
+ // MARISA_DEBUG_IF is ignored if _DEBUG is undefined. So, it is useful for
73
+ // debugging time-critical codes.
74
+ #ifdef _DEBUG
75
+ #define MARISA_DEBUG_IF(cond, error_code) MARISA_THROW_IF(cond, error_code)
76
+ #else
77
+ #define MARISA_DEBUG_IF(cond, error_code)
78
+ #endif
79
+
80
+ } // namespace marisa
81
+
82
+ #endif // MARISA_EXCEPTION_H_
@@ -0,0 +1,26 @@
1
+ #ifndef MARISA_GRIMOIRE_ALGORITHM_H_
2
+ #define MARISA_GRIMOIRE_ALGORITHM_H_
3
+
4
+ #include "marisa/grimoire/algorithm/sort.h"
5
+
6
+ namespace marisa {
7
+ namespace grimoire {
8
+
9
+ class Algorithm {
10
+ public:
11
+ Algorithm() {}
12
+
13
+ template <typename Iterator>
14
+ std::size_t sort(Iterator begin, Iterator end) const {
15
+ return algorithm::sort(begin, end);
16
+ }
17
+
18
+ private:
19
+ Algorithm(const Algorithm &);
20
+ Algorithm &operator=(const Algorithm &);
21
+ };
22
+
23
+ } // namespace grimoire
24
+ } // namespace marisa
25
+
26
+ #endif // MARISA_GRIMOIRE_ALGORITHM_H_
@@ -0,0 +1,196 @@
1
+ #ifndef MARISA_GRIMOIRE_ALGORITHM_SORT_H_
2
+ #define MARISA_GRIMOIRE_ALGORITHM_SORT_H_
3
+
4
+ #include "marisa/base.h"
5
+
6
+ namespace marisa {
7
+ namespace grimoire {
8
+ namespace algorithm {
9
+ namespace details {
10
+
11
+ enum {
12
+ MARISA_INSERTION_SORT_THRESHOLD = 10
13
+ };
14
+
15
+ template <typename T>
16
+ int get_label(const T &unit, std::size_t depth) {
17
+ MARISA_DEBUG_IF(depth > unit.length(), MARISA_BOUND_ERROR);
18
+
19
+ return (depth < unit.length()) ? (int)(UInt8)unit[depth] : -1;
20
+ }
21
+
22
+ template <typename T>
23
+ int median(const T &a, const T &b, const T &c, std::size_t depth) {
24
+ const int x = get_label(a, depth);
25
+ const int y = get_label(b, depth);
26
+ const int z = get_label(c, depth);
27
+ if (x < y) {
28
+ if (y < z) {
29
+ return y;
30
+ } else if (x < z) {
31
+ return z;
32
+ }
33
+ return x;
34
+ } else if (x < z) {
35
+ return x;
36
+ } else if (y < z) {
37
+ return z;
38
+ }
39
+ return y;
40
+ }
41
+
42
+ template <typename T>
43
+ int compare(const T &lhs, const T &rhs, std::size_t depth) {
44
+ for (std::size_t i = depth; i < lhs.length(); ++i) {
45
+ if (i == rhs.length()) {
46
+ return 1;
47
+ }
48
+ if (lhs[i] != rhs[i]) {
49
+ return (UInt8)lhs[i] - (UInt8)rhs[i];
50
+ }
51
+ }
52
+ if (lhs.length() == rhs.length()) {
53
+ return 0;
54
+ }
55
+ return (lhs.length() < rhs.length()) ? -1 : 1;
56
+ }
57
+
58
+ template <typename Iterator>
59
+ std::size_t insertion_sort(Iterator l, Iterator r, std::size_t depth) {
60
+ MARISA_DEBUG_IF(l > r, MARISA_BOUND_ERROR);
61
+
62
+ std::size_t count = 1;
63
+ for (Iterator i = l + 1; i < r; ++i) {
64
+ int result = 0;
65
+ for (Iterator j = i; j > l; --j) {
66
+ result = compare(*(j - 1), *j, depth);
67
+ if (result <= 0) {
68
+ break;
69
+ }
70
+ marisa::swap(*(j - 1), *j);
71
+ }
72
+ if (result != 0) {
73
+ ++count;
74
+ }
75
+ }
76
+ return count;
77
+ }
78
+
79
+ template <typename Iterator>
80
+ std::size_t sort(Iterator l, Iterator r, std::size_t depth) {
81
+ MARISA_DEBUG_IF(l > r, MARISA_BOUND_ERROR);
82
+
83
+ std::size_t count = 0;
84
+ while ((r - l) > MARISA_INSERTION_SORT_THRESHOLD) {
85
+ Iterator pl = l;
86
+ Iterator pr = r;
87
+ Iterator pivot_l = l;
88
+ Iterator pivot_r = r;
89
+
90
+ const int pivot = median(*l, *(l + (r - l) / 2), *(r - 1), depth);
91
+ for ( ; ; ) {
92
+ while (pl < pr) {
93
+ const int label = get_label(*pl, depth);
94
+ if (label > pivot) {
95
+ break;
96
+ } else if (label == pivot) {
97
+ marisa::swap(*pl, *pivot_l);
98
+ ++pivot_l;
99
+ }
100
+ ++pl;
101
+ }
102
+ while (pl < pr) {
103
+ const int label = get_label(*--pr, depth);
104
+ if (label < pivot) {
105
+ break;
106
+ } else if (label == pivot) {
107
+ marisa::swap(*pr, *--pivot_r);
108
+ }
109
+ }
110
+ if (pl >= pr) {
111
+ break;
112
+ }
113
+ marisa::swap(*pl, *pr);
114
+ ++pl;
115
+ }
116
+ while (pivot_l > l) {
117
+ marisa::swap(*--pivot_l, *--pl);
118
+ }
119
+ while (pivot_r < r) {
120
+ marisa::swap(*pivot_r, *pr);
121
+ ++pivot_r;
122
+ ++pr;
123
+ }
124
+
125
+ if (((pl - l) > (pr - pl)) || ((r - pr) > (pr - pl))) {
126
+ if ((pr - pl) == 1) {
127
+ ++count;
128
+ } else if ((pr - pl) > 1) {
129
+ if (pivot == -1) {
130
+ ++count;
131
+ } else {
132
+ count += sort(pl, pr, depth + 1);
133
+ }
134
+ }
135
+
136
+ if ((pl - l) < (r - pr)) {
137
+ if ((pl - l) == 1) {
138
+ ++count;
139
+ } else if ((pl - l) > 1) {
140
+ count += sort(l, pl, depth);
141
+ }
142
+ l = pr;
143
+ } else {
144
+ if ((r - pr) == 1) {
145
+ ++count;
146
+ } else if ((r - pr) > 1) {
147
+ count += sort(pr, r, depth);
148
+ }
149
+ r = pl;
150
+ }
151
+ } else {
152
+ if ((pl - l) == 1) {
153
+ ++count;
154
+ } else if ((pl - l) > 1) {
155
+ count += sort(l, pl, depth);
156
+ }
157
+
158
+ if ((r - pr) == 1) {
159
+ ++count;
160
+ } else if ((r - pr) > 1) {
161
+ count += sort(pr, r, depth);
162
+ }
163
+
164
+ l = pl, r = pr;
165
+ if ((pr - pl) == 1) {
166
+ ++count;
167
+ } else if ((pr - pl) > 1) {
168
+ if (pivot == -1) {
169
+ l = r;
170
+ ++count;
171
+ } else {
172
+ ++depth;
173
+ }
174
+ }
175
+ }
176
+ }
177
+
178
+ if ((r - l) > 1) {
179
+ count += insertion_sort(l, r, depth);
180
+ }
181
+ return count;
182
+ }
183
+
184
+ } // namespace details
185
+
186
+ template <typename Iterator>
187
+ std::size_t sort(Iterator begin, Iterator end) {
188
+ MARISA_DEBUG_IF(begin > end, MARISA_BOUND_ERROR);
189
+ return details::sort(begin, end, 0);
190
+ };
191
+
192
+ } // namespace algorithm
193
+ } // namespace grimoire
194
+ } // namespace marisa
195
+
196
+ #endif // MARISA_GRIMOIRE_ALGORITHM_SORT_H_