melisa 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. data/README.md +11 -0
  2. data/ext/marisa/bindings/marisa-swig.cxx +253 -0
  3. data/ext/marisa/bindings/marisa-swig.h +183 -0
  4. data/ext/marisa/bindings/perl/marisa-swig.cxx +253 -0
  5. data/ext/marisa/bindings/perl/marisa-swig.h +183 -0
  6. data/ext/marisa/bindings/perl/marisa-swig_wrap.cxx +5160 -0
  7. data/ext/marisa/bindings/python/marisa-swig.cxx +253 -0
  8. data/ext/marisa/bindings/python/marisa-swig.h +183 -0
  9. data/ext/marisa/bindings/python/marisa-swig_wrap.cxx +6090 -0
  10. data/ext/marisa/bindings/ruby/extconf.rb +5 -0
  11. data/ext/marisa/bindings/ruby/marisa-swig.cxx +253 -0
  12. data/ext/marisa/bindings/ruby/marisa-swig.h +183 -0
  13. data/ext/marisa/bindings/ruby/marisa-swig_wrap.cxx +4708 -0
  14. data/ext/marisa/lib/marisa.h +14 -0
  15. data/ext/marisa/lib/marisa/agent.cc +51 -0
  16. data/ext/marisa/lib/marisa/agent.h +73 -0
  17. data/ext/marisa/lib/marisa/base.h +193 -0
  18. data/ext/marisa/lib/marisa/exception.h +82 -0
  19. data/ext/marisa/lib/marisa/grimoire/algorithm.h +26 -0
  20. data/ext/marisa/lib/marisa/grimoire/algorithm/sort.h +196 -0
  21. data/ext/marisa/lib/marisa/grimoire/intrin.h +115 -0
  22. data/ext/marisa/lib/marisa/grimoire/io.h +18 -0
  23. data/ext/marisa/lib/marisa/grimoire/io/mapper.cc +163 -0
  24. data/ext/marisa/lib/marisa/grimoire/io/mapper.h +67 -0
  25. data/ext/marisa/lib/marisa/grimoire/io/reader.cc +147 -0
  26. data/ext/marisa/lib/marisa/grimoire/io/reader.h +66 -0
  27. data/ext/marisa/lib/marisa/grimoire/io/writer.cc +148 -0
  28. data/ext/marisa/lib/marisa/grimoire/io/writer.h +65 -0
  29. data/ext/marisa/lib/marisa/grimoire/trie.h +16 -0
  30. data/ext/marisa/lib/marisa/grimoire/trie/cache.h +81 -0
  31. data/ext/marisa/lib/marisa/grimoire/trie/config.h +155 -0
  32. data/ext/marisa/lib/marisa/grimoire/trie/entry.h +82 -0
  33. data/ext/marisa/lib/marisa/grimoire/trie/header.h +61 -0
  34. data/ext/marisa/lib/marisa/grimoire/trie/history.h +65 -0
  35. data/ext/marisa/lib/marisa/grimoire/trie/key.h +228 -0
  36. data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.cc +876 -0
  37. data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.h +134 -0
  38. data/ext/marisa/lib/marisa/grimoire/trie/range.h +115 -0
  39. data/ext/marisa/lib/marisa/grimoire/trie/state.h +117 -0
  40. data/ext/marisa/lib/marisa/grimoire/trie/tail.cc +218 -0
  41. data/ext/marisa/lib/marisa/grimoire/trie/tail.h +72 -0
  42. data/ext/marisa/lib/marisa/grimoire/vector.h +18 -0
  43. data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.cc +826 -0
  44. data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.h +179 -0
  45. data/ext/marisa/lib/marisa/grimoire/vector/flat-vector.h +205 -0
  46. data/ext/marisa/lib/marisa/grimoire/vector/pop-count.h +110 -0
  47. data/ext/marisa/lib/marisa/grimoire/vector/rank-index.h +82 -0
  48. data/ext/marisa/lib/marisa/grimoire/vector/vector.h +256 -0
  49. data/ext/marisa/lib/marisa/iostream.h +18 -0
  50. data/ext/marisa/lib/marisa/key.h +85 -0
  51. data/ext/marisa/lib/marisa/keyset.cc +181 -0
  52. data/ext/marisa/lib/marisa/keyset.h +80 -0
  53. data/ext/marisa/lib/marisa/query.h +71 -0
  54. data/ext/marisa/lib/marisa/scoped-array.h +48 -0
  55. data/ext/marisa/lib/marisa/scoped-ptr.h +52 -0
  56. data/ext/marisa/lib/marisa/stdio.h +15 -0
  57. data/ext/marisa/lib/marisa/trie.cc +249 -0
  58. data/ext/marisa/lib/marisa/trie.h +64 -0
  59. data/ext/marisa/tests/base-test.cc +309 -0
  60. data/ext/marisa/tests/io-test.cc +252 -0
  61. data/ext/marisa/tests/marisa-assert.h +26 -0
  62. data/ext/marisa/tests/marisa-test.cc +388 -0
  63. data/ext/marisa/tests/trie-test.cc +507 -0
  64. data/ext/marisa/tests/vector-test.cc +466 -0
  65. data/ext/marisa/tools/cmdopt.cc +298 -0
  66. data/ext/marisa/tools/cmdopt.h +58 -0
  67. data/ext/marisa/tools/marisa-benchmark.cc +418 -0
  68. data/ext/marisa/tools/marisa-build.cc +206 -0
  69. data/ext/marisa/tools/marisa-common-prefix-search.cc +143 -0
  70. data/ext/marisa/tools/marisa-dump.cc +151 -0
  71. data/ext/marisa/tools/marisa-lookup.cc +110 -0
  72. data/ext/marisa/tools/marisa-predictive-search.cc +143 -0
  73. data/ext/marisa/tools/marisa-reverse-lookup.cc +110 -0
  74. data/lib/melisa.rb +7 -0
  75. data/lib/melisa/base_config_flags.rb +76 -0
  76. data/lib/melisa/bytes_trie.rb +55 -0
  77. data/lib/melisa/int_trie.rb +14 -0
  78. data/lib/melisa/search.rb +55 -0
  79. data/lib/melisa/trie.rb +96 -0
  80. data/lib/melisa/version.rb +3 -0
  81. data/melisa.gemspec +36 -0
  82. data/spec/base_config_flags_spec.rb +73 -0
  83. data/spec/bytes_trie_spec.rb +16 -0
  84. data/spec/int_trie_spec.rb +16 -0
  85. data/spec/search_spec.rb +29 -0
  86. data/spec/spec_helper.rb +1 -0
  87. data/spec/trie_spec.rb +30 -0
  88. metadata +207 -0
@@ -0,0 +1,14 @@
1
+ #ifndef MARISA_H_
2
+ #define MARISA_H_
3
+
4
+ // "marisa/stdio.h" includes <cstdio> for I/O using std::FILE.
5
+ #include "marisa/stdio.h"
6
+
7
+ // "marisa/iostream.h" includes <iosfwd> for I/O using std::iostream.
8
+ #include "marisa/iostream.h"
9
+
10
+ // You can use <marisa/trie.h> instead of <marisa.h> if you don't need the
11
+ // above I/O interfaces and don't want to include the above I/O headers.
12
+ #include "marisa/trie.h"
13
+
14
+ #endif // MARISA_H_
@@ -0,0 +1,51 @@
1
+ #include <new>
2
+
3
+ #include "marisa/agent.h"
4
+ #include "marisa/grimoire/trie.h"
5
+
6
+ namespace marisa {
7
+
8
+ Agent::Agent() : query_(), key_(), state_() {}
9
+
10
+ Agent::~Agent() {}
11
+
12
+ void Agent::set_query(const char *str) {
13
+ MARISA_THROW_IF(str == NULL, MARISA_NULL_ERROR);
14
+ if (state_.get() != NULL) {
15
+ state_->reset();
16
+ }
17
+ query_.set_str(str);
18
+ }
19
+
20
+ void Agent::set_query(const char *ptr, std::size_t length) {
21
+ MARISA_THROW_IF((ptr == NULL) && (length != 0), MARISA_NULL_ERROR);
22
+ if (state_.get() != NULL) {
23
+ state_->reset();
24
+ }
25
+ query_.set_str(ptr, length);
26
+ }
27
+
28
+ void Agent::set_query(std::size_t key_id) {
29
+ if (state_.get() != NULL) {
30
+ state_->reset();
31
+ }
32
+ query_.set_id(key_id);
33
+ }
34
+
35
+ void Agent::init_state() {
36
+ MARISA_THROW_IF(state_.get() != NULL, MARISA_STATE_ERROR);
37
+ state_.reset(new (std::nothrow) grimoire::State);
38
+ MARISA_THROW_IF(state_.get() == NULL, MARISA_MEMORY_ERROR);
39
+ }
40
+
41
+ void Agent::clear() {
42
+ Agent().swap(*this);
43
+ }
44
+
45
+ void Agent::swap(Agent &rhs) {
46
+ query_.swap(rhs.query_);
47
+ key_.swap(rhs.key_);
48
+ state_.swap(rhs.state_);
49
+ }
50
+
51
+ } // namespace marisa
@@ -0,0 +1,73 @@
1
+ #ifndef MARISA_AGENT_H_
2
+ #define MARISA_AGENT_H_
3
+
4
+ #include "marisa/key.h"
5
+ #include "marisa/query.h"
6
+
7
+ namespace marisa {
8
+ namespace grimoire {
9
+ namespace trie {
10
+
11
+ class State;
12
+
13
+ } // namespace trie
14
+ } // namespace grimoire
15
+
16
+ class Agent {
17
+ public:
18
+ Agent();
19
+ ~Agent();
20
+
21
+ const Query &query() const {
22
+ return query_;
23
+ }
24
+ const Key &key() const {
25
+ return key_;
26
+ }
27
+
28
+ void set_query(const char *str);
29
+ void set_query(const char *ptr, std::size_t length);
30
+ void set_query(std::size_t key_id);
31
+
32
+ const grimoire::trie::State &state() const {
33
+ return *state_;
34
+ }
35
+ grimoire::trie::State &state() {
36
+ return *state_;
37
+ }
38
+
39
+ void set_key(const char *str) {
40
+ MARISA_DEBUG_IF(str == NULL, MARISA_NULL_ERROR);
41
+ key_.set_str(str);
42
+ }
43
+ void set_key(const char *ptr, std::size_t length) {
44
+ MARISA_DEBUG_IF((ptr == NULL) && (length != 0), MARISA_NULL_ERROR);
45
+ MARISA_DEBUG_IF(length > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
46
+ key_.set_str(ptr, length);
47
+ }
48
+ void set_key(std::size_t id) {
49
+ MARISA_DEBUG_IF(id > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
50
+ key_.set_id(id);
51
+ }
52
+
53
+ bool has_state() const {
54
+ return state_.get() != NULL;
55
+ }
56
+ void init_state();
57
+
58
+ void clear();
59
+ void swap(Agent &rhs);
60
+
61
+ private:
62
+ Query query_;
63
+ Key key_;
64
+ scoped_ptr<grimoire::trie::State> state_;
65
+
66
+ // Disallows copy and assignment.
67
+ Agent(const Agent &);
68
+ Agent &operator=(const Agent &);
69
+ };
70
+
71
+ } // namespace marisa
72
+
73
+ #endif // MARISA_AGENT_H_
@@ -0,0 +1,193 @@
1
+ #ifndef MARISA_BASE_H_
2
+ #define MARISA_BASE_H_
3
+
4
+ // Old Visual C++ does not provide stdint.h.
5
+ #ifndef _MSC_VER
6
+ #include <stdint.h>
7
+ #endif // _MSC_VER
8
+
9
+ #ifdef __cplusplus
10
+ #include <cstddef>
11
+ #else // __cplusplus
12
+ #include <stddef.h>
13
+ #endif // __cplusplus
14
+
15
+ #ifdef __cplusplus
16
+ extern "C" {
17
+ #endif // __cplusplus
18
+
19
+ #ifdef _MSC_VER
20
+ typedef unsigned __int8 marisa_uint8;
21
+ typedef unsigned __int16 marisa_uint16;
22
+ typedef unsigned __int32 marisa_uint32;
23
+ typedef unsigned __int64 marisa_uint64;
24
+ #else // _MSC_VER
25
+ typedef uint8_t marisa_uint8;
26
+ typedef uint16_t marisa_uint16;
27
+ typedef uint32_t marisa_uint32;
28
+ typedef uint64_t marisa_uint64;
29
+ #endif // _MSC_VER
30
+
31
+ #if defined(_WIN64) || defined(__amd64__) || defined(__x86_64__) || \
32
+ defined(__ia64__) || defined(__ppc64__) || defined(__powerpc64__) || \
33
+ defined(__sparc64__) || defined(__mips64__) || defined(__aarch64__)
34
+ #define MARISA_WORD_SIZE 64
35
+ #else // defined(_WIN64), etc.
36
+ #define MARISA_WORD_SIZE 32
37
+ #endif // defined(_WIN64), etc.
38
+
39
+ //#define MARISA_WORD_SIZE (sizeof(void *) * 8)
40
+
41
+ #define MARISA_UINT8_MAX ((marisa_uint8)~(marisa_uint8)0)
42
+ #define MARISA_UINT16_MAX ((marisa_uint16)~(marisa_uint16)0)
43
+ #define MARISA_UINT32_MAX ((marisa_uint32)~(marisa_uint32)0)
44
+ #define MARISA_UINT64_MAX ((marisa_uint64)~(marisa_uint64)0)
45
+ #define MARISA_SIZE_MAX ((size_t)~(size_t)0)
46
+
47
+ #define MARISA_INVALID_LINK_ID MARISA_UINT32_MAX
48
+ #define MARISA_INVALID_KEY_ID MARISA_UINT32_MAX
49
+ #define MARISA_INVALID_EXTRA (MARISA_UINT32_MAX >> 8)
50
+
51
+ // Error codes are defined as members of marisa_error_code. This library throws
52
+ // an exception with one of the error codes when an error occurs.
53
+ typedef enum marisa_error_code_ {
54
+ // MARISA_OK means that a requested operation has succeeded. In practice, an
55
+ // exception never has MARISA_OK because it is not an error.
56
+ MARISA_OK = 0,
57
+
58
+ // MARISA_STATE_ERROR means that an object was not ready for a requested
59
+ // operation. For example, an operation to modify a fixed vector throws an
60
+ // exception with MARISA_STATE_ERROR.
61
+ MARISA_STATE_ERROR = 1,
62
+
63
+ // MARISA_NULL_ERROR means that an invalid NULL pointer has been given.
64
+ MARISA_NULL_ERROR = 2,
65
+
66
+ // MARISA_BOUND_ERROR means that an operation has tried to access an out of
67
+ // range address.
68
+ MARISA_BOUND_ERROR = 3,
69
+
70
+ // MARISA_RANGE_ERROR means that an out of range value has appeared in
71
+ // operation.
72
+ MARISA_RANGE_ERROR = 4,
73
+
74
+ // MARISA_CODE_ERROR means that an undefined code has appeared in operation.
75
+ MARISA_CODE_ERROR = 5,
76
+
77
+ // MARISA_RESET_ERROR means that a smart pointer has tried to reset itself.
78
+ MARISA_RESET_ERROR = 6,
79
+
80
+ // MARISA_SIZE_ERROR means that a size has exceeded a library limitation.
81
+ MARISA_SIZE_ERROR = 7,
82
+
83
+ // MARISA_MEMORY_ERROR means that a memory allocation has failed.
84
+ MARISA_MEMORY_ERROR = 8,
85
+
86
+ // MARISA_IO_ERROR means that an I/O operation has failed.
87
+ MARISA_IO_ERROR = 9,
88
+
89
+ // MARISA_FORMAT_ERROR means that input was in invalid format.
90
+ MARISA_FORMAT_ERROR = 10,
91
+ } marisa_error_code;
92
+
93
+ // Min/max values, flags and masks for dictionary settings are defined below.
94
+ // Please note that unspecified settings will be replaced with the default
95
+ // settings. For example, 0 is equivalent to (MARISA_DEFAULT_NUM_TRIES |
96
+ // MARISA_DEFAULT_TRIE | MARISA_DEFAULT_TAIL | MARISA_DEFAULT_ORDER).
97
+
98
+ // A dictionary consists of 3 tries in default. Usually more tries make a
99
+ // dictionary space-efficient but time-inefficient.
100
+ typedef enum marisa_num_tries_ {
101
+ MARISA_MIN_NUM_TRIES = 0x00001,
102
+ MARISA_MAX_NUM_TRIES = 0x0007F,
103
+ MARISA_DEFAULT_NUM_TRIES = 0x00003,
104
+ } marisa_num_tries;
105
+
106
+ // This library uses a cache technique to accelerate search functions. The
107
+ // following enumerated type marisa_cache_level gives a list of available cache
108
+ // size options. A larger cache enables faster search but takes a more space.
109
+ typedef enum marisa_cache_level_ {
110
+ MARISA_HUGE_CACHE = 0x00080,
111
+ MARISA_LARGE_CACHE = 0x00100,
112
+ MARISA_NORMAL_CACHE = 0x00200,
113
+ MARISA_SMALL_CACHE = 0x00400,
114
+ MARISA_TINY_CACHE = 0x00800,
115
+ MARISA_DEFAULT_CACHE = MARISA_NORMAL_CACHE
116
+ } marisa_cache_level;
117
+
118
+ // This library provides 2 kinds of TAIL implementations.
119
+ typedef enum marisa_tail_mode_ {
120
+ // MARISA_TEXT_TAIL merges last labels as zero-terminated strings. So, it is
121
+ // available if and only if the last labels do not contain a NULL character.
122
+ // If MARISA_TEXT_TAIL is specified and a NULL character exists in the last
123
+ // labels, the setting is automatically switched to MARISA_BINARY_TAIL.
124
+ MARISA_TEXT_TAIL = 0x01000,
125
+
126
+ // MARISA_BINARY_TAIL also merges last labels but as byte sequences. It uses
127
+ // a bit vector to detect the end of a sequence, instead of NULL characters.
128
+ // So, MARISA_BINARY_TAIL requires a larger space if the average length of
129
+ // labels is greater than 8.
130
+ MARISA_BINARY_TAIL = 0x02000,
131
+
132
+ MARISA_DEFAULT_TAIL = MARISA_TEXT_TAIL,
133
+ } marisa_tail_mode;
134
+
135
+ // The arrangement of nodes affects the time cost of matching and the order of
136
+ // predictive search.
137
+ typedef enum marisa_node_order_ {
138
+ // MARISA_LABEL_ORDER arranges nodes in ascending label order.
139
+ // MARISA_LABEL_ORDER is useful if an application needs to predict keys in
140
+ // label order.
141
+ MARISA_LABEL_ORDER = 0x10000,
142
+
143
+ // MARISA_WEIGHT_ORDER arranges nodes in descending weight order.
144
+ // MARISA_WEIGHT_ORDER is generally a better choice because it enables faster
145
+ // matching.
146
+ MARISA_WEIGHT_ORDER = 0x20000,
147
+
148
+ MARISA_DEFAULT_ORDER = MARISA_WEIGHT_ORDER,
149
+ } marisa_node_order;
150
+
151
+ typedef enum marisa_config_mask_ {
152
+ MARISA_NUM_TRIES_MASK = 0x0007F,
153
+ MARISA_CACHE_LEVEL_MASK = 0x00F80,
154
+ MARISA_TAIL_MODE_MASK = 0x0F000,
155
+ MARISA_NODE_ORDER_MASK = 0xF0000,
156
+ MARISA_CONFIG_MASK = 0xFFFFF
157
+ } marisa_config_mask;
158
+
159
+ #ifdef __cplusplus
160
+ } // extern "C"
161
+ #endif // __cplusplus
162
+
163
+ #ifdef __cplusplus
164
+ namespace marisa {
165
+
166
+ typedef ::marisa_uint8 UInt8;
167
+ typedef ::marisa_uint16 UInt16;
168
+ typedef ::marisa_uint32 UInt32;
169
+ typedef ::marisa_uint64 UInt64;
170
+
171
+ typedef ::marisa_error_code ErrorCode;
172
+
173
+ typedef ::marisa_cache_level CacheLevel;
174
+ typedef ::marisa_tail_mode TailMode;
175
+ typedef ::marisa_node_order NodeOrder;
176
+
177
+ template <typename T>
178
+ inline void swap(T &lhs, T &rhs) {
179
+ T temp = lhs;
180
+ lhs = rhs;
181
+ rhs = temp;
182
+ }
183
+
184
+ } // namespace marisa
185
+ #endif // __cplusplus
186
+
187
+ #ifdef __cplusplus
188
+ #include "marisa/exception.h"
189
+ #include "marisa/scoped-ptr.h"
190
+ #include "marisa/scoped-array.h"
191
+ #endif // __cplusplus
192
+
193
+ #endif // MARISA_BASE_H_
@@ -0,0 +1,82 @@
1
+ #ifndef MARISA_EXCEPTION_H_
2
+ #define MARISA_EXCEPTION_H_
3
+
4
+ #include <exception>
5
+
6
+ #include "marisa/base.h"
7
+
8
+ namespace marisa {
9
+
10
+ // An exception object keeps a filename, a line number, an error code and an
11
+ // error message. The message format is as follows:
12
+ // "__FILE__:__LINE__: error_code: error_message"
13
+ class Exception : public std::exception {
14
+ public:
15
+ Exception(const char *filename, int line,
16
+ ErrorCode error_code, const char *error_message)
17
+ : std::exception(), filename_(filename), line_(line),
18
+ error_code_(error_code), error_message_(error_message) {}
19
+ Exception(const Exception &ex)
20
+ : std::exception(), filename_(ex.filename_), line_(ex.line_),
21
+ error_code_(ex.error_code_), error_message_(ex.error_message_) {}
22
+ virtual ~Exception() throw() {}
23
+
24
+ Exception &operator=(const Exception &rhs) {
25
+ filename_ = rhs.filename_;
26
+ line_ = rhs.line_;
27
+ error_code_ = rhs.error_code_;
28
+ error_message_ = rhs.error_message_;
29
+ return *this;
30
+ }
31
+
32
+ const char *filename() const {
33
+ return filename_;
34
+ }
35
+ int line() const {
36
+ return line_;
37
+ }
38
+ ErrorCode error_code() const {
39
+ return error_code_;
40
+ }
41
+ const char *error_message() const {
42
+ return error_message_;
43
+ }
44
+
45
+ virtual const char *what() const throw() {
46
+ return error_message_;
47
+ }
48
+
49
+ private:
50
+ const char *filename_;
51
+ int line_;
52
+ ErrorCode error_code_;
53
+ const char *error_message_;
54
+ };
55
+
56
+ // These macros are used to convert a line number to a string constant.
57
+ #define MARISA_INT_TO_STR(value) #value
58
+ #define MARISA_LINE_TO_STR(line) MARISA_INT_TO_STR(line)
59
+ #define MARISA_LINE_STR MARISA_LINE_TO_STR(__LINE__)
60
+
61
+ // MARISA_THROW throws an exception with a filename, a line number, an error
62
+ // code and an error message. The message format is as follows:
63
+ // "__FILE__:__LINE__: error_code: error_message"
64
+ #define MARISA_THROW(error_code, error_message) \
65
+ (throw marisa::Exception(__FILE__, __LINE__, error_code, \
66
+ __FILE__ ":" MARISA_LINE_STR ": " #error_code ": " error_message))
67
+
68
+ // MARISA_THROW_IF throws an exception if `condition' is true.
69
+ #define MARISA_THROW_IF(condition, error_code) \
70
+ (void)((!(condition)) || (MARISA_THROW(error_code, #condition), 0))
71
+
72
+ // MARISA_DEBUG_IF is ignored if _DEBUG is undefined. So, it is useful for
73
+ // debugging time-critical codes.
74
+ #ifdef _DEBUG
75
+ #define MARISA_DEBUG_IF(cond, error_code) MARISA_THROW_IF(cond, error_code)
76
+ #else
77
+ #define MARISA_DEBUG_IF(cond, error_code)
78
+ #endif
79
+
80
+ } // namespace marisa
81
+
82
+ #endif // MARISA_EXCEPTION_H_
@@ -0,0 +1,26 @@
1
+ #ifndef MARISA_GRIMOIRE_ALGORITHM_H_
2
+ #define MARISA_GRIMOIRE_ALGORITHM_H_
3
+
4
+ #include "marisa/grimoire/algorithm/sort.h"
5
+
6
+ namespace marisa {
7
+ namespace grimoire {
8
+
9
+ class Algorithm {
10
+ public:
11
+ Algorithm() {}
12
+
13
+ template <typename Iterator>
14
+ std::size_t sort(Iterator begin, Iterator end) const {
15
+ return algorithm::sort(begin, end);
16
+ }
17
+
18
+ private:
19
+ Algorithm(const Algorithm &);
20
+ Algorithm &operator=(const Algorithm &);
21
+ };
22
+
23
+ } // namespace grimoire
24
+ } // namespace marisa
25
+
26
+ #endif // MARISA_GRIMOIRE_ALGORITHM_H_
@@ -0,0 +1,196 @@
1
+ #ifndef MARISA_GRIMOIRE_ALGORITHM_SORT_H_
2
+ #define MARISA_GRIMOIRE_ALGORITHM_SORT_H_
3
+
4
+ #include "marisa/base.h"
5
+
6
+ namespace marisa {
7
+ namespace grimoire {
8
+ namespace algorithm {
9
+ namespace details {
10
+
11
+ enum {
12
+ MARISA_INSERTION_SORT_THRESHOLD = 10
13
+ };
14
+
15
+ template <typename T>
16
+ int get_label(const T &unit, std::size_t depth) {
17
+ MARISA_DEBUG_IF(depth > unit.length(), MARISA_BOUND_ERROR);
18
+
19
+ return (depth < unit.length()) ? (int)(UInt8)unit[depth] : -1;
20
+ }
21
+
22
+ template <typename T>
23
+ int median(const T &a, const T &b, const T &c, std::size_t depth) {
24
+ const int x = get_label(a, depth);
25
+ const int y = get_label(b, depth);
26
+ const int z = get_label(c, depth);
27
+ if (x < y) {
28
+ if (y < z) {
29
+ return y;
30
+ } else if (x < z) {
31
+ return z;
32
+ }
33
+ return x;
34
+ } else if (x < z) {
35
+ return x;
36
+ } else if (y < z) {
37
+ return z;
38
+ }
39
+ return y;
40
+ }
41
+
42
+ template <typename T>
43
+ int compare(const T &lhs, const T &rhs, std::size_t depth) {
44
+ for (std::size_t i = depth; i < lhs.length(); ++i) {
45
+ if (i == rhs.length()) {
46
+ return 1;
47
+ }
48
+ if (lhs[i] != rhs[i]) {
49
+ return (UInt8)lhs[i] - (UInt8)rhs[i];
50
+ }
51
+ }
52
+ if (lhs.length() == rhs.length()) {
53
+ return 0;
54
+ }
55
+ return (lhs.length() < rhs.length()) ? -1 : 1;
56
+ }
57
+
58
+ template <typename Iterator>
59
+ std::size_t insertion_sort(Iterator l, Iterator r, std::size_t depth) {
60
+ MARISA_DEBUG_IF(l > r, MARISA_BOUND_ERROR);
61
+
62
+ std::size_t count = 1;
63
+ for (Iterator i = l + 1; i < r; ++i) {
64
+ int result = 0;
65
+ for (Iterator j = i; j > l; --j) {
66
+ result = compare(*(j - 1), *j, depth);
67
+ if (result <= 0) {
68
+ break;
69
+ }
70
+ marisa::swap(*(j - 1), *j);
71
+ }
72
+ if (result != 0) {
73
+ ++count;
74
+ }
75
+ }
76
+ return count;
77
+ }
78
+
79
+ template <typename Iterator>
80
+ std::size_t sort(Iterator l, Iterator r, std::size_t depth) {
81
+ MARISA_DEBUG_IF(l > r, MARISA_BOUND_ERROR);
82
+
83
+ std::size_t count = 0;
84
+ while ((r - l) > MARISA_INSERTION_SORT_THRESHOLD) {
85
+ Iterator pl = l;
86
+ Iterator pr = r;
87
+ Iterator pivot_l = l;
88
+ Iterator pivot_r = r;
89
+
90
+ const int pivot = median(*l, *(l + (r - l) / 2), *(r - 1), depth);
91
+ for ( ; ; ) {
92
+ while (pl < pr) {
93
+ const int label = get_label(*pl, depth);
94
+ if (label > pivot) {
95
+ break;
96
+ } else if (label == pivot) {
97
+ marisa::swap(*pl, *pivot_l);
98
+ ++pivot_l;
99
+ }
100
+ ++pl;
101
+ }
102
+ while (pl < pr) {
103
+ const int label = get_label(*--pr, depth);
104
+ if (label < pivot) {
105
+ break;
106
+ } else if (label == pivot) {
107
+ marisa::swap(*pr, *--pivot_r);
108
+ }
109
+ }
110
+ if (pl >= pr) {
111
+ break;
112
+ }
113
+ marisa::swap(*pl, *pr);
114
+ ++pl;
115
+ }
116
+ while (pivot_l > l) {
117
+ marisa::swap(*--pivot_l, *--pl);
118
+ }
119
+ while (pivot_r < r) {
120
+ marisa::swap(*pivot_r, *pr);
121
+ ++pivot_r;
122
+ ++pr;
123
+ }
124
+
125
+ if (((pl - l) > (pr - pl)) || ((r - pr) > (pr - pl))) {
126
+ if ((pr - pl) == 1) {
127
+ ++count;
128
+ } else if ((pr - pl) > 1) {
129
+ if (pivot == -1) {
130
+ ++count;
131
+ } else {
132
+ count += sort(pl, pr, depth + 1);
133
+ }
134
+ }
135
+
136
+ if ((pl - l) < (r - pr)) {
137
+ if ((pl - l) == 1) {
138
+ ++count;
139
+ } else if ((pl - l) > 1) {
140
+ count += sort(l, pl, depth);
141
+ }
142
+ l = pr;
143
+ } else {
144
+ if ((r - pr) == 1) {
145
+ ++count;
146
+ } else if ((r - pr) > 1) {
147
+ count += sort(pr, r, depth);
148
+ }
149
+ r = pl;
150
+ }
151
+ } else {
152
+ if ((pl - l) == 1) {
153
+ ++count;
154
+ } else if ((pl - l) > 1) {
155
+ count += sort(l, pl, depth);
156
+ }
157
+
158
+ if ((r - pr) == 1) {
159
+ ++count;
160
+ } else if ((r - pr) > 1) {
161
+ count += sort(pr, r, depth);
162
+ }
163
+
164
+ l = pl, r = pr;
165
+ if ((pr - pl) == 1) {
166
+ ++count;
167
+ } else if ((pr - pl) > 1) {
168
+ if (pivot == -1) {
169
+ l = r;
170
+ ++count;
171
+ } else {
172
+ ++depth;
173
+ }
174
+ }
175
+ }
176
+ }
177
+
178
+ if ((r - l) > 1) {
179
+ count += insertion_sort(l, r, depth);
180
+ }
181
+ return count;
182
+ }
183
+
184
+ } // namespace details
185
+
186
+ template <typename Iterator>
187
+ std::size_t sort(Iterator begin, Iterator end) {
188
+ MARISA_DEBUG_IF(begin > end, MARISA_BOUND_ERROR);
189
+ return details::sort(begin, end, 0);
190
+ };
191
+
192
+ } // namespace algorithm
193
+ } // namespace grimoire
194
+ } // namespace marisa
195
+
196
+ #endif // MARISA_GRIMOIRE_ALGORITHM_SORT_H_