melisa 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +11 -0
- data/ext/marisa/bindings/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/marisa-swig.h +183 -0
- data/ext/marisa/bindings/perl/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/perl/marisa-swig.h +183 -0
- data/ext/marisa/bindings/perl/marisa-swig_wrap.cxx +5160 -0
- data/ext/marisa/bindings/python/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/python/marisa-swig.h +183 -0
- data/ext/marisa/bindings/python/marisa-swig_wrap.cxx +6090 -0
- data/ext/marisa/bindings/ruby/extconf.rb +5 -0
- data/ext/marisa/bindings/ruby/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/ruby/marisa-swig.h +183 -0
- data/ext/marisa/bindings/ruby/marisa-swig_wrap.cxx +4708 -0
- data/ext/marisa/lib/marisa.h +14 -0
- data/ext/marisa/lib/marisa/agent.cc +51 -0
- data/ext/marisa/lib/marisa/agent.h +73 -0
- data/ext/marisa/lib/marisa/base.h +193 -0
- data/ext/marisa/lib/marisa/exception.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/algorithm.h +26 -0
- data/ext/marisa/lib/marisa/grimoire/algorithm/sort.h +196 -0
- data/ext/marisa/lib/marisa/grimoire/intrin.h +115 -0
- data/ext/marisa/lib/marisa/grimoire/io.h +18 -0
- data/ext/marisa/lib/marisa/grimoire/io/mapper.cc +163 -0
- data/ext/marisa/lib/marisa/grimoire/io/mapper.h +67 -0
- data/ext/marisa/lib/marisa/grimoire/io/reader.cc +147 -0
- data/ext/marisa/lib/marisa/grimoire/io/reader.h +66 -0
- data/ext/marisa/lib/marisa/grimoire/io/writer.cc +148 -0
- data/ext/marisa/lib/marisa/grimoire/io/writer.h +65 -0
- data/ext/marisa/lib/marisa/grimoire/trie.h +16 -0
- data/ext/marisa/lib/marisa/grimoire/trie/cache.h +81 -0
- data/ext/marisa/lib/marisa/grimoire/trie/config.h +155 -0
- data/ext/marisa/lib/marisa/grimoire/trie/entry.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/trie/header.h +61 -0
- data/ext/marisa/lib/marisa/grimoire/trie/history.h +65 -0
- data/ext/marisa/lib/marisa/grimoire/trie/key.h +228 -0
- data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.cc +876 -0
- data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.h +134 -0
- data/ext/marisa/lib/marisa/grimoire/trie/range.h +115 -0
- data/ext/marisa/lib/marisa/grimoire/trie/state.h +117 -0
- data/ext/marisa/lib/marisa/grimoire/trie/tail.cc +218 -0
- data/ext/marisa/lib/marisa/grimoire/trie/tail.h +72 -0
- data/ext/marisa/lib/marisa/grimoire/vector.h +18 -0
- data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.cc +826 -0
- data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.h +179 -0
- data/ext/marisa/lib/marisa/grimoire/vector/flat-vector.h +205 -0
- data/ext/marisa/lib/marisa/grimoire/vector/pop-count.h +110 -0
- data/ext/marisa/lib/marisa/grimoire/vector/rank-index.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/vector/vector.h +256 -0
- data/ext/marisa/lib/marisa/iostream.h +18 -0
- data/ext/marisa/lib/marisa/key.h +85 -0
- data/ext/marisa/lib/marisa/keyset.cc +181 -0
- data/ext/marisa/lib/marisa/keyset.h +80 -0
- data/ext/marisa/lib/marisa/query.h +71 -0
- data/ext/marisa/lib/marisa/scoped-array.h +48 -0
- data/ext/marisa/lib/marisa/scoped-ptr.h +52 -0
- data/ext/marisa/lib/marisa/stdio.h +15 -0
- data/ext/marisa/lib/marisa/trie.cc +249 -0
- data/ext/marisa/lib/marisa/trie.h +64 -0
- data/ext/marisa/tests/base-test.cc +309 -0
- data/ext/marisa/tests/io-test.cc +252 -0
- data/ext/marisa/tests/marisa-assert.h +26 -0
- data/ext/marisa/tests/marisa-test.cc +388 -0
- data/ext/marisa/tests/trie-test.cc +507 -0
- data/ext/marisa/tests/vector-test.cc +466 -0
- data/ext/marisa/tools/cmdopt.cc +298 -0
- data/ext/marisa/tools/cmdopt.h +58 -0
- data/ext/marisa/tools/marisa-benchmark.cc +418 -0
- data/ext/marisa/tools/marisa-build.cc +206 -0
- data/ext/marisa/tools/marisa-common-prefix-search.cc +143 -0
- data/ext/marisa/tools/marisa-dump.cc +151 -0
- data/ext/marisa/tools/marisa-lookup.cc +110 -0
- data/ext/marisa/tools/marisa-predictive-search.cc +143 -0
- data/ext/marisa/tools/marisa-reverse-lookup.cc +110 -0
- data/lib/melisa.rb +7 -0
- data/lib/melisa/base_config_flags.rb +76 -0
- data/lib/melisa/bytes_trie.rb +55 -0
- data/lib/melisa/int_trie.rb +14 -0
- data/lib/melisa/search.rb +55 -0
- data/lib/melisa/trie.rb +96 -0
- data/lib/melisa/version.rb +3 -0
- data/melisa.gemspec +36 -0
- data/spec/base_config_flags_spec.rb +73 -0
- data/spec/bytes_trie_spec.rb +16 -0
- data/spec/int_trie_spec.rb +16 -0
- data/spec/search_spec.rb +29 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/trie_spec.rb +30 -0
- metadata +207 -0
@@ -0,0 +1,14 @@
|
|
1
|
+
#ifndef MARISA_H_
|
2
|
+
#define MARISA_H_
|
3
|
+
|
4
|
+
// "marisa/stdio.h" includes <cstdio> for I/O using std::FILE.
|
5
|
+
#include "marisa/stdio.h"
|
6
|
+
|
7
|
+
// "marisa/iostream.h" includes <iosfwd> for I/O using std::iostream.
|
8
|
+
#include "marisa/iostream.h"
|
9
|
+
|
10
|
+
// You can use <marisa/trie.h> instead of <marisa.h> if you don't need the
|
11
|
+
// above I/O interfaces and don't want to include the above I/O headers.
|
12
|
+
#include "marisa/trie.h"
|
13
|
+
|
14
|
+
#endif // MARISA_H_
|
@@ -0,0 +1,51 @@
|
|
1
|
+
#include <new>
|
2
|
+
|
3
|
+
#include "marisa/agent.h"
|
4
|
+
#include "marisa/grimoire/trie.h"
|
5
|
+
|
6
|
+
namespace marisa {
|
7
|
+
|
8
|
+
Agent::Agent() : query_(), key_(), state_() {}
|
9
|
+
|
10
|
+
Agent::~Agent() {}
|
11
|
+
|
12
|
+
void Agent::set_query(const char *str) {
|
13
|
+
MARISA_THROW_IF(str == NULL, MARISA_NULL_ERROR);
|
14
|
+
if (state_.get() != NULL) {
|
15
|
+
state_->reset();
|
16
|
+
}
|
17
|
+
query_.set_str(str);
|
18
|
+
}
|
19
|
+
|
20
|
+
void Agent::set_query(const char *ptr, std::size_t length) {
|
21
|
+
MARISA_THROW_IF((ptr == NULL) && (length != 0), MARISA_NULL_ERROR);
|
22
|
+
if (state_.get() != NULL) {
|
23
|
+
state_->reset();
|
24
|
+
}
|
25
|
+
query_.set_str(ptr, length);
|
26
|
+
}
|
27
|
+
|
28
|
+
void Agent::set_query(std::size_t key_id) {
|
29
|
+
if (state_.get() != NULL) {
|
30
|
+
state_->reset();
|
31
|
+
}
|
32
|
+
query_.set_id(key_id);
|
33
|
+
}
|
34
|
+
|
35
|
+
void Agent::init_state() {
|
36
|
+
MARISA_THROW_IF(state_.get() != NULL, MARISA_STATE_ERROR);
|
37
|
+
state_.reset(new (std::nothrow) grimoire::State);
|
38
|
+
MARISA_THROW_IF(state_.get() == NULL, MARISA_MEMORY_ERROR);
|
39
|
+
}
|
40
|
+
|
41
|
+
void Agent::clear() {
|
42
|
+
Agent().swap(*this);
|
43
|
+
}
|
44
|
+
|
45
|
+
void Agent::swap(Agent &rhs) {
|
46
|
+
query_.swap(rhs.query_);
|
47
|
+
key_.swap(rhs.key_);
|
48
|
+
state_.swap(rhs.state_);
|
49
|
+
}
|
50
|
+
|
51
|
+
} // namespace marisa
|
@@ -0,0 +1,73 @@
|
|
1
|
+
#ifndef MARISA_AGENT_H_
|
2
|
+
#define MARISA_AGENT_H_
|
3
|
+
|
4
|
+
#include "marisa/key.h"
|
5
|
+
#include "marisa/query.h"
|
6
|
+
|
7
|
+
namespace marisa {
|
8
|
+
namespace grimoire {
|
9
|
+
namespace trie {
|
10
|
+
|
11
|
+
class State;
|
12
|
+
|
13
|
+
} // namespace trie
|
14
|
+
} // namespace grimoire
|
15
|
+
|
16
|
+
class Agent {
|
17
|
+
public:
|
18
|
+
Agent();
|
19
|
+
~Agent();
|
20
|
+
|
21
|
+
const Query &query() const {
|
22
|
+
return query_;
|
23
|
+
}
|
24
|
+
const Key &key() const {
|
25
|
+
return key_;
|
26
|
+
}
|
27
|
+
|
28
|
+
void set_query(const char *str);
|
29
|
+
void set_query(const char *ptr, std::size_t length);
|
30
|
+
void set_query(std::size_t key_id);
|
31
|
+
|
32
|
+
const grimoire::trie::State &state() const {
|
33
|
+
return *state_;
|
34
|
+
}
|
35
|
+
grimoire::trie::State &state() {
|
36
|
+
return *state_;
|
37
|
+
}
|
38
|
+
|
39
|
+
void set_key(const char *str) {
|
40
|
+
MARISA_DEBUG_IF(str == NULL, MARISA_NULL_ERROR);
|
41
|
+
key_.set_str(str);
|
42
|
+
}
|
43
|
+
void set_key(const char *ptr, std::size_t length) {
|
44
|
+
MARISA_DEBUG_IF((ptr == NULL) && (length != 0), MARISA_NULL_ERROR);
|
45
|
+
MARISA_DEBUG_IF(length > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
46
|
+
key_.set_str(ptr, length);
|
47
|
+
}
|
48
|
+
void set_key(std::size_t id) {
|
49
|
+
MARISA_DEBUG_IF(id > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
50
|
+
key_.set_id(id);
|
51
|
+
}
|
52
|
+
|
53
|
+
bool has_state() const {
|
54
|
+
return state_.get() != NULL;
|
55
|
+
}
|
56
|
+
void init_state();
|
57
|
+
|
58
|
+
void clear();
|
59
|
+
void swap(Agent &rhs);
|
60
|
+
|
61
|
+
private:
|
62
|
+
Query query_;
|
63
|
+
Key key_;
|
64
|
+
scoped_ptr<grimoire::trie::State> state_;
|
65
|
+
|
66
|
+
// Disallows copy and assignment.
|
67
|
+
Agent(const Agent &);
|
68
|
+
Agent &operator=(const Agent &);
|
69
|
+
};
|
70
|
+
|
71
|
+
} // namespace marisa
|
72
|
+
|
73
|
+
#endif // MARISA_AGENT_H_
|
@@ -0,0 +1,193 @@
|
|
1
|
+
#ifndef MARISA_BASE_H_
|
2
|
+
#define MARISA_BASE_H_
|
3
|
+
|
4
|
+
// Old Visual C++ does not provide stdint.h.
|
5
|
+
#ifndef _MSC_VER
|
6
|
+
#include <stdint.h>
|
7
|
+
#endif // _MSC_VER
|
8
|
+
|
9
|
+
#ifdef __cplusplus
|
10
|
+
#include <cstddef>
|
11
|
+
#else // __cplusplus
|
12
|
+
#include <stddef.h>
|
13
|
+
#endif // __cplusplus
|
14
|
+
|
15
|
+
#ifdef __cplusplus
|
16
|
+
extern "C" {
|
17
|
+
#endif // __cplusplus
|
18
|
+
|
19
|
+
#ifdef _MSC_VER
|
20
|
+
typedef unsigned __int8 marisa_uint8;
|
21
|
+
typedef unsigned __int16 marisa_uint16;
|
22
|
+
typedef unsigned __int32 marisa_uint32;
|
23
|
+
typedef unsigned __int64 marisa_uint64;
|
24
|
+
#else // _MSC_VER
|
25
|
+
typedef uint8_t marisa_uint8;
|
26
|
+
typedef uint16_t marisa_uint16;
|
27
|
+
typedef uint32_t marisa_uint32;
|
28
|
+
typedef uint64_t marisa_uint64;
|
29
|
+
#endif // _MSC_VER
|
30
|
+
|
31
|
+
#if defined(_WIN64) || defined(__amd64__) || defined(__x86_64__) || \
|
32
|
+
defined(__ia64__) || defined(__ppc64__) || defined(__powerpc64__) || \
|
33
|
+
defined(__sparc64__) || defined(__mips64__) || defined(__aarch64__)
|
34
|
+
#define MARISA_WORD_SIZE 64
|
35
|
+
#else // defined(_WIN64), etc.
|
36
|
+
#define MARISA_WORD_SIZE 32
|
37
|
+
#endif // defined(_WIN64), etc.
|
38
|
+
|
39
|
+
//#define MARISA_WORD_SIZE (sizeof(void *) * 8)
|
40
|
+
|
41
|
+
#define MARISA_UINT8_MAX ((marisa_uint8)~(marisa_uint8)0)
|
42
|
+
#define MARISA_UINT16_MAX ((marisa_uint16)~(marisa_uint16)0)
|
43
|
+
#define MARISA_UINT32_MAX ((marisa_uint32)~(marisa_uint32)0)
|
44
|
+
#define MARISA_UINT64_MAX ((marisa_uint64)~(marisa_uint64)0)
|
45
|
+
#define MARISA_SIZE_MAX ((size_t)~(size_t)0)
|
46
|
+
|
47
|
+
#define MARISA_INVALID_LINK_ID MARISA_UINT32_MAX
|
48
|
+
#define MARISA_INVALID_KEY_ID MARISA_UINT32_MAX
|
49
|
+
#define MARISA_INVALID_EXTRA (MARISA_UINT32_MAX >> 8)
|
50
|
+
|
51
|
+
// Error codes are defined as members of marisa_error_code. This library throws
|
52
|
+
// an exception with one of the error codes when an error occurs.
|
53
|
+
typedef enum marisa_error_code_ {
|
54
|
+
// MARISA_OK means that a requested operation has succeeded. In practice, an
|
55
|
+
// exception never has MARISA_OK because it is not an error.
|
56
|
+
MARISA_OK = 0,
|
57
|
+
|
58
|
+
// MARISA_STATE_ERROR means that an object was not ready for a requested
|
59
|
+
// operation. For example, an operation to modify a fixed vector throws an
|
60
|
+
// exception with MARISA_STATE_ERROR.
|
61
|
+
MARISA_STATE_ERROR = 1,
|
62
|
+
|
63
|
+
// MARISA_NULL_ERROR means that an invalid NULL pointer has been given.
|
64
|
+
MARISA_NULL_ERROR = 2,
|
65
|
+
|
66
|
+
// MARISA_BOUND_ERROR means that an operation has tried to access an out of
|
67
|
+
// range address.
|
68
|
+
MARISA_BOUND_ERROR = 3,
|
69
|
+
|
70
|
+
// MARISA_RANGE_ERROR means that an out of range value has appeared in
|
71
|
+
// operation.
|
72
|
+
MARISA_RANGE_ERROR = 4,
|
73
|
+
|
74
|
+
// MARISA_CODE_ERROR means that an undefined code has appeared in operation.
|
75
|
+
MARISA_CODE_ERROR = 5,
|
76
|
+
|
77
|
+
// MARISA_RESET_ERROR means that a smart pointer has tried to reset itself.
|
78
|
+
MARISA_RESET_ERROR = 6,
|
79
|
+
|
80
|
+
// MARISA_SIZE_ERROR means that a size has exceeded a library limitation.
|
81
|
+
MARISA_SIZE_ERROR = 7,
|
82
|
+
|
83
|
+
// MARISA_MEMORY_ERROR means that a memory allocation has failed.
|
84
|
+
MARISA_MEMORY_ERROR = 8,
|
85
|
+
|
86
|
+
// MARISA_IO_ERROR means that an I/O operation has failed.
|
87
|
+
MARISA_IO_ERROR = 9,
|
88
|
+
|
89
|
+
// MARISA_FORMAT_ERROR means that input was in invalid format.
|
90
|
+
MARISA_FORMAT_ERROR = 10,
|
91
|
+
} marisa_error_code;
|
92
|
+
|
93
|
+
// Min/max values, flags and masks for dictionary settings are defined below.
|
94
|
+
// Please note that unspecified settings will be replaced with the default
|
95
|
+
// settings. For example, 0 is equivalent to (MARISA_DEFAULT_NUM_TRIES |
|
96
|
+
// MARISA_DEFAULT_TRIE | MARISA_DEFAULT_TAIL | MARISA_DEFAULT_ORDER).
|
97
|
+
|
98
|
+
// A dictionary consists of 3 tries in default. Usually more tries make a
|
99
|
+
// dictionary space-efficient but time-inefficient.
|
100
|
+
typedef enum marisa_num_tries_ {
|
101
|
+
MARISA_MIN_NUM_TRIES = 0x00001,
|
102
|
+
MARISA_MAX_NUM_TRIES = 0x0007F,
|
103
|
+
MARISA_DEFAULT_NUM_TRIES = 0x00003,
|
104
|
+
} marisa_num_tries;
|
105
|
+
|
106
|
+
// This library uses a cache technique to accelerate search functions. The
|
107
|
+
// following enumerated type marisa_cache_level gives a list of available cache
|
108
|
+
// size options. A larger cache enables faster search but takes a more space.
|
109
|
+
typedef enum marisa_cache_level_ {
|
110
|
+
MARISA_HUGE_CACHE = 0x00080,
|
111
|
+
MARISA_LARGE_CACHE = 0x00100,
|
112
|
+
MARISA_NORMAL_CACHE = 0x00200,
|
113
|
+
MARISA_SMALL_CACHE = 0x00400,
|
114
|
+
MARISA_TINY_CACHE = 0x00800,
|
115
|
+
MARISA_DEFAULT_CACHE = MARISA_NORMAL_CACHE
|
116
|
+
} marisa_cache_level;
|
117
|
+
|
118
|
+
// This library provides 2 kinds of TAIL implementations.
|
119
|
+
typedef enum marisa_tail_mode_ {
|
120
|
+
// MARISA_TEXT_TAIL merges last labels as zero-terminated strings. So, it is
|
121
|
+
// available if and only if the last labels do not contain a NULL character.
|
122
|
+
// If MARISA_TEXT_TAIL is specified and a NULL character exists in the last
|
123
|
+
// labels, the setting is automatically switched to MARISA_BINARY_TAIL.
|
124
|
+
MARISA_TEXT_TAIL = 0x01000,
|
125
|
+
|
126
|
+
// MARISA_BINARY_TAIL also merges last labels but as byte sequences. It uses
|
127
|
+
// a bit vector to detect the end of a sequence, instead of NULL characters.
|
128
|
+
// So, MARISA_BINARY_TAIL requires a larger space if the average length of
|
129
|
+
// labels is greater than 8.
|
130
|
+
MARISA_BINARY_TAIL = 0x02000,
|
131
|
+
|
132
|
+
MARISA_DEFAULT_TAIL = MARISA_TEXT_TAIL,
|
133
|
+
} marisa_tail_mode;
|
134
|
+
|
135
|
+
// The arrangement of nodes affects the time cost of matching and the order of
|
136
|
+
// predictive search.
|
137
|
+
typedef enum marisa_node_order_ {
|
138
|
+
// MARISA_LABEL_ORDER arranges nodes in ascending label order.
|
139
|
+
// MARISA_LABEL_ORDER is useful if an application needs to predict keys in
|
140
|
+
// label order.
|
141
|
+
MARISA_LABEL_ORDER = 0x10000,
|
142
|
+
|
143
|
+
// MARISA_WEIGHT_ORDER arranges nodes in descending weight order.
|
144
|
+
// MARISA_WEIGHT_ORDER is generally a better choice because it enables faster
|
145
|
+
// matching.
|
146
|
+
MARISA_WEIGHT_ORDER = 0x20000,
|
147
|
+
|
148
|
+
MARISA_DEFAULT_ORDER = MARISA_WEIGHT_ORDER,
|
149
|
+
} marisa_node_order;
|
150
|
+
|
151
|
+
typedef enum marisa_config_mask_ {
|
152
|
+
MARISA_NUM_TRIES_MASK = 0x0007F,
|
153
|
+
MARISA_CACHE_LEVEL_MASK = 0x00F80,
|
154
|
+
MARISA_TAIL_MODE_MASK = 0x0F000,
|
155
|
+
MARISA_NODE_ORDER_MASK = 0xF0000,
|
156
|
+
MARISA_CONFIG_MASK = 0xFFFFF
|
157
|
+
} marisa_config_mask;
|
158
|
+
|
159
|
+
#ifdef __cplusplus
|
160
|
+
} // extern "C"
|
161
|
+
#endif // __cplusplus
|
162
|
+
|
163
|
+
#ifdef __cplusplus
|
164
|
+
namespace marisa {
|
165
|
+
|
166
|
+
typedef ::marisa_uint8 UInt8;
|
167
|
+
typedef ::marisa_uint16 UInt16;
|
168
|
+
typedef ::marisa_uint32 UInt32;
|
169
|
+
typedef ::marisa_uint64 UInt64;
|
170
|
+
|
171
|
+
typedef ::marisa_error_code ErrorCode;
|
172
|
+
|
173
|
+
typedef ::marisa_cache_level CacheLevel;
|
174
|
+
typedef ::marisa_tail_mode TailMode;
|
175
|
+
typedef ::marisa_node_order NodeOrder;
|
176
|
+
|
177
|
+
template <typename T>
|
178
|
+
inline void swap(T &lhs, T &rhs) {
|
179
|
+
T temp = lhs;
|
180
|
+
lhs = rhs;
|
181
|
+
rhs = temp;
|
182
|
+
}
|
183
|
+
|
184
|
+
} // namespace marisa
|
185
|
+
#endif // __cplusplus
|
186
|
+
|
187
|
+
#ifdef __cplusplus
|
188
|
+
#include "marisa/exception.h"
|
189
|
+
#include "marisa/scoped-ptr.h"
|
190
|
+
#include "marisa/scoped-array.h"
|
191
|
+
#endif // __cplusplus
|
192
|
+
|
193
|
+
#endif // MARISA_BASE_H_
|
@@ -0,0 +1,82 @@
|
|
1
|
+
#ifndef MARISA_EXCEPTION_H_
|
2
|
+
#define MARISA_EXCEPTION_H_
|
3
|
+
|
4
|
+
#include <exception>
|
5
|
+
|
6
|
+
#include "marisa/base.h"
|
7
|
+
|
8
|
+
namespace marisa {
|
9
|
+
|
10
|
+
// An exception object keeps a filename, a line number, an error code and an
|
11
|
+
// error message. The message format is as follows:
|
12
|
+
// "__FILE__:__LINE__: error_code: error_message"
|
13
|
+
class Exception : public std::exception {
|
14
|
+
public:
|
15
|
+
Exception(const char *filename, int line,
|
16
|
+
ErrorCode error_code, const char *error_message)
|
17
|
+
: std::exception(), filename_(filename), line_(line),
|
18
|
+
error_code_(error_code), error_message_(error_message) {}
|
19
|
+
Exception(const Exception &ex)
|
20
|
+
: std::exception(), filename_(ex.filename_), line_(ex.line_),
|
21
|
+
error_code_(ex.error_code_), error_message_(ex.error_message_) {}
|
22
|
+
virtual ~Exception() throw() {}
|
23
|
+
|
24
|
+
Exception &operator=(const Exception &rhs) {
|
25
|
+
filename_ = rhs.filename_;
|
26
|
+
line_ = rhs.line_;
|
27
|
+
error_code_ = rhs.error_code_;
|
28
|
+
error_message_ = rhs.error_message_;
|
29
|
+
return *this;
|
30
|
+
}
|
31
|
+
|
32
|
+
const char *filename() const {
|
33
|
+
return filename_;
|
34
|
+
}
|
35
|
+
int line() const {
|
36
|
+
return line_;
|
37
|
+
}
|
38
|
+
ErrorCode error_code() const {
|
39
|
+
return error_code_;
|
40
|
+
}
|
41
|
+
const char *error_message() const {
|
42
|
+
return error_message_;
|
43
|
+
}
|
44
|
+
|
45
|
+
virtual const char *what() const throw() {
|
46
|
+
return error_message_;
|
47
|
+
}
|
48
|
+
|
49
|
+
private:
|
50
|
+
const char *filename_;
|
51
|
+
int line_;
|
52
|
+
ErrorCode error_code_;
|
53
|
+
const char *error_message_;
|
54
|
+
};
|
55
|
+
|
56
|
+
// These macros are used to convert a line number to a string constant.
|
57
|
+
#define MARISA_INT_TO_STR(value) #value
|
58
|
+
#define MARISA_LINE_TO_STR(line) MARISA_INT_TO_STR(line)
|
59
|
+
#define MARISA_LINE_STR MARISA_LINE_TO_STR(__LINE__)
|
60
|
+
|
61
|
+
// MARISA_THROW throws an exception with a filename, a line number, an error
|
62
|
+
// code and an error message. The message format is as follows:
|
63
|
+
// "__FILE__:__LINE__: error_code: error_message"
|
64
|
+
#define MARISA_THROW(error_code, error_message) \
|
65
|
+
(throw marisa::Exception(__FILE__, __LINE__, error_code, \
|
66
|
+
__FILE__ ":" MARISA_LINE_STR ": " #error_code ": " error_message))
|
67
|
+
|
68
|
+
// MARISA_THROW_IF throws an exception if `condition' is true.
|
69
|
+
#define MARISA_THROW_IF(condition, error_code) \
|
70
|
+
(void)((!(condition)) || (MARISA_THROW(error_code, #condition), 0))
|
71
|
+
|
72
|
+
// MARISA_DEBUG_IF is ignored if _DEBUG is undefined. So, it is useful for
|
73
|
+
// debugging time-critical codes.
|
74
|
+
#ifdef _DEBUG
|
75
|
+
#define MARISA_DEBUG_IF(cond, error_code) MARISA_THROW_IF(cond, error_code)
|
76
|
+
#else
|
77
|
+
#define MARISA_DEBUG_IF(cond, error_code)
|
78
|
+
#endif
|
79
|
+
|
80
|
+
} // namespace marisa
|
81
|
+
|
82
|
+
#endif // MARISA_EXCEPTION_H_
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#ifndef MARISA_GRIMOIRE_ALGORITHM_H_
|
2
|
+
#define MARISA_GRIMOIRE_ALGORITHM_H_
|
3
|
+
|
4
|
+
#include "marisa/grimoire/algorithm/sort.h"
|
5
|
+
|
6
|
+
namespace marisa {
|
7
|
+
namespace grimoire {
|
8
|
+
|
9
|
+
class Algorithm {
|
10
|
+
public:
|
11
|
+
Algorithm() {}
|
12
|
+
|
13
|
+
template <typename Iterator>
|
14
|
+
std::size_t sort(Iterator begin, Iterator end) const {
|
15
|
+
return algorithm::sort(begin, end);
|
16
|
+
}
|
17
|
+
|
18
|
+
private:
|
19
|
+
Algorithm(const Algorithm &);
|
20
|
+
Algorithm &operator=(const Algorithm &);
|
21
|
+
};
|
22
|
+
|
23
|
+
} // namespace grimoire
|
24
|
+
} // namespace marisa
|
25
|
+
|
26
|
+
#endif // MARISA_GRIMOIRE_ALGORITHM_H_
|
@@ -0,0 +1,196 @@
|
|
1
|
+
#ifndef MARISA_GRIMOIRE_ALGORITHM_SORT_H_
|
2
|
+
#define MARISA_GRIMOIRE_ALGORITHM_SORT_H_
|
3
|
+
|
4
|
+
#include "marisa/base.h"
|
5
|
+
|
6
|
+
namespace marisa {
|
7
|
+
namespace grimoire {
|
8
|
+
namespace algorithm {
|
9
|
+
namespace details {
|
10
|
+
|
11
|
+
enum {
|
12
|
+
MARISA_INSERTION_SORT_THRESHOLD = 10
|
13
|
+
};
|
14
|
+
|
15
|
+
template <typename T>
|
16
|
+
int get_label(const T &unit, std::size_t depth) {
|
17
|
+
MARISA_DEBUG_IF(depth > unit.length(), MARISA_BOUND_ERROR);
|
18
|
+
|
19
|
+
return (depth < unit.length()) ? (int)(UInt8)unit[depth] : -1;
|
20
|
+
}
|
21
|
+
|
22
|
+
template <typename T>
|
23
|
+
int median(const T &a, const T &b, const T &c, std::size_t depth) {
|
24
|
+
const int x = get_label(a, depth);
|
25
|
+
const int y = get_label(b, depth);
|
26
|
+
const int z = get_label(c, depth);
|
27
|
+
if (x < y) {
|
28
|
+
if (y < z) {
|
29
|
+
return y;
|
30
|
+
} else if (x < z) {
|
31
|
+
return z;
|
32
|
+
}
|
33
|
+
return x;
|
34
|
+
} else if (x < z) {
|
35
|
+
return x;
|
36
|
+
} else if (y < z) {
|
37
|
+
return z;
|
38
|
+
}
|
39
|
+
return y;
|
40
|
+
}
|
41
|
+
|
42
|
+
template <typename T>
|
43
|
+
int compare(const T &lhs, const T &rhs, std::size_t depth) {
|
44
|
+
for (std::size_t i = depth; i < lhs.length(); ++i) {
|
45
|
+
if (i == rhs.length()) {
|
46
|
+
return 1;
|
47
|
+
}
|
48
|
+
if (lhs[i] != rhs[i]) {
|
49
|
+
return (UInt8)lhs[i] - (UInt8)rhs[i];
|
50
|
+
}
|
51
|
+
}
|
52
|
+
if (lhs.length() == rhs.length()) {
|
53
|
+
return 0;
|
54
|
+
}
|
55
|
+
return (lhs.length() < rhs.length()) ? -1 : 1;
|
56
|
+
}
|
57
|
+
|
58
|
+
template <typename Iterator>
|
59
|
+
std::size_t insertion_sort(Iterator l, Iterator r, std::size_t depth) {
|
60
|
+
MARISA_DEBUG_IF(l > r, MARISA_BOUND_ERROR);
|
61
|
+
|
62
|
+
std::size_t count = 1;
|
63
|
+
for (Iterator i = l + 1; i < r; ++i) {
|
64
|
+
int result = 0;
|
65
|
+
for (Iterator j = i; j > l; --j) {
|
66
|
+
result = compare(*(j - 1), *j, depth);
|
67
|
+
if (result <= 0) {
|
68
|
+
break;
|
69
|
+
}
|
70
|
+
marisa::swap(*(j - 1), *j);
|
71
|
+
}
|
72
|
+
if (result != 0) {
|
73
|
+
++count;
|
74
|
+
}
|
75
|
+
}
|
76
|
+
return count;
|
77
|
+
}
|
78
|
+
|
79
|
+
template <typename Iterator>
|
80
|
+
std::size_t sort(Iterator l, Iterator r, std::size_t depth) {
|
81
|
+
MARISA_DEBUG_IF(l > r, MARISA_BOUND_ERROR);
|
82
|
+
|
83
|
+
std::size_t count = 0;
|
84
|
+
while ((r - l) > MARISA_INSERTION_SORT_THRESHOLD) {
|
85
|
+
Iterator pl = l;
|
86
|
+
Iterator pr = r;
|
87
|
+
Iterator pivot_l = l;
|
88
|
+
Iterator pivot_r = r;
|
89
|
+
|
90
|
+
const int pivot = median(*l, *(l + (r - l) / 2), *(r - 1), depth);
|
91
|
+
for ( ; ; ) {
|
92
|
+
while (pl < pr) {
|
93
|
+
const int label = get_label(*pl, depth);
|
94
|
+
if (label > pivot) {
|
95
|
+
break;
|
96
|
+
} else if (label == pivot) {
|
97
|
+
marisa::swap(*pl, *pivot_l);
|
98
|
+
++pivot_l;
|
99
|
+
}
|
100
|
+
++pl;
|
101
|
+
}
|
102
|
+
while (pl < pr) {
|
103
|
+
const int label = get_label(*--pr, depth);
|
104
|
+
if (label < pivot) {
|
105
|
+
break;
|
106
|
+
} else if (label == pivot) {
|
107
|
+
marisa::swap(*pr, *--pivot_r);
|
108
|
+
}
|
109
|
+
}
|
110
|
+
if (pl >= pr) {
|
111
|
+
break;
|
112
|
+
}
|
113
|
+
marisa::swap(*pl, *pr);
|
114
|
+
++pl;
|
115
|
+
}
|
116
|
+
while (pivot_l > l) {
|
117
|
+
marisa::swap(*--pivot_l, *--pl);
|
118
|
+
}
|
119
|
+
while (pivot_r < r) {
|
120
|
+
marisa::swap(*pivot_r, *pr);
|
121
|
+
++pivot_r;
|
122
|
+
++pr;
|
123
|
+
}
|
124
|
+
|
125
|
+
if (((pl - l) > (pr - pl)) || ((r - pr) > (pr - pl))) {
|
126
|
+
if ((pr - pl) == 1) {
|
127
|
+
++count;
|
128
|
+
} else if ((pr - pl) > 1) {
|
129
|
+
if (pivot == -1) {
|
130
|
+
++count;
|
131
|
+
} else {
|
132
|
+
count += sort(pl, pr, depth + 1);
|
133
|
+
}
|
134
|
+
}
|
135
|
+
|
136
|
+
if ((pl - l) < (r - pr)) {
|
137
|
+
if ((pl - l) == 1) {
|
138
|
+
++count;
|
139
|
+
} else if ((pl - l) > 1) {
|
140
|
+
count += sort(l, pl, depth);
|
141
|
+
}
|
142
|
+
l = pr;
|
143
|
+
} else {
|
144
|
+
if ((r - pr) == 1) {
|
145
|
+
++count;
|
146
|
+
} else if ((r - pr) > 1) {
|
147
|
+
count += sort(pr, r, depth);
|
148
|
+
}
|
149
|
+
r = pl;
|
150
|
+
}
|
151
|
+
} else {
|
152
|
+
if ((pl - l) == 1) {
|
153
|
+
++count;
|
154
|
+
} else if ((pl - l) > 1) {
|
155
|
+
count += sort(l, pl, depth);
|
156
|
+
}
|
157
|
+
|
158
|
+
if ((r - pr) == 1) {
|
159
|
+
++count;
|
160
|
+
} else if ((r - pr) > 1) {
|
161
|
+
count += sort(pr, r, depth);
|
162
|
+
}
|
163
|
+
|
164
|
+
l = pl, r = pr;
|
165
|
+
if ((pr - pl) == 1) {
|
166
|
+
++count;
|
167
|
+
} else if ((pr - pl) > 1) {
|
168
|
+
if (pivot == -1) {
|
169
|
+
l = r;
|
170
|
+
++count;
|
171
|
+
} else {
|
172
|
+
++depth;
|
173
|
+
}
|
174
|
+
}
|
175
|
+
}
|
176
|
+
}
|
177
|
+
|
178
|
+
if ((r - l) > 1) {
|
179
|
+
count += insertion_sort(l, r, depth);
|
180
|
+
}
|
181
|
+
return count;
|
182
|
+
}
|
183
|
+
|
184
|
+
} // namespace details
|
185
|
+
|
186
|
+
template <typename Iterator>
|
187
|
+
std::size_t sort(Iterator begin, Iterator end) {
|
188
|
+
MARISA_DEBUG_IF(begin > end, MARISA_BOUND_ERROR);
|
189
|
+
return details::sort(begin, end, 0);
|
190
|
+
};
|
191
|
+
|
192
|
+
} // namespace algorithm
|
193
|
+
} // namespace grimoire
|
194
|
+
} // namespace marisa
|
195
|
+
|
196
|
+
#endif // MARISA_GRIMOIRE_ALGORITHM_SORT_H_
|