melisa 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +11 -0
- data/ext/marisa/bindings/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/marisa-swig.h +183 -0
- data/ext/marisa/bindings/perl/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/perl/marisa-swig.h +183 -0
- data/ext/marisa/bindings/perl/marisa-swig_wrap.cxx +5160 -0
- data/ext/marisa/bindings/python/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/python/marisa-swig.h +183 -0
- data/ext/marisa/bindings/python/marisa-swig_wrap.cxx +6090 -0
- data/ext/marisa/bindings/ruby/extconf.rb +5 -0
- data/ext/marisa/bindings/ruby/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/ruby/marisa-swig.h +183 -0
- data/ext/marisa/bindings/ruby/marisa-swig_wrap.cxx +4708 -0
- data/ext/marisa/lib/marisa.h +14 -0
- data/ext/marisa/lib/marisa/agent.cc +51 -0
- data/ext/marisa/lib/marisa/agent.h +73 -0
- data/ext/marisa/lib/marisa/base.h +193 -0
- data/ext/marisa/lib/marisa/exception.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/algorithm.h +26 -0
- data/ext/marisa/lib/marisa/grimoire/algorithm/sort.h +196 -0
- data/ext/marisa/lib/marisa/grimoire/intrin.h +115 -0
- data/ext/marisa/lib/marisa/grimoire/io.h +18 -0
- data/ext/marisa/lib/marisa/grimoire/io/mapper.cc +163 -0
- data/ext/marisa/lib/marisa/grimoire/io/mapper.h +67 -0
- data/ext/marisa/lib/marisa/grimoire/io/reader.cc +147 -0
- data/ext/marisa/lib/marisa/grimoire/io/reader.h +66 -0
- data/ext/marisa/lib/marisa/grimoire/io/writer.cc +148 -0
- data/ext/marisa/lib/marisa/grimoire/io/writer.h +65 -0
- data/ext/marisa/lib/marisa/grimoire/trie.h +16 -0
- data/ext/marisa/lib/marisa/grimoire/trie/cache.h +81 -0
- data/ext/marisa/lib/marisa/grimoire/trie/config.h +155 -0
- data/ext/marisa/lib/marisa/grimoire/trie/entry.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/trie/header.h +61 -0
- data/ext/marisa/lib/marisa/grimoire/trie/history.h +65 -0
- data/ext/marisa/lib/marisa/grimoire/trie/key.h +228 -0
- data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.cc +876 -0
- data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.h +134 -0
- data/ext/marisa/lib/marisa/grimoire/trie/range.h +115 -0
- data/ext/marisa/lib/marisa/grimoire/trie/state.h +117 -0
- data/ext/marisa/lib/marisa/grimoire/trie/tail.cc +218 -0
- data/ext/marisa/lib/marisa/grimoire/trie/tail.h +72 -0
- data/ext/marisa/lib/marisa/grimoire/vector.h +18 -0
- data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.cc +826 -0
- data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.h +179 -0
- data/ext/marisa/lib/marisa/grimoire/vector/flat-vector.h +205 -0
- data/ext/marisa/lib/marisa/grimoire/vector/pop-count.h +110 -0
- data/ext/marisa/lib/marisa/grimoire/vector/rank-index.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/vector/vector.h +256 -0
- data/ext/marisa/lib/marisa/iostream.h +18 -0
- data/ext/marisa/lib/marisa/key.h +85 -0
- data/ext/marisa/lib/marisa/keyset.cc +181 -0
- data/ext/marisa/lib/marisa/keyset.h +80 -0
- data/ext/marisa/lib/marisa/query.h +71 -0
- data/ext/marisa/lib/marisa/scoped-array.h +48 -0
- data/ext/marisa/lib/marisa/scoped-ptr.h +52 -0
- data/ext/marisa/lib/marisa/stdio.h +15 -0
- data/ext/marisa/lib/marisa/trie.cc +249 -0
- data/ext/marisa/lib/marisa/trie.h +64 -0
- data/ext/marisa/tests/base-test.cc +309 -0
- data/ext/marisa/tests/io-test.cc +252 -0
- data/ext/marisa/tests/marisa-assert.h +26 -0
- data/ext/marisa/tests/marisa-test.cc +388 -0
- data/ext/marisa/tests/trie-test.cc +507 -0
- data/ext/marisa/tests/vector-test.cc +466 -0
- data/ext/marisa/tools/cmdopt.cc +298 -0
- data/ext/marisa/tools/cmdopt.h +58 -0
- data/ext/marisa/tools/marisa-benchmark.cc +418 -0
- data/ext/marisa/tools/marisa-build.cc +206 -0
- data/ext/marisa/tools/marisa-common-prefix-search.cc +143 -0
- data/ext/marisa/tools/marisa-dump.cc +151 -0
- data/ext/marisa/tools/marisa-lookup.cc +110 -0
- data/ext/marisa/tools/marisa-predictive-search.cc +143 -0
- data/ext/marisa/tools/marisa-reverse-lookup.cc +110 -0
- data/lib/melisa.rb +7 -0
- data/lib/melisa/base_config_flags.rb +76 -0
- data/lib/melisa/bytes_trie.rb +55 -0
- data/lib/melisa/int_trie.rb +14 -0
- data/lib/melisa/search.rb +55 -0
- data/lib/melisa/trie.rb +96 -0
- data/lib/melisa/version.rb +3 -0
- data/melisa.gemspec +36 -0
- data/spec/base_config_flags_spec.rb +73 -0
- data/spec/bytes_trie_spec.rb +16 -0
- data/spec/int_trie_spec.rb +16 -0
- data/spec/search_spec.rb +29 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/trie_spec.rb +30 -0
- metadata +207 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#ifndef MARISA_H_
|
|
2
|
+
#define MARISA_H_
|
|
3
|
+
|
|
4
|
+
// "marisa/stdio.h" includes <cstdio> for I/O using std::FILE.
|
|
5
|
+
#include "marisa/stdio.h"
|
|
6
|
+
|
|
7
|
+
// "marisa/iostream.h" includes <iosfwd> for I/O using std::iostream.
|
|
8
|
+
#include "marisa/iostream.h"
|
|
9
|
+
|
|
10
|
+
// You can use <marisa/trie.h> instead of <marisa.h> if you don't need the
|
|
11
|
+
// above I/O interfaces and don't want to include the above I/O headers.
|
|
12
|
+
#include "marisa/trie.h"
|
|
13
|
+
|
|
14
|
+
#endif // MARISA_H_
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
#include <new>
|
|
2
|
+
|
|
3
|
+
#include "marisa/agent.h"
|
|
4
|
+
#include "marisa/grimoire/trie.h"
|
|
5
|
+
|
|
6
|
+
namespace marisa {
|
|
7
|
+
|
|
8
|
+
Agent::Agent() : query_(), key_(), state_() {}
|
|
9
|
+
|
|
10
|
+
Agent::~Agent() {}
|
|
11
|
+
|
|
12
|
+
void Agent::set_query(const char *str) {
|
|
13
|
+
MARISA_THROW_IF(str == NULL, MARISA_NULL_ERROR);
|
|
14
|
+
if (state_.get() != NULL) {
|
|
15
|
+
state_->reset();
|
|
16
|
+
}
|
|
17
|
+
query_.set_str(str);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
void Agent::set_query(const char *ptr, std::size_t length) {
|
|
21
|
+
MARISA_THROW_IF((ptr == NULL) && (length != 0), MARISA_NULL_ERROR);
|
|
22
|
+
if (state_.get() != NULL) {
|
|
23
|
+
state_->reset();
|
|
24
|
+
}
|
|
25
|
+
query_.set_str(ptr, length);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
void Agent::set_query(std::size_t key_id) {
|
|
29
|
+
if (state_.get() != NULL) {
|
|
30
|
+
state_->reset();
|
|
31
|
+
}
|
|
32
|
+
query_.set_id(key_id);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
void Agent::init_state() {
|
|
36
|
+
MARISA_THROW_IF(state_.get() != NULL, MARISA_STATE_ERROR);
|
|
37
|
+
state_.reset(new (std::nothrow) grimoire::State);
|
|
38
|
+
MARISA_THROW_IF(state_.get() == NULL, MARISA_MEMORY_ERROR);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
void Agent::clear() {
|
|
42
|
+
Agent().swap(*this);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
void Agent::swap(Agent &rhs) {
|
|
46
|
+
query_.swap(rhs.query_);
|
|
47
|
+
key_.swap(rhs.key_);
|
|
48
|
+
state_.swap(rhs.state_);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
} // namespace marisa
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
#ifndef MARISA_AGENT_H_
|
|
2
|
+
#define MARISA_AGENT_H_
|
|
3
|
+
|
|
4
|
+
#include "marisa/key.h"
|
|
5
|
+
#include "marisa/query.h"
|
|
6
|
+
|
|
7
|
+
namespace marisa {
|
|
8
|
+
namespace grimoire {
|
|
9
|
+
namespace trie {
|
|
10
|
+
|
|
11
|
+
class State;
|
|
12
|
+
|
|
13
|
+
} // namespace trie
|
|
14
|
+
} // namespace grimoire
|
|
15
|
+
|
|
16
|
+
class Agent {
|
|
17
|
+
public:
|
|
18
|
+
Agent();
|
|
19
|
+
~Agent();
|
|
20
|
+
|
|
21
|
+
const Query &query() const {
|
|
22
|
+
return query_;
|
|
23
|
+
}
|
|
24
|
+
const Key &key() const {
|
|
25
|
+
return key_;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
void set_query(const char *str);
|
|
29
|
+
void set_query(const char *ptr, std::size_t length);
|
|
30
|
+
void set_query(std::size_t key_id);
|
|
31
|
+
|
|
32
|
+
const grimoire::trie::State &state() const {
|
|
33
|
+
return *state_;
|
|
34
|
+
}
|
|
35
|
+
grimoire::trie::State &state() {
|
|
36
|
+
return *state_;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
void set_key(const char *str) {
|
|
40
|
+
MARISA_DEBUG_IF(str == NULL, MARISA_NULL_ERROR);
|
|
41
|
+
key_.set_str(str);
|
|
42
|
+
}
|
|
43
|
+
void set_key(const char *ptr, std::size_t length) {
|
|
44
|
+
MARISA_DEBUG_IF((ptr == NULL) && (length != 0), MARISA_NULL_ERROR);
|
|
45
|
+
MARISA_DEBUG_IF(length > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
|
46
|
+
key_.set_str(ptr, length);
|
|
47
|
+
}
|
|
48
|
+
void set_key(std::size_t id) {
|
|
49
|
+
MARISA_DEBUG_IF(id > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
|
50
|
+
key_.set_id(id);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
bool has_state() const {
|
|
54
|
+
return state_.get() != NULL;
|
|
55
|
+
}
|
|
56
|
+
void init_state();
|
|
57
|
+
|
|
58
|
+
void clear();
|
|
59
|
+
void swap(Agent &rhs);
|
|
60
|
+
|
|
61
|
+
private:
|
|
62
|
+
Query query_;
|
|
63
|
+
Key key_;
|
|
64
|
+
scoped_ptr<grimoire::trie::State> state_;
|
|
65
|
+
|
|
66
|
+
// Disallows copy and assignment.
|
|
67
|
+
Agent(const Agent &);
|
|
68
|
+
Agent &operator=(const Agent &);
|
|
69
|
+
};
|
|
70
|
+
|
|
71
|
+
} // namespace marisa
|
|
72
|
+
|
|
73
|
+
#endif // MARISA_AGENT_H_
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
#ifndef MARISA_BASE_H_
|
|
2
|
+
#define MARISA_BASE_H_
|
|
3
|
+
|
|
4
|
+
// Old Visual C++ does not provide stdint.h.
|
|
5
|
+
#ifndef _MSC_VER
|
|
6
|
+
#include <stdint.h>
|
|
7
|
+
#endif // _MSC_VER
|
|
8
|
+
|
|
9
|
+
#ifdef __cplusplus
|
|
10
|
+
#include <cstddef>
|
|
11
|
+
#else // __cplusplus
|
|
12
|
+
#include <stddef.h>
|
|
13
|
+
#endif // __cplusplus
|
|
14
|
+
|
|
15
|
+
#ifdef __cplusplus
|
|
16
|
+
extern "C" {
|
|
17
|
+
#endif // __cplusplus
|
|
18
|
+
|
|
19
|
+
#ifdef _MSC_VER
|
|
20
|
+
typedef unsigned __int8 marisa_uint8;
|
|
21
|
+
typedef unsigned __int16 marisa_uint16;
|
|
22
|
+
typedef unsigned __int32 marisa_uint32;
|
|
23
|
+
typedef unsigned __int64 marisa_uint64;
|
|
24
|
+
#else // _MSC_VER
|
|
25
|
+
typedef uint8_t marisa_uint8;
|
|
26
|
+
typedef uint16_t marisa_uint16;
|
|
27
|
+
typedef uint32_t marisa_uint32;
|
|
28
|
+
typedef uint64_t marisa_uint64;
|
|
29
|
+
#endif // _MSC_VER
|
|
30
|
+
|
|
31
|
+
#if defined(_WIN64) || defined(__amd64__) || defined(__x86_64__) || \
|
|
32
|
+
defined(__ia64__) || defined(__ppc64__) || defined(__powerpc64__) || \
|
|
33
|
+
defined(__sparc64__) || defined(__mips64__) || defined(__aarch64__)
|
|
34
|
+
#define MARISA_WORD_SIZE 64
|
|
35
|
+
#else // defined(_WIN64), etc.
|
|
36
|
+
#define MARISA_WORD_SIZE 32
|
|
37
|
+
#endif // defined(_WIN64), etc.
|
|
38
|
+
|
|
39
|
+
//#define MARISA_WORD_SIZE (sizeof(void *) * 8)
|
|
40
|
+
|
|
41
|
+
#define MARISA_UINT8_MAX ((marisa_uint8)~(marisa_uint8)0)
|
|
42
|
+
#define MARISA_UINT16_MAX ((marisa_uint16)~(marisa_uint16)0)
|
|
43
|
+
#define MARISA_UINT32_MAX ((marisa_uint32)~(marisa_uint32)0)
|
|
44
|
+
#define MARISA_UINT64_MAX ((marisa_uint64)~(marisa_uint64)0)
|
|
45
|
+
#define MARISA_SIZE_MAX ((size_t)~(size_t)0)
|
|
46
|
+
|
|
47
|
+
#define MARISA_INVALID_LINK_ID MARISA_UINT32_MAX
|
|
48
|
+
#define MARISA_INVALID_KEY_ID MARISA_UINT32_MAX
|
|
49
|
+
#define MARISA_INVALID_EXTRA (MARISA_UINT32_MAX >> 8)
|
|
50
|
+
|
|
51
|
+
// Error codes are defined as members of marisa_error_code. This library throws
|
|
52
|
+
// an exception with one of the error codes when an error occurs.
|
|
53
|
+
typedef enum marisa_error_code_ {
|
|
54
|
+
// MARISA_OK means that a requested operation has succeeded. In practice, an
|
|
55
|
+
// exception never has MARISA_OK because it is not an error.
|
|
56
|
+
MARISA_OK = 0,
|
|
57
|
+
|
|
58
|
+
// MARISA_STATE_ERROR means that an object was not ready for a requested
|
|
59
|
+
// operation. For example, an operation to modify a fixed vector throws an
|
|
60
|
+
// exception with MARISA_STATE_ERROR.
|
|
61
|
+
MARISA_STATE_ERROR = 1,
|
|
62
|
+
|
|
63
|
+
// MARISA_NULL_ERROR means that an invalid NULL pointer has been given.
|
|
64
|
+
MARISA_NULL_ERROR = 2,
|
|
65
|
+
|
|
66
|
+
// MARISA_BOUND_ERROR means that an operation has tried to access an out of
|
|
67
|
+
// range address.
|
|
68
|
+
MARISA_BOUND_ERROR = 3,
|
|
69
|
+
|
|
70
|
+
// MARISA_RANGE_ERROR means that an out of range value has appeared in
|
|
71
|
+
// operation.
|
|
72
|
+
MARISA_RANGE_ERROR = 4,
|
|
73
|
+
|
|
74
|
+
// MARISA_CODE_ERROR means that an undefined code has appeared in operation.
|
|
75
|
+
MARISA_CODE_ERROR = 5,
|
|
76
|
+
|
|
77
|
+
// MARISA_RESET_ERROR means that a smart pointer has tried to reset itself.
|
|
78
|
+
MARISA_RESET_ERROR = 6,
|
|
79
|
+
|
|
80
|
+
// MARISA_SIZE_ERROR means that a size has exceeded a library limitation.
|
|
81
|
+
MARISA_SIZE_ERROR = 7,
|
|
82
|
+
|
|
83
|
+
// MARISA_MEMORY_ERROR means that a memory allocation has failed.
|
|
84
|
+
MARISA_MEMORY_ERROR = 8,
|
|
85
|
+
|
|
86
|
+
// MARISA_IO_ERROR means that an I/O operation has failed.
|
|
87
|
+
MARISA_IO_ERROR = 9,
|
|
88
|
+
|
|
89
|
+
// MARISA_FORMAT_ERROR means that input was in invalid format.
|
|
90
|
+
MARISA_FORMAT_ERROR = 10,
|
|
91
|
+
} marisa_error_code;
|
|
92
|
+
|
|
93
|
+
// Min/max values, flags and masks for dictionary settings are defined below.
|
|
94
|
+
// Please note that unspecified settings will be replaced with the default
|
|
95
|
+
// settings. For example, 0 is equivalent to (MARISA_DEFAULT_NUM_TRIES |
|
|
96
|
+
// MARISA_DEFAULT_TRIE | MARISA_DEFAULT_TAIL | MARISA_DEFAULT_ORDER).
|
|
97
|
+
|
|
98
|
+
// A dictionary consists of 3 tries in default. Usually more tries make a
|
|
99
|
+
// dictionary space-efficient but time-inefficient.
|
|
100
|
+
typedef enum marisa_num_tries_ {
|
|
101
|
+
MARISA_MIN_NUM_TRIES = 0x00001,
|
|
102
|
+
MARISA_MAX_NUM_TRIES = 0x0007F,
|
|
103
|
+
MARISA_DEFAULT_NUM_TRIES = 0x00003,
|
|
104
|
+
} marisa_num_tries;
|
|
105
|
+
|
|
106
|
+
// This library uses a cache technique to accelerate search functions. The
|
|
107
|
+
// following enumerated type marisa_cache_level gives a list of available cache
|
|
108
|
+
// size options. A larger cache enables faster search but takes a more space.
|
|
109
|
+
typedef enum marisa_cache_level_ {
|
|
110
|
+
MARISA_HUGE_CACHE = 0x00080,
|
|
111
|
+
MARISA_LARGE_CACHE = 0x00100,
|
|
112
|
+
MARISA_NORMAL_CACHE = 0x00200,
|
|
113
|
+
MARISA_SMALL_CACHE = 0x00400,
|
|
114
|
+
MARISA_TINY_CACHE = 0x00800,
|
|
115
|
+
MARISA_DEFAULT_CACHE = MARISA_NORMAL_CACHE
|
|
116
|
+
} marisa_cache_level;
|
|
117
|
+
|
|
118
|
+
// This library provides 2 kinds of TAIL implementations.
|
|
119
|
+
typedef enum marisa_tail_mode_ {
|
|
120
|
+
// MARISA_TEXT_TAIL merges last labels as zero-terminated strings. So, it is
|
|
121
|
+
// available if and only if the last labels do not contain a NULL character.
|
|
122
|
+
// If MARISA_TEXT_TAIL is specified and a NULL character exists in the last
|
|
123
|
+
// labels, the setting is automatically switched to MARISA_BINARY_TAIL.
|
|
124
|
+
MARISA_TEXT_TAIL = 0x01000,
|
|
125
|
+
|
|
126
|
+
// MARISA_BINARY_TAIL also merges last labels but as byte sequences. It uses
|
|
127
|
+
// a bit vector to detect the end of a sequence, instead of NULL characters.
|
|
128
|
+
// So, MARISA_BINARY_TAIL requires a larger space if the average length of
|
|
129
|
+
// labels is greater than 8.
|
|
130
|
+
MARISA_BINARY_TAIL = 0x02000,
|
|
131
|
+
|
|
132
|
+
MARISA_DEFAULT_TAIL = MARISA_TEXT_TAIL,
|
|
133
|
+
} marisa_tail_mode;
|
|
134
|
+
|
|
135
|
+
// The arrangement of nodes affects the time cost of matching and the order of
|
|
136
|
+
// predictive search.
|
|
137
|
+
typedef enum marisa_node_order_ {
|
|
138
|
+
// MARISA_LABEL_ORDER arranges nodes in ascending label order.
|
|
139
|
+
// MARISA_LABEL_ORDER is useful if an application needs to predict keys in
|
|
140
|
+
// label order.
|
|
141
|
+
MARISA_LABEL_ORDER = 0x10000,
|
|
142
|
+
|
|
143
|
+
// MARISA_WEIGHT_ORDER arranges nodes in descending weight order.
|
|
144
|
+
// MARISA_WEIGHT_ORDER is generally a better choice because it enables faster
|
|
145
|
+
// matching.
|
|
146
|
+
MARISA_WEIGHT_ORDER = 0x20000,
|
|
147
|
+
|
|
148
|
+
MARISA_DEFAULT_ORDER = MARISA_WEIGHT_ORDER,
|
|
149
|
+
} marisa_node_order;
|
|
150
|
+
|
|
151
|
+
typedef enum marisa_config_mask_ {
|
|
152
|
+
MARISA_NUM_TRIES_MASK = 0x0007F,
|
|
153
|
+
MARISA_CACHE_LEVEL_MASK = 0x00F80,
|
|
154
|
+
MARISA_TAIL_MODE_MASK = 0x0F000,
|
|
155
|
+
MARISA_NODE_ORDER_MASK = 0xF0000,
|
|
156
|
+
MARISA_CONFIG_MASK = 0xFFFFF
|
|
157
|
+
} marisa_config_mask;
|
|
158
|
+
|
|
159
|
+
#ifdef __cplusplus
|
|
160
|
+
} // extern "C"
|
|
161
|
+
#endif // __cplusplus
|
|
162
|
+
|
|
163
|
+
#ifdef __cplusplus
|
|
164
|
+
namespace marisa {
|
|
165
|
+
|
|
166
|
+
typedef ::marisa_uint8 UInt8;
|
|
167
|
+
typedef ::marisa_uint16 UInt16;
|
|
168
|
+
typedef ::marisa_uint32 UInt32;
|
|
169
|
+
typedef ::marisa_uint64 UInt64;
|
|
170
|
+
|
|
171
|
+
typedef ::marisa_error_code ErrorCode;
|
|
172
|
+
|
|
173
|
+
typedef ::marisa_cache_level CacheLevel;
|
|
174
|
+
typedef ::marisa_tail_mode TailMode;
|
|
175
|
+
typedef ::marisa_node_order NodeOrder;
|
|
176
|
+
|
|
177
|
+
template <typename T>
|
|
178
|
+
inline void swap(T &lhs, T &rhs) {
|
|
179
|
+
T temp = lhs;
|
|
180
|
+
lhs = rhs;
|
|
181
|
+
rhs = temp;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
} // namespace marisa
|
|
185
|
+
#endif // __cplusplus
|
|
186
|
+
|
|
187
|
+
#ifdef __cplusplus
|
|
188
|
+
#include "marisa/exception.h"
|
|
189
|
+
#include "marisa/scoped-ptr.h"
|
|
190
|
+
#include "marisa/scoped-array.h"
|
|
191
|
+
#endif // __cplusplus
|
|
192
|
+
|
|
193
|
+
#endif // MARISA_BASE_H_
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
#ifndef MARISA_EXCEPTION_H_
|
|
2
|
+
#define MARISA_EXCEPTION_H_
|
|
3
|
+
|
|
4
|
+
#include <exception>
|
|
5
|
+
|
|
6
|
+
#include "marisa/base.h"
|
|
7
|
+
|
|
8
|
+
namespace marisa {
|
|
9
|
+
|
|
10
|
+
// An exception object keeps a filename, a line number, an error code and an
|
|
11
|
+
// error message. The message format is as follows:
|
|
12
|
+
// "__FILE__:__LINE__: error_code: error_message"
|
|
13
|
+
class Exception : public std::exception {
|
|
14
|
+
public:
|
|
15
|
+
Exception(const char *filename, int line,
|
|
16
|
+
ErrorCode error_code, const char *error_message)
|
|
17
|
+
: std::exception(), filename_(filename), line_(line),
|
|
18
|
+
error_code_(error_code), error_message_(error_message) {}
|
|
19
|
+
Exception(const Exception &ex)
|
|
20
|
+
: std::exception(), filename_(ex.filename_), line_(ex.line_),
|
|
21
|
+
error_code_(ex.error_code_), error_message_(ex.error_message_) {}
|
|
22
|
+
virtual ~Exception() throw() {}
|
|
23
|
+
|
|
24
|
+
Exception &operator=(const Exception &rhs) {
|
|
25
|
+
filename_ = rhs.filename_;
|
|
26
|
+
line_ = rhs.line_;
|
|
27
|
+
error_code_ = rhs.error_code_;
|
|
28
|
+
error_message_ = rhs.error_message_;
|
|
29
|
+
return *this;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const char *filename() const {
|
|
33
|
+
return filename_;
|
|
34
|
+
}
|
|
35
|
+
int line() const {
|
|
36
|
+
return line_;
|
|
37
|
+
}
|
|
38
|
+
ErrorCode error_code() const {
|
|
39
|
+
return error_code_;
|
|
40
|
+
}
|
|
41
|
+
const char *error_message() const {
|
|
42
|
+
return error_message_;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
virtual const char *what() const throw() {
|
|
46
|
+
return error_message_;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
private:
|
|
50
|
+
const char *filename_;
|
|
51
|
+
int line_;
|
|
52
|
+
ErrorCode error_code_;
|
|
53
|
+
const char *error_message_;
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
// These macros are used to convert a line number to a string constant.
|
|
57
|
+
#define MARISA_INT_TO_STR(value) #value
|
|
58
|
+
#define MARISA_LINE_TO_STR(line) MARISA_INT_TO_STR(line)
|
|
59
|
+
#define MARISA_LINE_STR MARISA_LINE_TO_STR(__LINE__)
|
|
60
|
+
|
|
61
|
+
// MARISA_THROW throws an exception with a filename, a line number, an error
|
|
62
|
+
// code and an error message. The message format is as follows:
|
|
63
|
+
// "__FILE__:__LINE__: error_code: error_message"
|
|
64
|
+
#define MARISA_THROW(error_code, error_message) \
|
|
65
|
+
(throw marisa::Exception(__FILE__, __LINE__, error_code, \
|
|
66
|
+
__FILE__ ":" MARISA_LINE_STR ": " #error_code ": " error_message))
|
|
67
|
+
|
|
68
|
+
// MARISA_THROW_IF throws an exception if `condition' is true.
|
|
69
|
+
#define MARISA_THROW_IF(condition, error_code) \
|
|
70
|
+
(void)((!(condition)) || (MARISA_THROW(error_code, #condition), 0))
|
|
71
|
+
|
|
72
|
+
// MARISA_DEBUG_IF is ignored if _DEBUG is undefined. So, it is useful for
|
|
73
|
+
// debugging time-critical codes.
|
|
74
|
+
#ifdef _DEBUG
|
|
75
|
+
#define MARISA_DEBUG_IF(cond, error_code) MARISA_THROW_IF(cond, error_code)
|
|
76
|
+
#else
|
|
77
|
+
#define MARISA_DEBUG_IF(cond, error_code)
|
|
78
|
+
#endif
|
|
79
|
+
|
|
80
|
+
} // namespace marisa
|
|
81
|
+
|
|
82
|
+
#endif // MARISA_EXCEPTION_H_
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#ifndef MARISA_GRIMOIRE_ALGORITHM_H_
|
|
2
|
+
#define MARISA_GRIMOIRE_ALGORITHM_H_
|
|
3
|
+
|
|
4
|
+
#include "marisa/grimoire/algorithm/sort.h"
|
|
5
|
+
|
|
6
|
+
namespace marisa {
|
|
7
|
+
namespace grimoire {
|
|
8
|
+
|
|
9
|
+
class Algorithm {
|
|
10
|
+
public:
|
|
11
|
+
Algorithm() {}
|
|
12
|
+
|
|
13
|
+
template <typename Iterator>
|
|
14
|
+
std::size_t sort(Iterator begin, Iterator end) const {
|
|
15
|
+
return algorithm::sort(begin, end);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
private:
|
|
19
|
+
Algorithm(const Algorithm &);
|
|
20
|
+
Algorithm &operator=(const Algorithm &);
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
} // namespace grimoire
|
|
24
|
+
} // namespace marisa
|
|
25
|
+
|
|
26
|
+
#endif // MARISA_GRIMOIRE_ALGORITHM_H_
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
#ifndef MARISA_GRIMOIRE_ALGORITHM_SORT_H_
|
|
2
|
+
#define MARISA_GRIMOIRE_ALGORITHM_SORT_H_
|
|
3
|
+
|
|
4
|
+
#include "marisa/base.h"
|
|
5
|
+
|
|
6
|
+
namespace marisa {
|
|
7
|
+
namespace grimoire {
|
|
8
|
+
namespace algorithm {
|
|
9
|
+
namespace details {
|
|
10
|
+
|
|
11
|
+
enum {
|
|
12
|
+
MARISA_INSERTION_SORT_THRESHOLD = 10
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
template <typename T>
|
|
16
|
+
int get_label(const T &unit, std::size_t depth) {
|
|
17
|
+
MARISA_DEBUG_IF(depth > unit.length(), MARISA_BOUND_ERROR);
|
|
18
|
+
|
|
19
|
+
return (depth < unit.length()) ? (int)(UInt8)unit[depth] : -1;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
template <typename T>
|
|
23
|
+
int median(const T &a, const T &b, const T &c, std::size_t depth) {
|
|
24
|
+
const int x = get_label(a, depth);
|
|
25
|
+
const int y = get_label(b, depth);
|
|
26
|
+
const int z = get_label(c, depth);
|
|
27
|
+
if (x < y) {
|
|
28
|
+
if (y < z) {
|
|
29
|
+
return y;
|
|
30
|
+
} else if (x < z) {
|
|
31
|
+
return z;
|
|
32
|
+
}
|
|
33
|
+
return x;
|
|
34
|
+
} else if (x < z) {
|
|
35
|
+
return x;
|
|
36
|
+
} else if (y < z) {
|
|
37
|
+
return z;
|
|
38
|
+
}
|
|
39
|
+
return y;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
template <typename T>
|
|
43
|
+
int compare(const T &lhs, const T &rhs, std::size_t depth) {
|
|
44
|
+
for (std::size_t i = depth; i < lhs.length(); ++i) {
|
|
45
|
+
if (i == rhs.length()) {
|
|
46
|
+
return 1;
|
|
47
|
+
}
|
|
48
|
+
if (lhs[i] != rhs[i]) {
|
|
49
|
+
return (UInt8)lhs[i] - (UInt8)rhs[i];
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
if (lhs.length() == rhs.length()) {
|
|
53
|
+
return 0;
|
|
54
|
+
}
|
|
55
|
+
return (lhs.length() < rhs.length()) ? -1 : 1;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
template <typename Iterator>
|
|
59
|
+
std::size_t insertion_sort(Iterator l, Iterator r, std::size_t depth) {
|
|
60
|
+
MARISA_DEBUG_IF(l > r, MARISA_BOUND_ERROR);
|
|
61
|
+
|
|
62
|
+
std::size_t count = 1;
|
|
63
|
+
for (Iterator i = l + 1; i < r; ++i) {
|
|
64
|
+
int result = 0;
|
|
65
|
+
for (Iterator j = i; j > l; --j) {
|
|
66
|
+
result = compare(*(j - 1), *j, depth);
|
|
67
|
+
if (result <= 0) {
|
|
68
|
+
break;
|
|
69
|
+
}
|
|
70
|
+
marisa::swap(*(j - 1), *j);
|
|
71
|
+
}
|
|
72
|
+
if (result != 0) {
|
|
73
|
+
++count;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
return count;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
template <typename Iterator>
|
|
80
|
+
std::size_t sort(Iterator l, Iterator r, std::size_t depth) {
|
|
81
|
+
MARISA_DEBUG_IF(l > r, MARISA_BOUND_ERROR);
|
|
82
|
+
|
|
83
|
+
std::size_t count = 0;
|
|
84
|
+
while ((r - l) > MARISA_INSERTION_SORT_THRESHOLD) {
|
|
85
|
+
Iterator pl = l;
|
|
86
|
+
Iterator pr = r;
|
|
87
|
+
Iterator pivot_l = l;
|
|
88
|
+
Iterator pivot_r = r;
|
|
89
|
+
|
|
90
|
+
const int pivot = median(*l, *(l + (r - l) / 2), *(r - 1), depth);
|
|
91
|
+
for ( ; ; ) {
|
|
92
|
+
while (pl < pr) {
|
|
93
|
+
const int label = get_label(*pl, depth);
|
|
94
|
+
if (label > pivot) {
|
|
95
|
+
break;
|
|
96
|
+
} else if (label == pivot) {
|
|
97
|
+
marisa::swap(*pl, *pivot_l);
|
|
98
|
+
++pivot_l;
|
|
99
|
+
}
|
|
100
|
+
++pl;
|
|
101
|
+
}
|
|
102
|
+
while (pl < pr) {
|
|
103
|
+
const int label = get_label(*--pr, depth);
|
|
104
|
+
if (label < pivot) {
|
|
105
|
+
break;
|
|
106
|
+
} else if (label == pivot) {
|
|
107
|
+
marisa::swap(*pr, *--pivot_r);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
if (pl >= pr) {
|
|
111
|
+
break;
|
|
112
|
+
}
|
|
113
|
+
marisa::swap(*pl, *pr);
|
|
114
|
+
++pl;
|
|
115
|
+
}
|
|
116
|
+
while (pivot_l > l) {
|
|
117
|
+
marisa::swap(*--pivot_l, *--pl);
|
|
118
|
+
}
|
|
119
|
+
while (pivot_r < r) {
|
|
120
|
+
marisa::swap(*pivot_r, *pr);
|
|
121
|
+
++pivot_r;
|
|
122
|
+
++pr;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
if (((pl - l) > (pr - pl)) || ((r - pr) > (pr - pl))) {
|
|
126
|
+
if ((pr - pl) == 1) {
|
|
127
|
+
++count;
|
|
128
|
+
} else if ((pr - pl) > 1) {
|
|
129
|
+
if (pivot == -1) {
|
|
130
|
+
++count;
|
|
131
|
+
} else {
|
|
132
|
+
count += sort(pl, pr, depth + 1);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
if ((pl - l) < (r - pr)) {
|
|
137
|
+
if ((pl - l) == 1) {
|
|
138
|
+
++count;
|
|
139
|
+
} else if ((pl - l) > 1) {
|
|
140
|
+
count += sort(l, pl, depth);
|
|
141
|
+
}
|
|
142
|
+
l = pr;
|
|
143
|
+
} else {
|
|
144
|
+
if ((r - pr) == 1) {
|
|
145
|
+
++count;
|
|
146
|
+
} else if ((r - pr) > 1) {
|
|
147
|
+
count += sort(pr, r, depth);
|
|
148
|
+
}
|
|
149
|
+
r = pl;
|
|
150
|
+
}
|
|
151
|
+
} else {
|
|
152
|
+
if ((pl - l) == 1) {
|
|
153
|
+
++count;
|
|
154
|
+
} else if ((pl - l) > 1) {
|
|
155
|
+
count += sort(l, pl, depth);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
if ((r - pr) == 1) {
|
|
159
|
+
++count;
|
|
160
|
+
} else if ((r - pr) > 1) {
|
|
161
|
+
count += sort(pr, r, depth);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
l = pl, r = pr;
|
|
165
|
+
if ((pr - pl) == 1) {
|
|
166
|
+
++count;
|
|
167
|
+
} else if ((pr - pl) > 1) {
|
|
168
|
+
if (pivot == -1) {
|
|
169
|
+
l = r;
|
|
170
|
+
++count;
|
|
171
|
+
} else {
|
|
172
|
+
++depth;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
if ((r - l) > 1) {
|
|
179
|
+
count += insertion_sort(l, r, depth);
|
|
180
|
+
}
|
|
181
|
+
return count;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
} // namespace details
|
|
185
|
+
|
|
186
|
+
template <typename Iterator>
|
|
187
|
+
std::size_t sort(Iterator begin, Iterator end) {
|
|
188
|
+
MARISA_DEBUG_IF(begin > end, MARISA_BOUND_ERROR);
|
|
189
|
+
return details::sort(begin, end, 0);
|
|
190
|
+
};
|
|
191
|
+
|
|
192
|
+
} // namespace algorithm
|
|
193
|
+
} // namespace grimoire
|
|
194
|
+
} // namespace marisa
|
|
195
|
+
|
|
196
|
+
#endif // MARISA_GRIMOIRE_ALGORITHM_SORT_H_
|