melisa 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +11 -0
- data/ext/marisa/bindings/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/marisa-swig.h +183 -0
- data/ext/marisa/bindings/perl/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/perl/marisa-swig.h +183 -0
- data/ext/marisa/bindings/perl/marisa-swig_wrap.cxx +5160 -0
- data/ext/marisa/bindings/python/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/python/marisa-swig.h +183 -0
- data/ext/marisa/bindings/python/marisa-swig_wrap.cxx +6090 -0
- data/ext/marisa/bindings/ruby/extconf.rb +5 -0
- data/ext/marisa/bindings/ruby/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/ruby/marisa-swig.h +183 -0
- data/ext/marisa/bindings/ruby/marisa-swig_wrap.cxx +4708 -0
- data/ext/marisa/lib/marisa.h +14 -0
- data/ext/marisa/lib/marisa/agent.cc +51 -0
- data/ext/marisa/lib/marisa/agent.h +73 -0
- data/ext/marisa/lib/marisa/base.h +193 -0
- data/ext/marisa/lib/marisa/exception.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/algorithm.h +26 -0
- data/ext/marisa/lib/marisa/grimoire/algorithm/sort.h +196 -0
- data/ext/marisa/lib/marisa/grimoire/intrin.h +115 -0
- data/ext/marisa/lib/marisa/grimoire/io.h +18 -0
- data/ext/marisa/lib/marisa/grimoire/io/mapper.cc +163 -0
- data/ext/marisa/lib/marisa/grimoire/io/mapper.h +67 -0
- data/ext/marisa/lib/marisa/grimoire/io/reader.cc +147 -0
- data/ext/marisa/lib/marisa/grimoire/io/reader.h +66 -0
- data/ext/marisa/lib/marisa/grimoire/io/writer.cc +148 -0
- data/ext/marisa/lib/marisa/grimoire/io/writer.h +65 -0
- data/ext/marisa/lib/marisa/grimoire/trie.h +16 -0
- data/ext/marisa/lib/marisa/grimoire/trie/cache.h +81 -0
- data/ext/marisa/lib/marisa/grimoire/trie/config.h +155 -0
- data/ext/marisa/lib/marisa/grimoire/trie/entry.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/trie/header.h +61 -0
- data/ext/marisa/lib/marisa/grimoire/trie/history.h +65 -0
- data/ext/marisa/lib/marisa/grimoire/trie/key.h +228 -0
- data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.cc +876 -0
- data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.h +134 -0
- data/ext/marisa/lib/marisa/grimoire/trie/range.h +115 -0
- data/ext/marisa/lib/marisa/grimoire/trie/state.h +117 -0
- data/ext/marisa/lib/marisa/grimoire/trie/tail.cc +218 -0
- data/ext/marisa/lib/marisa/grimoire/trie/tail.h +72 -0
- data/ext/marisa/lib/marisa/grimoire/vector.h +18 -0
- data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.cc +826 -0
- data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.h +179 -0
- data/ext/marisa/lib/marisa/grimoire/vector/flat-vector.h +205 -0
- data/ext/marisa/lib/marisa/grimoire/vector/pop-count.h +110 -0
- data/ext/marisa/lib/marisa/grimoire/vector/rank-index.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/vector/vector.h +256 -0
- data/ext/marisa/lib/marisa/iostream.h +18 -0
- data/ext/marisa/lib/marisa/key.h +85 -0
- data/ext/marisa/lib/marisa/keyset.cc +181 -0
- data/ext/marisa/lib/marisa/keyset.h +80 -0
- data/ext/marisa/lib/marisa/query.h +71 -0
- data/ext/marisa/lib/marisa/scoped-array.h +48 -0
- data/ext/marisa/lib/marisa/scoped-ptr.h +52 -0
- data/ext/marisa/lib/marisa/stdio.h +15 -0
- data/ext/marisa/lib/marisa/trie.cc +249 -0
- data/ext/marisa/lib/marisa/trie.h +64 -0
- data/ext/marisa/tests/base-test.cc +309 -0
- data/ext/marisa/tests/io-test.cc +252 -0
- data/ext/marisa/tests/marisa-assert.h +26 -0
- data/ext/marisa/tests/marisa-test.cc +388 -0
- data/ext/marisa/tests/trie-test.cc +507 -0
- data/ext/marisa/tests/vector-test.cc +466 -0
- data/ext/marisa/tools/cmdopt.cc +298 -0
- data/ext/marisa/tools/cmdopt.h +58 -0
- data/ext/marisa/tools/marisa-benchmark.cc +418 -0
- data/ext/marisa/tools/marisa-build.cc +206 -0
- data/ext/marisa/tools/marisa-common-prefix-search.cc +143 -0
- data/ext/marisa/tools/marisa-dump.cc +151 -0
- data/ext/marisa/tools/marisa-lookup.cc +110 -0
- data/ext/marisa/tools/marisa-predictive-search.cc +143 -0
- data/ext/marisa/tools/marisa-reverse-lookup.cc +110 -0
- data/lib/melisa.rb +7 -0
- data/lib/melisa/base_config_flags.rb +76 -0
- data/lib/melisa/bytes_trie.rb +55 -0
- data/lib/melisa/int_trie.rb +14 -0
- data/lib/melisa/search.rb +55 -0
- data/lib/melisa/trie.rb +96 -0
- data/lib/melisa/version.rb +3 -0
- data/melisa.gemspec +36 -0
- data/spec/base_config_flags_spec.rb +73 -0
- data/spec/bytes_trie_spec.rb +16 -0
- data/spec/int_trie_spec.rb +16 -0
- data/spec/search_spec.rb +29 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/trie_spec.rb +30 -0
- metadata +207 -0
data/README.md
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
Features
|
2
|
+
--------
|
3
|
+
- fast search for exact strings and prefixes
|
4
|
+
- has a BytesTrie that can be used to store binary data
|
5
|
+
- has an IntTrie that can be used to store integer values easily
|
6
|
+
|
7
|
+
TODO
|
8
|
+
----
|
9
|
+
- Ruby bindings aren't yet built into the gem
|
10
|
+
|
11
|
+
License: MIT
|
@@ -0,0 +1,253 @@
|
|
1
|
+
#include <cstring>
|
2
|
+
#include <new>
|
3
|
+
|
4
|
+
#include "marisa-swig.h"
|
5
|
+
|
6
|
+
namespace marisa_swig {
|
7
|
+
|
8
|
+
void Key::str(const char **ptr_out, size_t *length_out) const {
|
9
|
+
*ptr_out = key_.ptr();
|
10
|
+
*length_out = key_.length();
|
11
|
+
}
|
12
|
+
|
13
|
+
size_t Key::id() const {
|
14
|
+
return key_.id();
|
15
|
+
}
|
16
|
+
|
17
|
+
float Key::weight() const {
|
18
|
+
return key_.weight();
|
19
|
+
}
|
20
|
+
|
21
|
+
void Query::str(const char **ptr_out, size_t *length_out) const {
|
22
|
+
*ptr_out = query_.ptr();
|
23
|
+
*length_out = query_.length();
|
24
|
+
}
|
25
|
+
|
26
|
+
size_t Query::id() const {
|
27
|
+
return query_.id();
|
28
|
+
}
|
29
|
+
|
30
|
+
Keyset::Keyset() : keyset_(new (std::nothrow) marisa::Keyset) {
|
31
|
+
MARISA_THROW_IF(keyset_ == NULL, ::MARISA_MEMORY_ERROR);
|
32
|
+
}
|
33
|
+
|
34
|
+
Keyset::~Keyset() {
|
35
|
+
delete keyset_;
|
36
|
+
}
|
37
|
+
|
38
|
+
void Keyset::push_back(const marisa::Key &key) {
|
39
|
+
keyset_->push_back(key);
|
40
|
+
}
|
41
|
+
|
42
|
+
void Keyset::push_back(const char *ptr, size_t length, float weight) {
|
43
|
+
keyset_->push_back(ptr, length, weight);
|
44
|
+
}
|
45
|
+
|
46
|
+
const Key &Keyset::key(size_t i) const {
|
47
|
+
return reinterpret_cast<const Key &>((*keyset_)[i]);
|
48
|
+
}
|
49
|
+
|
50
|
+
void Keyset::key_str(size_t i,
|
51
|
+
const char **ptr_out, size_t *length_out) const {
|
52
|
+
*ptr_out = (*keyset_)[i].ptr();
|
53
|
+
*length_out = (*keyset_)[i].length();
|
54
|
+
}
|
55
|
+
|
56
|
+
size_t Keyset::key_id(size_t i) const {
|
57
|
+
return (*keyset_)[i].id();
|
58
|
+
}
|
59
|
+
|
60
|
+
size_t Keyset::num_keys() const {
|
61
|
+
return keyset_->num_keys();
|
62
|
+
}
|
63
|
+
|
64
|
+
bool Keyset::empty() const {
|
65
|
+
return keyset_->empty();
|
66
|
+
}
|
67
|
+
|
68
|
+
size_t Keyset::size() const {
|
69
|
+
return keyset_->size();
|
70
|
+
}
|
71
|
+
|
72
|
+
size_t Keyset::total_length() const {
|
73
|
+
return keyset_->total_length();
|
74
|
+
}
|
75
|
+
|
76
|
+
void Keyset::reset() {
|
77
|
+
keyset_->reset();
|
78
|
+
}
|
79
|
+
|
80
|
+
void Keyset::clear() {
|
81
|
+
keyset_->clear();
|
82
|
+
}
|
83
|
+
|
84
|
+
Agent::Agent()
|
85
|
+
: agent_(new (std::nothrow) marisa::Agent), buf_(NULL), buf_size_(0) {
|
86
|
+
MARISA_THROW_IF(agent_ == NULL, ::MARISA_MEMORY_ERROR);
|
87
|
+
}
|
88
|
+
|
89
|
+
Agent::~Agent() {
|
90
|
+
delete agent_;
|
91
|
+
delete [] buf_;
|
92
|
+
}
|
93
|
+
|
94
|
+
void Agent::set_query(const char *ptr, size_t length) {
|
95
|
+
if (length > buf_size_) {
|
96
|
+
size_t new_buf_size = (buf_size_ != 0) ? buf_size_ : 1;
|
97
|
+
if (length >= (MARISA_SIZE_MAX / 2)) {
|
98
|
+
new_buf_size = MARISA_SIZE_MAX;
|
99
|
+
} else {
|
100
|
+
while (new_buf_size < length) {
|
101
|
+
new_buf_size *= 2;
|
102
|
+
}
|
103
|
+
}
|
104
|
+
char *new_buf = new (std::nothrow) char[new_buf_size];
|
105
|
+
MARISA_THROW_IF(new_buf == NULL, MARISA_MEMORY_ERROR);
|
106
|
+
delete [] buf_;
|
107
|
+
buf_ = new_buf;
|
108
|
+
buf_size_ = new_buf_size;
|
109
|
+
}
|
110
|
+
std::memcpy(buf_, ptr, length);
|
111
|
+
agent_->set_query(buf_, length);
|
112
|
+
}
|
113
|
+
|
114
|
+
void Agent::set_query(size_t id) {
|
115
|
+
agent_->set_query(id);
|
116
|
+
}
|
117
|
+
|
118
|
+
const Key &Agent::key() const {
|
119
|
+
return reinterpret_cast<const Key &>(agent_->key());
|
120
|
+
}
|
121
|
+
|
122
|
+
const Query &Agent::query() const {
|
123
|
+
return reinterpret_cast<const Query &>(agent_->query());
|
124
|
+
}
|
125
|
+
|
126
|
+
void Agent::key_str(const char **ptr_out, size_t *length_out) const {
|
127
|
+
*ptr_out = agent_->key().ptr();
|
128
|
+
*length_out = agent_->key().length();
|
129
|
+
}
|
130
|
+
|
131
|
+
size_t Agent::key_id() const {
|
132
|
+
return agent_->key().id();
|
133
|
+
}
|
134
|
+
|
135
|
+
void Agent::query_str(const char **ptr_out, size_t *length_out) const {
|
136
|
+
*ptr_out = agent_->query().ptr();
|
137
|
+
*length_out = agent_->query().length();
|
138
|
+
}
|
139
|
+
|
140
|
+
size_t Agent::query_id() const {
|
141
|
+
return agent_->query().id();
|
142
|
+
}
|
143
|
+
|
144
|
+
Trie::Trie() : trie_(new (std::nothrow) marisa::Trie) {
|
145
|
+
MARISA_THROW_IF(trie_ == NULL, ::MARISA_MEMORY_ERROR);
|
146
|
+
}
|
147
|
+
|
148
|
+
Trie::~Trie() {
|
149
|
+
delete trie_;
|
150
|
+
}
|
151
|
+
|
152
|
+
void Trie::build(Keyset &keyset, int config_flags) {
|
153
|
+
trie_->build(*keyset.keyset_, config_flags);
|
154
|
+
}
|
155
|
+
|
156
|
+
void Trie::mmap(const char *filename) {
|
157
|
+
trie_->mmap(filename);
|
158
|
+
}
|
159
|
+
|
160
|
+
void Trie::load(const char *filename) {
|
161
|
+
trie_->load(filename);
|
162
|
+
}
|
163
|
+
|
164
|
+
void Trie::save(const char *filename) const {
|
165
|
+
trie_->save(filename);
|
166
|
+
}
|
167
|
+
|
168
|
+
bool Trie::lookup(Agent &agent) const {
|
169
|
+
return trie_->lookup(*agent.agent_);
|
170
|
+
}
|
171
|
+
|
172
|
+
void Trie::reverse_lookup(Agent &agent) const {
|
173
|
+
trie_->reverse_lookup(*agent.agent_);
|
174
|
+
}
|
175
|
+
|
176
|
+
bool Trie::common_prefix_search(Agent &agent) const {
|
177
|
+
return trie_->common_prefix_search(*agent.agent_);
|
178
|
+
}
|
179
|
+
|
180
|
+
bool Trie::predictive_search(Agent &agent) const {
|
181
|
+
return trie_->predictive_search(*agent.agent_);
|
182
|
+
}
|
183
|
+
|
184
|
+
size_t Trie::lookup(const char *ptr, size_t length) const {
|
185
|
+
marisa::Agent agent;
|
186
|
+
agent.set_query(ptr, length);
|
187
|
+
if (!trie_->lookup(agent)) {
|
188
|
+
return MARISA_INVALID_KEY_ID;
|
189
|
+
}
|
190
|
+
return agent.key().id();
|
191
|
+
}
|
192
|
+
|
193
|
+
void Trie::reverse_lookup(size_t id,
|
194
|
+
const char **ptr_out_to_be_deleted, size_t *length_out) const {
|
195
|
+
marisa::Agent agent;
|
196
|
+
agent.set_query(id);
|
197
|
+
trie_->reverse_lookup(agent);
|
198
|
+
char * const buf = new (std::nothrow) char[agent.key().length()];
|
199
|
+
MARISA_THROW_IF(buf == NULL, MARISA_MEMORY_ERROR);
|
200
|
+
std::memcpy(buf, agent.key().ptr(), agent.key().length());
|
201
|
+
*ptr_out_to_be_deleted = buf;
|
202
|
+
*length_out = agent.key().length();
|
203
|
+
}
|
204
|
+
|
205
|
+
size_t Trie::num_tries() const {
|
206
|
+
return trie_->num_tries();
|
207
|
+
}
|
208
|
+
|
209
|
+
size_t Trie::num_keys() const {
|
210
|
+
return trie_->num_keys();
|
211
|
+
}
|
212
|
+
|
213
|
+
size_t Trie::num_nodes() const {
|
214
|
+
return trie_->num_nodes();
|
215
|
+
}
|
216
|
+
|
217
|
+
TailMode Trie::tail_mode() const {
|
218
|
+
if (trie_->tail_mode() == ::MARISA_TEXT_TAIL) {
|
219
|
+
return TEXT_TAIL;
|
220
|
+
} else {
|
221
|
+
return BINARY_TAIL;
|
222
|
+
}
|
223
|
+
}
|
224
|
+
|
225
|
+
NodeOrder Trie::node_order() const {
|
226
|
+
if (trie_->node_order() == ::MARISA_LABEL_ORDER) {
|
227
|
+
return LABEL_ORDER;
|
228
|
+
} else {
|
229
|
+
return WEIGHT_ORDER;
|
230
|
+
}
|
231
|
+
}
|
232
|
+
|
233
|
+
bool Trie::empty() const {
|
234
|
+
return trie_->empty();
|
235
|
+
}
|
236
|
+
|
237
|
+
size_t Trie::size() const {
|
238
|
+
return trie_->size();
|
239
|
+
}
|
240
|
+
|
241
|
+
size_t Trie::total_size() const {
|
242
|
+
return trie_->total_size();
|
243
|
+
}
|
244
|
+
|
245
|
+
size_t Trie::io_size() const {
|
246
|
+
return trie_->io_size();
|
247
|
+
}
|
248
|
+
|
249
|
+
void Trie::clear() {
|
250
|
+
trie_->clear();
|
251
|
+
}
|
252
|
+
|
253
|
+
} // namespace marisa_swig
|
@@ -0,0 +1,183 @@
|
|
1
|
+
#ifndef MARISA_SWIG_H_
|
2
|
+
#define MARISA_SWIG_H_
|
3
|
+
|
4
|
+
#include <marisa.h>
|
5
|
+
|
6
|
+
namespace marisa_swig {
|
7
|
+
|
8
|
+
#define MARISA_SWIG_ENUM_COPY(name) name = MARISA_ ## name
|
9
|
+
|
10
|
+
enum ErrorCode {
|
11
|
+
MARISA_SWIG_ENUM_COPY(OK),
|
12
|
+
MARISA_SWIG_ENUM_COPY(STATE_ERROR),
|
13
|
+
MARISA_SWIG_ENUM_COPY(NULL_ERROR),
|
14
|
+
MARISA_SWIG_ENUM_COPY(BOUND_ERROR),
|
15
|
+
MARISA_SWIG_ENUM_COPY(RANGE_ERROR),
|
16
|
+
MARISA_SWIG_ENUM_COPY(CODE_ERROR),
|
17
|
+
MARISA_SWIG_ENUM_COPY(RESET_ERROR),
|
18
|
+
MARISA_SWIG_ENUM_COPY(SIZE_ERROR),
|
19
|
+
MARISA_SWIG_ENUM_COPY(MEMORY_ERROR),
|
20
|
+
MARISA_SWIG_ENUM_COPY(IO_ERROR),
|
21
|
+
MARISA_SWIG_ENUM_COPY(FORMAT_ERROR)
|
22
|
+
};
|
23
|
+
|
24
|
+
enum NumTries {
|
25
|
+
MARISA_SWIG_ENUM_COPY(MIN_NUM_TRIES),
|
26
|
+
MARISA_SWIG_ENUM_COPY(MAX_NUM_TRIES),
|
27
|
+
MARISA_SWIG_ENUM_COPY(DEFAULT_NUM_TRIES)
|
28
|
+
};
|
29
|
+
|
30
|
+
enum CacheLevel {
|
31
|
+
MARISA_SWIG_ENUM_COPY(HUGE_CACHE),
|
32
|
+
MARISA_SWIG_ENUM_COPY(LARGE_CACHE),
|
33
|
+
MARISA_SWIG_ENUM_COPY(NORMAL_CACHE),
|
34
|
+
MARISA_SWIG_ENUM_COPY(SMALL_CACHE),
|
35
|
+
MARISA_SWIG_ENUM_COPY(TINY_CACHE),
|
36
|
+
MARISA_SWIG_ENUM_COPY(DEFAULT_CACHE)
|
37
|
+
};
|
38
|
+
|
39
|
+
enum TailMode {
|
40
|
+
MARISA_SWIG_ENUM_COPY(TEXT_TAIL),
|
41
|
+
MARISA_SWIG_ENUM_COPY(BINARY_TAIL),
|
42
|
+
MARISA_SWIG_ENUM_COPY(DEFAULT_TAIL)
|
43
|
+
};
|
44
|
+
|
45
|
+
enum NodeOrder {
|
46
|
+
MARISA_SWIG_ENUM_COPY(LABEL_ORDER),
|
47
|
+
MARISA_SWIG_ENUM_COPY(WEIGHT_ORDER),
|
48
|
+
MARISA_SWIG_ENUM_COPY(DEFAULT_ORDER)
|
49
|
+
};
|
50
|
+
|
51
|
+
#undef MARISA_SWIG_ENUM_COPY
|
52
|
+
|
53
|
+
class Key {
|
54
|
+
public:
|
55
|
+
void str(const char **ptr_out, std::size_t *length_out) const;
|
56
|
+
std::size_t id() const;
|
57
|
+
float weight() const;
|
58
|
+
|
59
|
+
private:
|
60
|
+
const marisa::Key key_;
|
61
|
+
|
62
|
+
Key();
|
63
|
+
Key(const Key &key);
|
64
|
+
Key &operator=(const Key &);
|
65
|
+
};
|
66
|
+
|
67
|
+
class Query {
|
68
|
+
public:
|
69
|
+
void str(const char **ptr_out, std::size_t *length_out) const;
|
70
|
+
std::size_t id() const;
|
71
|
+
|
72
|
+
private:
|
73
|
+
const marisa::Query query_;
|
74
|
+
|
75
|
+
Query();
|
76
|
+
Query(const Query &query);
|
77
|
+
Query &operator=(const Query &);
|
78
|
+
};
|
79
|
+
|
80
|
+
class Keyset {
|
81
|
+
friend class Trie;
|
82
|
+
|
83
|
+
public:
|
84
|
+
Keyset();
|
85
|
+
~Keyset();
|
86
|
+
|
87
|
+
void push_back(const marisa::Key &key);
|
88
|
+
void push_back(const char *ptr, std::size_t length, float weight = 1.0);
|
89
|
+
|
90
|
+
const Key &key(std::size_t i) const;
|
91
|
+
|
92
|
+
void key_str(std::size_t i,
|
93
|
+
const char **ptr_out, std::size_t *length_out) const;
|
94
|
+
std::size_t key_id(std::size_t i) const;
|
95
|
+
|
96
|
+
std::size_t num_keys() const;
|
97
|
+
|
98
|
+
bool empty() const;
|
99
|
+
std::size_t size() const;
|
100
|
+
std::size_t total_length() const;
|
101
|
+
|
102
|
+
void reset();
|
103
|
+
void clear();
|
104
|
+
|
105
|
+
private:
|
106
|
+
marisa::Keyset *keyset_;
|
107
|
+
|
108
|
+
Keyset(const Keyset &);
|
109
|
+
Keyset &operator=(const Keyset &);
|
110
|
+
};
|
111
|
+
|
112
|
+
class Agent {
|
113
|
+
friend class Trie;
|
114
|
+
|
115
|
+
public:
|
116
|
+
Agent();
|
117
|
+
~Agent();
|
118
|
+
|
119
|
+
void set_query(const char *ptr, std::size_t length);
|
120
|
+
void set_query(std::size_t id);
|
121
|
+
|
122
|
+
const Key &key() const;
|
123
|
+
const Query &query() const;
|
124
|
+
|
125
|
+
void key_str(const char **ptr_out, std::size_t *length_out) const;
|
126
|
+
std::size_t key_id() const;
|
127
|
+
|
128
|
+
void query_str(const char **ptr_out, std::size_t *length_out) const;
|
129
|
+
std::size_t query_id() const;
|
130
|
+
|
131
|
+
private:
|
132
|
+
marisa::Agent *agent_;
|
133
|
+
char *buf_;
|
134
|
+
std::size_t buf_size_;
|
135
|
+
|
136
|
+
Agent(const Agent &);
|
137
|
+
Agent &operator=(const Agent &);
|
138
|
+
};
|
139
|
+
|
140
|
+
class Trie {
|
141
|
+
public:
|
142
|
+
Trie();
|
143
|
+
~Trie();
|
144
|
+
|
145
|
+
void build(Keyset &keyset, int config_flags = 0);
|
146
|
+
|
147
|
+
void mmap(const char *filename);
|
148
|
+
void load(const char *filename);
|
149
|
+
void save(const char *filename) const;
|
150
|
+
|
151
|
+
bool lookup(Agent &agent) const;
|
152
|
+
void reverse_lookup(Agent &agent) const;
|
153
|
+
bool common_prefix_search(Agent &agent) const;
|
154
|
+
bool predictive_search(Agent &agent) const;
|
155
|
+
|
156
|
+
std::size_t lookup(const char *ptr, std::size_t length) const;
|
157
|
+
void reverse_lookup(std::size_t id,
|
158
|
+
const char **ptr_out_to_be_deleted, std::size_t *length_out) const;
|
159
|
+
|
160
|
+
std::size_t num_tries() const;
|
161
|
+
std::size_t num_keys() const;
|
162
|
+
std::size_t num_nodes() const;
|
163
|
+
|
164
|
+
TailMode tail_mode() const;
|
165
|
+
NodeOrder node_order() const;
|
166
|
+
|
167
|
+
bool empty() const;
|
168
|
+
std::size_t size() const;
|
169
|
+
std::size_t total_size() const;
|
170
|
+
std::size_t io_size() const;
|
171
|
+
|
172
|
+
void clear();
|
173
|
+
|
174
|
+
private:
|
175
|
+
marisa::Trie *trie_;
|
176
|
+
|
177
|
+
Trie(const Trie &);
|
178
|
+
Trie &operator=(const Trie &);
|
179
|
+
};
|
180
|
+
|
181
|
+
} // namespace marisa_swig
|
182
|
+
|
183
|
+
#endif // MARISA_SWIG_H_
|
@@ -0,0 +1,253 @@
|
|
1
|
+
#include <cstring>
|
2
|
+
#include <new>
|
3
|
+
|
4
|
+
#include "marisa-swig.h"
|
5
|
+
|
6
|
+
namespace marisa_swig {
|
7
|
+
|
8
|
+
void Key::str(const char **ptr_out, size_t *length_out) const {
|
9
|
+
*ptr_out = key_.ptr();
|
10
|
+
*length_out = key_.length();
|
11
|
+
}
|
12
|
+
|
13
|
+
size_t Key::id() const {
|
14
|
+
return key_.id();
|
15
|
+
}
|
16
|
+
|
17
|
+
float Key::weight() const {
|
18
|
+
return key_.weight();
|
19
|
+
}
|
20
|
+
|
21
|
+
void Query::str(const char **ptr_out, size_t *length_out) const {
|
22
|
+
*ptr_out = query_.ptr();
|
23
|
+
*length_out = query_.length();
|
24
|
+
}
|
25
|
+
|
26
|
+
size_t Query::id() const {
|
27
|
+
return query_.id();
|
28
|
+
}
|
29
|
+
|
30
|
+
Keyset::Keyset() : keyset_(new (std::nothrow) marisa::Keyset) {
|
31
|
+
MARISA_THROW_IF(keyset_ == NULL, ::MARISA_MEMORY_ERROR);
|
32
|
+
}
|
33
|
+
|
34
|
+
Keyset::~Keyset() {
|
35
|
+
delete keyset_;
|
36
|
+
}
|
37
|
+
|
38
|
+
void Keyset::push_back(const marisa::Key &key) {
|
39
|
+
keyset_->push_back(key);
|
40
|
+
}
|
41
|
+
|
42
|
+
void Keyset::push_back(const char *ptr, size_t length, float weight) {
|
43
|
+
keyset_->push_back(ptr, length, weight);
|
44
|
+
}
|
45
|
+
|
46
|
+
const Key &Keyset::key(size_t i) const {
|
47
|
+
return reinterpret_cast<const Key &>((*keyset_)[i]);
|
48
|
+
}
|
49
|
+
|
50
|
+
void Keyset::key_str(size_t i,
|
51
|
+
const char **ptr_out, size_t *length_out) const {
|
52
|
+
*ptr_out = (*keyset_)[i].ptr();
|
53
|
+
*length_out = (*keyset_)[i].length();
|
54
|
+
}
|
55
|
+
|
56
|
+
size_t Keyset::key_id(size_t i) const {
|
57
|
+
return (*keyset_)[i].id();
|
58
|
+
}
|
59
|
+
|
60
|
+
size_t Keyset::num_keys() const {
|
61
|
+
return keyset_->num_keys();
|
62
|
+
}
|
63
|
+
|
64
|
+
bool Keyset::empty() const {
|
65
|
+
return keyset_->empty();
|
66
|
+
}
|
67
|
+
|
68
|
+
size_t Keyset::size() const {
|
69
|
+
return keyset_->size();
|
70
|
+
}
|
71
|
+
|
72
|
+
size_t Keyset::total_length() const {
|
73
|
+
return keyset_->total_length();
|
74
|
+
}
|
75
|
+
|
76
|
+
void Keyset::reset() {
|
77
|
+
keyset_->reset();
|
78
|
+
}
|
79
|
+
|
80
|
+
void Keyset::clear() {
|
81
|
+
keyset_->clear();
|
82
|
+
}
|
83
|
+
|
84
|
+
Agent::Agent()
|
85
|
+
: agent_(new (std::nothrow) marisa::Agent), buf_(NULL), buf_size_(0) {
|
86
|
+
MARISA_THROW_IF(agent_ == NULL, ::MARISA_MEMORY_ERROR);
|
87
|
+
}
|
88
|
+
|
89
|
+
Agent::~Agent() {
|
90
|
+
delete agent_;
|
91
|
+
delete [] buf_;
|
92
|
+
}
|
93
|
+
|
94
|
+
void Agent::set_query(const char *ptr, size_t length) {
|
95
|
+
if (length > buf_size_) {
|
96
|
+
size_t new_buf_size = (buf_size_ != 0) ? buf_size_ : 1;
|
97
|
+
if (length >= (MARISA_SIZE_MAX / 2)) {
|
98
|
+
new_buf_size = MARISA_SIZE_MAX;
|
99
|
+
} else {
|
100
|
+
while (new_buf_size < length) {
|
101
|
+
new_buf_size *= 2;
|
102
|
+
}
|
103
|
+
}
|
104
|
+
char *new_buf = new (std::nothrow) char[new_buf_size];
|
105
|
+
MARISA_THROW_IF(new_buf == NULL, MARISA_MEMORY_ERROR);
|
106
|
+
delete [] buf_;
|
107
|
+
buf_ = new_buf;
|
108
|
+
buf_size_ = new_buf_size;
|
109
|
+
}
|
110
|
+
std::memcpy(buf_, ptr, length);
|
111
|
+
agent_->set_query(buf_, length);
|
112
|
+
}
|
113
|
+
|
114
|
+
void Agent::set_query(size_t id) {
|
115
|
+
agent_->set_query(id);
|
116
|
+
}
|
117
|
+
|
118
|
+
const Key &Agent::key() const {
|
119
|
+
return reinterpret_cast<const Key &>(agent_->key());
|
120
|
+
}
|
121
|
+
|
122
|
+
const Query &Agent::query() const {
|
123
|
+
return reinterpret_cast<const Query &>(agent_->query());
|
124
|
+
}
|
125
|
+
|
126
|
+
void Agent::key_str(const char **ptr_out, size_t *length_out) const {
|
127
|
+
*ptr_out = agent_->key().ptr();
|
128
|
+
*length_out = agent_->key().length();
|
129
|
+
}
|
130
|
+
|
131
|
+
size_t Agent::key_id() const {
|
132
|
+
return agent_->key().id();
|
133
|
+
}
|
134
|
+
|
135
|
+
void Agent::query_str(const char **ptr_out, size_t *length_out) const {
|
136
|
+
*ptr_out = agent_->query().ptr();
|
137
|
+
*length_out = agent_->query().length();
|
138
|
+
}
|
139
|
+
|
140
|
+
size_t Agent::query_id() const {
|
141
|
+
return agent_->query().id();
|
142
|
+
}
|
143
|
+
|
144
|
+
Trie::Trie() : trie_(new (std::nothrow) marisa::Trie) {
|
145
|
+
MARISA_THROW_IF(trie_ == NULL, ::MARISA_MEMORY_ERROR);
|
146
|
+
}
|
147
|
+
|
148
|
+
Trie::~Trie() {
|
149
|
+
delete trie_;
|
150
|
+
}
|
151
|
+
|
152
|
+
void Trie::build(Keyset &keyset, int config_flags) {
|
153
|
+
trie_->build(*keyset.keyset_, config_flags);
|
154
|
+
}
|
155
|
+
|
156
|
+
void Trie::mmap(const char *filename) {
|
157
|
+
trie_->mmap(filename);
|
158
|
+
}
|
159
|
+
|
160
|
+
void Trie::load(const char *filename) {
|
161
|
+
trie_->load(filename);
|
162
|
+
}
|
163
|
+
|
164
|
+
void Trie::save(const char *filename) const {
|
165
|
+
trie_->save(filename);
|
166
|
+
}
|
167
|
+
|
168
|
+
bool Trie::lookup(Agent &agent) const {
|
169
|
+
return trie_->lookup(*agent.agent_);
|
170
|
+
}
|
171
|
+
|
172
|
+
void Trie::reverse_lookup(Agent &agent) const {
|
173
|
+
trie_->reverse_lookup(*agent.agent_);
|
174
|
+
}
|
175
|
+
|
176
|
+
bool Trie::common_prefix_search(Agent &agent) const {
|
177
|
+
return trie_->common_prefix_search(*agent.agent_);
|
178
|
+
}
|
179
|
+
|
180
|
+
bool Trie::predictive_search(Agent &agent) const {
|
181
|
+
return trie_->predictive_search(*agent.agent_);
|
182
|
+
}
|
183
|
+
|
184
|
+
size_t Trie::lookup(const char *ptr, size_t length) const {
|
185
|
+
marisa::Agent agent;
|
186
|
+
agent.set_query(ptr, length);
|
187
|
+
if (!trie_->lookup(agent)) {
|
188
|
+
return MARISA_INVALID_KEY_ID;
|
189
|
+
}
|
190
|
+
return agent.key().id();
|
191
|
+
}
|
192
|
+
|
193
|
+
void Trie::reverse_lookup(size_t id,
|
194
|
+
const char **ptr_out_to_be_deleted, size_t *length_out) const {
|
195
|
+
marisa::Agent agent;
|
196
|
+
agent.set_query(id);
|
197
|
+
trie_->reverse_lookup(agent);
|
198
|
+
char * const buf = new (std::nothrow) char[agent.key().length()];
|
199
|
+
MARISA_THROW_IF(buf == NULL, MARISA_MEMORY_ERROR);
|
200
|
+
std::memcpy(buf, agent.key().ptr(), agent.key().length());
|
201
|
+
*ptr_out_to_be_deleted = buf;
|
202
|
+
*length_out = agent.key().length();
|
203
|
+
}
|
204
|
+
|
205
|
+
size_t Trie::num_tries() const {
|
206
|
+
return trie_->num_tries();
|
207
|
+
}
|
208
|
+
|
209
|
+
size_t Trie::num_keys() const {
|
210
|
+
return trie_->num_keys();
|
211
|
+
}
|
212
|
+
|
213
|
+
size_t Trie::num_nodes() const {
|
214
|
+
return trie_->num_nodes();
|
215
|
+
}
|
216
|
+
|
217
|
+
TailMode Trie::tail_mode() const {
|
218
|
+
if (trie_->tail_mode() == ::MARISA_TEXT_TAIL) {
|
219
|
+
return TEXT_TAIL;
|
220
|
+
} else {
|
221
|
+
return BINARY_TAIL;
|
222
|
+
}
|
223
|
+
}
|
224
|
+
|
225
|
+
NodeOrder Trie::node_order() const {
|
226
|
+
if (trie_->node_order() == ::MARISA_LABEL_ORDER) {
|
227
|
+
return LABEL_ORDER;
|
228
|
+
} else {
|
229
|
+
return WEIGHT_ORDER;
|
230
|
+
}
|
231
|
+
}
|
232
|
+
|
233
|
+
bool Trie::empty() const {
|
234
|
+
return trie_->empty();
|
235
|
+
}
|
236
|
+
|
237
|
+
size_t Trie::size() const {
|
238
|
+
return trie_->size();
|
239
|
+
}
|
240
|
+
|
241
|
+
size_t Trie::total_size() const {
|
242
|
+
return trie_->total_size();
|
243
|
+
}
|
244
|
+
|
245
|
+
size_t Trie::io_size() const {
|
246
|
+
return trie_->io_size();
|
247
|
+
}
|
248
|
+
|
249
|
+
void Trie::clear() {
|
250
|
+
trie_->clear();
|
251
|
+
}
|
252
|
+
|
253
|
+
} // namespace marisa_swig
|