melisa 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +11 -0
- data/ext/marisa/bindings/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/marisa-swig.h +183 -0
- data/ext/marisa/bindings/perl/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/perl/marisa-swig.h +183 -0
- data/ext/marisa/bindings/perl/marisa-swig_wrap.cxx +5160 -0
- data/ext/marisa/bindings/python/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/python/marisa-swig.h +183 -0
- data/ext/marisa/bindings/python/marisa-swig_wrap.cxx +6090 -0
- data/ext/marisa/bindings/ruby/extconf.rb +5 -0
- data/ext/marisa/bindings/ruby/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/ruby/marisa-swig.h +183 -0
- data/ext/marisa/bindings/ruby/marisa-swig_wrap.cxx +4708 -0
- data/ext/marisa/lib/marisa.h +14 -0
- data/ext/marisa/lib/marisa/agent.cc +51 -0
- data/ext/marisa/lib/marisa/agent.h +73 -0
- data/ext/marisa/lib/marisa/base.h +193 -0
- data/ext/marisa/lib/marisa/exception.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/algorithm.h +26 -0
- data/ext/marisa/lib/marisa/grimoire/algorithm/sort.h +196 -0
- data/ext/marisa/lib/marisa/grimoire/intrin.h +115 -0
- data/ext/marisa/lib/marisa/grimoire/io.h +18 -0
- data/ext/marisa/lib/marisa/grimoire/io/mapper.cc +163 -0
- data/ext/marisa/lib/marisa/grimoire/io/mapper.h +67 -0
- data/ext/marisa/lib/marisa/grimoire/io/reader.cc +147 -0
- data/ext/marisa/lib/marisa/grimoire/io/reader.h +66 -0
- data/ext/marisa/lib/marisa/grimoire/io/writer.cc +148 -0
- data/ext/marisa/lib/marisa/grimoire/io/writer.h +65 -0
- data/ext/marisa/lib/marisa/grimoire/trie.h +16 -0
- data/ext/marisa/lib/marisa/grimoire/trie/cache.h +81 -0
- data/ext/marisa/lib/marisa/grimoire/trie/config.h +155 -0
- data/ext/marisa/lib/marisa/grimoire/trie/entry.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/trie/header.h +61 -0
- data/ext/marisa/lib/marisa/grimoire/trie/history.h +65 -0
- data/ext/marisa/lib/marisa/grimoire/trie/key.h +228 -0
- data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.cc +876 -0
- data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.h +134 -0
- data/ext/marisa/lib/marisa/grimoire/trie/range.h +115 -0
- data/ext/marisa/lib/marisa/grimoire/trie/state.h +117 -0
- data/ext/marisa/lib/marisa/grimoire/trie/tail.cc +218 -0
- data/ext/marisa/lib/marisa/grimoire/trie/tail.h +72 -0
- data/ext/marisa/lib/marisa/grimoire/vector.h +18 -0
- data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.cc +826 -0
- data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.h +179 -0
- data/ext/marisa/lib/marisa/grimoire/vector/flat-vector.h +205 -0
- data/ext/marisa/lib/marisa/grimoire/vector/pop-count.h +110 -0
- data/ext/marisa/lib/marisa/grimoire/vector/rank-index.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/vector/vector.h +256 -0
- data/ext/marisa/lib/marisa/iostream.h +18 -0
- data/ext/marisa/lib/marisa/key.h +85 -0
- data/ext/marisa/lib/marisa/keyset.cc +181 -0
- data/ext/marisa/lib/marisa/keyset.h +80 -0
- data/ext/marisa/lib/marisa/query.h +71 -0
- data/ext/marisa/lib/marisa/scoped-array.h +48 -0
- data/ext/marisa/lib/marisa/scoped-ptr.h +52 -0
- data/ext/marisa/lib/marisa/stdio.h +15 -0
- data/ext/marisa/lib/marisa/trie.cc +249 -0
- data/ext/marisa/lib/marisa/trie.h +64 -0
- data/ext/marisa/tests/base-test.cc +309 -0
- data/ext/marisa/tests/io-test.cc +252 -0
- data/ext/marisa/tests/marisa-assert.h +26 -0
- data/ext/marisa/tests/marisa-test.cc +388 -0
- data/ext/marisa/tests/trie-test.cc +507 -0
- data/ext/marisa/tests/vector-test.cc +466 -0
- data/ext/marisa/tools/cmdopt.cc +298 -0
- data/ext/marisa/tools/cmdopt.h +58 -0
- data/ext/marisa/tools/marisa-benchmark.cc +418 -0
- data/ext/marisa/tools/marisa-build.cc +206 -0
- data/ext/marisa/tools/marisa-common-prefix-search.cc +143 -0
- data/ext/marisa/tools/marisa-dump.cc +151 -0
- data/ext/marisa/tools/marisa-lookup.cc +110 -0
- data/ext/marisa/tools/marisa-predictive-search.cc +143 -0
- data/ext/marisa/tools/marisa-reverse-lookup.cc +110 -0
- data/lib/melisa.rb +7 -0
- data/lib/melisa/base_config_flags.rb +76 -0
- data/lib/melisa/bytes_trie.rb +55 -0
- data/lib/melisa/int_trie.rb +14 -0
- data/lib/melisa/search.rb +55 -0
- data/lib/melisa/trie.rb +96 -0
- data/lib/melisa/version.rb +3 -0
- data/melisa.gemspec +36 -0
- data/spec/base_config_flags_spec.rb +73 -0
- data/spec/bytes_trie_spec.rb +16 -0
- data/spec/int_trie_spec.rb +16 -0
- data/spec/search_spec.rb +29 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/trie_spec.rb +30 -0
- metadata +207 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
#ifndef MARISA_GRIMOIRE_TRIE_ENTRY_H_
|
|
2
|
+
#define MARISA_GRIMOIRE_TRIE_ENTRY_H_
|
|
3
|
+
|
|
4
|
+
#include "marisa/base.h"
|
|
5
|
+
|
|
6
|
+
namespace marisa {
|
|
7
|
+
namespace grimoire {
|
|
8
|
+
namespace trie {
|
|
9
|
+
|
|
10
|
+
class Entry {
|
|
11
|
+
public:
|
|
12
|
+
Entry()
|
|
13
|
+
: ptr_(static_cast<const char *>(NULL) - 1), length_(0), id_(0) {}
|
|
14
|
+
Entry(const Entry &entry)
|
|
15
|
+
: ptr_(entry.ptr_), length_(entry.length_), id_(entry.id_) {}
|
|
16
|
+
|
|
17
|
+
Entry &operator=(const Entry &entry) {
|
|
18
|
+
ptr_ = entry.ptr_;
|
|
19
|
+
length_ = entry.length_;
|
|
20
|
+
id_ = entry.id_;
|
|
21
|
+
return *this;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
char operator[](std::size_t i) const {
|
|
25
|
+
MARISA_DEBUG_IF(i >= length_, MARISA_BOUND_ERROR);
|
|
26
|
+
return *(ptr_ - i);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
void set_str(const char *ptr, std::size_t length) {
|
|
30
|
+
MARISA_DEBUG_IF((ptr == NULL) && (length != 0), MARISA_NULL_ERROR);
|
|
31
|
+
MARISA_DEBUG_IF(length > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
|
32
|
+
ptr_ = ptr + length - 1;
|
|
33
|
+
length_ = (UInt32)length;
|
|
34
|
+
}
|
|
35
|
+
void set_id(std::size_t id) {
|
|
36
|
+
MARISA_DEBUG_IF(id > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
|
37
|
+
id_ = (UInt32)id;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
const char *ptr() const {
|
|
41
|
+
return ptr_ - length_ + 1;
|
|
42
|
+
}
|
|
43
|
+
std::size_t length() const {
|
|
44
|
+
return length_;
|
|
45
|
+
}
|
|
46
|
+
std::size_t id() const {
|
|
47
|
+
return id_;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
class StringComparer {
|
|
51
|
+
public:
|
|
52
|
+
bool operator()(const Entry &lhs, const Entry &rhs) const {
|
|
53
|
+
for (std::size_t i = 0; i < lhs.length(); ++i) {
|
|
54
|
+
if (i == rhs.length()) {
|
|
55
|
+
return true;
|
|
56
|
+
}
|
|
57
|
+
if (lhs[i] != rhs[i]) {
|
|
58
|
+
return (UInt8)lhs[i] > (UInt8)rhs[i];
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
return lhs.length() > rhs.length();
|
|
62
|
+
}
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
class IDComparer {
|
|
66
|
+
public:
|
|
67
|
+
bool operator()(const Entry &lhs, const Entry &rhs) const {
|
|
68
|
+
return lhs.id_ < rhs.id_;
|
|
69
|
+
}
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
private:
|
|
73
|
+
const char *ptr_;
|
|
74
|
+
UInt32 length_;
|
|
75
|
+
UInt32 id_;
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
} // namespace trie
|
|
79
|
+
} // namespace grimoire
|
|
80
|
+
} // namespace marisa
|
|
81
|
+
|
|
82
|
+
#endif // MARISA_GRIMOIRE_TRIE_ENTRY_H_
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
#ifndef MARISA_GRIMOIRE_TRIE_HEADER_H_
|
|
2
|
+
#define MARISA_GRIMOIRE_TRIE_HEADER_H_
|
|
3
|
+
|
|
4
|
+
#include "marisa/grimoire/io.h"
|
|
5
|
+
|
|
6
|
+
namespace marisa {
|
|
7
|
+
namespace grimoire {
|
|
8
|
+
namespace trie {
|
|
9
|
+
|
|
10
|
+
class Header {
|
|
11
|
+
public:
|
|
12
|
+
enum {
|
|
13
|
+
HEADER_SIZE = 16
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
Header() {}
|
|
17
|
+
|
|
18
|
+
void map(Mapper &mapper) {
|
|
19
|
+
const char *ptr;
|
|
20
|
+
mapper.map(&ptr, HEADER_SIZE);
|
|
21
|
+
MARISA_THROW_IF(!test_header(ptr), MARISA_FORMAT_ERROR);
|
|
22
|
+
}
|
|
23
|
+
void read(Reader &reader) {
|
|
24
|
+
char buf[HEADER_SIZE];
|
|
25
|
+
reader.read(buf, HEADER_SIZE);
|
|
26
|
+
MARISA_THROW_IF(!test_header(buf), MARISA_FORMAT_ERROR);
|
|
27
|
+
}
|
|
28
|
+
void write(Writer &writer) const {
|
|
29
|
+
writer.write(get_header(), HEADER_SIZE);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
std::size_t io_size() const {
|
|
33
|
+
return HEADER_SIZE;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
private:
|
|
37
|
+
|
|
38
|
+
static const char *get_header() {
|
|
39
|
+
static const char buf[HEADER_SIZE] = "We love Marisa.";
|
|
40
|
+
return buf;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
static bool test_header(const char *ptr) {
|
|
44
|
+
for (std::size_t i = 0; i < HEADER_SIZE; ++i) {
|
|
45
|
+
if (ptr[i] != get_header()[i]) {
|
|
46
|
+
return false;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
return true;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Disallows copy and assignment.
|
|
53
|
+
Header(const Header &);
|
|
54
|
+
Header &operator=(const Header &);
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
} // namespace trie
|
|
58
|
+
} // namespace marisa
|
|
59
|
+
} // namespace grimoire
|
|
60
|
+
|
|
61
|
+
#endif // MARISA_GRIMOIRE_TRIE_HEADER_H_
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
#ifndef MARISA_GRIMOIRE_TRIE_STATE_HISTORY_H_
|
|
2
|
+
#define MARISA_GRIMOIRE_TRIE_STATE_HISTORY_H_
|
|
3
|
+
|
|
4
|
+
#include "marisa/base.h"
|
|
5
|
+
|
|
6
|
+
namespace marisa {
|
|
7
|
+
namespace grimoire {
|
|
8
|
+
namespace trie {
|
|
9
|
+
|
|
10
|
+
class History {
|
|
11
|
+
public:
|
|
12
|
+
History()
|
|
13
|
+
: node_id_(0), louds_pos_(0), key_pos_(0),
|
|
14
|
+
link_id_(MARISA_INVALID_LINK_ID), key_id_(MARISA_INVALID_KEY_ID) {}
|
|
15
|
+
|
|
16
|
+
void set_node_id(std::size_t node_id) {
|
|
17
|
+
MARISA_DEBUG_IF(node_id > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
|
18
|
+
node_id_ = (UInt32)node_id;
|
|
19
|
+
}
|
|
20
|
+
void set_louds_pos(std::size_t louds_pos) {
|
|
21
|
+
MARISA_DEBUG_IF(louds_pos > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
|
22
|
+
louds_pos_ = (UInt32)louds_pos;
|
|
23
|
+
}
|
|
24
|
+
void set_key_pos(std::size_t key_pos) {
|
|
25
|
+
MARISA_DEBUG_IF(key_pos > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
|
26
|
+
key_pos_ = (UInt32)key_pos;
|
|
27
|
+
}
|
|
28
|
+
void set_link_id(std::size_t link_id) {
|
|
29
|
+
MARISA_DEBUG_IF(link_id > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
|
30
|
+
link_id_ = (UInt32)link_id;
|
|
31
|
+
}
|
|
32
|
+
void set_key_id(std::size_t key_id) {
|
|
33
|
+
MARISA_DEBUG_IF(key_id > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
|
34
|
+
key_id_ = (UInt32)key_id;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
std::size_t node_id() const {
|
|
38
|
+
return node_id_;
|
|
39
|
+
}
|
|
40
|
+
std::size_t louds_pos() const {
|
|
41
|
+
return louds_pos_;
|
|
42
|
+
}
|
|
43
|
+
std::size_t key_pos() const {
|
|
44
|
+
return key_pos_;
|
|
45
|
+
}
|
|
46
|
+
std::size_t link_id() const {
|
|
47
|
+
return link_id_;
|
|
48
|
+
}
|
|
49
|
+
std::size_t key_id() const {
|
|
50
|
+
return key_id_;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
private:
|
|
54
|
+
UInt32 node_id_;
|
|
55
|
+
UInt32 louds_pos_;
|
|
56
|
+
UInt32 key_pos_;
|
|
57
|
+
UInt32 link_id_;
|
|
58
|
+
UInt32 key_id_;
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
} // namespace trie
|
|
62
|
+
} // namespace grimoire
|
|
63
|
+
} // namespace marisa
|
|
64
|
+
|
|
65
|
+
#endif // MARISA_GRIMOIRE_TRIE_STATE_HISTORY_H_
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
#ifndef MARISA_GRIMOIRE_TRIE_KEY_H_
|
|
2
|
+
#define MARISA_GRIMOIRE_TRIE_KEY_H_
|
|
3
|
+
|
|
4
|
+
#include "marisa/base.h"
|
|
5
|
+
|
|
6
|
+
namespace marisa {
|
|
7
|
+
namespace grimoire {
|
|
8
|
+
namespace trie {
|
|
9
|
+
|
|
10
|
+
class Key {
|
|
11
|
+
public:
|
|
12
|
+
Key() : ptr_(NULL), length_(0), union_(), id_(0) {
|
|
13
|
+
union_.terminal = 0;
|
|
14
|
+
}
|
|
15
|
+
Key(const Key &entry)
|
|
16
|
+
: ptr_(entry.ptr_), length_(entry.length_),
|
|
17
|
+
union_(entry.union_), id_(entry.id_) {}
|
|
18
|
+
|
|
19
|
+
Key &operator=(const Key &entry) {
|
|
20
|
+
ptr_ = entry.ptr_;
|
|
21
|
+
length_ = entry.length_;
|
|
22
|
+
union_ = entry.union_;
|
|
23
|
+
id_ = entry.id_;
|
|
24
|
+
return *this;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
char operator[](std::size_t i) const {
|
|
28
|
+
MARISA_DEBUG_IF(i >= length_, MARISA_BOUND_ERROR);
|
|
29
|
+
return ptr_[i];
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
void substr(std::size_t pos, std::size_t length) {
|
|
33
|
+
MARISA_DEBUG_IF(pos > length_, MARISA_BOUND_ERROR);
|
|
34
|
+
MARISA_DEBUG_IF(length > length_, MARISA_BOUND_ERROR);
|
|
35
|
+
MARISA_DEBUG_IF(pos > (length_ - length), MARISA_BOUND_ERROR);
|
|
36
|
+
ptr_ += pos;
|
|
37
|
+
length_ = (UInt32)length;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
void set_str(const char *ptr, std::size_t length) {
|
|
41
|
+
MARISA_DEBUG_IF((ptr == NULL) && (length != 0), MARISA_NULL_ERROR);
|
|
42
|
+
MARISA_DEBUG_IF(length > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
|
43
|
+
ptr_ = ptr;
|
|
44
|
+
length_ = (UInt32)length;
|
|
45
|
+
}
|
|
46
|
+
void set_weight(float weight) {
|
|
47
|
+
union_.weight = weight;
|
|
48
|
+
}
|
|
49
|
+
void set_terminal(std::size_t terminal) {
|
|
50
|
+
MARISA_DEBUG_IF(terminal > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
|
51
|
+
union_.terminal = (UInt32)terminal;
|
|
52
|
+
}
|
|
53
|
+
void set_id(std::size_t id) {
|
|
54
|
+
MARISA_DEBUG_IF(id > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
|
55
|
+
id_ = (UInt32)id;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const char *ptr() const {
|
|
59
|
+
return ptr_;
|
|
60
|
+
}
|
|
61
|
+
std::size_t length() const {
|
|
62
|
+
return length_;
|
|
63
|
+
}
|
|
64
|
+
float weight() const {
|
|
65
|
+
return union_.weight;
|
|
66
|
+
}
|
|
67
|
+
std::size_t terminal() const {
|
|
68
|
+
return union_.terminal;
|
|
69
|
+
}
|
|
70
|
+
std::size_t id() const {
|
|
71
|
+
return id_;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
private:
|
|
75
|
+
const char *ptr_;
|
|
76
|
+
UInt32 length_;
|
|
77
|
+
union Union {
|
|
78
|
+
float weight;
|
|
79
|
+
UInt32 terminal;
|
|
80
|
+
} union_;
|
|
81
|
+
UInt32 id_;
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
inline bool operator==(const Key &lhs, const Key &rhs) {
|
|
85
|
+
if (lhs.length() != rhs.length()) {
|
|
86
|
+
return false;
|
|
87
|
+
}
|
|
88
|
+
for (std::size_t i = 0; i < lhs.length(); ++i) {
|
|
89
|
+
if (lhs[i] != rhs[i]) {
|
|
90
|
+
return false;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return true;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
inline bool operator!=(const Key &lhs, const Key &rhs) {
|
|
97
|
+
return !(lhs == rhs);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
inline bool operator<(const Key &lhs, const Key &rhs) {
|
|
101
|
+
for (std::size_t i = 0; i < lhs.length(); ++i) {
|
|
102
|
+
if (i == rhs.length()) {
|
|
103
|
+
return false;
|
|
104
|
+
}
|
|
105
|
+
if (lhs[i] != rhs[i]) {
|
|
106
|
+
return (UInt8)lhs[i] < (UInt8)rhs[i];
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
return lhs.length() < rhs.length();
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
inline bool operator>(const Key &lhs, const Key &rhs) {
|
|
113
|
+
return rhs < lhs;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
class ReverseKey {
|
|
117
|
+
public:
|
|
118
|
+
ReverseKey()
|
|
119
|
+
: ptr_(static_cast<const char *>(NULL) - 1),
|
|
120
|
+
length_(0), union_(), id_(0) {
|
|
121
|
+
union_.terminal = 0;
|
|
122
|
+
}
|
|
123
|
+
ReverseKey(const ReverseKey &entry)
|
|
124
|
+
: ptr_(entry.ptr_), length_(entry.length_),
|
|
125
|
+
union_(entry.union_), id_(entry.id_) {}
|
|
126
|
+
|
|
127
|
+
ReverseKey &operator=(const ReverseKey &entry) {
|
|
128
|
+
ptr_ = entry.ptr_;
|
|
129
|
+
length_ = entry.length_;
|
|
130
|
+
union_ = entry.union_;
|
|
131
|
+
id_ = entry.id_;
|
|
132
|
+
return *this;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
char operator[](std::size_t i) const {
|
|
136
|
+
MARISA_DEBUG_IF(i >= length_, MARISA_BOUND_ERROR);
|
|
137
|
+
return *(ptr_ - i);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
void substr(std::size_t pos, std::size_t length) {
|
|
141
|
+
MARISA_DEBUG_IF(pos > length_, MARISA_BOUND_ERROR);
|
|
142
|
+
MARISA_DEBUG_IF(length > length_, MARISA_BOUND_ERROR);
|
|
143
|
+
MARISA_DEBUG_IF(pos > (length_ - length), MARISA_BOUND_ERROR);
|
|
144
|
+
ptr_ -= pos;
|
|
145
|
+
length_ = (UInt32)length;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
void set_str(const char *ptr, std::size_t length) {
|
|
149
|
+
MARISA_DEBUG_IF((ptr == NULL) && (length != 0), MARISA_NULL_ERROR);
|
|
150
|
+
MARISA_DEBUG_IF(length > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
|
151
|
+
ptr_ = ptr + length - 1;
|
|
152
|
+
length_ = (UInt32)length;
|
|
153
|
+
}
|
|
154
|
+
void set_weight(float weight) {
|
|
155
|
+
union_.weight = weight;
|
|
156
|
+
}
|
|
157
|
+
void set_terminal(std::size_t terminal) {
|
|
158
|
+
MARISA_DEBUG_IF(terminal > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
|
159
|
+
union_.terminal = (UInt32)terminal;
|
|
160
|
+
}
|
|
161
|
+
void set_id(std::size_t id) {
|
|
162
|
+
MARISA_DEBUG_IF(id > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
|
163
|
+
id_ = (UInt32)id;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
const char *ptr() const {
|
|
167
|
+
return ptr_ - length_ + 1;
|
|
168
|
+
}
|
|
169
|
+
std::size_t length() const {
|
|
170
|
+
return length_;
|
|
171
|
+
}
|
|
172
|
+
float weight() const {
|
|
173
|
+
return union_.weight;
|
|
174
|
+
}
|
|
175
|
+
std::size_t terminal() const {
|
|
176
|
+
return union_.terminal;
|
|
177
|
+
}
|
|
178
|
+
std::size_t id() const {
|
|
179
|
+
return id_;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
private:
|
|
183
|
+
const char *ptr_;
|
|
184
|
+
UInt32 length_;
|
|
185
|
+
union Union {
|
|
186
|
+
float weight;
|
|
187
|
+
UInt32 terminal;
|
|
188
|
+
} union_;
|
|
189
|
+
UInt32 id_;
|
|
190
|
+
};
|
|
191
|
+
|
|
192
|
+
inline bool operator==(const ReverseKey &lhs, const ReverseKey &rhs) {
|
|
193
|
+
if (lhs.length() != rhs.length()) {
|
|
194
|
+
return false;
|
|
195
|
+
}
|
|
196
|
+
for (std::size_t i = 0; i < lhs.length(); ++i) {
|
|
197
|
+
if (lhs[i] != rhs[i]) {
|
|
198
|
+
return false;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
return true;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
inline bool operator!=(const ReverseKey &lhs, const ReverseKey &rhs) {
|
|
205
|
+
return !(lhs == rhs);
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
inline bool operator<(const ReverseKey &lhs, const ReverseKey &rhs) {
|
|
209
|
+
for (std::size_t i = 0; i < lhs.length(); ++i) {
|
|
210
|
+
if (i == rhs.length()) {
|
|
211
|
+
return false;
|
|
212
|
+
}
|
|
213
|
+
if (lhs[i] != rhs[i]) {
|
|
214
|
+
return (UInt8)lhs[i] < (UInt8)rhs[i];
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
return lhs.length() < rhs.length();
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
inline bool operator>(const ReverseKey &lhs, const ReverseKey &rhs) {
|
|
221
|
+
return rhs < lhs;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
} // namespace trie
|
|
225
|
+
} // namespace grimoire
|
|
226
|
+
} // namespace marisa
|
|
227
|
+
|
|
228
|
+
#endif // MARISA_GRIMOIRE_TRIE_KEY_H_
|
|
@@ -0,0 +1,876 @@
|
|
|
1
|
+
#include <algorithm>
|
|
2
|
+
#include <queue>
|
|
3
|
+
|
|
4
|
+
#include "marisa/grimoire/algorithm.h"
|
|
5
|
+
#include "marisa/grimoire/trie/header.h"
|
|
6
|
+
#include "marisa/grimoire/trie/range.h"
|
|
7
|
+
#include "marisa/grimoire/trie/state.h"
|
|
8
|
+
#include "marisa/grimoire/trie/louds-trie.h"
|
|
9
|
+
|
|
10
|
+
namespace marisa {
|
|
11
|
+
namespace grimoire {
|
|
12
|
+
namespace trie {
|
|
13
|
+
|
|
14
|
+
LoudsTrie::LoudsTrie()
|
|
15
|
+
: louds_(), terminal_flags_(), link_flags_(), bases_(), extras_(),
|
|
16
|
+
tail_(), next_trie_(), cache_(), cache_mask_(0), num_l1_nodes_(0),
|
|
17
|
+
config_(), mapper_() {}
|
|
18
|
+
|
|
19
|
+
LoudsTrie::~LoudsTrie() {}
|
|
20
|
+
|
|
21
|
+
void LoudsTrie::build(Keyset &keyset, int flags) {
|
|
22
|
+
Config config;
|
|
23
|
+
config.parse(flags);
|
|
24
|
+
|
|
25
|
+
LoudsTrie temp;
|
|
26
|
+
temp.build_(keyset, config);
|
|
27
|
+
swap(temp);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
void LoudsTrie::map(Mapper &mapper) {
|
|
31
|
+
Header().map(mapper);
|
|
32
|
+
|
|
33
|
+
LoudsTrie temp;
|
|
34
|
+
temp.map_(mapper);
|
|
35
|
+
temp.mapper_.swap(mapper);
|
|
36
|
+
swap(temp);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
void LoudsTrie::read(Reader &reader) {
|
|
40
|
+
Header().read(reader);
|
|
41
|
+
|
|
42
|
+
LoudsTrie temp;
|
|
43
|
+
temp.read_(reader);
|
|
44
|
+
swap(temp);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
void LoudsTrie::write(Writer &writer) const {
|
|
48
|
+
Header().write(writer);
|
|
49
|
+
|
|
50
|
+
write_(writer);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
bool LoudsTrie::lookup(Agent &agent) const {
|
|
54
|
+
MARISA_DEBUG_IF(!agent.has_state(), MARISA_STATE_ERROR);
|
|
55
|
+
|
|
56
|
+
State &state = agent.state();
|
|
57
|
+
state.lookup_init();
|
|
58
|
+
while (state.query_pos() < agent.query().length()) {
|
|
59
|
+
if (!find_child(agent)) {
|
|
60
|
+
return false;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
if (!terminal_flags_[state.node_id()]) {
|
|
64
|
+
return false;
|
|
65
|
+
}
|
|
66
|
+
agent.set_key(agent.query().ptr(), agent.query().length());
|
|
67
|
+
agent.set_key(terminal_flags_.rank1(state.node_id()));
|
|
68
|
+
return true;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
void LoudsTrie::reverse_lookup(Agent &agent) const {
|
|
72
|
+
MARISA_DEBUG_IF(!agent.has_state(), MARISA_STATE_ERROR);
|
|
73
|
+
MARISA_THROW_IF(agent.query().id() >= size(), MARISA_BOUND_ERROR);
|
|
74
|
+
|
|
75
|
+
State &state = agent.state();
|
|
76
|
+
state.reverse_lookup_init();
|
|
77
|
+
|
|
78
|
+
state.set_node_id(terminal_flags_.select1(agent.query().id()));
|
|
79
|
+
if (state.node_id() == 0) {
|
|
80
|
+
agent.set_key(state.key_buf().begin(), state.key_buf().size());
|
|
81
|
+
agent.set_key(agent.query().id());
|
|
82
|
+
return;
|
|
83
|
+
}
|
|
84
|
+
for ( ; ; ) {
|
|
85
|
+
if (link_flags_[state.node_id()]) {
|
|
86
|
+
const std::size_t prev_key_pos = state.key_buf().size();
|
|
87
|
+
restore(agent, get_link(state.node_id()));
|
|
88
|
+
std::reverse(state.key_buf().begin() + prev_key_pos,
|
|
89
|
+
state.key_buf().end());
|
|
90
|
+
} else {
|
|
91
|
+
state.key_buf().push_back((char)bases_[state.node_id()]);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
if (state.node_id() <= num_l1_nodes_) {
|
|
95
|
+
std::reverse(state.key_buf().begin(), state.key_buf().end());
|
|
96
|
+
agent.set_key(state.key_buf().begin(), state.key_buf().size());
|
|
97
|
+
agent.set_key(agent.query().id());
|
|
98
|
+
return;
|
|
99
|
+
}
|
|
100
|
+
state.set_node_id(louds_.select1(state.node_id()) - state.node_id() - 1);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
bool LoudsTrie::common_prefix_search(Agent &agent) const {
|
|
105
|
+
MARISA_DEBUG_IF(!agent.has_state(), MARISA_STATE_ERROR);
|
|
106
|
+
|
|
107
|
+
State &state = agent.state();
|
|
108
|
+
if (state.status_code() == MARISA_END_OF_COMMON_PREFIX_SEARCH) {
|
|
109
|
+
return false;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
if (state.status_code() != MARISA_READY_TO_COMMON_PREFIX_SEARCH) {
|
|
113
|
+
state.common_prefix_search_init();
|
|
114
|
+
if (terminal_flags_[state.node_id()]) {
|
|
115
|
+
agent.set_key(agent.query().ptr(), state.query_pos());
|
|
116
|
+
agent.set_key(terminal_flags_.rank1(state.node_id()));
|
|
117
|
+
return true;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
while (state.query_pos() < agent.query().length()) {
|
|
122
|
+
if (!find_child(agent)) {
|
|
123
|
+
state.set_status_code(MARISA_END_OF_COMMON_PREFIX_SEARCH);
|
|
124
|
+
return false;
|
|
125
|
+
} else if (terminal_flags_[state.node_id()]) {
|
|
126
|
+
agent.set_key(agent.query().ptr(), state.query_pos());
|
|
127
|
+
agent.set_key(terminal_flags_.rank1(state.node_id()));
|
|
128
|
+
return true;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
state.set_status_code(MARISA_END_OF_COMMON_PREFIX_SEARCH);
|
|
132
|
+
return false;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
bool LoudsTrie::predictive_search(Agent &agent) const {
|
|
136
|
+
MARISA_DEBUG_IF(!agent.has_state(), MARISA_STATE_ERROR);
|
|
137
|
+
|
|
138
|
+
State &state = agent.state();
|
|
139
|
+
if (state.status_code() == MARISA_END_OF_PREDICTIVE_SEARCH) {
|
|
140
|
+
return false;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
if (state.status_code() != MARISA_READY_TO_PREDICTIVE_SEARCH) {
|
|
144
|
+
state.predictive_search_init();
|
|
145
|
+
while (state.query_pos() < agent.query().length()) {
|
|
146
|
+
if (!predictive_find_child(agent)) {
|
|
147
|
+
state.set_status_code(MARISA_END_OF_PREDICTIVE_SEARCH);
|
|
148
|
+
return false;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
History history;
|
|
153
|
+
history.set_node_id(state.node_id());
|
|
154
|
+
history.set_key_pos(state.key_buf().size());
|
|
155
|
+
state.history().push_back(history);
|
|
156
|
+
state.set_history_pos(1);
|
|
157
|
+
|
|
158
|
+
if (terminal_flags_[state.node_id()]) {
|
|
159
|
+
agent.set_key(state.key_buf().begin(), state.key_buf().size());
|
|
160
|
+
agent.set_key(terminal_flags_.rank1(state.node_id()));
|
|
161
|
+
return true;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
for ( ; ; ) {
|
|
166
|
+
if (state.history_pos() == state.history().size()) {
|
|
167
|
+
const History ¤t = state.history().back();
|
|
168
|
+
History next;
|
|
169
|
+
next.set_louds_pos(louds_.select0(current.node_id()) + 1);
|
|
170
|
+
next.set_node_id(next.louds_pos() - current.node_id() - 1);
|
|
171
|
+
state.history().push_back(next);
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
History &next = state.history()[state.history_pos()];
|
|
175
|
+
const bool link_flag = louds_[next.louds_pos()];
|
|
176
|
+
next.set_louds_pos(next.louds_pos() + 1);
|
|
177
|
+
if (link_flag) {
|
|
178
|
+
state.set_history_pos(state.history_pos() + 1);
|
|
179
|
+
if (link_flags_[next.node_id()]) {
|
|
180
|
+
next.set_link_id(update_link_id(next.link_id(), next.node_id()));
|
|
181
|
+
restore(agent, get_link(next.node_id(), next.link_id()));
|
|
182
|
+
} else {
|
|
183
|
+
state.key_buf().push_back((char)bases_[next.node_id()]);
|
|
184
|
+
}
|
|
185
|
+
next.set_key_pos(state.key_buf().size());
|
|
186
|
+
|
|
187
|
+
if (terminal_flags_[next.node_id()]) {
|
|
188
|
+
if (next.key_id() == MARISA_INVALID_KEY_ID) {
|
|
189
|
+
next.set_key_id(terminal_flags_.rank1(next.node_id()));
|
|
190
|
+
} else {
|
|
191
|
+
next.set_key_id(next.key_id() + 1);
|
|
192
|
+
}
|
|
193
|
+
agent.set_key(state.key_buf().begin(), state.key_buf().size());
|
|
194
|
+
agent.set_key(next.key_id());
|
|
195
|
+
return true;
|
|
196
|
+
}
|
|
197
|
+
} else if (state.history_pos() != 1) {
|
|
198
|
+
History ¤t = state.history()[state.history_pos() - 1];
|
|
199
|
+
current.set_node_id(current.node_id() + 1);
|
|
200
|
+
const History &prev =
|
|
201
|
+
state.history()[state.history_pos() - 2];
|
|
202
|
+
state.key_buf().resize(prev.key_pos());
|
|
203
|
+
state.set_history_pos(state.history_pos() - 1);
|
|
204
|
+
} else {
|
|
205
|
+
state.set_status_code(MARISA_END_OF_PREDICTIVE_SEARCH);
|
|
206
|
+
return false;
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
std::size_t LoudsTrie::total_size() const {
|
|
212
|
+
return louds_.total_size() + terminal_flags_.total_size()
|
|
213
|
+
+ link_flags_.total_size() + bases_.total_size()
|
|
214
|
+
+ extras_.total_size() + tail_.total_size()
|
|
215
|
+
+ ((next_trie_.get() != NULL) ? next_trie_->total_size() : 0)
|
|
216
|
+
+ cache_.total_size();
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
std::size_t LoudsTrie::io_size() const {
|
|
220
|
+
return Header().io_size() + louds_.io_size()
|
|
221
|
+
+ terminal_flags_.io_size() + link_flags_.io_size()
|
|
222
|
+
+ bases_.io_size() + extras_.io_size() + tail_.io_size()
|
|
223
|
+
+ ((next_trie_.get() != NULL) ?
|
|
224
|
+
(next_trie_->io_size() - Header().io_size()) : 0)
|
|
225
|
+
+ cache_.io_size() + (sizeof(UInt32) * 2);
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
void LoudsTrie::clear() {
|
|
229
|
+
LoudsTrie().swap(*this);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
void LoudsTrie::swap(LoudsTrie &rhs) {
|
|
233
|
+
louds_.swap(rhs.louds_);
|
|
234
|
+
terminal_flags_.swap(rhs.terminal_flags_);
|
|
235
|
+
link_flags_.swap(rhs.link_flags_);
|
|
236
|
+
bases_.swap(rhs.bases_);
|
|
237
|
+
extras_.swap(rhs.extras_);
|
|
238
|
+
tail_.swap(rhs.tail_);
|
|
239
|
+
next_trie_.swap(rhs.next_trie_);
|
|
240
|
+
cache_.swap(rhs.cache_);
|
|
241
|
+
marisa::swap(cache_mask_, rhs.cache_mask_);
|
|
242
|
+
marisa::swap(num_l1_nodes_, rhs.num_l1_nodes_);
|
|
243
|
+
config_.swap(rhs.config_);
|
|
244
|
+
mapper_.swap(rhs.mapper_);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
void LoudsTrie::build_(Keyset &keyset, const Config &config) {
|
|
248
|
+
Vector<Key> keys;
|
|
249
|
+
keys.resize(keyset.size());
|
|
250
|
+
for (std::size_t i = 0; i < keyset.size(); ++i) {
|
|
251
|
+
keys[i].set_str(keyset[i].ptr(), keyset[i].length());
|
|
252
|
+
keys[i].set_weight(keyset[i].weight());
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
Vector<UInt32> terminals;
|
|
256
|
+
build_trie(keys, &terminals, config, 1);
|
|
257
|
+
|
|
258
|
+
typedef std::pair<UInt32, UInt32> TerminalIdPair;
|
|
259
|
+
|
|
260
|
+
Vector<TerminalIdPair> pairs;
|
|
261
|
+
pairs.resize(terminals.size());
|
|
262
|
+
for (std::size_t i = 0; i < pairs.size(); ++i) {
|
|
263
|
+
pairs[i].first = terminals[i];
|
|
264
|
+
pairs[i].second = (UInt32)i;
|
|
265
|
+
}
|
|
266
|
+
terminals.clear();
|
|
267
|
+
std::sort(pairs.begin(), pairs.end());
|
|
268
|
+
|
|
269
|
+
std::size_t node_id = 0;
|
|
270
|
+
for (std::size_t i = 0; i < pairs.size(); ++i) {
|
|
271
|
+
while (node_id < pairs[i].first) {
|
|
272
|
+
terminal_flags_.push_back(false);
|
|
273
|
+
++node_id;
|
|
274
|
+
}
|
|
275
|
+
if (node_id == pairs[i].first) {
|
|
276
|
+
terminal_flags_.push_back(true);
|
|
277
|
+
++node_id;
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
while (node_id < bases_.size()) {
|
|
281
|
+
terminal_flags_.push_back(false);
|
|
282
|
+
++node_id;
|
|
283
|
+
}
|
|
284
|
+
terminal_flags_.push_back(false);
|
|
285
|
+
terminal_flags_.build(false, true);
|
|
286
|
+
|
|
287
|
+
for (std::size_t i = 0; i < keyset.size(); ++i) {
|
|
288
|
+
keyset[pairs[i].second].set_id(terminal_flags_.rank1(pairs[i].first));
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
template <typename T>
|
|
293
|
+
void LoudsTrie::build_trie(Vector<T> &keys,
|
|
294
|
+
Vector<UInt32> *terminals, const Config &config, std::size_t trie_id) {
|
|
295
|
+
build_current_trie(keys, terminals, config, trie_id);
|
|
296
|
+
|
|
297
|
+
Vector<UInt32> next_terminals;
|
|
298
|
+
if (!keys.empty()) {
|
|
299
|
+
build_next_trie(keys, &next_terminals, config, trie_id);
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
if (next_trie_.get() != NULL) {
|
|
303
|
+
config_.parse((next_trie_->num_tries() + 1) |
|
|
304
|
+
next_trie_->tail_mode() | next_trie_->node_order());
|
|
305
|
+
} else {
|
|
306
|
+
config_.parse(1 | tail_.mode() | config.node_order() |
|
|
307
|
+
config.cache_level());
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
link_flags_.build(false, false);
|
|
311
|
+
std::size_t node_id = 0;
|
|
312
|
+
for (std::size_t i = 0; i < next_terminals.size(); ++i) {
|
|
313
|
+
while (!link_flags_[node_id]) {
|
|
314
|
+
++node_id;
|
|
315
|
+
}
|
|
316
|
+
bases_[node_id] = (UInt8)(next_terminals[i] % 256);
|
|
317
|
+
next_terminals[i] /= 256;
|
|
318
|
+
++node_id;
|
|
319
|
+
}
|
|
320
|
+
extras_.build(next_terminals);
|
|
321
|
+
fill_cache();
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
template <typename T>
|
|
325
|
+
void LoudsTrie::build_current_trie(Vector<T> &keys,
|
|
326
|
+
Vector<UInt32> *terminals, const Config &config,
|
|
327
|
+
std::size_t trie_id) try {
|
|
328
|
+
for (std::size_t i = 0; i < keys.size(); ++i) {
|
|
329
|
+
keys[i].set_id(i);
|
|
330
|
+
}
|
|
331
|
+
const std::size_t num_keys = Algorithm().sort(keys.begin(), keys.end());
|
|
332
|
+
reserve_cache(config, trie_id, num_keys);
|
|
333
|
+
|
|
334
|
+
louds_.push_back(true);
|
|
335
|
+
louds_.push_back(false);
|
|
336
|
+
bases_.push_back('\0');
|
|
337
|
+
link_flags_.push_back(false);
|
|
338
|
+
|
|
339
|
+
Vector<T> next_keys;
|
|
340
|
+
std::queue<Range> queue;
|
|
341
|
+
Vector<WeightedRange> w_ranges;
|
|
342
|
+
|
|
343
|
+
queue.push(make_range(0, keys.size(), 0));
|
|
344
|
+
while (!queue.empty()) {
|
|
345
|
+
const std::size_t node_id = link_flags_.size() - queue.size();
|
|
346
|
+
|
|
347
|
+
Range range = queue.front();
|
|
348
|
+
queue.pop();
|
|
349
|
+
|
|
350
|
+
while ((range.begin() < range.end()) &&
|
|
351
|
+
(keys[range.begin()].length() == range.key_pos())) {
|
|
352
|
+
keys[range.begin()].set_terminal(node_id);
|
|
353
|
+
range.set_begin(range.begin() + 1);
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
if (range.begin() == range.end()) {
|
|
357
|
+
louds_.push_back(false);
|
|
358
|
+
continue;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
w_ranges.clear();
|
|
362
|
+
double weight = keys[range.begin()].weight();
|
|
363
|
+
for (std::size_t i = range.begin() + 1; i < range.end(); ++i) {
|
|
364
|
+
if (keys[i - 1][range.key_pos()] != keys[i][range.key_pos()]) {
|
|
365
|
+
w_ranges.push_back(make_weighted_range(
|
|
366
|
+
range.begin(), i, range.key_pos(), (float)weight));
|
|
367
|
+
range.set_begin(i);
|
|
368
|
+
weight = 0.0;
|
|
369
|
+
}
|
|
370
|
+
weight += keys[i].weight();
|
|
371
|
+
}
|
|
372
|
+
w_ranges.push_back(make_weighted_range(
|
|
373
|
+
range.begin(), range.end(), range.key_pos(), (float)weight));
|
|
374
|
+
if (config.node_order() == MARISA_WEIGHT_ORDER) {
|
|
375
|
+
std::stable_sort(w_ranges.begin(), w_ranges.end(),
|
|
376
|
+
std::greater<WeightedRange>());
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
if (node_id == 0) {
|
|
380
|
+
num_l1_nodes_ = w_ranges.size();
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
for (std::size_t i = 0; i < w_ranges.size(); ++i) {
|
|
384
|
+
WeightedRange &w_range = w_ranges[i];
|
|
385
|
+
std::size_t key_pos = w_range.key_pos() + 1;
|
|
386
|
+
while (key_pos < keys[w_range.begin()].length()) {
|
|
387
|
+
std::size_t j;
|
|
388
|
+
for (j = w_range.begin() + 1; j < w_range.end(); ++j) {
|
|
389
|
+
if (keys[j - 1][key_pos] != keys[j][key_pos]) {
|
|
390
|
+
break;
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
if (j < w_range.end()) {
|
|
394
|
+
break;
|
|
395
|
+
}
|
|
396
|
+
++key_pos;
|
|
397
|
+
}
|
|
398
|
+
cache<T>(node_id, bases_.size(), w_range.weight(),
|
|
399
|
+
keys[w_range.begin()][w_range.key_pos()]);
|
|
400
|
+
|
|
401
|
+
if (key_pos == w_range.key_pos() + 1) {
|
|
402
|
+
bases_.push_back(keys[w_range.begin()][w_range.key_pos()]);
|
|
403
|
+
link_flags_.push_back(false);
|
|
404
|
+
} else {
|
|
405
|
+
bases_.push_back('\0');
|
|
406
|
+
link_flags_.push_back(true);
|
|
407
|
+
T next_key;
|
|
408
|
+
next_key.set_str(keys[w_range.begin()].ptr(),
|
|
409
|
+
keys[w_range.begin()].length());
|
|
410
|
+
next_key.substr(w_range.key_pos(), key_pos - w_range.key_pos());
|
|
411
|
+
next_key.set_weight(w_range.weight());
|
|
412
|
+
next_keys.push_back(next_key);
|
|
413
|
+
}
|
|
414
|
+
w_range.set_key_pos(key_pos);
|
|
415
|
+
queue.push(w_range.range());
|
|
416
|
+
louds_.push_back(true);
|
|
417
|
+
}
|
|
418
|
+
louds_.push_back(false);
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
louds_.push_back(false);
|
|
422
|
+
louds_.build(trie_id == 1, true);
|
|
423
|
+
bases_.shrink();
|
|
424
|
+
|
|
425
|
+
build_terminals(keys, terminals);
|
|
426
|
+
keys.swap(next_keys);
|
|
427
|
+
} catch (const std::bad_alloc &) {
|
|
428
|
+
MARISA_THROW(MARISA_MEMORY_ERROR, "std::bad_alloc");
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
template <>
|
|
432
|
+
void LoudsTrie::build_next_trie(Vector<Key> &keys,
|
|
433
|
+
Vector<UInt32> *terminals, const Config &config, std::size_t trie_id) {
|
|
434
|
+
if (trie_id == config.num_tries()) {
|
|
435
|
+
Vector<Entry> entries;
|
|
436
|
+
entries.resize(keys.size());
|
|
437
|
+
for (std::size_t i = 0; i < keys.size(); ++i) {
|
|
438
|
+
entries[i].set_str(keys[i].ptr(), keys[i].length());
|
|
439
|
+
}
|
|
440
|
+
tail_.build(entries, terminals, config.tail_mode());
|
|
441
|
+
return;
|
|
442
|
+
}
|
|
443
|
+
Vector<ReverseKey> reverse_keys;
|
|
444
|
+
reverse_keys.resize(keys.size());
|
|
445
|
+
for (std::size_t i = 0; i < keys.size(); ++i) {
|
|
446
|
+
reverse_keys[i].set_str(keys[i].ptr(), keys[i].length());
|
|
447
|
+
reverse_keys[i].set_weight(keys[i].weight());
|
|
448
|
+
}
|
|
449
|
+
keys.clear();
|
|
450
|
+
next_trie_.reset(new (std::nothrow) LoudsTrie);
|
|
451
|
+
MARISA_THROW_IF(next_trie_.get() == NULL, MARISA_MEMORY_ERROR);
|
|
452
|
+
next_trie_->build_trie(reverse_keys, terminals, config, trie_id + 1);
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
template <>
|
|
456
|
+
void LoudsTrie::build_next_trie(Vector<ReverseKey> &keys,
|
|
457
|
+
Vector<UInt32> *terminals, const Config &config, std::size_t trie_id) {
|
|
458
|
+
if (trie_id == config.num_tries()) {
|
|
459
|
+
Vector<Entry> entries;
|
|
460
|
+
entries.resize(keys.size());
|
|
461
|
+
for (std::size_t i = 0; i < keys.size(); ++i) {
|
|
462
|
+
entries[i].set_str(keys[i].ptr(), keys[i].length());
|
|
463
|
+
}
|
|
464
|
+
tail_.build(entries, terminals, config.tail_mode());
|
|
465
|
+
return;
|
|
466
|
+
}
|
|
467
|
+
next_trie_.reset(new (std::nothrow) LoudsTrie);
|
|
468
|
+
MARISA_THROW_IF(next_trie_.get() == NULL, MARISA_MEMORY_ERROR);
|
|
469
|
+
next_trie_->build_trie(keys, terminals, config, trie_id + 1);
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
template <typename T>
|
|
473
|
+
void LoudsTrie::build_terminals(const Vector<T> &keys,
|
|
474
|
+
Vector<UInt32> *terminals) const {
|
|
475
|
+
Vector<UInt32> temp;
|
|
476
|
+
temp.resize(keys.size());
|
|
477
|
+
for (std::size_t i = 0; i < keys.size(); ++i) {
|
|
478
|
+
temp[keys[i].id()] = (UInt32)keys[i].terminal();
|
|
479
|
+
}
|
|
480
|
+
terminals->swap(temp);
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
template <>
|
|
484
|
+
void LoudsTrie::cache<Key>(std::size_t parent, std::size_t child,
|
|
485
|
+
float weight, char label) {
|
|
486
|
+
MARISA_DEBUG_IF(parent >= child, MARISA_RANGE_ERROR);
|
|
487
|
+
|
|
488
|
+
const std::size_t cache_id = get_cache_id(parent, label);
|
|
489
|
+
if (weight > cache_[cache_id].weight()) {
|
|
490
|
+
cache_[cache_id].set_parent(parent);
|
|
491
|
+
cache_[cache_id].set_child(child);
|
|
492
|
+
cache_[cache_id].set_weight(weight);
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
void LoudsTrie::reserve_cache(const Config &config, std::size_t trie_id,
|
|
497
|
+
std::size_t num_keys) {
|
|
498
|
+
std::size_t cache_size = (trie_id == 1) ? 256 : 1;
|
|
499
|
+
while (cache_size < (num_keys / config.cache_level())) {
|
|
500
|
+
cache_size *= 2;
|
|
501
|
+
}
|
|
502
|
+
cache_.resize(cache_size);
|
|
503
|
+
cache_mask_ = cache_size - 1;
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
template <>
|
|
507
|
+
void LoudsTrie::cache<ReverseKey>(std::size_t parent, std::size_t child,
|
|
508
|
+
float weight, char) {
|
|
509
|
+
MARISA_DEBUG_IF(parent >= child, MARISA_RANGE_ERROR);
|
|
510
|
+
|
|
511
|
+
const std::size_t cache_id = get_cache_id(child);
|
|
512
|
+
if (weight > cache_[cache_id].weight()) {
|
|
513
|
+
cache_[cache_id].set_parent(parent);
|
|
514
|
+
cache_[cache_id].set_child(child);
|
|
515
|
+
cache_[cache_id].set_weight(weight);
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
void LoudsTrie::fill_cache() {
|
|
520
|
+
for (std::size_t i = 0; i < cache_.size(); ++i) {
|
|
521
|
+
const std::size_t node_id = cache_[i].child();
|
|
522
|
+
if (node_id != 0) {
|
|
523
|
+
cache_[i].set_base(bases_[node_id]);
|
|
524
|
+
cache_[i].set_extra(!link_flags_[node_id] ?
|
|
525
|
+
MARISA_INVALID_EXTRA : extras_[link_flags_.rank1(node_id)]);
|
|
526
|
+
} else {
|
|
527
|
+
cache_[i].set_parent(MARISA_UINT32_MAX);
|
|
528
|
+
cache_[i].set_child(MARISA_UINT32_MAX);
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
void LoudsTrie::map_(Mapper &mapper) {
|
|
534
|
+
louds_.map(mapper);
|
|
535
|
+
terminal_flags_.map(mapper);
|
|
536
|
+
link_flags_.map(mapper);
|
|
537
|
+
bases_.map(mapper);
|
|
538
|
+
extras_.map(mapper);
|
|
539
|
+
tail_.map(mapper);
|
|
540
|
+
if ((link_flags_.num_1s() != 0) && tail_.empty()) {
|
|
541
|
+
next_trie_.reset(new (std::nothrow) LoudsTrie);
|
|
542
|
+
MARISA_THROW_IF(next_trie_.get() == NULL, MARISA_MEMORY_ERROR);
|
|
543
|
+
next_trie_->map_(mapper);
|
|
544
|
+
}
|
|
545
|
+
cache_.map(mapper);
|
|
546
|
+
cache_mask_ = cache_.size() - 1;
|
|
547
|
+
{
|
|
548
|
+
UInt32 temp_num_l1_nodes;
|
|
549
|
+
mapper.map(&temp_num_l1_nodes);
|
|
550
|
+
num_l1_nodes_ = temp_num_l1_nodes;
|
|
551
|
+
}
|
|
552
|
+
{
|
|
553
|
+
UInt32 temp_config_flags;
|
|
554
|
+
mapper.map(&temp_config_flags);
|
|
555
|
+
config_.parse((int)temp_config_flags);
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
void LoudsTrie::read_(Reader &reader) {
|
|
560
|
+
louds_.read(reader);
|
|
561
|
+
terminal_flags_.read(reader);
|
|
562
|
+
link_flags_.read(reader);
|
|
563
|
+
bases_.read(reader);
|
|
564
|
+
extras_.read(reader);
|
|
565
|
+
tail_.read(reader);
|
|
566
|
+
if ((link_flags_.num_1s() != 0) && tail_.empty()) {
|
|
567
|
+
next_trie_.reset(new (std::nothrow) LoudsTrie);
|
|
568
|
+
MARISA_THROW_IF(next_trie_.get() == NULL, MARISA_MEMORY_ERROR);
|
|
569
|
+
next_trie_->read_(reader);
|
|
570
|
+
}
|
|
571
|
+
cache_.read(reader);
|
|
572
|
+
cache_mask_ = cache_.size() - 1;
|
|
573
|
+
{
|
|
574
|
+
UInt32 temp_num_l1_nodes;
|
|
575
|
+
reader.read(&temp_num_l1_nodes);
|
|
576
|
+
num_l1_nodes_ = temp_num_l1_nodes;
|
|
577
|
+
}
|
|
578
|
+
{
|
|
579
|
+
UInt32 temp_config_flags;
|
|
580
|
+
reader.read(&temp_config_flags);
|
|
581
|
+
config_.parse((int)temp_config_flags);
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
void LoudsTrie::write_(Writer &writer) const {
|
|
586
|
+
louds_.write(writer);
|
|
587
|
+
terminal_flags_.write(writer);
|
|
588
|
+
link_flags_.write(writer);
|
|
589
|
+
bases_.write(writer);
|
|
590
|
+
extras_.write(writer);
|
|
591
|
+
tail_.write(writer);
|
|
592
|
+
if (next_trie_.get() != NULL) {
|
|
593
|
+
next_trie_->write_(writer);
|
|
594
|
+
}
|
|
595
|
+
cache_.write(writer);
|
|
596
|
+
writer.write((UInt32)num_l1_nodes_);
|
|
597
|
+
writer.write((UInt32)config_.flags());
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
bool LoudsTrie::find_child(Agent &agent) const {
|
|
601
|
+
MARISA_DEBUG_IF(agent.state().query_pos() >= agent.query().length(),
|
|
602
|
+
MARISA_BOUND_ERROR);
|
|
603
|
+
|
|
604
|
+
State &state = agent.state();
|
|
605
|
+
const std::size_t cache_id = get_cache_id(state.node_id(),
|
|
606
|
+
agent.query()[state.query_pos()]);
|
|
607
|
+
if (state.node_id() == cache_[cache_id].parent()) {
|
|
608
|
+
if (cache_[cache_id].extra() != MARISA_INVALID_EXTRA) {
|
|
609
|
+
if (!match(agent, cache_[cache_id].link())) {
|
|
610
|
+
return false;
|
|
611
|
+
}
|
|
612
|
+
} else {
|
|
613
|
+
state.set_query_pos(state.query_pos() + 1);
|
|
614
|
+
}
|
|
615
|
+
state.set_node_id(cache_[cache_id].child());
|
|
616
|
+
return true;
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
std::size_t louds_pos = louds_.select0(state.node_id()) + 1;
|
|
620
|
+
if (!louds_[louds_pos]) {
|
|
621
|
+
return false;
|
|
622
|
+
}
|
|
623
|
+
state.set_node_id(louds_pos - state.node_id() - 1);
|
|
624
|
+
std::size_t link_id = MARISA_INVALID_LINK_ID;
|
|
625
|
+
do {
|
|
626
|
+
if (link_flags_[state.node_id()]) {
|
|
627
|
+
link_id = update_link_id(link_id, state.node_id());
|
|
628
|
+
const std::size_t prev_query_pos = state.query_pos();
|
|
629
|
+
if (match(agent, get_link(state.node_id(), link_id))) {
|
|
630
|
+
return true;
|
|
631
|
+
} else if (state.query_pos() != prev_query_pos) {
|
|
632
|
+
return false;
|
|
633
|
+
}
|
|
634
|
+
} else if (bases_[state.node_id()] ==
|
|
635
|
+
(UInt8)agent.query()[state.query_pos()]) {
|
|
636
|
+
state.set_query_pos(state.query_pos() + 1);
|
|
637
|
+
return true;
|
|
638
|
+
}
|
|
639
|
+
state.set_node_id(state.node_id() + 1);
|
|
640
|
+
++louds_pos;
|
|
641
|
+
} while (louds_[louds_pos]);
|
|
642
|
+
return false;
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
bool LoudsTrie::predictive_find_child(Agent &agent) const {
|
|
646
|
+
MARISA_DEBUG_IF(agent.state().query_pos() >= agent.query().length(),
|
|
647
|
+
MARISA_BOUND_ERROR);
|
|
648
|
+
|
|
649
|
+
State &state = agent.state();
|
|
650
|
+
const std::size_t cache_id = get_cache_id(state.node_id(),
|
|
651
|
+
agent.query()[state.query_pos()]);
|
|
652
|
+
if (state.node_id() == cache_[cache_id].parent()) {
|
|
653
|
+
if (cache_[cache_id].extra() != MARISA_INVALID_EXTRA) {
|
|
654
|
+
if (!prefix_match(agent, cache_[cache_id].link())) {
|
|
655
|
+
return false;
|
|
656
|
+
}
|
|
657
|
+
} else {
|
|
658
|
+
state.key_buf().push_back(cache_[cache_id].label());
|
|
659
|
+
state.set_query_pos(state.query_pos() + 1);
|
|
660
|
+
}
|
|
661
|
+
state.set_node_id(cache_[cache_id].child());
|
|
662
|
+
return true;
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
std::size_t louds_pos = louds_.select0(state.node_id()) + 1;
|
|
666
|
+
if (!louds_[louds_pos]) {
|
|
667
|
+
return false;
|
|
668
|
+
}
|
|
669
|
+
state.set_node_id(louds_pos - state.node_id() - 1);
|
|
670
|
+
std::size_t link_id = MARISA_INVALID_LINK_ID;
|
|
671
|
+
do {
|
|
672
|
+
if (link_flags_[state.node_id()]) {
|
|
673
|
+
link_id = update_link_id(link_id, state.node_id());
|
|
674
|
+
const std::size_t prev_query_pos = state.query_pos();
|
|
675
|
+
if (prefix_match(agent, get_link(state.node_id(), link_id))) {
|
|
676
|
+
return true;
|
|
677
|
+
} else if (state.query_pos() != prev_query_pos) {
|
|
678
|
+
return false;
|
|
679
|
+
}
|
|
680
|
+
} else if (bases_[state.node_id()] ==
|
|
681
|
+
(UInt8)agent.query()[state.query_pos()]) {
|
|
682
|
+
state.key_buf().push_back((char)bases_[state.node_id()]);
|
|
683
|
+
state.set_query_pos(state.query_pos() + 1);
|
|
684
|
+
return true;
|
|
685
|
+
}
|
|
686
|
+
state.set_node_id(state.node_id() + 1);
|
|
687
|
+
++louds_pos;
|
|
688
|
+
} while (louds_[louds_pos]);
|
|
689
|
+
return false;
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
void LoudsTrie::restore(Agent &agent, std::size_t link) const {
|
|
693
|
+
if (next_trie_.get() != NULL) {
|
|
694
|
+
next_trie_->restore_(agent, link);
|
|
695
|
+
} else {
|
|
696
|
+
tail_.restore(agent, link);
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
bool LoudsTrie::match(Agent &agent, std::size_t link) const {
|
|
701
|
+
if (next_trie_.get() != NULL) {
|
|
702
|
+
return next_trie_->match_(agent, link);
|
|
703
|
+
} else {
|
|
704
|
+
return tail_.match(agent, link);
|
|
705
|
+
}
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
bool LoudsTrie::prefix_match(Agent &agent, std::size_t link) const {
|
|
709
|
+
if (next_trie_.get() != NULL) {
|
|
710
|
+
return next_trie_->prefix_match_(agent, link);
|
|
711
|
+
} else {
|
|
712
|
+
return tail_.prefix_match(agent, link);
|
|
713
|
+
}
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
void LoudsTrie::restore_(Agent &agent, std::size_t node_id) const {
|
|
717
|
+
MARISA_DEBUG_IF(node_id == 0, MARISA_RANGE_ERROR);
|
|
718
|
+
|
|
719
|
+
State &state = agent.state();
|
|
720
|
+
for ( ; ; ) {
|
|
721
|
+
const std::size_t cache_id = get_cache_id(node_id);
|
|
722
|
+
if (node_id == cache_[cache_id].child()) {
|
|
723
|
+
if (cache_[cache_id].extra() != MARISA_INVALID_EXTRA) {
|
|
724
|
+
restore(agent, cache_[cache_id].link());
|
|
725
|
+
} else {
|
|
726
|
+
state.key_buf().push_back(cache_[cache_id].label());
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
node_id = cache_[cache_id].parent();
|
|
730
|
+
if (node_id == 0) {
|
|
731
|
+
return;
|
|
732
|
+
}
|
|
733
|
+
continue;
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
if (link_flags_[node_id]) {
|
|
737
|
+
restore(agent, get_link(node_id));
|
|
738
|
+
} else {
|
|
739
|
+
state.key_buf().push_back((char)bases_[node_id]);
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
if (node_id <= num_l1_nodes_) {
|
|
743
|
+
return;
|
|
744
|
+
}
|
|
745
|
+
node_id = louds_.select1(node_id) - node_id - 1;
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
bool LoudsTrie::match_(Agent &agent, std::size_t node_id) const {
|
|
750
|
+
MARISA_DEBUG_IF(agent.state().query_pos() >= agent.query().length(),
|
|
751
|
+
MARISA_BOUND_ERROR);
|
|
752
|
+
MARISA_DEBUG_IF(node_id == 0, MARISA_RANGE_ERROR);
|
|
753
|
+
|
|
754
|
+
State &state = agent.state();
|
|
755
|
+
for ( ; ; ) {
|
|
756
|
+
const std::size_t cache_id = get_cache_id(node_id);
|
|
757
|
+
if (node_id == cache_[cache_id].child()) {
|
|
758
|
+
if (cache_[cache_id].extra() != MARISA_INVALID_EXTRA) {
|
|
759
|
+
if (!match(agent, cache_[cache_id].link())) {
|
|
760
|
+
return false;
|
|
761
|
+
}
|
|
762
|
+
} else if (cache_[cache_id].label() ==
|
|
763
|
+
agent.query()[state.query_pos()]) {
|
|
764
|
+
state.set_query_pos(state.query_pos() + 1);
|
|
765
|
+
} else {
|
|
766
|
+
return false;
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
node_id = cache_[cache_id].parent();
|
|
770
|
+
if (node_id == 0) {
|
|
771
|
+
return true;
|
|
772
|
+
} else if (state.query_pos() >= agent.query().length()) {
|
|
773
|
+
return false;
|
|
774
|
+
}
|
|
775
|
+
continue;
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
if (link_flags_[node_id]) {
|
|
779
|
+
if (next_trie_.get() != NULL) {
|
|
780
|
+
if (!match(agent, get_link(node_id))) {
|
|
781
|
+
return false;
|
|
782
|
+
}
|
|
783
|
+
} else if (!tail_.match(agent, get_link(node_id))) {
|
|
784
|
+
return false;
|
|
785
|
+
}
|
|
786
|
+
} else if (bases_[node_id] == (UInt8)agent.query()[state.query_pos()]) {
|
|
787
|
+
state.set_query_pos(state.query_pos() + 1);
|
|
788
|
+
} else {
|
|
789
|
+
return false;
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
if (node_id <= num_l1_nodes_) {
|
|
793
|
+
return true;
|
|
794
|
+
} else if (state.query_pos() >= agent.query().length()) {
|
|
795
|
+
return false;
|
|
796
|
+
}
|
|
797
|
+
node_id = louds_.select1(node_id) - node_id - 1;
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
|
|
801
|
+
bool LoudsTrie::prefix_match_(Agent &agent, std::size_t node_id) const {
|
|
802
|
+
MARISA_DEBUG_IF(agent.state().query_pos() >= agent.query().length(),
|
|
803
|
+
MARISA_BOUND_ERROR);
|
|
804
|
+
MARISA_DEBUG_IF(node_id == 0, MARISA_RANGE_ERROR);
|
|
805
|
+
|
|
806
|
+
State &state = agent.state();
|
|
807
|
+
for ( ; ; ) {
|
|
808
|
+
const std::size_t cache_id = get_cache_id(node_id);
|
|
809
|
+
if (node_id == cache_[cache_id].child()) {
|
|
810
|
+
if (cache_[cache_id].extra() != MARISA_INVALID_EXTRA) {
|
|
811
|
+
if (!prefix_match(agent, cache_[cache_id].link())) {
|
|
812
|
+
return false;
|
|
813
|
+
}
|
|
814
|
+
} else if (cache_[cache_id].label() ==
|
|
815
|
+
agent.query()[state.query_pos()]) {
|
|
816
|
+
state.key_buf().push_back(cache_[cache_id].label());
|
|
817
|
+
state.set_query_pos(state.query_pos() + 1);
|
|
818
|
+
} else {
|
|
819
|
+
return false;
|
|
820
|
+
}
|
|
821
|
+
|
|
822
|
+
node_id = cache_[cache_id].parent();
|
|
823
|
+
if (node_id == 0) {
|
|
824
|
+
return true;
|
|
825
|
+
}
|
|
826
|
+
} else {
|
|
827
|
+
if (link_flags_[node_id]) {
|
|
828
|
+
if (!prefix_match(agent, get_link(node_id))) {
|
|
829
|
+
return false;
|
|
830
|
+
}
|
|
831
|
+
} else if (bases_[node_id] == (UInt8)agent.query()[state.query_pos()]) {
|
|
832
|
+
state.key_buf().push_back((char)bases_[node_id]);
|
|
833
|
+
state.set_query_pos(state.query_pos() + 1);
|
|
834
|
+
} else {
|
|
835
|
+
return false;
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
if (node_id <= num_l1_nodes_) {
|
|
839
|
+
return true;
|
|
840
|
+
}
|
|
841
|
+
node_id = louds_.select1(node_id) - node_id - 1;
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
if (state.query_pos() >= agent.query().length()) {
|
|
845
|
+
restore_(agent, node_id);
|
|
846
|
+
return true;
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
std::size_t LoudsTrie::get_cache_id(std::size_t node_id, char label) const {
|
|
852
|
+
return (node_id ^ (node_id << 5) ^ (UInt8)label) & cache_mask_;
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
std::size_t LoudsTrie::get_cache_id(std::size_t node_id) const {
|
|
856
|
+
return node_id & cache_mask_;
|
|
857
|
+
}
|
|
858
|
+
|
|
859
|
+
std::size_t LoudsTrie::get_link(std::size_t node_id) const {
|
|
860
|
+
return bases_[node_id] | (extras_[link_flags_.rank1(node_id)] * 256);
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
std::size_t LoudsTrie::get_link(std::size_t node_id,
|
|
864
|
+
std::size_t link_id) const {
|
|
865
|
+
return bases_[node_id] | (extras_[link_id] * 256);
|
|
866
|
+
}
|
|
867
|
+
|
|
868
|
+
std::size_t LoudsTrie::update_link_id(std::size_t link_id,
|
|
869
|
+
std::size_t node_id) const {
|
|
870
|
+
return (link_id == MARISA_INVALID_LINK_ID) ?
|
|
871
|
+
link_flags_.rank1(node_id) : (link_id + 1);
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
} // namespace trie
|
|
875
|
+
} // namespace grimoire
|
|
876
|
+
} // namespace marisa
|