melisa 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +11 -0
- data/ext/marisa/bindings/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/marisa-swig.h +183 -0
- data/ext/marisa/bindings/perl/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/perl/marisa-swig.h +183 -0
- data/ext/marisa/bindings/perl/marisa-swig_wrap.cxx +5160 -0
- data/ext/marisa/bindings/python/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/python/marisa-swig.h +183 -0
- data/ext/marisa/bindings/python/marisa-swig_wrap.cxx +6090 -0
- data/ext/marisa/bindings/ruby/extconf.rb +5 -0
- data/ext/marisa/bindings/ruby/marisa-swig.cxx +253 -0
- data/ext/marisa/bindings/ruby/marisa-swig.h +183 -0
- data/ext/marisa/bindings/ruby/marisa-swig_wrap.cxx +4708 -0
- data/ext/marisa/lib/marisa.h +14 -0
- data/ext/marisa/lib/marisa/agent.cc +51 -0
- data/ext/marisa/lib/marisa/agent.h +73 -0
- data/ext/marisa/lib/marisa/base.h +193 -0
- data/ext/marisa/lib/marisa/exception.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/algorithm.h +26 -0
- data/ext/marisa/lib/marisa/grimoire/algorithm/sort.h +196 -0
- data/ext/marisa/lib/marisa/grimoire/intrin.h +115 -0
- data/ext/marisa/lib/marisa/grimoire/io.h +18 -0
- data/ext/marisa/lib/marisa/grimoire/io/mapper.cc +163 -0
- data/ext/marisa/lib/marisa/grimoire/io/mapper.h +67 -0
- data/ext/marisa/lib/marisa/grimoire/io/reader.cc +147 -0
- data/ext/marisa/lib/marisa/grimoire/io/reader.h +66 -0
- data/ext/marisa/lib/marisa/grimoire/io/writer.cc +148 -0
- data/ext/marisa/lib/marisa/grimoire/io/writer.h +65 -0
- data/ext/marisa/lib/marisa/grimoire/trie.h +16 -0
- data/ext/marisa/lib/marisa/grimoire/trie/cache.h +81 -0
- data/ext/marisa/lib/marisa/grimoire/trie/config.h +155 -0
- data/ext/marisa/lib/marisa/grimoire/trie/entry.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/trie/header.h +61 -0
- data/ext/marisa/lib/marisa/grimoire/trie/history.h +65 -0
- data/ext/marisa/lib/marisa/grimoire/trie/key.h +228 -0
- data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.cc +876 -0
- data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.h +134 -0
- data/ext/marisa/lib/marisa/grimoire/trie/range.h +115 -0
- data/ext/marisa/lib/marisa/grimoire/trie/state.h +117 -0
- data/ext/marisa/lib/marisa/grimoire/trie/tail.cc +218 -0
- data/ext/marisa/lib/marisa/grimoire/trie/tail.h +72 -0
- data/ext/marisa/lib/marisa/grimoire/vector.h +18 -0
- data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.cc +826 -0
- data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.h +179 -0
- data/ext/marisa/lib/marisa/grimoire/vector/flat-vector.h +205 -0
- data/ext/marisa/lib/marisa/grimoire/vector/pop-count.h +110 -0
- data/ext/marisa/lib/marisa/grimoire/vector/rank-index.h +82 -0
- data/ext/marisa/lib/marisa/grimoire/vector/vector.h +256 -0
- data/ext/marisa/lib/marisa/iostream.h +18 -0
- data/ext/marisa/lib/marisa/key.h +85 -0
- data/ext/marisa/lib/marisa/keyset.cc +181 -0
- data/ext/marisa/lib/marisa/keyset.h +80 -0
- data/ext/marisa/lib/marisa/query.h +71 -0
- data/ext/marisa/lib/marisa/scoped-array.h +48 -0
- data/ext/marisa/lib/marisa/scoped-ptr.h +52 -0
- data/ext/marisa/lib/marisa/stdio.h +15 -0
- data/ext/marisa/lib/marisa/trie.cc +249 -0
- data/ext/marisa/lib/marisa/trie.h +64 -0
- data/ext/marisa/tests/base-test.cc +309 -0
- data/ext/marisa/tests/io-test.cc +252 -0
- data/ext/marisa/tests/marisa-assert.h +26 -0
- data/ext/marisa/tests/marisa-test.cc +388 -0
- data/ext/marisa/tests/trie-test.cc +507 -0
- data/ext/marisa/tests/vector-test.cc +466 -0
- data/ext/marisa/tools/cmdopt.cc +298 -0
- data/ext/marisa/tools/cmdopt.h +58 -0
- data/ext/marisa/tools/marisa-benchmark.cc +418 -0
- data/ext/marisa/tools/marisa-build.cc +206 -0
- data/ext/marisa/tools/marisa-common-prefix-search.cc +143 -0
- data/ext/marisa/tools/marisa-dump.cc +151 -0
- data/ext/marisa/tools/marisa-lookup.cc +110 -0
- data/ext/marisa/tools/marisa-predictive-search.cc +143 -0
- data/ext/marisa/tools/marisa-reverse-lookup.cc +110 -0
- data/lib/melisa.rb +7 -0
- data/lib/melisa/base_config_flags.rb +76 -0
- data/lib/melisa/bytes_trie.rb +55 -0
- data/lib/melisa/int_trie.rb +14 -0
- data/lib/melisa/search.rb +55 -0
- data/lib/melisa/trie.rb +96 -0
- data/lib/melisa/version.rb +3 -0
- data/melisa.gemspec +36 -0
- data/spec/base_config_flags_spec.rb +73 -0
- data/spec/bytes_trie_spec.rb +16 -0
- data/spec/int_trie_spec.rb +16 -0
- data/spec/search_spec.rb +29 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/trie_spec.rb +30 -0
- metadata +207 -0
@@ -0,0 +1,134 @@
|
|
1
|
+
#ifndef MARISA_GRIMOIRE_TRIE_LOUDS_TRIE_H_
|
2
|
+
#define MARISA_GRIMOIRE_TRIE_LOUDS_TRIE_H_
|
3
|
+
|
4
|
+
#include "marisa/keyset.h"
|
5
|
+
#include "marisa/agent.h"
|
6
|
+
#include "marisa/grimoire/vector.h"
|
7
|
+
#include "marisa/grimoire/trie/config.h"
|
8
|
+
#include "marisa/grimoire/trie/key.h"
|
9
|
+
#include "marisa/grimoire/trie/tail.h"
|
10
|
+
#include "marisa/grimoire/trie/cache.h"
|
11
|
+
|
12
|
+
namespace marisa {
|
13
|
+
namespace grimoire {
|
14
|
+
namespace trie {
|
15
|
+
|
16
|
+
class LoudsTrie {
|
17
|
+
public:
|
18
|
+
LoudsTrie();
|
19
|
+
~LoudsTrie();
|
20
|
+
|
21
|
+
void build(Keyset &keyset, int flags);
|
22
|
+
|
23
|
+
void map(Mapper &mapper);
|
24
|
+
void read(Reader &reader);
|
25
|
+
void write(Writer &writer) const;
|
26
|
+
|
27
|
+
bool lookup(Agent &agent) const;
|
28
|
+
void reverse_lookup(Agent &agent) const;
|
29
|
+
bool common_prefix_search(Agent &agent) const;
|
30
|
+
bool predictive_search(Agent &agent) const;
|
31
|
+
|
32
|
+
std::size_t num_tries() const {
|
33
|
+
return config_.num_tries();
|
34
|
+
}
|
35
|
+
std::size_t num_keys() const {
|
36
|
+
return size();
|
37
|
+
}
|
38
|
+
std::size_t num_nodes() const {
|
39
|
+
return (louds_.size() / 2) - 1;
|
40
|
+
}
|
41
|
+
|
42
|
+
CacheLevel cache_level() const {
|
43
|
+
return config_.cache_level();
|
44
|
+
}
|
45
|
+
TailMode tail_mode() const {
|
46
|
+
return config_.tail_mode();
|
47
|
+
}
|
48
|
+
NodeOrder node_order() const {
|
49
|
+
return config_.node_order();
|
50
|
+
}
|
51
|
+
|
52
|
+
bool empty() const {
|
53
|
+
return size() == 0;
|
54
|
+
}
|
55
|
+
std::size_t size() const {
|
56
|
+
return terminal_flags_.num_1s();
|
57
|
+
}
|
58
|
+
std::size_t total_size() const;
|
59
|
+
std::size_t io_size() const;
|
60
|
+
|
61
|
+
void clear();
|
62
|
+
void swap(LoudsTrie &rhs);
|
63
|
+
|
64
|
+
private:
|
65
|
+
BitVector louds_;
|
66
|
+
BitVector terminal_flags_;
|
67
|
+
BitVector link_flags_;
|
68
|
+
Vector<UInt8> bases_;
|
69
|
+
FlatVector extras_;
|
70
|
+
Tail tail_;
|
71
|
+
scoped_ptr<LoudsTrie> next_trie_;
|
72
|
+
Vector<Cache> cache_;
|
73
|
+
std::size_t cache_mask_;
|
74
|
+
std::size_t num_l1_nodes_;
|
75
|
+
Config config_;
|
76
|
+
Mapper mapper_;
|
77
|
+
|
78
|
+
void build_(Keyset &keyset, const Config &config);
|
79
|
+
|
80
|
+
template <typename T>
|
81
|
+
void build_trie(Vector<T> &keys,
|
82
|
+
Vector<UInt32> *terminals, const Config &config, std::size_t trie_id);
|
83
|
+
template <typename T>
|
84
|
+
void build_current_trie(Vector<T> &keys,
|
85
|
+
Vector<UInt32> *terminals, const Config &config, std::size_t trie_id);
|
86
|
+
template <typename T>
|
87
|
+
void build_next_trie(Vector<T> &keys,
|
88
|
+
Vector<UInt32> *terminals, const Config &config, std::size_t trie_id);
|
89
|
+
template <typename T>
|
90
|
+
void build_terminals(const Vector<T> &keys,
|
91
|
+
Vector<UInt32> *terminals) const;
|
92
|
+
|
93
|
+
void reserve_cache(const Config &config, std::size_t trie_id,
|
94
|
+
std::size_t num_keys);
|
95
|
+
template <typename T>
|
96
|
+
void cache(std::size_t parent, std::size_t child,
|
97
|
+
float weight, char label);
|
98
|
+
void fill_cache();
|
99
|
+
|
100
|
+
void map_(Mapper &mapper);
|
101
|
+
void read_(Reader &reader);
|
102
|
+
void write_(Writer &writer) const;
|
103
|
+
|
104
|
+
inline bool find_child(Agent &agent) const;
|
105
|
+
inline bool predictive_find_child(Agent &agent) const;
|
106
|
+
|
107
|
+
inline void restore(Agent &agent, std::size_t node_id) const;
|
108
|
+
inline bool match(Agent &agent, std::size_t node_id) const;
|
109
|
+
inline bool prefix_match(Agent &agent, std::size_t node_id) const;
|
110
|
+
|
111
|
+
void restore_(Agent &agent, std::size_t node_id) const;
|
112
|
+
bool match_(Agent &agent, std::size_t node_id) const;
|
113
|
+
bool prefix_match_(Agent &agent, std::size_t node_id) const;
|
114
|
+
|
115
|
+
inline std::size_t get_cache_id(std::size_t node_id, char label) const;
|
116
|
+
inline std::size_t get_cache_id(std::size_t node_id) const;
|
117
|
+
|
118
|
+
inline std::size_t get_link(std::size_t node_id) const;
|
119
|
+
inline std::size_t get_link(std::size_t node_id,
|
120
|
+
std::size_t link_id) const;
|
121
|
+
|
122
|
+
inline std::size_t update_link_id(std::size_t link_id,
|
123
|
+
std::size_t node_id) const;
|
124
|
+
|
125
|
+
// Disallows copy and assignment.
|
126
|
+
LoudsTrie(const LoudsTrie &);
|
127
|
+
LoudsTrie &operator=(const LoudsTrie &);
|
128
|
+
};
|
129
|
+
|
130
|
+
} // namespace trie
|
131
|
+
} // namespace grimoire
|
132
|
+
} // namespace marisa
|
133
|
+
|
134
|
+
#endif // MARISA_GRIMOIRE_TRIE_LOUDS_TRIE_H_
|
@@ -0,0 +1,115 @@
|
|
1
|
+
#ifndef MARISA_GRIMOIRE_TRIE_RANGE_H_
|
2
|
+
#define MARISA_GRIMOIRE_TRIE_RANGE_H_
|
3
|
+
|
4
|
+
#include "marisa/base.h"
|
5
|
+
|
6
|
+
namespace marisa {
|
7
|
+
namespace grimoire {
|
8
|
+
namespace trie {
|
9
|
+
|
10
|
+
class Range {
|
11
|
+
public:
|
12
|
+
Range() : begin_(0), end_(0), key_pos_(0) {}
|
13
|
+
|
14
|
+
void set_begin(std::size_t begin) {
|
15
|
+
MARISA_DEBUG_IF(begin > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
16
|
+
begin_ = begin;
|
17
|
+
}
|
18
|
+
void set_end(std::size_t end) {
|
19
|
+
MARISA_DEBUG_IF(end > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
20
|
+
end_ = end;
|
21
|
+
}
|
22
|
+
void set_key_pos(std::size_t key_pos) {
|
23
|
+
MARISA_DEBUG_IF(key_pos > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
24
|
+
key_pos_ = key_pos;
|
25
|
+
}
|
26
|
+
|
27
|
+
std::size_t begin() const {
|
28
|
+
return begin_;
|
29
|
+
}
|
30
|
+
std::size_t end() const {
|
31
|
+
return end_;
|
32
|
+
}
|
33
|
+
std::size_t key_pos() const {
|
34
|
+
return key_pos_;
|
35
|
+
}
|
36
|
+
|
37
|
+
private:
|
38
|
+
UInt32 begin_;
|
39
|
+
UInt32 end_;
|
40
|
+
UInt32 key_pos_;
|
41
|
+
};
|
42
|
+
|
43
|
+
inline Range make_range(std::size_t begin, std::size_t end,
|
44
|
+
std::size_t key_pos) {
|
45
|
+
Range range;
|
46
|
+
range.set_begin(begin);
|
47
|
+
range.set_end(end);
|
48
|
+
range.set_key_pos(key_pos);
|
49
|
+
return range;
|
50
|
+
}
|
51
|
+
|
52
|
+
class WeightedRange {
|
53
|
+
public:
|
54
|
+
WeightedRange() : range_(), weight_(0.0F) {}
|
55
|
+
|
56
|
+
void set_range(const Range &range) {
|
57
|
+
range_ = range;
|
58
|
+
}
|
59
|
+
void set_begin(std::size_t begin) {
|
60
|
+
range_.set_begin(begin);
|
61
|
+
}
|
62
|
+
void set_end(std::size_t end) {
|
63
|
+
range_.set_end(end);
|
64
|
+
}
|
65
|
+
void set_key_pos(std::size_t key_pos) {
|
66
|
+
range_.set_key_pos(key_pos);
|
67
|
+
}
|
68
|
+
void set_weight(float weight) {
|
69
|
+
weight_ = weight;
|
70
|
+
}
|
71
|
+
|
72
|
+
const Range &range() const {
|
73
|
+
return range_;
|
74
|
+
}
|
75
|
+
std::size_t begin() const {
|
76
|
+
return range_.begin();
|
77
|
+
}
|
78
|
+
std::size_t end() const {
|
79
|
+
return range_.end();
|
80
|
+
}
|
81
|
+
std::size_t key_pos() const {
|
82
|
+
return range_.key_pos();
|
83
|
+
}
|
84
|
+
float weight() const {
|
85
|
+
return weight_;
|
86
|
+
}
|
87
|
+
|
88
|
+
private:
|
89
|
+
Range range_;
|
90
|
+
float weight_;
|
91
|
+
};
|
92
|
+
|
93
|
+
inline bool operator<(const WeightedRange &lhs, const WeightedRange &rhs) {
|
94
|
+
return lhs.weight() < rhs.weight();
|
95
|
+
}
|
96
|
+
|
97
|
+
inline bool operator>(const WeightedRange &lhs, const WeightedRange &rhs) {
|
98
|
+
return lhs.weight() > rhs.weight();
|
99
|
+
}
|
100
|
+
|
101
|
+
inline WeightedRange make_weighted_range(std::size_t begin, std::size_t end,
|
102
|
+
std::size_t key_pos, float weight) {
|
103
|
+
WeightedRange range;
|
104
|
+
range.set_begin(begin);
|
105
|
+
range.set_end(end);
|
106
|
+
range.set_key_pos(key_pos);
|
107
|
+
range.set_weight(weight);
|
108
|
+
return range;
|
109
|
+
}
|
110
|
+
|
111
|
+
} // namespace trie
|
112
|
+
} // namespace grimoire
|
113
|
+
} // namespace marisa
|
114
|
+
|
115
|
+
#endif // MARISA_GRIMOIRE_TRIE_RANGE_H_
|
@@ -0,0 +1,117 @@
|
|
1
|
+
#ifndef MARISA_GRIMOIRE_TRIE_STATE_H_
|
2
|
+
#define MARISA_GRIMOIRE_TRIE_STATE_H_
|
3
|
+
|
4
|
+
#include "marisa/grimoire/vector.h"
|
5
|
+
#include "marisa/grimoire/trie/history.h"
|
6
|
+
|
7
|
+
namespace marisa {
|
8
|
+
namespace grimoire {
|
9
|
+
namespace trie {
|
10
|
+
|
11
|
+
// A search agent has its internal state and the status codes are defined
|
12
|
+
// below.
|
13
|
+
typedef enum StatusCode {
|
14
|
+
MARISA_READY_TO_ALL,
|
15
|
+
MARISA_READY_TO_COMMON_PREFIX_SEARCH,
|
16
|
+
MARISA_READY_TO_PREDICTIVE_SEARCH,
|
17
|
+
MARISA_END_OF_COMMON_PREFIX_SEARCH,
|
18
|
+
MARISA_END_OF_PREDICTIVE_SEARCH,
|
19
|
+
} StatusCode;
|
20
|
+
|
21
|
+
class State {
|
22
|
+
public:
|
23
|
+
State()
|
24
|
+
: key_buf_(), history_(), node_id_(0), query_pos_(0),
|
25
|
+
history_pos_(0), status_code_(MARISA_READY_TO_ALL) {}
|
26
|
+
|
27
|
+
void set_node_id(std::size_t node_id) {
|
28
|
+
MARISA_DEBUG_IF(node_id > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
29
|
+
node_id_ = (UInt32)node_id;
|
30
|
+
}
|
31
|
+
void set_query_pos(std::size_t query_pos) {
|
32
|
+
MARISA_DEBUG_IF(query_pos > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
33
|
+
query_pos_ = (UInt32)query_pos;
|
34
|
+
}
|
35
|
+
void set_history_pos(std::size_t history_pos) {
|
36
|
+
MARISA_DEBUG_IF(history_pos > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
37
|
+
history_pos_ = (UInt32)history_pos;
|
38
|
+
}
|
39
|
+
void set_status_code(StatusCode status_code) {
|
40
|
+
status_code_ = status_code;
|
41
|
+
}
|
42
|
+
|
43
|
+
std::size_t node_id() const {
|
44
|
+
return node_id_;
|
45
|
+
}
|
46
|
+
std::size_t query_pos() const {
|
47
|
+
return query_pos_;
|
48
|
+
}
|
49
|
+
std::size_t history_pos() const {
|
50
|
+
return history_pos_;
|
51
|
+
}
|
52
|
+
StatusCode status_code() const {
|
53
|
+
return status_code_;
|
54
|
+
}
|
55
|
+
|
56
|
+
const Vector<char> &key_buf() const {
|
57
|
+
return key_buf_;
|
58
|
+
}
|
59
|
+
const Vector<History> &history() const {
|
60
|
+
return history_;
|
61
|
+
}
|
62
|
+
|
63
|
+
Vector<char> &key_buf() {
|
64
|
+
return key_buf_;
|
65
|
+
}
|
66
|
+
Vector<History> &history() {
|
67
|
+
return history_;
|
68
|
+
}
|
69
|
+
|
70
|
+
void reset() {
|
71
|
+
status_code_ = MARISA_READY_TO_ALL;
|
72
|
+
}
|
73
|
+
|
74
|
+
void lookup_init() {
|
75
|
+
node_id_ = 0;
|
76
|
+
query_pos_ = 0;
|
77
|
+
status_code_ = MARISA_READY_TO_ALL;
|
78
|
+
}
|
79
|
+
void reverse_lookup_init() {
|
80
|
+
key_buf_.resize(0);
|
81
|
+
key_buf_.reserve(32);
|
82
|
+
status_code_ = MARISA_READY_TO_ALL;
|
83
|
+
}
|
84
|
+
void common_prefix_search_init() {
|
85
|
+
node_id_ = 0;
|
86
|
+
query_pos_ = 0;
|
87
|
+
status_code_ = MARISA_READY_TO_COMMON_PREFIX_SEARCH;
|
88
|
+
}
|
89
|
+
void predictive_search_init() {
|
90
|
+
key_buf_.resize(0);
|
91
|
+
key_buf_.reserve(64);
|
92
|
+
history_.resize(0);
|
93
|
+
history_.reserve(4);
|
94
|
+
node_id_ = 0;
|
95
|
+
query_pos_ = 0;
|
96
|
+
history_pos_ = 0;
|
97
|
+
status_code_ = MARISA_READY_TO_PREDICTIVE_SEARCH;
|
98
|
+
}
|
99
|
+
|
100
|
+
private:
|
101
|
+
Vector<char> key_buf_;
|
102
|
+
Vector<History> history_;
|
103
|
+
UInt32 node_id_;
|
104
|
+
UInt32 query_pos_;
|
105
|
+
UInt32 history_pos_;
|
106
|
+
StatusCode status_code_;
|
107
|
+
|
108
|
+
// Disallows copy and assignment.
|
109
|
+
State(const State &);
|
110
|
+
State &operator=(const State &);
|
111
|
+
};
|
112
|
+
|
113
|
+
} // namespace trie
|
114
|
+
} // namespace grimoire
|
115
|
+
} // namespace marisa
|
116
|
+
|
117
|
+
#endif // MARISA_GRIMOIRE_TRIE_STATE_H_
|
@@ -0,0 +1,218 @@
|
|
1
|
+
#include "marisa/grimoire/algorithm.h"
|
2
|
+
#include "marisa/grimoire/trie/state.h"
|
3
|
+
#include "marisa/grimoire/trie/tail.h"
|
4
|
+
|
5
|
+
namespace marisa {
|
6
|
+
namespace grimoire {
|
7
|
+
namespace trie {
|
8
|
+
|
9
|
+
Tail::Tail() : buf_(), end_flags_() {}
|
10
|
+
|
11
|
+
void Tail::build(Vector<Entry> &entries, Vector<UInt32> *offsets,
|
12
|
+
TailMode mode) {
|
13
|
+
MARISA_THROW_IF(offsets == NULL, MARISA_NULL_ERROR);
|
14
|
+
|
15
|
+
switch (mode) {
|
16
|
+
case MARISA_TEXT_TAIL: {
|
17
|
+
for (std::size_t i = 0; i < entries.size(); ++i) {
|
18
|
+
const char * const ptr = entries[i].ptr();
|
19
|
+
const std::size_t length = entries[i].length();
|
20
|
+
for (std::size_t j = 0; j < length; ++j) {
|
21
|
+
if (ptr[j] == '\0') {
|
22
|
+
mode = MARISA_BINARY_TAIL;
|
23
|
+
break;
|
24
|
+
}
|
25
|
+
}
|
26
|
+
if (mode == MARISA_BINARY_TAIL) {
|
27
|
+
break;
|
28
|
+
}
|
29
|
+
}
|
30
|
+
break;
|
31
|
+
}
|
32
|
+
case MARISA_BINARY_TAIL: {
|
33
|
+
break;
|
34
|
+
}
|
35
|
+
default: {
|
36
|
+
MARISA_THROW(MARISA_CODE_ERROR, "undefined tail mode");
|
37
|
+
}
|
38
|
+
}
|
39
|
+
|
40
|
+
Tail temp;
|
41
|
+
temp.build_(entries, offsets, mode);
|
42
|
+
swap(temp);
|
43
|
+
}
|
44
|
+
|
45
|
+
void Tail::map(Mapper &mapper) {
|
46
|
+
Tail temp;
|
47
|
+
temp.map_(mapper);
|
48
|
+
swap(temp);
|
49
|
+
}
|
50
|
+
|
51
|
+
void Tail::read(Reader &reader) {
|
52
|
+
Tail temp;
|
53
|
+
temp.read_(reader);
|
54
|
+
swap(temp);
|
55
|
+
}
|
56
|
+
|
57
|
+
void Tail::write(Writer &writer) const {
|
58
|
+
write_(writer);
|
59
|
+
}
|
60
|
+
|
61
|
+
void Tail::restore(Agent &agent, std::size_t offset) const {
|
62
|
+
MARISA_DEBUG_IF(buf_.empty(), MARISA_STATE_ERROR);
|
63
|
+
|
64
|
+
State &state = agent.state();
|
65
|
+
if (end_flags_.empty()) {
|
66
|
+
for (const char *ptr = &buf_[offset]; *ptr != '\0'; ++ptr) {
|
67
|
+
state.key_buf().push_back(*ptr);
|
68
|
+
}
|
69
|
+
} else {
|
70
|
+
do {
|
71
|
+
state.key_buf().push_back(buf_[offset]);
|
72
|
+
} while (!end_flags_[offset++]);
|
73
|
+
}
|
74
|
+
}
|
75
|
+
|
76
|
+
bool Tail::match(Agent &agent, std::size_t offset) const {
|
77
|
+
MARISA_DEBUG_IF(buf_.empty(), MARISA_STATE_ERROR);
|
78
|
+
MARISA_DEBUG_IF(agent.state().query_pos() >= agent.query().length(),
|
79
|
+
MARISA_BOUND_ERROR);
|
80
|
+
|
81
|
+
State &state = agent.state();
|
82
|
+
if (end_flags_.empty()) {
|
83
|
+
const char * const ptr = &buf_[offset] - state.query_pos();
|
84
|
+
do {
|
85
|
+
if (ptr[state.query_pos()] != agent.query()[state.query_pos()]) {
|
86
|
+
return false;
|
87
|
+
}
|
88
|
+
state.set_query_pos(state.query_pos() + 1);
|
89
|
+
if (ptr[state.query_pos()] == '\0') {
|
90
|
+
return true;
|
91
|
+
}
|
92
|
+
} while (state.query_pos() < agent.query().length());
|
93
|
+
return false;
|
94
|
+
} else {
|
95
|
+
do {
|
96
|
+
if (buf_[offset] != agent.query()[state.query_pos()]) {
|
97
|
+
return false;
|
98
|
+
}
|
99
|
+
state.set_query_pos(state.query_pos() + 1);
|
100
|
+
if (end_flags_[offset++]) {
|
101
|
+
return true;
|
102
|
+
}
|
103
|
+
} while (state.query_pos() < agent.query().length());
|
104
|
+
return false;
|
105
|
+
}
|
106
|
+
}
|
107
|
+
|
108
|
+
bool Tail::prefix_match(Agent &agent, std::size_t offset) const {
|
109
|
+
MARISA_DEBUG_IF(buf_.empty(), MARISA_STATE_ERROR);
|
110
|
+
|
111
|
+
State &state = agent.state();
|
112
|
+
if (end_flags_.empty()) {
|
113
|
+
const char *ptr = &buf_[offset] - state.query_pos();
|
114
|
+
do {
|
115
|
+
if (ptr[state.query_pos()] != agent.query()[state.query_pos()]) {
|
116
|
+
return false;
|
117
|
+
}
|
118
|
+
state.key_buf().push_back(ptr[state.query_pos()]);
|
119
|
+
state.set_query_pos(state.query_pos() + 1);
|
120
|
+
if (ptr[state.query_pos()] == '\0') {
|
121
|
+
return true;
|
122
|
+
}
|
123
|
+
} while (state.query_pos() < agent.query().length());
|
124
|
+
ptr += state.query_pos();
|
125
|
+
do {
|
126
|
+
state.key_buf().push_back(*ptr);
|
127
|
+
} while (*++ptr != '\0');
|
128
|
+
return true;
|
129
|
+
} else {
|
130
|
+
do {
|
131
|
+
if (buf_[offset] != agent.query()[state.query_pos()]) {
|
132
|
+
return false;
|
133
|
+
}
|
134
|
+
state.key_buf().push_back(buf_[offset]);
|
135
|
+
state.set_query_pos(state.query_pos() + 1);
|
136
|
+
if (end_flags_[offset++]) {
|
137
|
+
return true;
|
138
|
+
}
|
139
|
+
} while (state.query_pos() < agent.query().length());
|
140
|
+
do {
|
141
|
+
state.key_buf().push_back(buf_[offset]);
|
142
|
+
} while (!end_flags_[offset++]);
|
143
|
+
return true;
|
144
|
+
}
|
145
|
+
}
|
146
|
+
|
147
|
+
void Tail::clear() {
|
148
|
+
Tail().swap(*this);
|
149
|
+
}
|
150
|
+
|
151
|
+
void Tail::swap(Tail &rhs) {
|
152
|
+
buf_.swap(rhs.buf_);
|
153
|
+
end_flags_.swap(rhs.end_flags_);
|
154
|
+
}
|
155
|
+
|
156
|
+
void Tail::build_(Vector<Entry> &entries, Vector<UInt32> *offsets,
|
157
|
+
TailMode mode) {
|
158
|
+
for (std::size_t i = 0; i < entries.size(); ++i) {
|
159
|
+
entries[i].set_id(i);
|
160
|
+
}
|
161
|
+
Algorithm().sort(entries.begin(), entries.end());
|
162
|
+
|
163
|
+
Vector<UInt32> temp_offsets;
|
164
|
+
temp_offsets.resize(entries.size(), 0);
|
165
|
+
|
166
|
+
const Entry dummy;
|
167
|
+
const Entry *last = &dummy;
|
168
|
+
for (std::size_t i = entries.size(); i > 0; --i) {
|
169
|
+
const Entry ¤t = entries[i - 1];
|
170
|
+
MARISA_THROW_IF(current.length() == 0, MARISA_RANGE_ERROR);
|
171
|
+
std::size_t match = 0;
|
172
|
+
while ((match < current.length()) && (match < last->length()) &&
|
173
|
+
((*last)[match] == current[match])) {
|
174
|
+
++match;
|
175
|
+
}
|
176
|
+
if ((match == current.length()) && (last->length() != 0)) {
|
177
|
+
temp_offsets[current.id()] = (UInt32)(
|
178
|
+
temp_offsets[last->id()] + (last->length() - match));
|
179
|
+
} else {
|
180
|
+
temp_offsets[current.id()] = (UInt32)buf_.size();
|
181
|
+
for (std::size_t j = 1; j <= current.length(); ++j) {
|
182
|
+
buf_.push_back(current[current.length() - j]);
|
183
|
+
}
|
184
|
+
if (mode == MARISA_TEXT_TAIL) {
|
185
|
+
buf_.push_back('\0');
|
186
|
+
} else {
|
187
|
+
for (std::size_t j = 1; j < current.length(); ++j) {
|
188
|
+
end_flags_.push_back(false);
|
189
|
+
}
|
190
|
+
end_flags_.push_back(true);
|
191
|
+
}
|
192
|
+
MARISA_THROW_IF(buf_.size() > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
|
193
|
+
}
|
194
|
+
last = ¤t;
|
195
|
+
}
|
196
|
+
buf_.shrink();
|
197
|
+
|
198
|
+
offsets->swap(temp_offsets);
|
199
|
+
}
|
200
|
+
|
201
|
+
void Tail::map_(Mapper &mapper) {
|
202
|
+
buf_.map(mapper);
|
203
|
+
end_flags_.map(mapper);
|
204
|
+
}
|
205
|
+
|
206
|
+
void Tail::read_(Reader &reader) {
|
207
|
+
buf_.read(reader);
|
208
|
+
end_flags_.read(reader);
|
209
|
+
}
|
210
|
+
|
211
|
+
void Tail::write_(Writer &writer) const {
|
212
|
+
buf_.write(writer);
|
213
|
+
end_flags_.write(writer);
|
214
|
+
}
|
215
|
+
|
216
|
+
} // namespace trie
|
217
|
+
} // namespace grimoire
|
218
|
+
} // namespace marisa
|