melisa 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. data/README.md +11 -0
  2. data/ext/marisa/bindings/marisa-swig.cxx +253 -0
  3. data/ext/marisa/bindings/marisa-swig.h +183 -0
  4. data/ext/marisa/bindings/perl/marisa-swig.cxx +253 -0
  5. data/ext/marisa/bindings/perl/marisa-swig.h +183 -0
  6. data/ext/marisa/bindings/perl/marisa-swig_wrap.cxx +5160 -0
  7. data/ext/marisa/bindings/python/marisa-swig.cxx +253 -0
  8. data/ext/marisa/bindings/python/marisa-swig.h +183 -0
  9. data/ext/marisa/bindings/python/marisa-swig_wrap.cxx +6090 -0
  10. data/ext/marisa/bindings/ruby/extconf.rb +5 -0
  11. data/ext/marisa/bindings/ruby/marisa-swig.cxx +253 -0
  12. data/ext/marisa/bindings/ruby/marisa-swig.h +183 -0
  13. data/ext/marisa/bindings/ruby/marisa-swig_wrap.cxx +4708 -0
  14. data/ext/marisa/lib/marisa.h +14 -0
  15. data/ext/marisa/lib/marisa/agent.cc +51 -0
  16. data/ext/marisa/lib/marisa/agent.h +73 -0
  17. data/ext/marisa/lib/marisa/base.h +193 -0
  18. data/ext/marisa/lib/marisa/exception.h +82 -0
  19. data/ext/marisa/lib/marisa/grimoire/algorithm.h +26 -0
  20. data/ext/marisa/lib/marisa/grimoire/algorithm/sort.h +196 -0
  21. data/ext/marisa/lib/marisa/grimoire/intrin.h +115 -0
  22. data/ext/marisa/lib/marisa/grimoire/io.h +18 -0
  23. data/ext/marisa/lib/marisa/grimoire/io/mapper.cc +163 -0
  24. data/ext/marisa/lib/marisa/grimoire/io/mapper.h +67 -0
  25. data/ext/marisa/lib/marisa/grimoire/io/reader.cc +147 -0
  26. data/ext/marisa/lib/marisa/grimoire/io/reader.h +66 -0
  27. data/ext/marisa/lib/marisa/grimoire/io/writer.cc +148 -0
  28. data/ext/marisa/lib/marisa/grimoire/io/writer.h +65 -0
  29. data/ext/marisa/lib/marisa/grimoire/trie.h +16 -0
  30. data/ext/marisa/lib/marisa/grimoire/trie/cache.h +81 -0
  31. data/ext/marisa/lib/marisa/grimoire/trie/config.h +155 -0
  32. data/ext/marisa/lib/marisa/grimoire/trie/entry.h +82 -0
  33. data/ext/marisa/lib/marisa/grimoire/trie/header.h +61 -0
  34. data/ext/marisa/lib/marisa/grimoire/trie/history.h +65 -0
  35. data/ext/marisa/lib/marisa/grimoire/trie/key.h +228 -0
  36. data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.cc +876 -0
  37. data/ext/marisa/lib/marisa/grimoire/trie/louds-trie.h +134 -0
  38. data/ext/marisa/lib/marisa/grimoire/trie/range.h +115 -0
  39. data/ext/marisa/lib/marisa/grimoire/trie/state.h +117 -0
  40. data/ext/marisa/lib/marisa/grimoire/trie/tail.cc +218 -0
  41. data/ext/marisa/lib/marisa/grimoire/trie/tail.h +72 -0
  42. data/ext/marisa/lib/marisa/grimoire/vector.h +18 -0
  43. data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.cc +826 -0
  44. data/ext/marisa/lib/marisa/grimoire/vector/bit-vector.h +179 -0
  45. data/ext/marisa/lib/marisa/grimoire/vector/flat-vector.h +205 -0
  46. data/ext/marisa/lib/marisa/grimoire/vector/pop-count.h +110 -0
  47. data/ext/marisa/lib/marisa/grimoire/vector/rank-index.h +82 -0
  48. data/ext/marisa/lib/marisa/grimoire/vector/vector.h +256 -0
  49. data/ext/marisa/lib/marisa/iostream.h +18 -0
  50. data/ext/marisa/lib/marisa/key.h +85 -0
  51. data/ext/marisa/lib/marisa/keyset.cc +181 -0
  52. data/ext/marisa/lib/marisa/keyset.h +80 -0
  53. data/ext/marisa/lib/marisa/query.h +71 -0
  54. data/ext/marisa/lib/marisa/scoped-array.h +48 -0
  55. data/ext/marisa/lib/marisa/scoped-ptr.h +52 -0
  56. data/ext/marisa/lib/marisa/stdio.h +15 -0
  57. data/ext/marisa/lib/marisa/trie.cc +249 -0
  58. data/ext/marisa/lib/marisa/trie.h +64 -0
  59. data/ext/marisa/tests/base-test.cc +309 -0
  60. data/ext/marisa/tests/io-test.cc +252 -0
  61. data/ext/marisa/tests/marisa-assert.h +26 -0
  62. data/ext/marisa/tests/marisa-test.cc +388 -0
  63. data/ext/marisa/tests/trie-test.cc +507 -0
  64. data/ext/marisa/tests/vector-test.cc +466 -0
  65. data/ext/marisa/tools/cmdopt.cc +298 -0
  66. data/ext/marisa/tools/cmdopt.h +58 -0
  67. data/ext/marisa/tools/marisa-benchmark.cc +418 -0
  68. data/ext/marisa/tools/marisa-build.cc +206 -0
  69. data/ext/marisa/tools/marisa-common-prefix-search.cc +143 -0
  70. data/ext/marisa/tools/marisa-dump.cc +151 -0
  71. data/ext/marisa/tools/marisa-lookup.cc +110 -0
  72. data/ext/marisa/tools/marisa-predictive-search.cc +143 -0
  73. data/ext/marisa/tools/marisa-reverse-lookup.cc +110 -0
  74. data/lib/melisa.rb +7 -0
  75. data/lib/melisa/base_config_flags.rb +76 -0
  76. data/lib/melisa/bytes_trie.rb +55 -0
  77. data/lib/melisa/int_trie.rb +14 -0
  78. data/lib/melisa/search.rb +55 -0
  79. data/lib/melisa/trie.rb +96 -0
  80. data/lib/melisa/version.rb +3 -0
  81. data/melisa.gemspec +36 -0
  82. data/spec/base_config_flags_spec.rb +73 -0
  83. data/spec/bytes_trie_spec.rb +16 -0
  84. data/spec/int_trie_spec.rb +16 -0
  85. data/spec/search_spec.rb +29 -0
  86. data/spec/spec_helper.rb +1 -0
  87. data/spec/trie_spec.rb +30 -0
  88. metadata +207 -0
@@ -0,0 +1,82 @@
1
+ #ifndef MARISA_GRIMOIRE_VECTOR_RANK_INDEX_H_
2
+ #define MARISA_GRIMOIRE_VECTOR_RANK_INDEX_H_
3
+
4
+ #include "marisa/base.h"
5
+
6
+ namespace marisa {
7
+ namespace grimoire {
8
+ namespace vector {
9
+
10
+ class RankIndex {
11
+ public:
12
+ RankIndex() : abs_(0), rel_lo_(0), rel_hi_(0) {}
13
+
14
+ void set_abs(std::size_t value) {
15
+ MARISA_DEBUG_IF(value > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
16
+ abs_ = (UInt32)value;
17
+ }
18
+ void set_rel1(std::size_t value) {
19
+ MARISA_DEBUG_IF(value > 64, MARISA_RANGE_ERROR);
20
+ rel_lo_ = (UInt32)((rel_lo_ & ~0x7FU) | (value & 0x7FU));
21
+ }
22
+ void set_rel2(std::size_t value) {
23
+ MARISA_DEBUG_IF(value > 128, MARISA_RANGE_ERROR);
24
+ rel_lo_ = (UInt32)((rel_lo_ & ~(0xFFU << 7)) | ((value & 0xFFU) << 7));
25
+ }
26
+ void set_rel3(std::size_t value) {
27
+ MARISA_DEBUG_IF(value > 192, MARISA_RANGE_ERROR);
28
+ rel_lo_ = (UInt32)((rel_lo_ & ~(0xFFU << 15)) | ((value & 0xFFU) << 15));
29
+ }
30
+ void set_rel4(std::size_t value) {
31
+ MARISA_DEBUG_IF(value > 256, MARISA_RANGE_ERROR);
32
+ rel_lo_ = (UInt32)((rel_lo_ & ~(0x1FFU << 23)) | ((value & 0x1FFU) << 23));
33
+ }
34
+ void set_rel5(std::size_t value) {
35
+ MARISA_DEBUG_IF(value > 320, MARISA_RANGE_ERROR);
36
+ rel_hi_ = (UInt32)((rel_hi_ & ~0x1FFU) | (value & 0x1FFU));
37
+ }
38
+ void set_rel6(std::size_t value) {
39
+ MARISA_DEBUG_IF(value > 384, MARISA_RANGE_ERROR);
40
+ rel_hi_ = (UInt32)((rel_hi_ & ~(0x1FFU << 9)) | ((value & 0x1FFU) << 9));
41
+ }
42
+ void set_rel7(std::size_t value) {
43
+ MARISA_DEBUG_IF(value > 448, MARISA_RANGE_ERROR);
44
+ rel_hi_ = (UInt32)((rel_hi_ & ~(0x1FFU << 18)) | ((value & 0x1FFU) << 18));
45
+ }
46
+
47
+ std::size_t abs() const {
48
+ return abs_;
49
+ }
50
+ std::size_t rel1() const {
51
+ return rel_lo_ & 0x7FU;
52
+ }
53
+ std::size_t rel2() const {
54
+ return (rel_lo_ >> 7) & 0xFFU;
55
+ }
56
+ std::size_t rel3() const {
57
+ return (rel_lo_ >> 15) & 0xFFU;
58
+ }
59
+ std::size_t rel4() const {
60
+ return (rel_lo_ >> 23) & 0x1FFU;
61
+ }
62
+ std::size_t rel5() const {
63
+ return rel_hi_ & 0x1FFU;
64
+ }
65
+ std::size_t rel6() const {
66
+ return (rel_hi_ >> 9) & 0x1FFU;
67
+ }
68
+ std::size_t rel7() const {
69
+ return (rel_hi_ >> 18) & 0x1FFU;
70
+ }
71
+
72
+ private:
73
+ UInt32 abs_;
74
+ UInt32 rel_lo_;
75
+ UInt32 rel_hi_;
76
+ };
77
+
78
+ } // namespace vector
79
+ } // namespace grimoire
80
+ } // namespace marisa
81
+
82
+ #endif // MARISA_GRIMOIRE_VECTOR_RANK_INDEX_H_
@@ -0,0 +1,256 @@
1
+ #ifndef MARISA_GRIMOIRE_VECTOR_VECTOR_H_
2
+ #define MARISA_GRIMOIRE_VECTOR_VECTOR_H_
3
+
4
+ #include <new>
5
+
6
+ #include "marisa/grimoire/io.h"
7
+
8
+ namespace marisa {
9
+ namespace grimoire {
10
+ namespace vector {
11
+
12
+ template <typename T>
13
+ class Vector {
14
+ public:
15
+ Vector()
16
+ : buf_(), objs_(NULL), const_objs_(NULL),
17
+ size_(0), capacity_(0), fixed_(false) {}
18
+ ~Vector() {
19
+ if (objs_ != NULL) {
20
+ for (std::size_t i = 0; i < size_; ++i) {
21
+ objs_[i].~T();
22
+ }
23
+ }
24
+ }
25
+
26
+ void map(Mapper &mapper) {
27
+ Vector temp;
28
+ temp.map_(mapper);
29
+ swap(temp);
30
+ }
31
+
32
+ void read(Reader &reader) {
33
+ Vector temp;
34
+ temp.read_(reader);
35
+ swap(temp);
36
+ }
37
+
38
+ void write(Writer &writer) const {
39
+ write_(writer);
40
+ }
41
+
42
+ void push_back(const T &x) {
43
+ MARISA_DEBUG_IF(fixed_, MARISA_STATE_ERROR);
44
+ MARISA_DEBUG_IF(size_ == max_size(), MARISA_SIZE_ERROR);
45
+ reserve(size_ + 1);
46
+ new (&objs_[size_]) T(x);
47
+ ++size_;
48
+ }
49
+
50
+ void pop_back() {
51
+ MARISA_DEBUG_IF(fixed_, MARISA_STATE_ERROR);
52
+ MARISA_DEBUG_IF(size_ == 0, MARISA_STATE_ERROR);
53
+ objs_[--size_].~T();
54
+ }
55
+
56
+ // resize() assumes that T's placement new does not throw an exception.
57
+ void resize(std::size_t size) {
58
+ MARISA_DEBUG_IF(fixed_, MARISA_STATE_ERROR);
59
+ reserve(size);
60
+ for (std::size_t i = size_; i < size; ++i) {
61
+ new (&objs_[i]) T;
62
+ }
63
+ for (std::size_t i = size; i < size_; ++i) {
64
+ objs_[i].~T();
65
+ }
66
+ size_ = size;
67
+ }
68
+
69
+ // resize() assumes that T's placement new does not throw an exception.
70
+ void resize(std::size_t size, const T &x) {
71
+ MARISA_DEBUG_IF(fixed_, MARISA_STATE_ERROR);
72
+ reserve(size);
73
+ for (std::size_t i = size_; i < size; ++i) {
74
+ new (&objs_[i]) T(x);
75
+ }
76
+ for (std::size_t i = size; i < size_; ++i) {
77
+ objs_[i].~T();
78
+ }
79
+ size_ = size;
80
+ }
81
+
82
+ void reserve(std::size_t capacity) {
83
+ MARISA_DEBUG_IF(fixed_, MARISA_STATE_ERROR);
84
+ if (capacity <= capacity_) {
85
+ return;
86
+ }
87
+ MARISA_DEBUG_IF(capacity > max_size(), MARISA_SIZE_ERROR);
88
+ std::size_t new_capacity = capacity;
89
+ if (capacity_ > (capacity / 2)) {
90
+ if (capacity_ > (max_size() / 2)) {
91
+ new_capacity = max_size();
92
+ } else {
93
+ new_capacity = capacity_ * 2;
94
+ }
95
+ }
96
+ realloc(new_capacity);
97
+ }
98
+
99
+ void shrink() {
100
+ MARISA_THROW_IF(fixed_, MARISA_STATE_ERROR);
101
+ if (size_ != capacity_) {
102
+ realloc(size_);
103
+ }
104
+ }
105
+
106
+ void fix() {
107
+ MARISA_THROW_IF(fixed_, MARISA_STATE_ERROR);
108
+ fixed_ = true;
109
+ }
110
+
111
+ const T *begin() const {
112
+ return const_objs_;
113
+ }
114
+ const T *end() const {
115
+ return const_objs_ + size_;
116
+ }
117
+ const T &operator[](std::size_t i) const {
118
+ MARISA_DEBUG_IF(i >= size_, MARISA_BOUND_ERROR);
119
+ return const_objs_[i];
120
+ }
121
+ const T &front() const {
122
+ MARISA_DEBUG_IF(size_ == 0, MARISA_STATE_ERROR);
123
+ return const_objs_[0];
124
+ }
125
+ const T &back() const {
126
+ MARISA_DEBUG_IF(size_ == 0, MARISA_STATE_ERROR);
127
+ return const_objs_[size_ - 1];
128
+ }
129
+
130
+ T *begin() {
131
+ MARISA_DEBUG_IF(fixed_, MARISA_STATE_ERROR);
132
+ return objs_;
133
+ }
134
+ T *end() {
135
+ MARISA_DEBUG_IF(fixed_, MARISA_STATE_ERROR);
136
+ return objs_ + size_;
137
+ }
138
+ T &operator[](std::size_t i) {
139
+ MARISA_DEBUG_IF(fixed_, MARISA_STATE_ERROR);
140
+ MARISA_DEBUG_IF(i >= size_, MARISA_BOUND_ERROR);
141
+ return objs_[i];
142
+ }
143
+ T &front() {
144
+ MARISA_DEBUG_IF(fixed_, MARISA_STATE_ERROR);
145
+ MARISA_DEBUG_IF(size_ == 0, MARISA_STATE_ERROR);
146
+ return objs_[0];
147
+ }
148
+ T &back() {
149
+ MARISA_DEBUG_IF(fixed_, MARISA_STATE_ERROR);
150
+ MARISA_DEBUG_IF(size_ == 0, MARISA_STATE_ERROR);
151
+ return objs_[size_ - 1];
152
+ }
153
+
154
+ std::size_t size() const {
155
+ return size_;
156
+ }
157
+ std::size_t capacity() const {
158
+ return capacity_;
159
+ }
160
+ bool fixed() const {
161
+ return fixed_;
162
+ }
163
+
164
+ bool empty() const {
165
+ return size_ == 0;
166
+ }
167
+ std::size_t total_size() const {
168
+ return sizeof(T) * size_;
169
+ }
170
+ std::size_t io_size() const {
171
+ return sizeof(UInt64) + ((total_size() + 7) & ~(std::size_t)0x07);
172
+ }
173
+
174
+ void clear() {
175
+ Vector().swap(*this);
176
+ }
177
+ void swap(Vector &rhs) {
178
+ buf_.swap(rhs.buf_);
179
+ marisa::swap(objs_, rhs.objs_);
180
+ marisa::swap(const_objs_, rhs.const_objs_);
181
+ marisa::swap(size_, rhs.size_);
182
+ marisa::swap(capacity_, rhs.capacity_);
183
+ marisa::swap(fixed_, rhs.fixed_);
184
+ }
185
+
186
+ static std::size_t max_size() {
187
+ return MARISA_SIZE_MAX / sizeof(T);
188
+ }
189
+
190
+ private:
191
+ scoped_array<char> buf_;
192
+ T *objs_;
193
+ const T *const_objs_;
194
+ std::size_t size_;
195
+ std::size_t capacity_;
196
+ bool fixed_;
197
+
198
+ void map_(Mapper &mapper) {
199
+ UInt64 total_size;
200
+ mapper.map(&total_size);
201
+ MARISA_THROW_IF(total_size > MARISA_SIZE_MAX, MARISA_SIZE_ERROR);
202
+ MARISA_THROW_IF((total_size % sizeof(T)) != 0, MARISA_FORMAT_ERROR);
203
+ const std::size_t size = (std::size_t)(total_size / sizeof(T));
204
+ mapper.map(&const_objs_, size);
205
+ mapper.seek((std::size_t)((8 - (total_size % 8)) % 8));
206
+ size_ = size;
207
+ fix();
208
+ }
209
+ void read_(Reader &reader) {
210
+ UInt64 total_size;
211
+ reader.read(&total_size);
212
+ MARISA_THROW_IF(total_size > MARISA_SIZE_MAX, MARISA_SIZE_ERROR);
213
+ MARISA_THROW_IF((total_size % sizeof(T)) != 0, MARISA_FORMAT_ERROR);
214
+ const std::size_t size = (std::size_t)(total_size / sizeof(T));
215
+ resize(size);
216
+ reader.read(objs_, size);
217
+ reader.seek((std::size_t)((8 - (total_size % 8)) % 8));
218
+ }
219
+ void write_(Writer &writer) const {
220
+ writer.write((UInt64)total_size());
221
+ writer.write(const_objs_, size_);
222
+ writer.seek((8 - (total_size() % 8)) % 8);
223
+ }
224
+
225
+ // realloc() assumes that T's placement new does not throw an exception.
226
+ void realloc(std::size_t new_capacity) {
227
+ MARISA_DEBUG_IF(new_capacity > max_size(), MARISA_SIZE_ERROR);
228
+
229
+ scoped_array<char> new_buf(
230
+ new (std::nothrow) char[sizeof(T) * new_capacity]);
231
+ MARISA_DEBUG_IF(new_buf.get() == NULL, MARISA_MEMORY_ERROR);
232
+ T *new_objs = reinterpret_cast<T *>(new_buf.get());
233
+
234
+ for (std::size_t i = 0; i < size_; ++i) {
235
+ new (&new_objs[i]) T(objs_[i]);
236
+ }
237
+ for (std::size_t i = 0; i < size_; ++i) {
238
+ objs_[i].~T();
239
+ }
240
+
241
+ buf_.swap(new_buf);
242
+ objs_ = new_objs;
243
+ const_objs_ = new_objs;
244
+ capacity_ = new_capacity;
245
+ }
246
+
247
+ // Disallows copy and assignment.
248
+ Vector(const Vector &);
249
+ Vector &operator=(const Vector &);
250
+ };
251
+
252
+ } // namespace vector
253
+ } // namespace grimoire
254
+ } // namespace marisa
255
+
256
+ #endif // MARISA_GRIMOIRE_VECTOR_VECTOR_H_
@@ -0,0 +1,18 @@
1
+ #ifndef MARISA_IOSTREAM_H_
2
+ #define MARISA_IOSTREAM_H_
3
+
4
+ #include <iosfwd>
5
+
6
+ namespace marisa {
7
+
8
+ class Trie;
9
+
10
+ std::istream &read(std::istream &stream, Trie *trie);
11
+ std::ostream &write(std::ostream &stream, const Trie &trie);
12
+
13
+ std::istream &operator>>(std::istream &stream, Trie &trie);
14
+ std::ostream &operator<<(std::ostream &stream, const Trie &trie);
15
+
16
+ } // namespace marisa
17
+
18
+ #endif // MARISA_IOSTREAM_H_
@@ -0,0 +1,85 @@
1
+ #ifndef MARISA_KEY_H_
2
+ #define MARISA_KEY_H_
3
+
4
+ #include "marisa/base.h"
5
+
6
+ namespace marisa {
7
+
8
+ class Key {
9
+ public:
10
+ Key() : ptr_(NULL), length_(0), union_() {
11
+ union_.id = 0;
12
+ }
13
+ Key(const Key &key)
14
+ : ptr_(key.ptr_), length_(key.length_), union_(key.union_) {}
15
+
16
+ Key &operator=(const Key &key) {
17
+ ptr_ = key.ptr_;
18
+ length_ = key.length_;
19
+ union_ = key.union_;
20
+ return *this;
21
+ }
22
+
23
+ char operator[](std::size_t i) const {
24
+ MARISA_DEBUG_IF(i >= length_, MARISA_BOUND_ERROR);
25
+ return ptr_[i];
26
+ }
27
+
28
+ void set_str(const char *str) {
29
+ MARISA_DEBUG_IF(str == NULL, MARISA_NULL_ERROR);
30
+ std::size_t length = 0;
31
+ while (str[length] != '\0') {
32
+ ++length;
33
+ }
34
+ MARISA_DEBUG_IF(length > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
35
+ ptr_ = str;
36
+ length_ = (UInt32)length;
37
+ }
38
+ void set_str(const char *ptr, std::size_t length) {
39
+ MARISA_DEBUG_IF((ptr == NULL) && (length != 0), MARISA_NULL_ERROR);
40
+ MARISA_DEBUG_IF(length > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
41
+ ptr_ = ptr;
42
+ length_ = (UInt32)length;
43
+ }
44
+ void set_id(std::size_t id) {
45
+ MARISA_DEBUG_IF(id > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
46
+ union_.id = (UInt32)id;
47
+ }
48
+ void set_weight(float weight) {
49
+ union_.weight = weight;
50
+ }
51
+
52
+ const char *ptr() const {
53
+ return ptr_;
54
+ }
55
+ std::size_t length() const {
56
+ return length_;
57
+ }
58
+ std::size_t id() const {
59
+ return union_.id;
60
+ }
61
+ float weight() const {
62
+ return union_.weight;
63
+ }
64
+
65
+ void clear() {
66
+ Key().swap(*this);
67
+ }
68
+ void swap(Key &rhs) {
69
+ marisa::swap(ptr_, rhs.ptr_);
70
+ marisa::swap(length_, rhs.length_);
71
+ marisa::swap(union_.id, rhs.union_.id);
72
+ }
73
+
74
+ private:
75
+ const char *ptr_;
76
+ UInt32 length_;
77
+ union Union {
78
+ UInt32 id;
79
+ float weight;
80
+ } union_;
81
+ };
82
+
83
+ } // namespace marisa
84
+
85
+ #endif // MARISA_KEY_H_
@@ -0,0 +1,181 @@
1
+ #include <new>
2
+
3
+ #include "marisa/keyset.h"
4
+
5
+ namespace marisa {
6
+
7
+ Keyset::Keyset()
8
+ : base_blocks_(), base_blocks_size_(0), base_blocks_capacity_(0),
9
+ extra_blocks_(), extra_blocks_size_(0), extra_blocks_capacity_(0),
10
+ key_blocks_(), key_blocks_size_(0), key_blocks_capacity_(0),
11
+ ptr_(NULL), avail_(0), size_(0), total_length_(0) {}
12
+
13
+ void Keyset::push_back(const Key &key) {
14
+ MARISA_DEBUG_IF(size_ == MARISA_SIZE_MAX, MARISA_SIZE_ERROR);
15
+
16
+ char * const key_ptr = reserve(key.length());
17
+ for (std::size_t i = 0; i < key.length(); ++i) {
18
+ key_ptr[i] = key[i];
19
+ }
20
+
21
+ Key &new_key = key_blocks_[size_ / KEY_BLOCK_SIZE][size_ % KEY_BLOCK_SIZE];
22
+ new_key.set_str(key_ptr, key.length());
23
+ new_key.set_id(key.id());
24
+ ++size_;
25
+ total_length_ += new_key.length();
26
+ }
27
+
28
+ void Keyset::push_back(const Key &key, char end_marker) {
29
+ MARISA_DEBUG_IF(size_ == MARISA_SIZE_MAX, MARISA_SIZE_ERROR);
30
+
31
+ if ((size_ / KEY_BLOCK_SIZE) == key_blocks_size_) {
32
+ append_key_block();
33
+ }
34
+
35
+ char * const key_ptr = reserve(key.length() + 1);
36
+ for (std::size_t i = 0; i < key.length(); ++i) {
37
+ key_ptr[i] = key[i];
38
+ }
39
+ key_ptr[key.length()] = end_marker;
40
+
41
+ Key &new_key = key_blocks_[size_ / KEY_BLOCK_SIZE][size_ % KEY_BLOCK_SIZE];
42
+ new_key.set_str(key_ptr, key.length());
43
+ new_key.set_id(key.id());
44
+ ++size_;
45
+ total_length_ += new_key.length();
46
+ }
47
+
48
+ void Keyset::push_back(const char *str) {
49
+ MARISA_DEBUG_IF(size_ == MARISA_SIZE_MAX, MARISA_SIZE_ERROR);
50
+ MARISA_THROW_IF(str == NULL, MARISA_NULL_ERROR);
51
+
52
+ std::size_t length = 0;
53
+ while (str[length] != '\0') {
54
+ ++length;
55
+ }
56
+ push_back(str, length);
57
+ }
58
+
59
+ void Keyset::push_back(const char *ptr, std::size_t length, float weight) {
60
+ MARISA_DEBUG_IF(size_ == MARISA_SIZE_MAX, MARISA_SIZE_ERROR);
61
+ MARISA_THROW_IF((ptr == NULL) && (length != 0), MARISA_NULL_ERROR);
62
+ MARISA_THROW_IF(length > MARISA_UINT32_MAX, MARISA_SIZE_ERROR);
63
+
64
+ char * const key_ptr = reserve(length);
65
+ for (std::size_t i = 0; i < length; ++i) {
66
+ key_ptr[i] = ptr[i];
67
+ }
68
+
69
+ Key &key = key_blocks_[size_ / KEY_BLOCK_SIZE][size_ % KEY_BLOCK_SIZE];
70
+ key.set_str(key_ptr, length);
71
+ key.set_weight(weight);
72
+ ++size_;
73
+ total_length_ += length;
74
+ }
75
+
76
+ void Keyset::reset() {
77
+ base_blocks_size_ = 0;
78
+ extra_blocks_size_ = 0;
79
+ ptr_ = NULL;
80
+ avail_ = 0;
81
+ size_ = 0;
82
+ total_length_ = 0;
83
+ }
84
+
85
+ void Keyset::clear() {
86
+ Keyset().swap(*this);
87
+ }
88
+
89
+ void Keyset::swap(Keyset &rhs) {
90
+ base_blocks_.swap(rhs.base_blocks_);
91
+ marisa::swap(base_blocks_size_, rhs.base_blocks_size_);
92
+ marisa::swap(base_blocks_capacity_, rhs.base_blocks_capacity_);
93
+ extra_blocks_.swap(rhs.extra_blocks_);
94
+ marisa::swap(extra_blocks_size_, rhs.extra_blocks_size_);
95
+ marisa::swap(extra_blocks_capacity_, rhs.extra_blocks_capacity_);
96
+ key_blocks_.swap(rhs.key_blocks_);
97
+ marisa::swap(key_blocks_size_, rhs.key_blocks_size_);
98
+ marisa::swap(key_blocks_capacity_, rhs.key_blocks_capacity_);
99
+ marisa::swap(ptr_, rhs.ptr_);
100
+ marisa::swap(avail_, rhs.avail_);
101
+ marisa::swap(size_, rhs.size_);
102
+ marisa::swap(total_length_, rhs.total_length_);
103
+ }
104
+
105
+ char *Keyset::reserve(std::size_t size) {
106
+ if ((size_ / KEY_BLOCK_SIZE) == key_blocks_size_) {
107
+ append_key_block();
108
+ }
109
+
110
+ if (size > EXTRA_BLOCK_SIZE) {
111
+ append_extra_block(size);
112
+ return extra_blocks_[extra_blocks_size_ - 1].get();
113
+ } else {
114
+ if (size > avail_) {
115
+ append_base_block();
116
+ }
117
+ ptr_ += size;
118
+ avail_ -= size;
119
+ return ptr_ - size;
120
+ }
121
+ }
122
+
123
+ void Keyset::append_base_block() {
124
+ if (base_blocks_size_ == base_blocks_capacity_) {
125
+ const std::size_t new_capacity =
126
+ (base_blocks_size_ != 0) ? (base_blocks_size_ * 2) : 1;
127
+ scoped_array<scoped_array<char> > new_blocks(
128
+ new (std::nothrow) scoped_array<char>[new_capacity]);
129
+ MARISA_THROW_IF(new_blocks.get() == NULL, MARISA_MEMORY_ERROR);
130
+ for (std::size_t i = 0; i < base_blocks_size_; ++i) {
131
+ base_blocks_[i].swap(new_blocks[i]);
132
+ }
133
+ base_blocks_.swap(new_blocks);
134
+ base_blocks_capacity_ = new_capacity;
135
+ }
136
+ if (base_blocks_[base_blocks_size_].get() == NULL) {
137
+ scoped_array<char> new_block(new (std::nothrow) char[BASE_BLOCK_SIZE]);
138
+ MARISA_THROW_IF(new_block.get() == NULL, MARISA_MEMORY_ERROR);
139
+ base_blocks_[base_blocks_size_].swap(new_block);
140
+ }
141
+ ptr_ = base_blocks_[base_blocks_size_++].get();
142
+ avail_ = BASE_BLOCK_SIZE;
143
+ }
144
+
145
+ void Keyset::append_extra_block(std::size_t size) {
146
+ if (extra_blocks_size_ == extra_blocks_capacity_) {
147
+ const std::size_t new_capacity =
148
+ (extra_blocks_size_ != 0) ? (extra_blocks_size_ * 2) : 1;
149
+ scoped_array<scoped_array<char> > new_blocks(
150
+ new (std::nothrow) scoped_array<char>[new_capacity]);
151
+ MARISA_THROW_IF(new_blocks.get() == NULL, MARISA_MEMORY_ERROR);
152
+ for (std::size_t i = 0; i < extra_blocks_size_; ++i) {
153
+ extra_blocks_[i].swap(new_blocks[i]);
154
+ }
155
+ extra_blocks_.swap(new_blocks);
156
+ extra_blocks_capacity_ = new_capacity;
157
+ }
158
+ scoped_array<char> new_block(new (std::nothrow) char[size]);
159
+ MARISA_THROW_IF(new_block.get() == NULL, MARISA_MEMORY_ERROR);
160
+ extra_blocks_[extra_blocks_size_++].swap(new_block);
161
+ }
162
+
163
+ void Keyset::append_key_block() {
164
+ if (key_blocks_size_ == key_blocks_capacity_) {
165
+ const std::size_t new_capacity =
166
+ (key_blocks_size_ != 0) ? (key_blocks_size_ * 2) : 1;
167
+ scoped_array<scoped_array<Key> > new_blocks(
168
+ new (std::nothrow) scoped_array<Key>[new_capacity]);
169
+ MARISA_THROW_IF(new_blocks.get() == NULL, MARISA_MEMORY_ERROR);
170
+ for (std::size_t i = 0; i < key_blocks_size_; ++i) {
171
+ key_blocks_[i].swap(new_blocks[i]);
172
+ }
173
+ key_blocks_.swap(new_blocks);
174
+ key_blocks_capacity_ = new_capacity;
175
+ }
176
+ scoped_array<Key> new_block(new (std::nothrow) Key[KEY_BLOCK_SIZE]);
177
+ MARISA_THROW_IF(new_block.get() == NULL, MARISA_MEMORY_ERROR);
178
+ key_blocks_[key_blocks_size_++].swap(new_block);
179
+ }
180
+
181
+ } // namespace marisa