isotree 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -1,740 +0,0 @@
1
- // Tencent is pleased to support the open source community by making RapidJSON available.
2
- //
3
- // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
4
- //
5
- // Licensed under the MIT License (the "License"); you may not use this file except
6
- // in compliance with the License. You may obtain a copy of the License at
7
- //
8
- // http://opensource.org/licenses/MIT
9
- //
10
- // Unless required by applicable law or agreed to in writing, software distributed
11
- // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
- // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
- // specific language governing permissions and limitations under the License.
14
-
15
- #ifndef CEREAL_RAPIDJSON_INTERNAL_REGEX_H_
16
- #define CEREAL_RAPIDJSON_INTERNAL_REGEX_H_
17
-
18
- #include "../allocators.h"
19
- #include "../stream.h"
20
- #include "stack.h"
21
-
22
- #ifdef __clang__
23
- CEREAL_RAPIDJSON_DIAG_PUSH
24
- CEREAL_RAPIDJSON_DIAG_OFF(padded)
25
- CEREAL_RAPIDJSON_DIAG_OFF(switch-enum)
26
- CEREAL_RAPIDJSON_DIAG_OFF(implicit-fallthrough)
27
- #elif defined(_MSC_VER)
28
- CEREAL_RAPIDJSON_DIAG_PUSH
29
- CEREAL_RAPIDJSON_DIAG_OFF(4512) // assignment operator could not be generated
30
- #endif
31
-
32
- #ifdef __GNUC__
33
- CEREAL_RAPIDJSON_DIAG_PUSH
34
- CEREAL_RAPIDJSON_DIAG_OFF(effc++)
35
- #if __GNUC__ >= 7
36
- CEREAL_RAPIDJSON_DIAG_OFF(implicit-fallthrough)
37
- #endif
38
- #endif
39
-
40
- #ifndef CEREAL_RAPIDJSON_REGEX_VERBOSE
41
- #define CEREAL_RAPIDJSON_REGEX_VERBOSE 0
42
- #endif
43
-
44
- CEREAL_RAPIDJSON_NAMESPACE_BEGIN
45
- namespace internal {
46
-
47
- ///////////////////////////////////////////////////////////////////////////////
48
- // DecodedStream
49
-
50
- template <typename SourceStream, typename Encoding>
51
- class DecodedStream {
52
- public:
53
- DecodedStream(SourceStream& ss) : ss_(ss), codepoint_() { Decode(); }
54
- unsigned Peek() { return codepoint_; }
55
- unsigned Take() {
56
- unsigned c = codepoint_;
57
- if (c) // No further decoding when '\0'
58
- Decode();
59
- return c;
60
- }
61
-
62
- private:
63
- void Decode() {
64
- if (!Encoding::Decode(ss_, &codepoint_))
65
- codepoint_ = 0;
66
- }
67
-
68
- SourceStream& ss_;
69
- unsigned codepoint_;
70
- };
71
-
72
- ///////////////////////////////////////////////////////////////////////////////
73
- // GenericRegex
74
-
75
- static const SizeType kRegexInvalidState = ~SizeType(0); //!< Represents an invalid index in GenericRegex::State::out, out1
76
- static const SizeType kRegexInvalidRange = ~SizeType(0);
77
-
78
- template <typename Encoding, typename Allocator>
79
- class GenericRegexSearch;
80
-
81
- //! Regular expression engine with subset of ECMAscript grammar.
82
- /*!
83
- Supported regular expression syntax:
84
- - \c ab Concatenation
85
- - \c a|b Alternation
86
- - \c a? Zero or one
87
- - \c a* Zero or more
88
- - \c a+ One or more
89
- - \c a{3} Exactly 3 times
90
- - \c a{3,} At least 3 times
91
- - \c a{3,5} 3 to 5 times
92
- - \c (ab) Grouping
93
- - \c ^a At the beginning
94
- - \c a$ At the end
95
- - \c . Any character
96
- - \c [abc] Character classes
97
- - \c [a-c] Character class range
98
- - \c [a-z0-9_] Character class combination
99
- - \c [^abc] Negated character classes
100
- - \c [^a-c] Negated character class range
101
- - \c [\b] Backspace (U+0008)
102
- - \c \\| \\\\ ... Escape characters
103
- - \c \\f Form feed (U+000C)
104
- - \c \\n Line feed (U+000A)
105
- - \c \\r Carriage return (U+000D)
106
- - \c \\t Tab (U+0009)
107
- - \c \\v Vertical tab (U+000B)
108
-
109
- \note This is a Thompson NFA engine, implemented with reference to
110
- Cox, Russ. "Regular Expression Matching Can Be Simple And Fast (but is slow in Java, Perl, PHP, Python, Ruby,...).",
111
- https://swtch.com/~rsc/regexp/regexp1.html
112
- */
113
- template <typename Encoding, typename Allocator = CrtAllocator>
114
- class GenericRegex {
115
- public:
116
- typedef Encoding EncodingType;
117
- typedef typename Encoding::Ch Ch;
118
- template <typename, typename> friend class GenericRegexSearch;
119
-
120
- GenericRegex(const Ch* source, Allocator* allocator = 0) :
121
- ownAllocator_(allocator ? 0 : CEREAL_RAPIDJSON_NEW(Allocator)()), allocator_(allocator ? allocator : ownAllocator_),
122
- states_(allocator_, 256), ranges_(allocator_, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(),
123
- anchorBegin_(), anchorEnd_()
124
- {
125
- GenericStringStream<Encoding> ss(source);
126
- DecodedStream<GenericStringStream<Encoding>, Encoding> ds(ss);
127
- Parse(ds);
128
- }
129
-
130
- ~GenericRegex()
131
- {
132
- CEREAL_RAPIDJSON_DELETE(ownAllocator_);
133
- }
134
-
135
- bool IsValid() const {
136
- return root_ != kRegexInvalidState;
137
- }
138
-
139
- private:
140
- enum Operator {
141
- kZeroOrOne,
142
- kZeroOrMore,
143
- kOneOrMore,
144
- kConcatenation,
145
- kAlternation,
146
- kLeftParenthesis
147
- };
148
-
149
- static const unsigned kAnyCharacterClass = 0xFFFFFFFF; //!< For '.'
150
- static const unsigned kRangeCharacterClass = 0xFFFFFFFE;
151
- static const unsigned kRangeNegationFlag = 0x80000000;
152
-
153
- struct Range {
154
- unsigned start; //
155
- unsigned end;
156
- SizeType next;
157
- };
158
-
159
- struct State {
160
- SizeType out; //!< Equals to kInvalid for matching state
161
- SizeType out1; //!< Equals to non-kInvalid for split
162
- SizeType rangeStart;
163
- unsigned codepoint;
164
- };
165
-
166
- struct Frag {
167
- Frag(SizeType s, SizeType o, SizeType m) : start(s), out(o), minIndex(m) {}
168
- SizeType start;
169
- SizeType out; //!< link-list of all output states
170
- SizeType minIndex;
171
- };
172
-
173
- State& GetState(SizeType index) {
174
- CEREAL_RAPIDJSON_ASSERT(index < stateCount_);
175
- return states_.template Bottom<State>()[index];
176
- }
177
-
178
- const State& GetState(SizeType index) const {
179
- CEREAL_RAPIDJSON_ASSERT(index < stateCount_);
180
- return states_.template Bottom<State>()[index];
181
- }
182
-
183
- Range& GetRange(SizeType index) {
184
- CEREAL_RAPIDJSON_ASSERT(index < rangeCount_);
185
- return ranges_.template Bottom<Range>()[index];
186
- }
187
-
188
- const Range& GetRange(SizeType index) const {
189
- CEREAL_RAPIDJSON_ASSERT(index < rangeCount_);
190
- return ranges_.template Bottom<Range>()[index];
191
- }
192
-
193
- template <typename InputStream>
194
- void Parse(DecodedStream<InputStream, Encoding>& ds) {
195
- Stack<Allocator> operandStack(allocator_, 256); // Frag
196
- Stack<Allocator> operatorStack(allocator_, 256); // Operator
197
- Stack<Allocator> atomCountStack(allocator_, 256); // unsigned (Atom per parenthesis)
198
-
199
- *atomCountStack.template Push<unsigned>() = 0;
200
-
201
- unsigned codepoint;
202
- while (ds.Peek() != 0) {
203
- switch (codepoint = ds.Take()) {
204
- case '^':
205
- anchorBegin_ = true;
206
- break;
207
-
208
- case '$':
209
- anchorEnd_ = true;
210
- break;
211
-
212
- case '|':
213
- while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() < kAlternation)
214
- if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
215
- return;
216
- *operatorStack.template Push<Operator>() = kAlternation;
217
- *atomCountStack.template Top<unsigned>() = 0;
218
- break;
219
-
220
- case '(':
221
- *operatorStack.template Push<Operator>() = kLeftParenthesis;
222
- *atomCountStack.template Push<unsigned>() = 0;
223
- break;
224
-
225
- case ')':
226
- while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() != kLeftParenthesis)
227
- if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
228
- return;
229
- if (operatorStack.Empty())
230
- return;
231
- operatorStack.template Pop<Operator>(1);
232
- atomCountStack.template Pop<unsigned>(1);
233
- ImplicitConcatenation(atomCountStack, operatorStack);
234
- break;
235
-
236
- case '?':
237
- if (!Eval(operandStack, kZeroOrOne))
238
- return;
239
- break;
240
-
241
- case '*':
242
- if (!Eval(operandStack, kZeroOrMore))
243
- return;
244
- break;
245
-
246
- case '+':
247
- if (!Eval(operandStack, kOneOrMore))
248
- return;
249
- break;
250
-
251
- case '{':
252
- {
253
- unsigned n, m;
254
- if (!ParseUnsigned(ds, &n))
255
- return;
256
-
257
- if (ds.Peek() == ',') {
258
- ds.Take();
259
- if (ds.Peek() == '}')
260
- m = kInfinityQuantifier;
261
- else if (!ParseUnsigned(ds, &m) || m < n)
262
- return;
263
- }
264
- else
265
- m = n;
266
-
267
- if (!EvalQuantifier(operandStack, n, m) || ds.Peek() != '}')
268
- return;
269
- ds.Take();
270
- }
271
- break;
272
-
273
- case '.':
274
- PushOperand(operandStack, kAnyCharacterClass);
275
- ImplicitConcatenation(atomCountStack, operatorStack);
276
- break;
277
-
278
- case '[':
279
- {
280
- SizeType range;
281
- if (!ParseRange(ds, &range))
282
- return;
283
- SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, kRangeCharacterClass);
284
- GetState(s).rangeStart = range;
285
- *operandStack.template Push<Frag>() = Frag(s, s, s);
286
- }
287
- ImplicitConcatenation(atomCountStack, operatorStack);
288
- break;
289
-
290
- case '\\': // Escape character
291
- if (!CharacterEscape(ds, &codepoint))
292
- return; // Unsupported escape character
293
- // fall through to default
294
-
295
- default: // Pattern character
296
- PushOperand(operandStack, codepoint);
297
- ImplicitConcatenation(atomCountStack, operatorStack);
298
- }
299
- }
300
-
301
- while (!operatorStack.Empty())
302
- if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
303
- return;
304
-
305
- // Link the operand to matching state.
306
- if (operandStack.GetSize() == sizeof(Frag)) {
307
- Frag* e = operandStack.template Pop<Frag>(1);
308
- Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0));
309
- root_ = e->start;
310
-
311
- #if CEREAL_RAPIDJSON_REGEX_VERBOSE
312
- printf("root: %d\n", root_);
313
- for (SizeType i = 0; i < stateCount_ ; i++) {
314
- State& s = GetState(i);
315
- printf("[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1, (char)s.codepoint);
316
- }
317
- printf("\n");
318
- #endif
319
- }
320
- }
321
-
322
- SizeType NewState(SizeType out, SizeType out1, unsigned codepoint) {
323
- State* s = states_.template Push<State>();
324
- s->out = out;
325
- s->out1 = out1;
326
- s->codepoint = codepoint;
327
- s->rangeStart = kRegexInvalidRange;
328
- return stateCount_++;
329
- }
330
-
331
- void PushOperand(Stack<Allocator>& operandStack, unsigned codepoint) {
332
- SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint);
333
- *operandStack.template Push<Frag>() = Frag(s, s, s);
334
- }
335
-
336
- void ImplicitConcatenation(Stack<Allocator>& atomCountStack, Stack<Allocator>& operatorStack) {
337
- if (*atomCountStack.template Top<unsigned>())
338
- *operatorStack.template Push<Operator>() = kConcatenation;
339
- (*atomCountStack.template Top<unsigned>())++;
340
- }
341
-
342
- SizeType Append(SizeType l1, SizeType l2) {
343
- SizeType old = l1;
344
- while (GetState(l1).out != kRegexInvalidState)
345
- l1 = GetState(l1).out;
346
- GetState(l1).out = l2;
347
- return old;
348
- }
349
-
350
- void Patch(SizeType l, SizeType s) {
351
- for (SizeType next; l != kRegexInvalidState; l = next) {
352
- next = GetState(l).out;
353
- GetState(l).out = s;
354
- }
355
- }
356
-
357
- bool Eval(Stack<Allocator>& operandStack, Operator op) {
358
- switch (op) {
359
- case kConcatenation:
360
- CEREAL_RAPIDJSON_ASSERT(operandStack.GetSize() >= sizeof(Frag) * 2);
361
- {
362
- Frag e2 = *operandStack.template Pop<Frag>(1);
363
- Frag e1 = *operandStack.template Pop<Frag>(1);
364
- Patch(e1.out, e2.start);
365
- *operandStack.template Push<Frag>() = Frag(e1.start, e2.out, Min(e1.minIndex, e2.minIndex));
366
- }
367
- return true;
368
-
369
- case kAlternation:
370
- if (operandStack.GetSize() >= sizeof(Frag) * 2) {
371
- Frag e2 = *operandStack.template Pop<Frag>(1);
372
- Frag e1 = *operandStack.template Pop<Frag>(1);
373
- SizeType s = NewState(e1.start, e2.start, 0);
374
- *operandStack.template Push<Frag>() = Frag(s, Append(e1.out, e2.out), Min(e1.minIndex, e2.minIndex));
375
- return true;
376
- }
377
- return false;
378
-
379
- case kZeroOrOne:
380
- if (operandStack.GetSize() >= sizeof(Frag)) {
381
- Frag e = *operandStack.template Pop<Frag>(1);
382
- SizeType s = NewState(kRegexInvalidState, e.start, 0);
383
- *operandStack.template Push<Frag>() = Frag(s, Append(e.out, s), e.minIndex);
384
- return true;
385
- }
386
- return false;
387
-
388
- case kZeroOrMore:
389
- if (operandStack.GetSize() >= sizeof(Frag)) {
390
- Frag e = *operandStack.template Pop<Frag>(1);
391
- SizeType s = NewState(kRegexInvalidState, e.start, 0);
392
- Patch(e.out, s);
393
- *operandStack.template Push<Frag>() = Frag(s, s, e.minIndex);
394
- return true;
395
- }
396
- return false;
397
-
398
- case kOneOrMore:
399
- if (operandStack.GetSize() >= sizeof(Frag)) {
400
- Frag e = *operandStack.template Pop<Frag>(1);
401
- SizeType s = NewState(kRegexInvalidState, e.start, 0);
402
- Patch(e.out, s);
403
- *operandStack.template Push<Frag>() = Frag(e.start, s, e.minIndex);
404
- return true;
405
- }
406
- return false;
407
-
408
- default:
409
- // syntax error (e.g. unclosed kLeftParenthesis)
410
- return false;
411
- }
412
- }
413
-
414
- bool EvalQuantifier(Stack<Allocator>& operandStack, unsigned n, unsigned m) {
415
- CEREAL_RAPIDJSON_ASSERT(n <= m);
416
- CEREAL_RAPIDJSON_ASSERT(operandStack.GetSize() >= sizeof(Frag));
417
-
418
- if (n == 0) {
419
- if (m == 0) // a{0} not support
420
- return false;
421
- else if (m == kInfinityQuantifier)
422
- Eval(operandStack, kZeroOrMore); // a{0,} -> a*
423
- else {
424
- Eval(operandStack, kZeroOrOne); // a{0,5} -> a?
425
- for (unsigned i = 0; i < m - 1; i++)
426
- CloneTopOperand(operandStack); // a{0,5} -> a? a? a? a? a?
427
- for (unsigned i = 0; i < m - 1; i++)
428
- Eval(operandStack, kConcatenation); // a{0,5} -> a?a?a?a?a?
429
- }
430
- return true;
431
- }
432
-
433
- for (unsigned i = 0; i < n - 1; i++) // a{3} -> a a a
434
- CloneTopOperand(operandStack);
435
-
436
- if (m == kInfinityQuantifier)
437
- Eval(operandStack, kOneOrMore); // a{3,} -> a a a+
438
- else if (m > n) {
439
- CloneTopOperand(operandStack); // a{3,5} -> a a a a
440
- Eval(operandStack, kZeroOrOne); // a{3,5} -> a a a a?
441
- for (unsigned i = n; i < m - 1; i++)
442
- CloneTopOperand(operandStack); // a{3,5} -> a a a a? a?
443
- for (unsigned i = n; i < m; i++)
444
- Eval(operandStack, kConcatenation); // a{3,5} -> a a aa?a?
445
- }
446
-
447
- for (unsigned i = 0; i < n - 1; i++)
448
- Eval(operandStack, kConcatenation); // a{3} -> aaa, a{3,} -> aaa+, a{3.5} -> aaaa?a?
449
-
450
- return true;
451
- }
452
-
453
- static SizeType Min(SizeType a, SizeType b) { return a < b ? a : b; }
454
-
455
- void CloneTopOperand(Stack<Allocator>& operandStack) {
456
- const Frag src = *operandStack.template Top<Frag>(); // Copy constructor to prevent invalidation
457
- SizeType count = stateCount_ - src.minIndex; // Assumes top operand contains states in [src->minIndex, stateCount_)
458
- State* s = states_.template Push<State>(count);
459
- memcpy(s, &GetState(src.minIndex), count * sizeof(State));
460
- for (SizeType j = 0; j < count; j++) {
461
- if (s[j].out != kRegexInvalidState)
462
- s[j].out += count;
463
- if (s[j].out1 != kRegexInvalidState)
464
- s[j].out1 += count;
465
- }
466
- *operandStack.template Push<Frag>() = Frag(src.start + count, src.out + count, src.minIndex + count);
467
- stateCount_ += count;
468
- }
469
-
470
- template <typename InputStream>
471
- bool ParseUnsigned(DecodedStream<InputStream, Encoding>& ds, unsigned* u) {
472
- unsigned r = 0;
473
- if (ds.Peek() < '0' || ds.Peek() > '9')
474
- return false;
475
- while (ds.Peek() >= '0' && ds.Peek() <= '9') {
476
- if (r >= 429496729 && ds.Peek() > '5') // 2^32 - 1 = 4294967295
477
- return false; // overflow
478
- r = r * 10 + (ds.Take() - '0');
479
- }
480
- *u = r;
481
- return true;
482
- }
483
-
484
- template <typename InputStream>
485
- bool ParseRange(DecodedStream<InputStream, Encoding>& ds, SizeType* range) {
486
- bool isBegin = true;
487
- bool negate = false;
488
- int step = 0;
489
- SizeType start = kRegexInvalidRange;
490
- SizeType current = kRegexInvalidRange;
491
- unsigned codepoint;
492
- while ((codepoint = ds.Take()) != 0) {
493
- if (isBegin) {
494
- isBegin = false;
495
- if (codepoint == '^') {
496
- negate = true;
497
- continue;
498
- }
499
- }
500
-
501
- switch (codepoint) {
502
- case ']':
503
- if (start == kRegexInvalidRange)
504
- return false; // Error: nothing inside []
505
- if (step == 2) { // Add trailing '-'
506
- SizeType r = NewRange('-');
507
- CEREAL_RAPIDJSON_ASSERT(current != kRegexInvalidRange);
508
- GetRange(current).next = r;
509
- }
510
- if (negate)
511
- GetRange(start).start |= kRangeNegationFlag;
512
- *range = start;
513
- return true;
514
-
515
- case '\\':
516
- if (ds.Peek() == 'b') {
517
- ds.Take();
518
- codepoint = 0x0008; // Escape backspace character
519
- }
520
- else if (!CharacterEscape(ds, &codepoint))
521
- return false;
522
- // fall through to default
523
-
524
- default:
525
- switch (step) {
526
- case 1:
527
- if (codepoint == '-') {
528
- step++;
529
- break;
530
- }
531
- // fall through to step 0 for other characters
532
-
533
- case 0:
534
- {
535
- SizeType r = NewRange(codepoint);
536
- if (current != kRegexInvalidRange)
537
- GetRange(current).next = r;
538
- if (start == kRegexInvalidRange)
539
- start = r;
540
- current = r;
541
- }
542
- step = 1;
543
- break;
544
-
545
- default:
546
- CEREAL_RAPIDJSON_ASSERT(step == 2);
547
- GetRange(current).end = codepoint;
548
- step = 0;
549
- }
550
- }
551
- }
552
- return false;
553
- }
554
-
555
- SizeType NewRange(unsigned codepoint) {
556
- Range* r = ranges_.template Push<Range>();
557
- r->start = r->end = codepoint;
558
- r->next = kRegexInvalidRange;
559
- return rangeCount_++;
560
- }
561
-
562
- template <typename InputStream>
563
- bool CharacterEscape(DecodedStream<InputStream, Encoding>& ds, unsigned* escapedCodepoint) {
564
- unsigned codepoint;
565
- switch (codepoint = ds.Take()) {
566
- case '^':
567
- case '$':
568
- case '|':
569
- case '(':
570
- case ')':
571
- case '?':
572
- case '*':
573
- case '+':
574
- case '.':
575
- case '[':
576
- case ']':
577
- case '{':
578
- case '}':
579
- case '\\':
580
- *escapedCodepoint = codepoint; return true;
581
- case 'f': *escapedCodepoint = 0x000C; return true;
582
- case 'n': *escapedCodepoint = 0x000A; return true;
583
- case 'r': *escapedCodepoint = 0x000D; return true;
584
- case 't': *escapedCodepoint = 0x0009; return true;
585
- case 'v': *escapedCodepoint = 0x000B; return true;
586
- default:
587
- return false; // Unsupported escape character
588
- }
589
- }
590
-
591
- Allocator* ownAllocator_;
592
- Allocator* allocator_;
593
- Stack<Allocator> states_;
594
- Stack<Allocator> ranges_;
595
- SizeType root_;
596
- SizeType stateCount_;
597
- SizeType rangeCount_;
598
-
599
- static const unsigned kInfinityQuantifier = ~0u;
600
-
601
- // For SearchWithAnchoring()
602
- bool anchorBegin_;
603
- bool anchorEnd_;
604
- };
605
-
606
- template <typename RegexType, typename Allocator = CrtAllocator>
607
- class GenericRegexSearch {
608
- public:
609
- typedef typename RegexType::EncodingType Encoding;
610
- typedef typename Encoding::Ch Ch;
611
-
612
- GenericRegexSearch(const RegexType& regex, Allocator* allocator = 0) :
613
- regex_(regex), allocator_(allocator), ownAllocator_(0),
614
- state0_(allocator, 0), state1_(allocator, 0), stateSet_()
615
- {
616
- CEREAL_RAPIDJSON_ASSERT(regex_.IsValid());
617
- if (!allocator_)
618
- ownAllocator_ = allocator_ = CEREAL_RAPIDJSON_NEW(Allocator)();
619
- stateSet_ = static_cast<unsigned*>(allocator_->Malloc(GetStateSetSize()));
620
- state0_.template Reserve<SizeType>(regex_.stateCount_);
621
- state1_.template Reserve<SizeType>(regex_.stateCount_);
622
- }
623
-
624
- ~GenericRegexSearch() {
625
- Allocator::Free(stateSet_);
626
- CEREAL_RAPIDJSON_DELETE(ownAllocator_);
627
- }
628
-
629
- template <typename InputStream>
630
- bool Match(InputStream& is) {
631
- return SearchWithAnchoring(is, true, true);
632
- }
633
-
634
- bool Match(const Ch* s) {
635
- GenericStringStream<Encoding> is(s);
636
- return Match(is);
637
- }
638
-
639
- template <typename InputStream>
640
- bool Search(InputStream& is) {
641
- return SearchWithAnchoring(is, regex_.anchorBegin_, regex_.anchorEnd_);
642
- }
643
-
644
- bool Search(const Ch* s) {
645
- GenericStringStream<Encoding> is(s);
646
- return Search(is);
647
- }
648
-
649
- private:
650
- typedef typename RegexType::State State;
651
- typedef typename RegexType::Range Range;
652
-
653
- template <typename InputStream>
654
- bool SearchWithAnchoring(InputStream& is, bool anchorBegin, bool anchorEnd) {
655
- DecodedStream<InputStream, Encoding> ds(is);
656
-
657
- state0_.Clear();
658
- Stack<Allocator> *current = &state0_, *next = &state1_;
659
- const size_t stateSetSize = GetStateSetSize();
660
- std::memset(stateSet_, 0, stateSetSize);
661
-
662
- bool matched = AddState(*current, regex_.root_);
663
- unsigned codepoint;
664
- while (!current->Empty() && (codepoint = ds.Take()) != 0) {
665
- std::memset(stateSet_, 0, stateSetSize);
666
- next->Clear();
667
- matched = false;
668
- for (const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s) {
669
- const State& sr = regex_.GetState(*s);
670
- if (sr.codepoint == codepoint ||
671
- sr.codepoint == RegexType::kAnyCharacterClass ||
672
- (sr.codepoint == RegexType::kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint)))
673
- {
674
- matched = AddState(*next, sr.out) || matched;
675
- if (!anchorEnd && matched)
676
- return true;
677
- }
678
- if (!anchorBegin)
679
- AddState(*next, regex_.root_);
680
- }
681
- internal::Swap(current, next);
682
- }
683
-
684
- return matched;
685
- }
686
-
687
- size_t GetStateSetSize() const {
688
- return (regex_.stateCount_ + 31) / 32 * 4;
689
- }
690
-
691
- // Return whether the added states is a match state
692
- bool AddState(Stack<Allocator>& l, SizeType index) {
693
- CEREAL_RAPIDJSON_ASSERT(index != kRegexInvalidState);
694
-
695
- const State& s = regex_.GetState(index);
696
- if (s.out1 != kRegexInvalidState) { // Split
697
- bool matched = AddState(l, s.out);
698
- return AddState(l, s.out1) || matched;
699
- }
700
- else if (!(stateSet_[index >> 5] & (1u << (index & 31)))) {
701
- stateSet_[index >> 5] |= (1u << (index & 31));
702
- *l.template PushUnsafe<SizeType>() = index;
703
- }
704
- return s.out == kRegexInvalidState; // by using PushUnsafe() above, we can ensure s is not validated due to reallocation.
705
- }
706
-
707
- bool MatchRange(SizeType rangeIndex, unsigned codepoint) const {
708
- bool yes = (regex_.GetRange(rangeIndex).start & RegexType::kRangeNegationFlag) == 0;
709
- while (rangeIndex != kRegexInvalidRange) {
710
- const Range& r = regex_.GetRange(rangeIndex);
711
- if (codepoint >= (r.start & ~RegexType::kRangeNegationFlag) && codepoint <= r.end)
712
- return yes;
713
- rangeIndex = r.next;
714
- }
715
- return !yes;
716
- }
717
-
718
- const RegexType& regex_;
719
- Allocator* allocator_;
720
- Allocator* ownAllocator_;
721
- Stack<Allocator> state0_;
722
- Stack<Allocator> state1_;
723
- uint32_t* stateSet_;
724
- };
725
-
726
- typedef GenericRegex<UTF8<> > Regex;
727
- typedef GenericRegexSearch<Regex> RegexSearch;
728
-
729
- } // namespace internal
730
- CEREAL_RAPIDJSON_NAMESPACE_END
731
-
732
- #ifdef __GNUC__
733
- CEREAL_RAPIDJSON_DIAG_POP
734
- #endif
735
-
736
- #if defined(__clang__) || defined(_MSC_VER)
737
- CEREAL_RAPIDJSON_DIAG_POP
738
- #endif
739
-
740
- #endif // CEREAL_RAPIDJSON_INTERNAL_REGEX_H_