isotree 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -1,740 +0,0 @@
1
- // Tencent is pleased to support the open source community by making RapidJSON available.
2
- //
3
- // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
4
- //
5
- // Licensed under the MIT License (the "License"); you may not use this file except
6
- // in compliance with the License. You may obtain a copy of the License at
7
- //
8
- // http://opensource.org/licenses/MIT
9
- //
10
- // Unless required by applicable law or agreed to in writing, software distributed
11
- // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12
- // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13
- // specific language governing permissions and limitations under the License.
14
-
15
- #ifndef CEREAL_RAPIDJSON_INTERNAL_REGEX_H_
16
- #define CEREAL_RAPIDJSON_INTERNAL_REGEX_H_
17
-
18
- #include "../allocators.h"
19
- #include "../stream.h"
20
- #include "stack.h"
21
-
22
- #ifdef __clang__
23
- CEREAL_RAPIDJSON_DIAG_PUSH
24
- CEREAL_RAPIDJSON_DIAG_OFF(padded)
25
- CEREAL_RAPIDJSON_DIAG_OFF(switch-enum)
26
- CEREAL_RAPIDJSON_DIAG_OFF(implicit-fallthrough)
27
- #elif defined(_MSC_VER)
28
- CEREAL_RAPIDJSON_DIAG_PUSH
29
- CEREAL_RAPIDJSON_DIAG_OFF(4512) // assignment operator could not be generated
30
- #endif
31
-
32
- #ifdef __GNUC__
33
- CEREAL_RAPIDJSON_DIAG_PUSH
34
- CEREAL_RAPIDJSON_DIAG_OFF(effc++)
35
- #if __GNUC__ >= 7
36
- CEREAL_RAPIDJSON_DIAG_OFF(implicit-fallthrough)
37
- #endif
38
- #endif
39
-
40
- #ifndef CEREAL_RAPIDJSON_REGEX_VERBOSE
41
- #define CEREAL_RAPIDJSON_REGEX_VERBOSE 0
42
- #endif
43
-
44
- CEREAL_RAPIDJSON_NAMESPACE_BEGIN
45
- namespace internal {
46
-
47
- ///////////////////////////////////////////////////////////////////////////////
48
- // DecodedStream
49
-
50
- template <typename SourceStream, typename Encoding>
51
- class DecodedStream {
52
- public:
53
- DecodedStream(SourceStream& ss) : ss_(ss), codepoint_() { Decode(); }
54
- unsigned Peek() { return codepoint_; }
55
- unsigned Take() {
56
- unsigned c = codepoint_;
57
- if (c) // No further decoding when '\0'
58
- Decode();
59
- return c;
60
- }
61
-
62
- private:
63
- void Decode() {
64
- if (!Encoding::Decode(ss_, &codepoint_))
65
- codepoint_ = 0;
66
- }
67
-
68
- SourceStream& ss_;
69
- unsigned codepoint_;
70
- };
71
-
72
- ///////////////////////////////////////////////////////////////////////////////
73
- // GenericRegex
74
-
75
- static const SizeType kRegexInvalidState = ~SizeType(0); //!< Represents an invalid index in GenericRegex::State::out, out1
76
- static const SizeType kRegexInvalidRange = ~SizeType(0);
77
-
78
- template <typename Encoding, typename Allocator>
79
- class GenericRegexSearch;
80
-
81
- //! Regular expression engine with subset of ECMAscript grammar.
82
- /*!
83
- Supported regular expression syntax:
84
- - \c ab Concatenation
85
- - \c a|b Alternation
86
- - \c a? Zero or one
87
- - \c a* Zero or more
88
- - \c a+ One or more
89
- - \c a{3} Exactly 3 times
90
- - \c a{3,} At least 3 times
91
- - \c a{3,5} 3 to 5 times
92
- - \c (ab) Grouping
93
- - \c ^a At the beginning
94
- - \c a$ At the end
95
- - \c . Any character
96
- - \c [abc] Character classes
97
- - \c [a-c] Character class range
98
- - \c [a-z0-9_] Character class combination
99
- - \c [^abc] Negated character classes
100
- - \c [^a-c] Negated character class range
101
- - \c [\b] Backspace (U+0008)
102
- - \c \\| \\\\ ... Escape characters
103
- - \c \\f Form feed (U+000C)
104
- - \c \\n Line feed (U+000A)
105
- - \c \\r Carriage return (U+000D)
106
- - \c \\t Tab (U+0009)
107
- - \c \\v Vertical tab (U+000B)
108
-
109
- \note This is a Thompson NFA engine, implemented with reference to
110
- Cox, Russ. "Regular Expression Matching Can Be Simple And Fast (but is slow in Java, Perl, PHP, Python, Ruby,...).",
111
- https://swtch.com/~rsc/regexp/regexp1.html
112
- */
113
- template <typename Encoding, typename Allocator = CrtAllocator>
114
- class GenericRegex {
115
- public:
116
- typedef Encoding EncodingType;
117
- typedef typename Encoding::Ch Ch;
118
- template <typename, typename> friend class GenericRegexSearch;
119
-
120
- GenericRegex(const Ch* source, Allocator* allocator = 0) :
121
- ownAllocator_(allocator ? 0 : CEREAL_RAPIDJSON_NEW(Allocator)()), allocator_(allocator ? allocator : ownAllocator_),
122
- states_(allocator_, 256), ranges_(allocator_, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(),
123
- anchorBegin_(), anchorEnd_()
124
- {
125
- GenericStringStream<Encoding> ss(source);
126
- DecodedStream<GenericStringStream<Encoding>, Encoding> ds(ss);
127
- Parse(ds);
128
- }
129
-
130
- ~GenericRegex()
131
- {
132
- CEREAL_RAPIDJSON_DELETE(ownAllocator_);
133
- }
134
-
135
- bool IsValid() const {
136
- return root_ != kRegexInvalidState;
137
- }
138
-
139
- private:
140
- enum Operator {
141
- kZeroOrOne,
142
- kZeroOrMore,
143
- kOneOrMore,
144
- kConcatenation,
145
- kAlternation,
146
- kLeftParenthesis
147
- };
148
-
149
- static const unsigned kAnyCharacterClass = 0xFFFFFFFF; //!< For '.'
150
- static const unsigned kRangeCharacterClass = 0xFFFFFFFE;
151
- static const unsigned kRangeNegationFlag = 0x80000000;
152
-
153
- struct Range {
154
- unsigned start; //
155
- unsigned end;
156
- SizeType next;
157
- };
158
-
159
- struct State {
160
- SizeType out; //!< Equals to kInvalid for matching state
161
- SizeType out1; //!< Equals to non-kInvalid for split
162
- SizeType rangeStart;
163
- unsigned codepoint;
164
- };
165
-
166
- struct Frag {
167
- Frag(SizeType s, SizeType o, SizeType m) : start(s), out(o), minIndex(m) {}
168
- SizeType start;
169
- SizeType out; //!< link-list of all output states
170
- SizeType minIndex;
171
- };
172
-
173
- State& GetState(SizeType index) {
174
- CEREAL_RAPIDJSON_ASSERT(index < stateCount_);
175
- return states_.template Bottom<State>()[index];
176
- }
177
-
178
- const State& GetState(SizeType index) const {
179
- CEREAL_RAPIDJSON_ASSERT(index < stateCount_);
180
- return states_.template Bottom<State>()[index];
181
- }
182
-
183
- Range& GetRange(SizeType index) {
184
- CEREAL_RAPIDJSON_ASSERT(index < rangeCount_);
185
- return ranges_.template Bottom<Range>()[index];
186
- }
187
-
188
- const Range& GetRange(SizeType index) const {
189
- CEREAL_RAPIDJSON_ASSERT(index < rangeCount_);
190
- return ranges_.template Bottom<Range>()[index];
191
- }
192
-
193
- template <typename InputStream>
194
- void Parse(DecodedStream<InputStream, Encoding>& ds) {
195
- Stack<Allocator> operandStack(allocator_, 256); // Frag
196
- Stack<Allocator> operatorStack(allocator_, 256); // Operator
197
- Stack<Allocator> atomCountStack(allocator_, 256); // unsigned (Atom per parenthesis)
198
-
199
- *atomCountStack.template Push<unsigned>() = 0;
200
-
201
- unsigned codepoint;
202
- while (ds.Peek() != 0) {
203
- switch (codepoint = ds.Take()) {
204
- case '^':
205
- anchorBegin_ = true;
206
- break;
207
-
208
- case '$':
209
- anchorEnd_ = true;
210
- break;
211
-
212
- case '|':
213
- while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() < kAlternation)
214
- if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
215
- return;
216
- *operatorStack.template Push<Operator>() = kAlternation;
217
- *atomCountStack.template Top<unsigned>() = 0;
218
- break;
219
-
220
- case '(':
221
- *operatorStack.template Push<Operator>() = kLeftParenthesis;
222
- *atomCountStack.template Push<unsigned>() = 0;
223
- break;
224
-
225
- case ')':
226
- while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() != kLeftParenthesis)
227
- if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
228
- return;
229
- if (operatorStack.Empty())
230
- return;
231
- operatorStack.template Pop<Operator>(1);
232
- atomCountStack.template Pop<unsigned>(1);
233
- ImplicitConcatenation(atomCountStack, operatorStack);
234
- break;
235
-
236
- case '?':
237
- if (!Eval(operandStack, kZeroOrOne))
238
- return;
239
- break;
240
-
241
- case '*':
242
- if (!Eval(operandStack, kZeroOrMore))
243
- return;
244
- break;
245
-
246
- case '+':
247
- if (!Eval(operandStack, kOneOrMore))
248
- return;
249
- break;
250
-
251
- case '{':
252
- {
253
- unsigned n, m;
254
- if (!ParseUnsigned(ds, &n))
255
- return;
256
-
257
- if (ds.Peek() == ',') {
258
- ds.Take();
259
- if (ds.Peek() == '}')
260
- m = kInfinityQuantifier;
261
- else if (!ParseUnsigned(ds, &m) || m < n)
262
- return;
263
- }
264
- else
265
- m = n;
266
-
267
- if (!EvalQuantifier(operandStack, n, m) || ds.Peek() != '}')
268
- return;
269
- ds.Take();
270
- }
271
- break;
272
-
273
- case '.':
274
- PushOperand(operandStack, kAnyCharacterClass);
275
- ImplicitConcatenation(atomCountStack, operatorStack);
276
- break;
277
-
278
- case '[':
279
- {
280
- SizeType range;
281
- if (!ParseRange(ds, &range))
282
- return;
283
- SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, kRangeCharacterClass);
284
- GetState(s).rangeStart = range;
285
- *operandStack.template Push<Frag>() = Frag(s, s, s);
286
- }
287
- ImplicitConcatenation(atomCountStack, operatorStack);
288
- break;
289
-
290
- case '\\': // Escape character
291
- if (!CharacterEscape(ds, &codepoint))
292
- return; // Unsupported escape character
293
- // fall through to default
294
-
295
- default: // Pattern character
296
- PushOperand(operandStack, codepoint);
297
- ImplicitConcatenation(atomCountStack, operatorStack);
298
- }
299
- }
300
-
301
- while (!operatorStack.Empty())
302
- if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
303
- return;
304
-
305
- // Link the operand to matching state.
306
- if (operandStack.GetSize() == sizeof(Frag)) {
307
- Frag* e = operandStack.template Pop<Frag>(1);
308
- Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0));
309
- root_ = e->start;
310
-
311
- #if CEREAL_RAPIDJSON_REGEX_VERBOSE
312
- printf("root: %d\n", root_);
313
- for (SizeType i = 0; i < stateCount_ ; i++) {
314
- State& s = GetState(i);
315
- printf("[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1, (char)s.codepoint);
316
- }
317
- printf("\n");
318
- #endif
319
- }
320
- }
321
-
322
- SizeType NewState(SizeType out, SizeType out1, unsigned codepoint) {
323
- State* s = states_.template Push<State>();
324
- s->out = out;
325
- s->out1 = out1;
326
- s->codepoint = codepoint;
327
- s->rangeStart = kRegexInvalidRange;
328
- return stateCount_++;
329
- }
330
-
331
- void PushOperand(Stack<Allocator>& operandStack, unsigned codepoint) {
332
- SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint);
333
- *operandStack.template Push<Frag>() = Frag(s, s, s);
334
- }
335
-
336
- void ImplicitConcatenation(Stack<Allocator>& atomCountStack, Stack<Allocator>& operatorStack) {
337
- if (*atomCountStack.template Top<unsigned>())
338
- *operatorStack.template Push<Operator>() = kConcatenation;
339
- (*atomCountStack.template Top<unsigned>())++;
340
- }
341
-
342
- SizeType Append(SizeType l1, SizeType l2) {
343
- SizeType old = l1;
344
- while (GetState(l1).out != kRegexInvalidState)
345
- l1 = GetState(l1).out;
346
- GetState(l1).out = l2;
347
- return old;
348
- }
349
-
350
- void Patch(SizeType l, SizeType s) {
351
- for (SizeType next; l != kRegexInvalidState; l = next) {
352
- next = GetState(l).out;
353
- GetState(l).out = s;
354
- }
355
- }
356
-
357
- bool Eval(Stack<Allocator>& operandStack, Operator op) {
358
- switch (op) {
359
- case kConcatenation:
360
- CEREAL_RAPIDJSON_ASSERT(operandStack.GetSize() >= sizeof(Frag) * 2);
361
- {
362
- Frag e2 = *operandStack.template Pop<Frag>(1);
363
- Frag e1 = *operandStack.template Pop<Frag>(1);
364
- Patch(e1.out, e2.start);
365
- *operandStack.template Push<Frag>() = Frag(e1.start, e2.out, Min(e1.minIndex, e2.minIndex));
366
- }
367
- return true;
368
-
369
- case kAlternation:
370
- if (operandStack.GetSize() >= sizeof(Frag) * 2) {
371
- Frag e2 = *operandStack.template Pop<Frag>(1);
372
- Frag e1 = *operandStack.template Pop<Frag>(1);
373
- SizeType s = NewState(e1.start, e2.start, 0);
374
- *operandStack.template Push<Frag>() = Frag(s, Append(e1.out, e2.out), Min(e1.minIndex, e2.minIndex));
375
- return true;
376
- }
377
- return false;
378
-
379
- case kZeroOrOne:
380
- if (operandStack.GetSize() >= sizeof(Frag)) {
381
- Frag e = *operandStack.template Pop<Frag>(1);
382
- SizeType s = NewState(kRegexInvalidState, e.start, 0);
383
- *operandStack.template Push<Frag>() = Frag(s, Append(e.out, s), e.minIndex);
384
- return true;
385
- }
386
- return false;
387
-
388
- case kZeroOrMore:
389
- if (operandStack.GetSize() >= sizeof(Frag)) {
390
- Frag e = *operandStack.template Pop<Frag>(1);
391
- SizeType s = NewState(kRegexInvalidState, e.start, 0);
392
- Patch(e.out, s);
393
- *operandStack.template Push<Frag>() = Frag(s, s, e.minIndex);
394
- return true;
395
- }
396
- return false;
397
-
398
- case kOneOrMore:
399
- if (operandStack.GetSize() >= sizeof(Frag)) {
400
- Frag e = *operandStack.template Pop<Frag>(1);
401
- SizeType s = NewState(kRegexInvalidState, e.start, 0);
402
- Patch(e.out, s);
403
- *operandStack.template Push<Frag>() = Frag(e.start, s, e.minIndex);
404
- return true;
405
- }
406
- return false;
407
-
408
- default:
409
- // syntax error (e.g. unclosed kLeftParenthesis)
410
- return false;
411
- }
412
- }
413
-
414
- bool EvalQuantifier(Stack<Allocator>& operandStack, unsigned n, unsigned m) {
415
- CEREAL_RAPIDJSON_ASSERT(n <= m);
416
- CEREAL_RAPIDJSON_ASSERT(operandStack.GetSize() >= sizeof(Frag));
417
-
418
- if (n == 0) {
419
- if (m == 0) // a{0} not support
420
- return false;
421
- else if (m == kInfinityQuantifier)
422
- Eval(operandStack, kZeroOrMore); // a{0,} -> a*
423
- else {
424
- Eval(operandStack, kZeroOrOne); // a{0,5} -> a?
425
- for (unsigned i = 0; i < m - 1; i++)
426
- CloneTopOperand(operandStack); // a{0,5} -> a? a? a? a? a?
427
- for (unsigned i = 0; i < m - 1; i++)
428
- Eval(operandStack, kConcatenation); // a{0,5} -> a?a?a?a?a?
429
- }
430
- return true;
431
- }
432
-
433
- for (unsigned i = 0; i < n - 1; i++) // a{3} -> a a a
434
- CloneTopOperand(operandStack);
435
-
436
- if (m == kInfinityQuantifier)
437
- Eval(operandStack, kOneOrMore); // a{3,} -> a a a+
438
- else if (m > n) {
439
- CloneTopOperand(operandStack); // a{3,5} -> a a a a
440
- Eval(operandStack, kZeroOrOne); // a{3,5} -> a a a a?
441
- for (unsigned i = n; i < m - 1; i++)
442
- CloneTopOperand(operandStack); // a{3,5} -> a a a a? a?
443
- for (unsigned i = n; i < m; i++)
444
- Eval(operandStack, kConcatenation); // a{3,5} -> a a aa?a?
445
- }
446
-
447
- for (unsigned i = 0; i < n - 1; i++)
448
- Eval(operandStack, kConcatenation); // a{3} -> aaa, a{3,} -> aaa+, a{3.5} -> aaaa?a?
449
-
450
- return true;
451
- }
452
-
453
- static SizeType Min(SizeType a, SizeType b) { return a < b ? a : b; }
454
-
455
- void CloneTopOperand(Stack<Allocator>& operandStack) {
456
- const Frag src = *operandStack.template Top<Frag>(); // Copy constructor to prevent invalidation
457
- SizeType count = stateCount_ - src.minIndex; // Assumes top operand contains states in [src->minIndex, stateCount_)
458
- State* s = states_.template Push<State>(count);
459
- memcpy(s, &GetState(src.minIndex), count * sizeof(State));
460
- for (SizeType j = 0; j < count; j++) {
461
- if (s[j].out != kRegexInvalidState)
462
- s[j].out += count;
463
- if (s[j].out1 != kRegexInvalidState)
464
- s[j].out1 += count;
465
- }
466
- *operandStack.template Push<Frag>() = Frag(src.start + count, src.out + count, src.minIndex + count);
467
- stateCount_ += count;
468
- }
469
-
470
- template <typename InputStream>
471
- bool ParseUnsigned(DecodedStream<InputStream, Encoding>& ds, unsigned* u) {
472
- unsigned r = 0;
473
- if (ds.Peek() < '0' || ds.Peek() > '9')
474
- return false;
475
- while (ds.Peek() >= '0' && ds.Peek() <= '9') {
476
- if (r >= 429496729 && ds.Peek() > '5') // 2^32 - 1 = 4294967295
477
- return false; // overflow
478
- r = r * 10 + (ds.Take() - '0');
479
- }
480
- *u = r;
481
- return true;
482
- }
483
-
484
- template <typename InputStream>
485
- bool ParseRange(DecodedStream<InputStream, Encoding>& ds, SizeType* range) {
486
- bool isBegin = true;
487
- bool negate = false;
488
- int step = 0;
489
- SizeType start = kRegexInvalidRange;
490
- SizeType current = kRegexInvalidRange;
491
- unsigned codepoint;
492
- while ((codepoint = ds.Take()) != 0) {
493
- if (isBegin) {
494
- isBegin = false;
495
- if (codepoint == '^') {
496
- negate = true;
497
- continue;
498
- }
499
- }
500
-
501
- switch (codepoint) {
502
- case ']':
503
- if (start == kRegexInvalidRange)
504
- return false; // Error: nothing inside []
505
- if (step == 2) { // Add trailing '-'
506
- SizeType r = NewRange('-');
507
- CEREAL_RAPIDJSON_ASSERT(current != kRegexInvalidRange);
508
- GetRange(current).next = r;
509
- }
510
- if (negate)
511
- GetRange(start).start |= kRangeNegationFlag;
512
- *range = start;
513
- return true;
514
-
515
- case '\\':
516
- if (ds.Peek() == 'b') {
517
- ds.Take();
518
- codepoint = 0x0008; // Escape backspace character
519
- }
520
- else if (!CharacterEscape(ds, &codepoint))
521
- return false;
522
- // fall through to default
523
-
524
- default:
525
- switch (step) {
526
- case 1:
527
- if (codepoint == '-') {
528
- step++;
529
- break;
530
- }
531
- // fall through to step 0 for other characters
532
-
533
- case 0:
534
- {
535
- SizeType r = NewRange(codepoint);
536
- if (current != kRegexInvalidRange)
537
- GetRange(current).next = r;
538
- if (start == kRegexInvalidRange)
539
- start = r;
540
- current = r;
541
- }
542
- step = 1;
543
- break;
544
-
545
- default:
546
- CEREAL_RAPIDJSON_ASSERT(step == 2);
547
- GetRange(current).end = codepoint;
548
- step = 0;
549
- }
550
- }
551
- }
552
- return false;
553
- }
554
-
555
- SizeType NewRange(unsigned codepoint) {
556
- Range* r = ranges_.template Push<Range>();
557
- r->start = r->end = codepoint;
558
- r->next = kRegexInvalidRange;
559
- return rangeCount_++;
560
- }
561
-
562
- template <typename InputStream>
563
- bool CharacterEscape(DecodedStream<InputStream, Encoding>& ds, unsigned* escapedCodepoint) {
564
- unsigned codepoint;
565
- switch (codepoint = ds.Take()) {
566
- case '^':
567
- case '$':
568
- case '|':
569
- case '(':
570
- case ')':
571
- case '?':
572
- case '*':
573
- case '+':
574
- case '.':
575
- case '[':
576
- case ']':
577
- case '{':
578
- case '}':
579
- case '\\':
580
- *escapedCodepoint = codepoint; return true;
581
- case 'f': *escapedCodepoint = 0x000C; return true;
582
- case 'n': *escapedCodepoint = 0x000A; return true;
583
- case 'r': *escapedCodepoint = 0x000D; return true;
584
- case 't': *escapedCodepoint = 0x0009; return true;
585
- case 'v': *escapedCodepoint = 0x000B; return true;
586
- default:
587
- return false; // Unsupported escape character
588
- }
589
- }
590
-
591
- Allocator* ownAllocator_;
592
- Allocator* allocator_;
593
- Stack<Allocator> states_;
594
- Stack<Allocator> ranges_;
595
- SizeType root_;
596
- SizeType stateCount_;
597
- SizeType rangeCount_;
598
-
599
- static const unsigned kInfinityQuantifier = ~0u;
600
-
601
- // For SearchWithAnchoring()
602
- bool anchorBegin_;
603
- bool anchorEnd_;
604
- };
605
-
606
- template <typename RegexType, typename Allocator = CrtAllocator>
607
- class GenericRegexSearch {
608
- public:
609
- typedef typename RegexType::EncodingType Encoding;
610
- typedef typename Encoding::Ch Ch;
611
-
612
- GenericRegexSearch(const RegexType& regex, Allocator* allocator = 0) :
613
- regex_(regex), allocator_(allocator), ownAllocator_(0),
614
- state0_(allocator, 0), state1_(allocator, 0), stateSet_()
615
- {
616
- CEREAL_RAPIDJSON_ASSERT(regex_.IsValid());
617
- if (!allocator_)
618
- ownAllocator_ = allocator_ = CEREAL_RAPIDJSON_NEW(Allocator)();
619
- stateSet_ = static_cast<unsigned*>(allocator_->Malloc(GetStateSetSize()));
620
- state0_.template Reserve<SizeType>(regex_.stateCount_);
621
- state1_.template Reserve<SizeType>(regex_.stateCount_);
622
- }
623
-
624
- ~GenericRegexSearch() {
625
- Allocator::Free(stateSet_);
626
- CEREAL_RAPIDJSON_DELETE(ownAllocator_);
627
- }
628
-
629
- template <typename InputStream>
630
- bool Match(InputStream& is) {
631
- return SearchWithAnchoring(is, true, true);
632
- }
633
-
634
- bool Match(const Ch* s) {
635
- GenericStringStream<Encoding> is(s);
636
- return Match(is);
637
- }
638
-
639
- template <typename InputStream>
640
- bool Search(InputStream& is) {
641
- return SearchWithAnchoring(is, regex_.anchorBegin_, regex_.anchorEnd_);
642
- }
643
-
644
- bool Search(const Ch* s) {
645
- GenericStringStream<Encoding> is(s);
646
- return Search(is);
647
- }
648
-
649
- private:
650
- typedef typename RegexType::State State;
651
- typedef typename RegexType::Range Range;
652
-
653
- template <typename InputStream>
654
- bool SearchWithAnchoring(InputStream& is, bool anchorBegin, bool anchorEnd) {
655
- DecodedStream<InputStream, Encoding> ds(is);
656
-
657
- state0_.Clear();
658
- Stack<Allocator> *current = &state0_, *next = &state1_;
659
- const size_t stateSetSize = GetStateSetSize();
660
- std::memset(stateSet_, 0, stateSetSize);
661
-
662
- bool matched = AddState(*current, regex_.root_);
663
- unsigned codepoint;
664
- while (!current->Empty() && (codepoint = ds.Take()) != 0) {
665
- std::memset(stateSet_, 0, stateSetSize);
666
- next->Clear();
667
- matched = false;
668
- for (const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s) {
669
- const State& sr = regex_.GetState(*s);
670
- if (sr.codepoint == codepoint ||
671
- sr.codepoint == RegexType::kAnyCharacterClass ||
672
- (sr.codepoint == RegexType::kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint)))
673
- {
674
- matched = AddState(*next, sr.out) || matched;
675
- if (!anchorEnd && matched)
676
- return true;
677
- }
678
- if (!anchorBegin)
679
- AddState(*next, regex_.root_);
680
- }
681
- internal::Swap(current, next);
682
- }
683
-
684
- return matched;
685
- }
686
-
687
- size_t GetStateSetSize() const {
688
- return (regex_.stateCount_ + 31) / 32 * 4;
689
- }
690
-
691
- // Return whether the added states is a match state
692
- bool AddState(Stack<Allocator>& l, SizeType index) {
693
- CEREAL_RAPIDJSON_ASSERT(index != kRegexInvalidState);
694
-
695
- const State& s = regex_.GetState(index);
696
- if (s.out1 != kRegexInvalidState) { // Split
697
- bool matched = AddState(l, s.out);
698
- return AddState(l, s.out1) || matched;
699
- }
700
- else if (!(stateSet_[index >> 5] & (1u << (index & 31)))) {
701
- stateSet_[index >> 5] |= (1u << (index & 31));
702
- *l.template PushUnsafe<SizeType>() = index;
703
- }
704
- return s.out == kRegexInvalidState; // by using PushUnsafe() above, we can ensure s is not validated due to reallocation.
705
- }
706
-
707
- bool MatchRange(SizeType rangeIndex, unsigned codepoint) const {
708
- bool yes = (regex_.GetRange(rangeIndex).start & RegexType::kRangeNegationFlag) == 0;
709
- while (rangeIndex != kRegexInvalidRange) {
710
- const Range& r = regex_.GetRange(rangeIndex);
711
- if (codepoint >= (r.start & ~RegexType::kRangeNegationFlag) && codepoint <= r.end)
712
- return yes;
713
- rangeIndex = r.next;
714
- }
715
- return !yes;
716
- }
717
-
718
- const RegexType& regex_;
719
- Allocator* allocator_;
720
- Allocator* ownAllocator_;
721
- Stack<Allocator> state0_;
722
- Stack<Allocator> state1_;
723
- uint32_t* stateSet_;
724
- };
725
-
726
- typedef GenericRegex<UTF8<> > Regex;
727
- typedef GenericRegexSearch<Regex> RegexSearch;
728
-
729
- } // namespace internal
730
- CEREAL_RAPIDJSON_NAMESPACE_END
731
-
732
- #ifdef __GNUC__
733
- CEREAL_RAPIDJSON_DIAG_POP
734
- #endif
735
-
736
- #if defined(__clang__) || defined(_MSC_VER)
737
- CEREAL_RAPIDJSON_DIAG_POP
738
- #endif
739
-
740
- #endif // CEREAL_RAPIDJSON_INTERNAL_REGEX_H_