@grain/stdlib 0.4.0 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +65 -0
- package/LICENSE +21 -0
- package/README.md +34 -0
- package/array.gr +136 -44
- package/array.md +97 -21
- package/buffer.gr +495 -424
- package/buffer.md +850 -0
- package/bytes.gr +512 -407
- package/bytes.md +621 -0
- package/char.gr +11 -3
- package/hash.gr +26 -3
- package/hash.md +44 -0
- package/list.gr +54 -0
- package/number.gr +24 -6
- package/number.md +49 -17
- package/option.gr +244 -37
- package/option.md +579 -0
- package/package.json +33 -29
- package/queue.gr +98 -29
- package/queue.md +191 -0
- package/range.md +1 -1
- package/regex.gr +3055 -0
- package/regex.md +449 -0
- package/result.gr +216 -70
- package/result.md +446 -0
- package/runtime/gc.gr +2 -2
- package/runtime/string.gr +56 -24
- package/runtime/stringUtils.gr +172 -0
- package/runtime/unsafe/conv.gr +43 -0
- package/set.gr +172 -5
- package/set.md +502 -0
- package/stack.md +143 -0
- package/string.gr +444 -230
- package/string.md +815 -0
- package/sys/file.gr +3 -2
- package/sys/file.md +2 -2
package/regex.gr
ADDED
|
@@ -0,0 +1,3055 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module Regex: Regular Expressions.
|
|
3
|
+
* @example import Regex from "regex"
|
|
4
|
+
*
|
|
5
|
+
* @since 0.4.3
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/*
|
|
9
|
+
This library provides support for regular expressions in Grain.
|
|
10
|
+
Its parser and analyzer are largely ported from Racket (https://racket-lang.org/),
|
|
11
|
+
which is licensed under Apache 2.0. Racket's regular expression
|
|
12
|
+
engine is itself inspired by the Spencer engine, as found in Tcl.
|
|
13
|
+
*/
|
|
14
|
+
import Array from "array"
|
|
15
|
+
import Char from "char"
|
|
16
|
+
import List from "list"
|
|
17
|
+
import Map from "map"
|
|
18
|
+
import Option from "option"
|
|
19
|
+
import Result from "result"
|
|
20
|
+
import String from "string"
|
|
21
|
+
import Float32 from "float32"
|
|
22
|
+
import { min, max } from "number"
|
|
23
|
+
|
|
24
|
+
/*
|
|
25
|
+
|
|
26
|
+
===============================
|
|
27
|
+
REGEX PARSER CONFIG DEFINITIONS
|
|
28
|
+
===============================
|
|
29
|
+
|
|
30
|
+
*/
|
|
31
|
+
|
|
32
|
+
/*
|
|
33
|
+
We use boxes in these records in order to share
|
|
34
|
+
references across multiple objects.
|
|
35
|
+
For example, when a user types `(?i:...)`, we
|
|
36
|
+
want to create a new configuration which is
|
|
37
|
+
case-insensitive while still having the same group
|
|
38
|
+
number and reference counter.
|
|
39
|
+
*/
|
|
40
|
+
|
|
41
|
+
record RegExParserConfig {
|
|
42
|
+
// Whether to use Perl-based regexp syntax
|
|
43
|
+
isPerlRegExp: Bool,
|
|
44
|
+
// Whether the regexp is case-sensitive
|
|
45
|
+
caseSensitive: Bool,
|
|
46
|
+
// Whether multi-line mode is enabled
|
|
47
|
+
multiline: Bool,
|
|
48
|
+
// The number of total groups in this regular expression
|
|
49
|
+
groupNumber: Box<Number>,
|
|
50
|
+
// Whether this regular expression contains any references
|
|
51
|
+
references: Box<Bool>,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
let makeRegExParserConfig = () => {
|
|
55
|
+
{
|
|
56
|
+
isPerlRegExp: true,
|
|
57
|
+
caseSensitive: true,
|
|
58
|
+
multiline: false,
|
|
59
|
+
groupNumber: box(0),
|
|
60
|
+
references: box(false),
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
let configWithCaseSensitive = (config: RegExParserConfig, caseSensitive: Bool) => {
|
|
65
|
+
{
|
|
66
|
+
isPerlRegExp: config.isPerlRegExp,
|
|
67
|
+
caseSensitive: caseSensitive,
|
|
68
|
+
multiline: config.multiline,
|
|
69
|
+
groupNumber: config.groupNumber,
|
|
70
|
+
references: config.references,
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
let configWithMultiLine = (config: RegExParserConfig, multiline: Bool) => {
|
|
75
|
+
{
|
|
76
|
+
isPerlRegExp: config.isPerlRegExp,
|
|
77
|
+
caseSensitive: config.caseSensitive,
|
|
78
|
+
multiline: multiline,
|
|
79
|
+
groupNumber: config.groupNumber,
|
|
80
|
+
references: config.references,
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
let configGroupNumber = (config: RegExParserConfig) => unbox(config.groupNumber)
|
|
85
|
+
|
|
86
|
+
let configIncGroupNumber = (config: RegExParserConfig) => {
|
|
87
|
+
config.groupNumber := unbox(config.groupNumber) + 1
|
|
88
|
+
config
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
record RegExBuf {
|
|
92
|
+
input: String,
|
|
93
|
+
inputExploded: Array<Char>,
|
|
94
|
+
cursor: Box<Number>,
|
|
95
|
+
config: RegExParserConfig,
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
let makeRegExBuf = (s) => {
|
|
99
|
+
{input: s, inputExploded: String.explode(s), cursor: box(0), config: makeRegExParserConfig()}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
let withConfig = (buf: RegExBuf, config: RegExParserConfig) => {
|
|
103
|
+
{input: buf.input, inputExploded: buf.inputExploded, cursor: buf.cursor, config: config}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Parsing internals for recursive descent
|
|
107
|
+
|
|
108
|
+
let parseErr = (buf: RegExBuf, msg: String, posShift) => {
|
|
109
|
+
"Invalid Regular Expression: " ++ msg ++ " (position " ++ toString(unbox(buf.cursor) + posShift) ++ ")"
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
let next = (buf: RegExBuf) => {
|
|
113
|
+
let cursor = unbox(buf.cursor)
|
|
114
|
+
if (cursor >= Array.length(buf.inputExploded)) {
|
|
115
|
+
Err(parseErr(buf, "end of buffer reached", 0))
|
|
116
|
+
} else {
|
|
117
|
+
let ret = buf.inputExploded[cursor]
|
|
118
|
+
buf.cursor := cursor + 1
|
|
119
|
+
Ok(ret)
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
let peek = (buf: RegExBuf) => {
|
|
124
|
+
let cursor = unbox(buf.cursor)
|
|
125
|
+
if (cursor >= Array.length(buf.inputExploded)) {
|
|
126
|
+
Err(parseErr(buf, "end of buffer reached", 0))
|
|
127
|
+
} else {
|
|
128
|
+
Ok(buf.inputExploded[cursor])
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
let peekN = (buf: RegExBuf, n) => {
|
|
133
|
+
let cursor = unbox(buf.cursor)
|
|
134
|
+
if (cursor + n >= Array.length(buf.inputExploded)) {
|
|
135
|
+
Err(parseErr(buf, "end of buffer reached", 0))
|
|
136
|
+
} else {
|
|
137
|
+
Ok(buf.inputExploded[cursor + n])
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
let eat = (buf: RegExBuf, char: Char) => {
|
|
142
|
+
let cursor = unbox(buf.cursor)
|
|
143
|
+
if (cursor >= Array.length(buf.inputExploded)) {
|
|
144
|
+
Err(parseErr(buf, "end of buffer reached", 0))
|
|
145
|
+
} else {
|
|
146
|
+
let ret = buf.inputExploded[cursor]
|
|
147
|
+
if (ret == char) {
|
|
148
|
+
buf.cursor := cursor + 1
|
|
149
|
+
Ok(ret)
|
|
150
|
+
} else {
|
|
151
|
+
Err(parseErr(buf, "Expected character '" ++ Char.toString(char) ++ ", but found character '" ++ Char.toString(ret) ++ "'", 0))
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Checks if the given regex buffer is empty
|
|
158
|
+
* @param buf: The buffer to check
|
|
159
|
+
* @returns `false` if the buffer is empty, `true` otherwise.
|
|
160
|
+
*/
|
|
161
|
+
let more = (buf: RegExBuf) => {
|
|
162
|
+
unbox(buf.cursor) < Array.length(buf.inputExploded)
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
let moreN = (buf: RegExBuf, n) => {
|
|
166
|
+
unbox(buf.cursor) + n < Array.length(buf.inputExploded)
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// END Parsing internals for recursive descent
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
/*
|
|
173
|
+
|
|
174
|
+
=================================
|
|
175
|
+
REGEX RANGE DEFINITIONS AND UTILS
|
|
176
|
+
=================================
|
|
177
|
+
|
|
178
|
+
Based on https://github.com/racket/racket/blob/0a9c70e95a69743dd5d219a395e995be4a4bfd41/racket/src/regexp/common/range.rkt
|
|
179
|
+
|
|
180
|
+
*/
|
|
181
|
+
|
|
182
|
+
// [TODO] alias type RERange as List<(Number, Number)>
|
|
183
|
+
|
|
184
|
+
let rangeInvert = (rng, limitC) => {
|
|
185
|
+
let rec help = (rng, start) => {
|
|
186
|
+
match(rng) {
|
|
187
|
+
[] when start > limitC => [],
|
|
188
|
+
[] => [(start, limitC)],
|
|
189
|
+
[(subrangeStart, subrangeEnd), ...tl] => [(start, subrangeStart - 1), ...help(tl, subrangeEnd + 1)],
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
help(rng, 0)
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
let rec rangeContains = (rng, v) => {
|
|
196
|
+
match(rng) {
|
|
197
|
+
[] => false,
|
|
198
|
+
[(start, end), ..._] when (start <= v) && (v <= end) => true,
|
|
199
|
+
[_, ...tl] => rangeContains(tl, v),
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
let rec rangeAdd = (rng, v) => {
|
|
204
|
+
match(rng) {
|
|
205
|
+
_ when rangeContains(rng, v) => rng,
|
|
206
|
+
_ => rangeUnion(rng, [(v, v)])
|
|
207
|
+
}
|
|
208
|
+
},
|
|
209
|
+
|
|
210
|
+
rangeUnion = (rng1, rng2) => {
|
|
211
|
+
match((rng1, rng2)) {
|
|
212
|
+
([], _) => rng2,
|
|
213
|
+
(_, []) => rng1,
|
|
214
|
+
([(r1start, r1end), ...r1tl], [(r2start, r2end), ...r2tl]) when r1start <= r2start => {
|
|
215
|
+
if (r1end + 1 >= r2start) {
|
|
216
|
+
if (r1end <= r2end) {
|
|
217
|
+
rangeUnion([(r1start, r2end), ...r2tl], r1tl)
|
|
218
|
+
} else {
|
|
219
|
+
rangeUnion(rng1, r2tl)
|
|
220
|
+
}
|
|
221
|
+
} else {
|
|
222
|
+
[(r1start, r1end), ...rangeUnion(r1tl, rng2)]
|
|
223
|
+
}
|
|
224
|
+
},
|
|
225
|
+
(_, _) => rangeUnion(rng2, rng1)
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
let rangeAddSpan = (rng, fromC, toC) => {
|
|
230
|
+
rangeUnion(rng, [(fromC, toC)])
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
let rangeSingleton = (rng) => {
|
|
234
|
+
match(rng) {
|
|
235
|
+
[(c1, c2)] when c1 == c2 => Some(c1),
|
|
236
|
+
_ => None
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
let rec rangeIncludes = (rng, lo, hi) => {
|
|
241
|
+
match(rng) {
|
|
242
|
+
[] => false,
|
|
243
|
+
[(c1, c2), ...tl] when lo > c2 => rangeIncludes(tl, lo, hi),
|
|
244
|
+
[(c1, c2), ..._] => lo >= c1 && hi <= c2,
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
let rec rangeWithin = (rng, lo, hi) => {
|
|
249
|
+
match(rng) {
|
|
250
|
+
[] => true,
|
|
251
|
+
[(c1, _), ..._] when c1 < lo => false,
|
|
252
|
+
[(_, c2), ..._] when c2 > hi => false,
|
|
253
|
+
[_, ...tl] => rangeWithin(tl, lo, hi)
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
let rec rangeOverlaps = (rng, lo, hi) => {
|
|
258
|
+
match(rng) {
|
|
259
|
+
[] => false,
|
|
260
|
+
[(_, c2), ...tl] when lo > c2 => rangeOverlaps(tl, lo, hi),
|
|
261
|
+
[(c1, c2), ..._] => (lo >= c1 && lo <= c2) && (hi >= c1 && hi <= c2)
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
let rangeAddCaseAware = (rng, c, config) => {
|
|
266
|
+
match(c) {
|
|
267
|
+
None => Ok(rng),
|
|
268
|
+
Some(c) => {
|
|
269
|
+
let rng = rangeAdd(rng, c)
|
|
270
|
+
if (config.caseSensitive) {
|
|
271
|
+
Ok(rng)
|
|
272
|
+
} else {
|
|
273
|
+
// Needs Char.upcase and friends (once it's added, change return type from Result<RERange> to RERange) [see #661]:
|
|
274
|
+
/*
|
|
275
|
+
let rng = rangeAdd(rng, Char.code(Char.upcase(Char.fromCode(c))))
|
|
276
|
+
let rng = rangeAdd(rng, Char.code(Char.foldcase(Char.fromCode(c))))
|
|
277
|
+
let rng = rangeAdd(rng, Char.code(Char.downcase(Char.fromCode(c))))
|
|
278
|
+
Ok(rng)
|
|
279
|
+
*/
|
|
280
|
+
Err("NYI: Case-insensitive matching is not supported until grain-lang/grain#661 is resolved.")
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
let rangeAddSpanCaseAware = (rng, fromC, toC, config) => {
|
|
287
|
+
if (config.caseSensitive) {
|
|
288
|
+
Ok(rangeAddSpan(rng, fromC, toC))
|
|
289
|
+
} else {
|
|
290
|
+
let mut ret = Ok(rng)
|
|
291
|
+
for (let mut i = fromC; i <= toC; i = i + 1) {
|
|
292
|
+
match (ret) {
|
|
293
|
+
Ok(x) => ret = rangeAddCaseAware(x, Some(i), config),
|
|
294
|
+
Err(e) => break
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
ret
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/*
|
|
302
|
+
|
|
303
|
+
=====================
|
|
304
|
+
REGEX AST DEFINITIONS
|
|
305
|
+
=====================
|
|
306
|
+
|
|
307
|
+
*/
|
|
308
|
+
|
|
309
|
+
enum RepeatQuantifier {
|
|
310
|
+
ZeroOrMore,
|
|
311
|
+
OnceOrMore,
|
|
312
|
+
ZeroOrOne,
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
enum GroupModeFlag {
|
|
316
|
+
GMFCaseSensitive,
|
|
317
|
+
GMFCaseInsensitive,
|
|
318
|
+
GMFNotMulti,
|
|
319
|
+
GMFMulti,
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
enum LookMode {
|
|
323
|
+
LMMatches,
|
|
324
|
+
LMDoesntMatch,
|
|
325
|
+
LMMatchesPreceding,
|
|
326
|
+
LMDoesntMatchPreceding,
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
enum PCEMode {
|
|
330
|
+
PCEOnce,
|
|
331
|
+
PCELongest,
|
|
332
|
+
PCEShortest,
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
enum UnicodeCategory {
|
|
336
|
+
LetterLowercase,
|
|
337
|
+
LetterUppercase,
|
|
338
|
+
LetterTitlecase,
|
|
339
|
+
LetterModifier,
|
|
340
|
+
LetterOther,
|
|
341
|
+
NumberDecimalDigit,
|
|
342
|
+
NumberLetter,
|
|
343
|
+
NumberOther,
|
|
344
|
+
PunctuationOpen,
|
|
345
|
+
PunctuationClose,
|
|
346
|
+
PunctuationInitialQuote,
|
|
347
|
+
PunctuationFinalQuote,
|
|
348
|
+
PunctuationConnector,
|
|
349
|
+
PunctuationDash,
|
|
350
|
+
PunctuationOther,
|
|
351
|
+
MarkNonSpacing,
|
|
352
|
+
MarkSpacingCombining,
|
|
353
|
+
MarkEnclosing,
|
|
354
|
+
SymbolCurrency,
|
|
355
|
+
SymbolModifier,
|
|
356
|
+
SymbolMath,
|
|
357
|
+
SymbolOther,
|
|
358
|
+
SeparatorLine,
|
|
359
|
+
SeparatorParagraph,
|
|
360
|
+
SeparatorSpace,
|
|
361
|
+
OtherControl,
|
|
362
|
+
OtherFormat,
|
|
363
|
+
OtherSurrogate,
|
|
364
|
+
OtherNotAssigned,
|
|
365
|
+
OtherPrivateUse
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
enum ParsedRegularExpression {
|
|
369
|
+
RENever,
|
|
370
|
+
REEmpty,
|
|
371
|
+
REAny,
|
|
372
|
+
REStart,
|
|
373
|
+
REEnd,
|
|
374
|
+
RELineStart,
|
|
375
|
+
RELineEnd,
|
|
376
|
+
REWordBoundary,
|
|
377
|
+
RENotWordBoundary,
|
|
378
|
+
RELiteral(Char),
|
|
379
|
+
RELiteralString(String), // <- sequences of literals are flattened into a string
|
|
380
|
+
REAlts(ParsedRegularExpression, ParsedRegularExpression),
|
|
381
|
+
RESequence(List<ParsedRegularExpression>, Bool), // seq elts, needs backtrack
|
|
382
|
+
REGroup(ParsedRegularExpression, Number), // regex, group ID
|
|
383
|
+
RERepeat(ParsedRegularExpression, Number, Option<Number>, Bool), // regex, min, max (None for infinity), true=non-greedy
|
|
384
|
+
REMaybe(ParsedRegularExpression, Bool), // regex, true=non-greedy
|
|
385
|
+
REConditional(ParsedRegularExpression, ParsedRegularExpression, Option<ParsedRegularExpression>, Number, Number, Bool), // test, if-true, if-false, n-start, num-n, needs-backtrack
|
|
386
|
+
RELookahead(ParsedRegularExpression, Bool, Number, Number), // regex, is-match, n-start, num-n
|
|
387
|
+
RELookbehind(ParsedRegularExpression, Bool, Box<Number>, Box<Number>, Number, Number), // regex, is-match, lb-min, lb-max, n-start, num-n (lb-xx values patched in later)
|
|
388
|
+
RECut(ParsedRegularExpression, Number, Number, Bool), // regex, n-start, num-n, needs-backtrack
|
|
389
|
+
REReference(Number, Bool), // n, case-sensitive
|
|
390
|
+
RERange(List<(Number, Number)>),
|
|
391
|
+
REUnicodeCategories(List<UnicodeCategory>, Bool) // symlist, true=match/false=does-not-match
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
let needsBacktrack = (rx: ParsedRegularExpression) => {
|
|
395
|
+
match(rx) {
|
|
396
|
+
REAlts(_, _) => true,
|
|
397
|
+
RESequence(_, nb) => nb,
|
|
398
|
+
REGroup(_, _) => true,
|
|
399
|
+
RERepeat(_, _, _, _) => true,
|
|
400
|
+
REMaybe(_, _) => true,
|
|
401
|
+
REConditional(_, _, _, _, _, nb) => nb,
|
|
402
|
+
RECut(_, _, _, nb) => nb,
|
|
403
|
+
REUnicodeCategories(_, _) => true,
|
|
404
|
+
_ => false
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
let makeRERange = (rng, limitC) => {
|
|
409
|
+
match(rng) {
|
|
410
|
+
[(c1, c2)] when c1 == c2 => RELiteral(Char.fromCode(c1)),
|
|
411
|
+
_ when rangeIncludes(rng, 0, limitC) => REAny,
|
|
412
|
+
_ => RERange(rng),
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
enum MergeMode {
|
|
417
|
+
MMChar,
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
let mergeAdjacent = (lst) => {
|
|
421
|
+
// see [TODO] below
|
|
422
|
+
let readyForAccum = (l, mode) => {
|
|
423
|
+
match(l) {
|
|
424
|
+
[] => true,
|
|
425
|
+
[hd, ..._] => {
|
|
426
|
+
match(mode) {
|
|
427
|
+
None => false,
|
|
428
|
+
Some(MMChar) => {
|
|
429
|
+
match(hd) {
|
|
430
|
+
RELiteral(x) => false,
|
|
431
|
+
RELiteralString(x) => false,
|
|
432
|
+
_ => true
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
let rec loop = (mode, accum, l) => {
|
|
440
|
+
match(l) {
|
|
441
|
+
// flatten nested sequences
|
|
442
|
+
[(RESequence(rxs1, _)), ...tl] => loop(mode, accum, List.append(rxs1, tl)),
|
|
443
|
+
// drop empty elements
|
|
444
|
+
[REEmpty, ...tl] => loop(mode, accum, tl),
|
|
445
|
+
[RELiteralString(""), ...tl] => loop(mode, accum, tl),
|
|
446
|
+
// [TODO] Clean up with or-patterns (grain-lang/grain#696)
|
|
447
|
+
_ when readyForAccum(l, mode) => {
|
|
448
|
+
match(accum) {
|
|
449
|
+
[] => [],
|
|
450
|
+
[hd] => [RELiteralString(hd), ...loop(None, [], l)],
|
|
451
|
+
[hd, ...tl] => {
|
|
452
|
+
let newHd = match(mode) {
|
|
453
|
+
// MMByte would go here, if supported
|
|
454
|
+
Some(MMChar) => List.join("", List.reverse(accum)),
|
|
455
|
+
None => fail "internal error (mergeAdjacent)",
|
|
456
|
+
}
|
|
457
|
+
[RELiteralString(newHd), ...loop(None, [], l)]
|
|
458
|
+
},
|
|
459
|
+
}
|
|
460
|
+
},
|
|
461
|
+
[] => fail "impossible (mergeAdjacent)", // avoid warning (can delete once TODO is resolved)
|
|
462
|
+
[RELiteralString(x), ...tl] when Option.isSome(mode) => loop(mode, [x, ...accum], tl),
|
|
463
|
+
[RELiteral(c), ...tl] when Option.isSome(mode) => loop(mode, [Char.toString(c), ...accum], tl),
|
|
464
|
+
[RELiteralString(x), ...tl] => loop(Some(MMChar), [x], tl),
|
|
465
|
+
[RELiteral(c), ...tl] => loop(Some(MMChar), [Char.toString(c)], tl),
|
|
466
|
+
[hd, ...tl] => [hd, ...loop(None, [], tl)],
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
loop(None, [], lst)
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
let makeRESequence = (lst) => {
|
|
473
|
+
match(lst) {
|
|
474
|
+
[] => REEmpty,
|
|
475
|
+
[hd] => hd,
|
|
476
|
+
_ => {
|
|
477
|
+
match(mergeAdjacent(lst)) {
|
|
478
|
+
[hd] => hd,
|
|
479
|
+
mList => RESequence(mList, List.some(needsBacktrack, mList))
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
let makeREAlts = (rx1, rx2, limitC) => {
|
|
486
|
+
match((rx1, rx2)) {
|
|
487
|
+
((RENever, _)) => rx2,
|
|
488
|
+
((_, RENever)) => rx1,
|
|
489
|
+
((RERange(r1), RERange(r2))) => makeRERange(rangeUnion(r1, r2), limitC),
|
|
490
|
+
((RERange(r1), RELiteral(c2))) => makeRERange(rangeAdd(r1, Char.code(c2)), limitC),
|
|
491
|
+
((RELiteral(c1), RERange(r2))) => makeRERange(rangeAdd(r2, Char.code(c1)), limitC),
|
|
492
|
+
((RELiteral(c1), RELiteral(c2))) => makeRERange(rangeAdd(rangeAdd([], Char.code(c1)), Char.code(c2)), limitC),
|
|
493
|
+
_ => REAlts(rx1, rx2)
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
let makeRECut = (rx, nStart, numN) => {
|
|
498
|
+
RECut(rx, nStart, numN, needsBacktrack(rx))
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
let makeREConditional = (tst, pces1, pces2, nStart, numN) => {
|
|
502
|
+
let nb = needsBacktrack(pces1) || match(pces2) {
|
|
503
|
+
None => false,
|
|
504
|
+
Some(p2) => needsBacktrack(p2)
|
|
505
|
+
}
|
|
506
|
+
REConditional(tst, pces1, pces2, nStart, numN, nb)
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
/*
|
|
510
|
+
|
|
511
|
+
=========================
|
|
512
|
+
REGEX PARSING DEFINITIONS
|
|
513
|
+
=========================
|
|
514
|
+
|
|
515
|
+
*/
|
|
516
|
+
|
|
517
|
+
// Range parsing ("[a-z]")
|
|
518
|
+
|
|
519
|
+
// [TODO] (#769) When byte-based regexes are supported, we'll need another limit of 255 for those.
|
|
520
|
+
let rangeLimit = 0x10FFFF
|
|
521
|
+
|
|
522
|
+
// These are snake-cased to avoid confusion with their capitalized counterparts
|
|
523
|
+
|
|
524
|
+
let range_d = () => {
|
|
525
|
+
rangeAddSpan([], Char.code('0'), Char.code('9'))
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
let range_w = () => {
|
|
529
|
+
rangeAdd(rangeAddSpan(rangeAddSpan(range_d(), Char.code('a'), Char.code('z')), Char.code('A'), Char.code('Z')), Char.code('_'))
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
let range_s = () => {
|
|
533
|
+
// newline, tab, page, return
|
|
534
|
+
rangeAdd(rangeAdd(rangeAdd(rangeAdd(rangeAdd([], Char.code(' ')), 9), 10), 12), 13)
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
let rec parseRangeNot = (buf: RegExBuf) => {
|
|
538
|
+
if (!more(buf)) {
|
|
539
|
+
Err(parseErr(buf, "Missing closing `]`", 0))
|
|
540
|
+
} else {
|
|
541
|
+
match(peek(buf)) {
|
|
542
|
+
Err(e) => Err(e),
|
|
543
|
+
Ok('^') => {
|
|
544
|
+
ignore(eat(buf, '^'))
|
|
545
|
+
match(parseRange(buf)) {
|
|
546
|
+
Err(e) => Err(e),
|
|
547
|
+
Ok(rng) => Ok(rangeInvert(rng, rangeLimit))
|
|
548
|
+
}
|
|
549
|
+
},
|
|
550
|
+
Ok(_) => parseRange(buf)
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
},
|
|
554
|
+
|
|
555
|
+
parseRange = (buf: RegExBuf) => {
|
|
556
|
+
if (!more(buf)) {
|
|
557
|
+
Err(parseErr(buf, "Missing closing `]`", 0))
|
|
558
|
+
} else {
|
|
559
|
+
match(peek(buf)) {
|
|
560
|
+
Err(e) => Err(e),
|
|
561
|
+
Ok(']') => {
|
|
562
|
+
ignore(eat(buf, ']'))
|
|
563
|
+
match(parseRangeRest(buf, [], None, None)) {
|
|
564
|
+
Err(e) => Err(e),
|
|
565
|
+
Ok(rng) => Ok(rangeAdd(rng, Char.code(']')))
|
|
566
|
+
}
|
|
567
|
+
},
|
|
568
|
+
Ok('-') => {
|
|
569
|
+
ignore(eat(buf, '-'))
|
|
570
|
+
match(parseRangeRest(buf, [], None, None)) {
|
|
571
|
+
Err(e) => Err(e),
|
|
572
|
+
Ok(rng) => Ok(rangeAdd(rng, Char.code('-')))
|
|
573
|
+
}
|
|
574
|
+
},
|
|
575
|
+
Ok(_) => parseRangeRest(buf, [], None, None)
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
},
|
|
579
|
+
|
|
580
|
+
parseClass = (buf: RegExBuf) => {
|
|
581
|
+
if (!more(buf)) {
|
|
582
|
+
Err("no chars") // caught in handler (we use a Result to cleanly mesh with the Result type below)
|
|
583
|
+
} else {
|
|
584
|
+
match(peek(buf)) {
|
|
585
|
+
Err(e) => Err(e),
|
|
586
|
+
Ok('d') => {
|
|
587
|
+
ignore(eat(buf, 'd'))
|
|
588
|
+
Ok(range_d())
|
|
589
|
+
},
|
|
590
|
+
Ok('D') => {
|
|
591
|
+
ignore(eat(buf, 'D'))
|
|
592
|
+
Ok(rangeInvert(range_d(), rangeLimit))
|
|
593
|
+
},
|
|
594
|
+
Ok('w') => {
|
|
595
|
+
ignore(eat(buf, 'w'))
|
|
596
|
+
Ok(range_w())
|
|
597
|
+
},
|
|
598
|
+
Ok('W') => {
|
|
599
|
+
ignore(eat(buf, 'W'))
|
|
600
|
+
Ok(rangeInvert(range_w(), rangeLimit))
|
|
601
|
+
},
|
|
602
|
+
Ok('s') => {
|
|
603
|
+
ignore(eat(buf, 's'))
|
|
604
|
+
Ok(range_s())
|
|
605
|
+
},
|
|
606
|
+
Ok('S') => {
|
|
607
|
+
ignore(eat(buf, 'S'))
|
|
608
|
+
Ok(rangeInvert(range_s(), rangeLimit))
|
|
609
|
+
},
|
|
610
|
+
Ok(c) => Err("unknown class: " ++ toString(c)),
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
},
|
|
614
|
+
|
|
615
|
+
parsePosixCharClass = (buf: RegExBuf) => {
|
|
616
|
+
if (!more(buf)) {
|
|
617
|
+
Err(parseErr(buf, "Missing POSIX character class after `[`", 0))
|
|
618
|
+
} else {
|
|
619
|
+
match(peek(buf)) {
|
|
620
|
+
Err(e) => Err(e),
|
|
621
|
+
Ok(':') => {
|
|
622
|
+
ignore(eat(buf, ':'))
|
|
623
|
+
let rec loop = (acc) => {
|
|
624
|
+
match(peek(buf)) {
|
|
625
|
+
Err(e) => Err(e),
|
|
626
|
+
Ok(':') => {
|
|
627
|
+
ignore(eat(buf, ':'))
|
|
628
|
+
match(eat(buf, ']')) {
|
|
629
|
+
Err(_) => Err(parseErr(buf, "Missing closing `]`", 0)),
|
|
630
|
+
Ok(_) => Ok(List.join("", List.reverse(acc)))
|
|
631
|
+
}
|
|
632
|
+
},
|
|
633
|
+
Ok(c) when (Char.code('a') <= Char.code(c) && Char.code(c) <= Char.code('z')) => {
|
|
634
|
+
ignore(eat(buf, c))
|
|
635
|
+
loop([Char.toString(c), ...acc])
|
|
636
|
+
},
|
|
637
|
+
Ok(_) => Err(parseErr(buf, "Invalid character in POSIX character class", 0))
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
match(loop([])) {
|
|
641
|
+
Err(e) => Err(e),
|
|
642
|
+
Ok(s) => {
|
|
643
|
+
match(s) {
|
|
644
|
+
"alpha" => Ok(rangeAddSpan(rangeAddSpan([], Char.code('a'), Char.code('z')), Char.code('A'), Char.code('Z'))),
|
|
645
|
+
"upper" => Ok(rangeAddSpan([], Char.code('A'), Char.code('Z'))),
|
|
646
|
+
"lower" => Ok(rangeAddSpan([], Char.code('a'), Char.code('z'))),
|
|
647
|
+
"digit" => Ok(rangeAddSpan([], Char.code('0'), Char.code('9'))),
|
|
648
|
+
"xdigit" => Ok(rangeAddSpan(rangeAddSpan(rangeAddSpan([], Char.code('0'), Char.code('9')), Char.code('a'), Char.code('f')), Char.code('A'), Char.code('F'))),
|
|
649
|
+
"alnum" => Ok(rangeAddSpan(rangeAddSpan(rangeAddSpan([], Char.code('0'), Char.code('9')), Char.code('a'), Char.code('z')), Char.code('A'), Char.code('Z'))),
|
|
650
|
+
"word" => Ok(rangeAdd(rangeAddSpan(rangeAddSpan([], Char.code('a'), Char.code('f')), Char.code('A'), Char.code('F')), Char.code('_'))),
|
|
651
|
+
"blank" => Ok(rangeAdd(rangeAdd([], 0x20), 0x9)), // space and tab
|
|
652
|
+
"space" => Ok(range_s()),
|
|
653
|
+
"graph" => Err(parseErr(buf, "the [:graph:] character class is not currently supported. For more information, see https://github.com/grain-lang/grain/issues/661", 0)),
|
|
654
|
+
"print" => Err(parseErr(buf, "the [:print:] character class is not currently supported. For more information, see https://github.com/grain-lang/grain/issues/661", 0)),
|
|
655
|
+
"cntrl" => Ok(rangeAddSpan([], 0, 31)),
|
|
656
|
+
"ascii" => Ok(rangeAddSpan([], 0, 127)),
|
|
657
|
+
_ => Err(parseErr(buf, "Invalid POSIX character class: " ++ s, 0))
|
|
658
|
+
}
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
},
|
|
662
|
+
Ok(c) => Err(parseErr(buf, "Expected `:` after `[`. Found: `" ++ Char.toString(c) ++ "`", 0))
|
|
663
|
+
}
|
|
664
|
+
}
|
|
665
|
+
},
|
|
666
|
+
|
|
667
|
+
parseRangeRest = (buf: RegExBuf, rng, spanFrom: Option<Number>, mustSpanFrom: Option<Number>) => {
|
|
668
|
+
if (!more(buf)) {
|
|
669
|
+
Err(parseErr(buf, "Missing closing `]`", 0))
|
|
670
|
+
} else {
|
|
671
|
+
match(peek(buf)) {
|
|
672
|
+
Err(e) => Err(e),
|
|
673
|
+
Ok(']') => {
|
|
674
|
+
ignore(eat(buf, ']'))
|
|
675
|
+
rangeAddCaseAware(rng, spanFrom, buf.config)
|
|
676
|
+
},
|
|
677
|
+
Ok('-') => {
|
|
678
|
+
if (!moreN(buf, 1)) {
|
|
679
|
+
Err(parseErr(buf, "Missing closing `]`", 1))
|
|
680
|
+
} else {
|
|
681
|
+
match(peekN(buf, 1)) {
|
|
682
|
+
Err(e) => Err(e),
|
|
683
|
+
Ok(']') => {
|
|
684
|
+
match(mustSpanFrom) {
|
|
685
|
+
Some(_) => Err(parseErr(buf, "misplaced hyphen within square brackets in pattern", 1)),
|
|
686
|
+
None => {
|
|
687
|
+
ignore(eat(buf, '-'))
|
|
688
|
+
ignore(eat(buf, ']'))
|
|
689
|
+
match(rangeAddCaseAware(rng, spanFrom, buf.config)) {
|
|
690
|
+
Err(e) => Err(e),
|
|
691
|
+
Ok(rng) => Ok(rangeAdd(rng, Char.code('-')))
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
},
|
|
696
|
+
Ok(_) when Option.isNone(spanFrom) => Err(parseErr(buf, "misplaced hyphen within square brackets in pattern", 1)),
|
|
697
|
+
Ok(_) => {
|
|
698
|
+
ignore(eat(buf, '-'))
|
|
699
|
+
parseRangeRest(buf, rng, None, spanFrom)
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
},
|
|
704
|
+
Ok('\\') => {
|
|
705
|
+
ignore(eat(buf, '\\'))
|
|
706
|
+
if (!(buf.config.isPerlRegExp)) {
|
|
707
|
+
parseRangeRestSpan(buf, Char.code('\\'), rng, spanFrom, mustSpanFrom)
|
|
708
|
+
} else {
|
|
709
|
+
if (!more(buf)) {
|
|
710
|
+
Err(parseErr(buf, "escaping backslash at end pattern (within square brackets)", 0))
|
|
711
|
+
} else {
|
|
712
|
+
match(peek(buf)) {
|
|
713
|
+
Err(e) => Err(e),
|
|
714
|
+
Ok(c) when ((Char.code('a') <= Char.code(c) && Char.code(c) <= Char.code('z')) || (Char.code('A') <= Char.code(c) && Char.code(c) <= Char.code('Z'))) => {
|
|
715
|
+
match(mustSpanFrom) {
|
|
716
|
+
Some(_) => Err(parseErr(buf, "misplaced hyphen within square brackets in pattern", 0)),
|
|
717
|
+
None => {
|
|
718
|
+
let curPos = unbox(buf.cursor)
|
|
719
|
+
match(parseClass(buf)) {
|
|
720
|
+
Err(e) => Err("Invalid Regular Expression: illegal alphebetic escape (position " ++ toString(curPos) ++ ")"),
|
|
721
|
+
Ok(range1) => {
|
|
722
|
+
match(rangeAddCaseAware(rng, spanFrom, buf.config)) {
|
|
723
|
+
Err(e) => Err(e),
|
|
724
|
+
Ok(r) => parseRangeRest(buf, rangeUnion(range1, r), spanFrom, mustSpanFrom)
|
|
725
|
+
}
|
|
726
|
+
}
|
|
727
|
+
}
|
|
728
|
+
}
|
|
729
|
+
}
|
|
730
|
+
},
|
|
731
|
+
Ok(c) => {
|
|
732
|
+
ignore(next(buf))
|
|
733
|
+
parseRangeRestSpan(buf, Char.code(c), rng, spanFrom, mustSpanFrom)
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
},
|
|
739
|
+
Ok('[') => {
|
|
740
|
+
ignore(eat(buf, '['))
|
|
741
|
+
let curPos = unbox(buf.cursor)
|
|
742
|
+
match(parsePosixCharClass(buf)) {
|
|
743
|
+
// NOTE: Based on the spec, we don't propagate out
|
|
744
|
+
// the errors here. Instead, we treat malformed
|
|
745
|
+
// POSIX classes as being simple sequences of characters.
|
|
746
|
+
Err(e) => {
|
|
747
|
+
buf.cursor := curPos
|
|
748
|
+
parseRangeRestSpan(buf, Char.code('['), rng, spanFrom, mustSpanFrom)
|
|
749
|
+
},
|
|
750
|
+
Ok(rngNew) => {
|
|
751
|
+
match(rangeAddCaseAware(rng, spanFrom, buf.config)) {
|
|
752
|
+
Err(e) => Err(e),
|
|
753
|
+
Ok(rng) => parseRangeRest(buf, rangeUnion(rngNew, rng), None, None)
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
}
|
|
757
|
+
},
|
|
758
|
+
Ok(c) => {
|
|
759
|
+
ignore(next(buf))
|
|
760
|
+
parseRangeRestSpan(buf, Char.code(c), rng, spanFrom, mustSpanFrom)
|
|
761
|
+
}
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
},
|
|
765
|
+
|
|
766
|
+
parseRangeRestSpan = (buf: RegExBuf, c, rng, spanFrom: Option<Number>, mustSpanFrom: Option<Number>) => {
|
|
767
|
+
match(mustSpanFrom) {
|
|
768
|
+
Some(n) => {
|
|
769
|
+
if (n > c) {
|
|
770
|
+
Err(parseErr(buf, "invalid range within square brackets in pattern", 0))
|
|
771
|
+
} else {
|
|
772
|
+
match(rangeAddSpanCaseAware(rng, n, c, buf.config)) {
|
|
773
|
+
Err(e) => Err(e),
|
|
774
|
+
Ok(rng) => parseRangeRest(buf, rng, None, None)
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
},
|
|
778
|
+
None => {
|
|
779
|
+
match(rangeAddCaseAware(rng, spanFrom, buf.config)) {
|
|
780
|
+
Err(e) => Err(e),
|
|
781
|
+
Ok(rng) => parseRangeRest(buf, rng, Some(c), None)
|
|
782
|
+
}
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
// Main parsing
|
|
788
|
+
|
|
789
|
+
let rec parseAtom = (buf: RegExBuf) => {
|
|
790
|
+
match (peek(buf)) {
|
|
791
|
+
Err(e) => Err(e),
|
|
792
|
+
Ok(c) => match(c) {
|
|
793
|
+
'(' => {
|
|
794
|
+
if (!moreN(buf, 1)) {
|
|
795
|
+
Err(parseErr(buf, "Parentheses not closed", 1))
|
|
796
|
+
} else if (peekN(buf, 1) == Ok('?')) {
|
|
797
|
+
// fancy group
|
|
798
|
+
if (!moreN(buf, 2)) {
|
|
799
|
+
Err(parseErr(buf, "Parentheses not closed", 2))
|
|
800
|
+
} else {
|
|
801
|
+
match(peekN(buf, 2)) {
|
|
802
|
+
Err(e) => Err(e),
|
|
803
|
+
Ok('>') => {
|
|
804
|
+
// cut
|
|
805
|
+
ignore(eat(buf, '('))
|
|
806
|
+
ignore(eat(buf, '?'))
|
|
807
|
+
ignore(eat(buf, '>'))
|
|
808
|
+
let preNumGroups = unbox(buf.config.groupNumber)
|
|
809
|
+
match(parseRegex(buf)) {
|
|
810
|
+
Err(e) => Err(e),
|
|
811
|
+
Ok(rx) => {
|
|
812
|
+
let postNumGroups = unbox(buf.config.groupNumber)
|
|
813
|
+
match(eat(buf, ')')) {
|
|
814
|
+
Err(e) => Err(e),
|
|
815
|
+
Ok(_) => Ok(makeRECut(rx, preNumGroups, postNumGroups - preNumGroups))
|
|
816
|
+
}
|
|
817
|
+
}
|
|
818
|
+
}
|
|
819
|
+
},
|
|
820
|
+
Ok('(') => {
|
|
821
|
+
// conditional
|
|
822
|
+
ignore(eat(buf, '('))
|
|
823
|
+
ignore(eat(buf, '?'))
|
|
824
|
+
ignore(eat(buf, '('))
|
|
825
|
+
let tstPreNumGroups = unbox(buf.config.groupNumber)
|
|
826
|
+
match(parseTest(buf)) {
|
|
827
|
+
Err(e) => Err(e),
|
|
828
|
+
Ok(test) => {
|
|
829
|
+
let tstSpanNumGroups = unbox(buf.config.groupNumber) - tstPreNumGroups
|
|
830
|
+
match(parsePCEs(buf, false)) {
|
|
831
|
+
Err(e) => Err(e),
|
|
832
|
+
Ok(pces) => {
|
|
833
|
+
if (!more(buf)) {
|
|
834
|
+
Err(parseErr(buf, "Parentheses not closed", 0))
|
|
835
|
+
} else {
|
|
836
|
+
match(peek(buf)) {
|
|
837
|
+
Err(e) => Err(e),
|
|
838
|
+
Ok('|') => {
|
|
839
|
+
ignore(eat(buf, '|'))
|
|
840
|
+
match(parsePCEs(buf, false)) {
|
|
841
|
+
Err(e) => Err(e),
|
|
842
|
+
Ok(pces2) => {
|
|
843
|
+
match(peek(buf)) {
|
|
844
|
+
Err(_) => Err(parseErr(buf, "Parentheses not closed", 0)),
|
|
845
|
+
Ok(_) => {
|
|
846
|
+
ignore(eat(buf, ')'))
|
|
847
|
+
Ok(makeREConditional(test, makeRESequence(pces), Some(makeRESequence(pces2)), tstPreNumGroups, tstSpanNumGroups))
|
|
848
|
+
}
|
|
849
|
+
}
|
|
850
|
+
}
|
|
851
|
+
}
|
|
852
|
+
},
|
|
853
|
+
Ok(')') => {
|
|
854
|
+
ignore(eat(buf, ')'))
|
|
855
|
+
Ok(makeREConditional(test, makeRESequence(pces), None, tstPreNumGroups, tstSpanNumGroups))
|
|
856
|
+
},
|
|
857
|
+
Ok(_) => {
|
|
858
|
+
Err(parseErr(buf, "Failed to parse condition", 0))
|
|
859
|
+
}
|
|
860
|
+
}
|
|
861
|
+
}
|
|
862
|
+
}
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
}
|
|
866
|
+
},
|
|
867
|
+
Ok(c) when (c == 'i' || c == 's' || c == 'm' || c == '-' || c == ':') => {
|
|
868
|
+
// match with mode
|
|
869
|
+
ignore(eat(buf, '('))
|
|
870
|
+
ignore(eat(buf, '?'))
|
|
871
|
+
match(parseMode(buf)) {
|
|
872
|
+
Err(e) => Err(e),
|
|
873
|
+
Ok(config) => {
|
|
874
|
+
if (!more(buf)) {
|
|
875
|
+
Err(parseErr(buf, "Parentheses not closed", 0))
|
|
876
|
+
} else {
|
|
877
|
+
match(peek(buf)) {
|
|
878
|
+
Err(e) => Err(e),
|
|
879
|
+
Ok(':') => {
|
|
880
|
+
ignore(eat(buf, ':'))
|
|
881
|
+
match (parseRegex(withConfig(buf, config))) {
|
|
882
|
+
Err(e) => Err(e),
|
|
883
|
+
Ok(rx) => {
|
|
884
|
+
match(eat(buf, ')')) {
|
|
885
|
+
Err(e) => Err(e),
|
|
886
|
+
Ok(_) => Ok(rx)
|
|
887
|
+
}
|
|
888
|
+
}
|
|
889
|
+
}
|
|
890
|
+
},
|
|
891
|
+
Ok(_) => {
|
|
892
|
+
Err(parseErr(buf, "expected `:` or another mode after `(?` and a mode sequence; a mode is `i`, `-i`, `m`, `-m`, `s`, or `-s`", 0))
|
|
893
|
+
}
|
|
894
|
+
}
|
|
895
|
+
}
|
|
896
|
+
}
|
|
897
|
+
}
|
|
898
|
+
},
|
|
899
|
+
Ok(_) => {
|
|
900
|
+
ignore(eat(buf, '('))
|
|
901
|
+
ignore(eat(buf, '?'))
|
|
902
|
+
parseLook(buf)
|
|
903
|
+
},
|
|
904
|
+
}
|
|
905
|
+
}
|
|
906
|
+
} else {
|
|
907
|
+
// simple group
|
|
908
|
+
ignore(eat(buf, '('))
|
|
909
|
+
let groupNum = unbox(buf.config.groupNumber)
|
|
910
|
+
// Note that this inc operation is side-effecting
|
|
911
|
+
match(parseRegex(withConfig(buf, configIncGroupNumber(buf.config)))) {
|
|
912
|
+
Err(e) => Err(e),
|
|
913
|
+
Ok(r) => {
|
|
914
|
+
match(eat(buf, ')')) {
|
|
915
|
+
Err(e) => Err(e),
|
|
916
|
+
Ok(_) => Ok(REGroup(r, groupNum))
|
|
917
|
+
}
|
|
918
|
+
}
|
|
919
|
+
}
|
|
920
|
+
}
|
|
921
|
+
},
|
|
922
|
+
'[' => {
|
|
923
|
+
ignore(eat(buf, '['))
|
|
924
|
+
match(parseRangeNot(buf)) {
|
|
925
|
+
Err(e) => Err(e),
|
|
926
|
+
Ok(rng) => Ok(makeRERange(rng, rangeLimit))
|
|
927
|
+
}
|
|
928
|
+
},
|
|
929
|
+
'.' => {
|
|
930
|
+
ignore(eat(buf, '.'))
|
|
931
|
+
if (buf.config.multiline) {
|
|
932
|
+
// if in multiline mode, '.' matches everything but \n
|
|
933
|
+
Ok(makeRERange(rangeInvert(rangeAdd([], Char.code('\n')), rangeLimit), rangeLimit))
|
|
934
|
+
} else {
|
|
935
|
+
Ok(REAny)
|
|
936
|
+
}
|
|
937
|
+
},
|
|
938
|
+
'^' => {
|
|
939
|
+
ignore(eat(buf, '^'))
|
|
940
|
+
Ok(if (buf.config.multiline) { RELineStart } else { REStart })
|
|
941
|
+
},
|
|
942
|
+
'$' => {
|
|
943
|
+
ignore(eat(buf, '$'))
|
|
944
|
+
Ok(if (buf.config.multiline) { RELineEnd } else { REEnd })
|
|
945
|
+
},
|
|
946
|
+
_ => parseLiteral(buf)
|
|
947
|
+
}
|
|
948
|
+
}
|
|
949
|
+
},
|
|
950
|
+
|
|
951
|
+
parseLook = (buf: RegExBuf) => {
|
|
952
|
+
let preNumGroups = unbox(buf.config.groupNumber)
|
|
953
|
+
let spanNumGroups = () => unbox(buf.config.groupNumber) - preNumGroups
|
|
954
|
+
// (isMatch, isAhead)
|
|
955
|
+
let flags = match(peek(buf)) {
|
|
956
|
+
Err(e) => Err(e),
|
|
957
|
+
Ok('=') => {
|
|
958
|
+
ignore(eat(buf, '='))
|
|
959
|
+
Ok((true, true))
|
|
960
|
+
},
|
|
961
|
+
Ok('!') => {
|
|
962
|
+
ignore(eat(buf, '!'))
|
|
963
|
+
Ok((false, true))
|
|
964
|
+
},
|
|
965
|
+
Ok('<') => {
|
|
966
|
+
ignore(eat(buf, '<'))
|
|
967
|
+
if (!more(buf)) {
|
|
968
|
+
Err(parseErr(buf, "Unterminated look sequence", 0))
|
|
969
|
+
} else {
|
|
970
|
+
match(peek(buf)) {
|
|
971
|
+
Err(e) => Err(e),
|
|
972
|
+
Ok('=') => {
|
|
973
|
+
ignore(eat(buf, '='))
|
|
974
|
+
Ok((true, false))
|
|
975
|
+
},
|
|
976
|
+
Ok('!') => {
|
|
977
|
+
ignore(eat(buf, '!'))
|
|
978
|
+
Ok((false, false))
|
|
979
|
+
},
|
|
980
|
+
Ok(_) => Err(parseErr(buf, "Invalid look sequence", 0))
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
},
|
|
984
|
+
Ok(_) => {
|
|
985
|
+
Err(parseErr(buf, "Invalid look sequence", 0))
|
|
986
|
+
}
|
|
987
|
+
}
|
|
988
|
+
match(flags) {
|
|
989
|
+
Err(e) => Err(e),
|
|
990
|
+
Ok((isMatch, isAhead)) => {
|
|
991
|
+
match(parseRegex(buf)) {
|
|
992
|
+
Err(e) => Err(e),
|
|
993
|
+
Ok(rx) => {
|
|
994
|
+
match(eat(buf, ')')) {
|
|
995
|
+
Err(e) => Err(e),
|
|
996
|
+
Ok(_) => {
|
|
997
|
+
if (isAhead) {
|
|
998
|
+
Ok(RELookahead(rx, isMatch, preNumGroups, spanNumGroups()))
|
|
999
|
+
} else {
|
|
1000
|
+
Ok(RELookbehind(rx, isMatch, box(0), box(0), preNumGroups, spanNumGroups()))
|
|
1001
|
+
}
|
|
1002
|
+
}
|
|
1003
|
+
}
|
|
1004
|
+
}
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
1008
|
+
},
|
|
1009
|
+
|
|
1010
|
+
parseTest = (buf: RegExBuf) => {
|
|
1011
|
+
if (!more(buf)) {
|
|
1012
|
+
Err(parseErr(buf, "Expected test", 0))
|
|
1013
|
+
} else {
|
|
1014
|
+
match(peek(buf)) {
|
|
1015
|
+
Err(e) => Err(e),
|
|
1016
|
+
Ok('?') => {
|
|
1017
|
+
ignore(eat(buf, '?'))
|
|
1018
|
+
parseLook(buf)
|
|
1019
|
+
},
|
|
1020
|
+
Ok(c) when (Char.code(c) >= Char.code('0') && Char.code(c) <= Char.code('9')) => {
|
|
1021
|
+
buf.config.references := true
|
|
1022
|
+
let curPos = unbox(buf.cursor)
|
|
1023
|
+
match(parseInteger(buf, 0)) {
|
|
1024
|
+
Err(e) => Err(e),
|
|
1025
|
+
Ok(n) => {
|
|
1026
|
+
if (unbox(buf.cursor) == curPos) {
|
|
1027
|
+
Err(parseErr(buf, "expected `)` after `(?(` followed by digits", 0))
|
|
1028
|
+
} else {
|
|
1029
|
+
match(eat(buf, ')')) {
|
|
1030
|
+
Err(e) => Err(e),
|
|
1031
|
+
Ok(_) => Ok(REReference(n, false))
|
|
1032
|
+
}
|
|
1033
|
+
}
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
1036
|
+
},
|
|
1037
|
+
Ok(_) => Err(parseErr(buf, "expected `(?=`, `(?!`, `(?<`, or digit after `(?(`", 0))
|
|
1038
|
+
}
|
|
1039
|
+
}
|
|
1040
|
+
},
|
|
1041
|
+
|
|
1042
|
+
parseInteger = (buf: RegExBuf, n) => {
|
|
1043
|
+
if (!more(buf)) {
|
|
1044
|
+
Ok(n)
|
|
1045
|
+
} else {
|
|
1046
|
+
match(peek(buf)) {
|
|
1047
|
+
Err(c) => Err(c),
|
|
1048
|
+
Ok(c) when (Char.code(c) >= Char.code('0') && Char.code(c) <= Char.code('9')) => {
|
|
1049
|
+
ignore(next(buf))
|
|
1050
|
+
parseInteger(buf, (10 * n) + (Char.code(c) - Char.code('0')))
|
|
1051
|
+
},
|
|
1052
|
+
Ok(_) => Ok(n)
|
|
1053
|
+
}
|
|
1054
|
+
}
|
|
1055
|
+
},
|
|
1056
|
+
|
|
1057
|
+
parseMode = (buf: RegExBuf) => {
|
|
1058
|
+
let processState = ((cs, ml)) => {
|
|
1059
|
+
let withCs = match(cs) {
|
|
1060
|
+
None => buf.config,
|
|
1061
|
+
Some(true) => configWithCaseSensitive(buf.config, true),
|
|
1062
|
+
Some(_) => configWithCaseSensitive(buf.config, false),
|
|
1063
|
+
}
|
|
1064
|
+
match(ml) {
|
|
1065
|
+
None => withCs,
|
|
1066
|
+
Some(true) => configWithMultiLine(withCs, true),
|
|
1067
|
+
Some(_) => configWithMultiLine(withCs, false),
|
|
1068
|
+
}
|
|
1069
|
+
}
|
|
1070
|
+
let rec help = ((cs, ml)) => {
|
|
1071
|
+
if (!more(buf)) {
|
|
1072
|
+
Ok(processState((cs, ml)))
|
|
1073
|
+
} else {
|
|
1074
|
+
match(peek(buf)) {
|
|
1075
|
+
Err(e) => Err(e),
|
|
1076
|
+
Ok('i') => {
|
|
1077
|
+
ignore(eat(buf, 'i'))
|
|
1078
|
+
help((Some(false), ml))
|
|
1079
|
+
},
|
|
1080
|
+
Ok('s') => {
|
|
1081
|
+
ignore(eat(buf, 's'))
|
|
1082
|
+
help((cs, Some(false)))
|
|
1083
|
+
},
|
|
1084
|
+
Ok('m') => {
|
|
1085
|
+
ignore(eat(buf, 'm'))
|
|
1086
|
+
help((cs, Some(true)))
|
|
1087
|
+
},
|
|
1088
|
+
Ok('-') => {
|
|
1089
|
+
ignore(eat(buf, '-'))
|
|
1090
|
+
if (!more(buf)) {
|
|
1091
|
+
Ok(processState((cs, ml)))
|
|
1092
|
+
} else {
|
|
1093
|
+
match(peek(buf)) {
|
|
1094
|
+
Err(e) => Err(e),
|
|
1095
|
+
Ok('i') => {
|
|
1096
|
+
ignore(eat(buf, 'i'))
|
|
1097
|
+
help((Some(true), ml))
|
|
1098
|
+
},
|
|
1099
|
+
Ok('s') => {
|
|
1100
|
+
ignore(eat(buf, 's'))
|
|
1101
|
+
help((cs, Some(true)))
|
|
1102
|
+
},
|
|
1103
|
+
Ok('m') => {
|
|
1104
|
+
ignore(eat(buf, 'm'))
|
|
1105
|
+
help((cs, Some(false)))
|
|
1106
|
+
},
|
|
1107
|
+
_ => Ok(processState((cs, ml)))
|
|
1108
|
+
}
|
|
1109
|
+
}
|
|
1110
|
+
},
|
|
1111
|
+
_ => Ok(processState((cs, ml)))
|
|
1112
|
+
}
|
|
1113
|
+
}
|
|
1114
|
+
}
|
|
1115
|
+
help((None, None))
|
|
1116
|
+
},
|
|
1117
|
+
|
|
1118
|
+
parseUnicodeCategories = (buf: RegExBuf, pC: String) => {
|
|
1119
|
+
if (!more(buf)) {
|
|
1120
|
+
Err(parseErr(buf, "Expected unicode category", 0))
|
|
1121
|
+
} else {
|
|
1122
|
+
match(peek(buf)) {
|
|
1123
|
+
Err(e) => Err(e),
|
|
1124
|
+
Ok('{') => {
|
|
1125
|
+
ignore(eat(buf, '{'))
|
|
1126
|
+
let catNegated = if (peek(buf) == Ok('^')) {
|
|
1127
|
+
ignore(eat(buf, '^'))
|
|
1128
|
+
true
|
|
1129
|
+
} else false
|
|
1130
|
+
let rec loop = (acc) => {
|
|
1131
|
+
match(peek(buf)) {
|
|
1132
|
+
Err(e) => Err(parseErr(buf, "Missing `}` to close `\\" ++ pC ++ "`", 0)),
|
|
1133
|
+
Ok('}') => {
|
|
1134
|
+
ignore(eat(buf, '}'))
|
|
1135
|
+
Ok(List.join("", List.reverse(acc)))
|
|
1136
|
+
},
|
|
1137
|
+
Ok(c) => {
|
|
1138
|
+
ignore(eat(buf, c))
|
|
1139
|
+
loop([Char.toString(c), ...acc])
|
|
1140
|
+
}
|
|
1141
|
+
}
|
|
1142
|
+
}
|
|
1143
|
+
let lst = match(loop([])) {
|
|
1144
|
+
Err(e) => Err(e),
|
|
1145
|
+
Ok(s) => {
|
|
1146
|
+
// In case anyone is curious where these codes originate from:
|
|
1147
|
+
// https://www.unicode.org/reports/tr44/#General_Category_Values
|
|
1148
|
+
match(s) {
|
|
1149
|
+
"Ll" => Ok([LetterLowercase]),
|
|
1150
|
+
"Lu" => Ok([LetterUppercase]),
|
|
1151
|
+
"Lt" => Ok([LetterTitlecase]),
|
|
1152
|
+
"Lm" => Ok([LetterModifier]),
|
|
1153
|
+
"L&" => Ok([LetterLowercase, LetterUppercase, LetterTitlecase, LetterModifier]),
|
|
1154
|
+
"Lo" => Ok([LetterOther]),
|
|
1155
|
+
"L" => Ok([LetterLowercase, LetterUppercase, LetterTitlecase, LetterModifier, LetterOther]),
|
|
1156
|
+
"Nd" => Ok([NumberDecimalDigit]),
|
|
1157
|
+
"Nl" => Ok([NumberLetter]),
|
|
1158
|
+
"No" => Ok([NumberOther]),
|
|
1159
|
+
"N" => Ok([NumberDecimalDigit, NumberLetter, NumberOther]),
|
|
1160
|
+
"Ps" => Ok([PunctuationOpen]),
|
|
1161
|
+
"Pe" => Ok([PunctuationClose]),
|
|
1162
|
+
"Pi" => Ok([PunctuationInitialQuote]),
|
|
1163
|
+
"Pf" => Ok([PunctuationFinalQuote]),
|
|
1164
|
+
"Pc" => Ok([PunctuationConnector]),
|
|
1165
|
+
"Pd" => Ok([PunctuationDash]),
|
|
1166
|
+
"Po" => Ok([PunctuationOther]),
|
|
1167
|
+
"P" => Ok([PunctuationOpen, PunctuationClose, PunctuationInitialQuote, PunctuationFinalQuote, PunctuationConnector, PunctuationDash, PunctuationOther]),
|
|
1168
|
+
"Mn" => Ok([MarkNonSpacing]),
|
|
1169
|
+
"Mc" => Ok([MarkSpacingCombining]),
|
|
1170
|
+
"Me" => Ok([MarkEnclosing]),
|
|
1171
|
+
"M" => Ok([MarkNonSpacing, MarkSpacingCombining, MarkEnclosing]),
|
|
1172
|
+
"Sc" => Ok([SymbolCurrency]),
|
|
1173
|
+
"Sk" => Ok([SymbolModifier]),
|
|
1174
|
+
"Sm" => Ok([SymbolMath]),
|
|
1175
|
+
"So" => Ok([SymbolOther]),
|
|
1176
|
+
"S" => Ok([SymbolCurrency, SymbolModifier, SymbolMath, SymbolOther]),
|
|
1177
|
+
"Zl" => Ok([SeparatorLine]),
|
|
1178
|
+
"Zp" => Ok([SeparatorParagraph]),
|
|
1179
|
+
"Zs" => Ok([SeparatorSpace]),
|
|
1180
|
+
"Z" => Ok([SeparatorLine, SeparatorParagraph, SeparatorSpace]),
|
|
1181
|
+
"Cc" => Ok([OtherControl]),
|
|
1182
|
+
"Cf" => Ok([OtherFormat]),
|
|
1183
|
+
"Cs" => Ok([OtherSurrogate]),
|
|
1184
|
+
"Cn" => Ok([OtherNotAssigned]),
|
|
1185
|
+
"Co" => Ok([OtherPrivateUse]),
|
|
1186
|
+
"C" => Ok([OtherControl, OtherFormat, OtherSurrogate, OtherNotAssigned, OtherPrivateUse]),
|
|
1187
|
+
"." => Ok([
|
|
1188
|
+
LetterLowercase, LetterUppercase, LetterTitlecase, LetterModifier, LetterOther,
|
|
1189
|
+
NumberDecimalDigit, NumberLetter, NumberOther,
|
|
1190
|
+
PunctuationOpen, PunctuationClose, PunctuationInitialQuote, PunctuationFinalQuote, PunctuationConnector, PunctuationDash, PunctuationOther,
|
|
1191
|
+
MarkNonSpacing, MarkSpacingCombining, MarkEnclosing,
|
|
1192
|
+
SymbolCurrency, SymbolModifier, SymbolMath, SymbolOther,
|
|
1193
|
+
SeparatorLine, SeparatorParagraph, SeparatorSpace,
|
|
1194
|
+
OtherControl, OtherFormat, OtherSurrogate, OtherNotAssigned, OtherPrivateUse
|
|
1195
|
+
]),
|
|
1196
|
+
s => Err(parseErr(buf, "Unrecognized property name in `\\" ++ pC ++ "`: `" ++ s ++ "`", 0))
|
|
1197
|
+
}
|
|
1198
|
+
}
|
|
1199
|
+
}
|
|
1200
|
+
match(lst) {
|
|
1201
|
+
Err(e) => Err(e),
|
|
1202
|
+
Ok(l) => Ok((l, catNegated))
|
|
1203
|
+
}
|
|
1204
|
+
},
|
|
1205
|
+
Ok(_) => Err(parseErr(buf, "Expected `{` after `\\" ++ pC ++ "`", 0))
|
|
1206
|
+
}
|
|
1207
|
+
}
|
|
1208
|
+
},
|
|
1209
|
+
|
|
1210
|
+
parseLiteral = (buf: RegExBuf) => {
|
|
1211
|
+
if (!more(buf)) {
|
|
1212
|
+
Err(parseErr(buf, "Expected literal", 0))
|
|
1213
|
+
} else {
|
|
1214
|
+
match(peek(buf)) {
|
|
1215
|
+
Err(e) => Err(e),
|
|
1216
|
+
Ok('*') => Err(parseErr(buf, "`*` follows nothing in pattern", 0)),
|
|
1217
|
+
Ok('+') => Err(parseErr(buf, "`+` follows nothing in pattern", 0)),
|
|
1218
|
+
Ok('?') => Err(parseErr(buf, "`?` follows nothing in pattern", 0)),
|
|
1219
|
+
Ok('{') when buf.config.isPerlRegExp => Err(parseErr(buf, "`{` follows nothing in pattern", 0)),
|
|
1220
|
+
Ok('\\') => {
|
|
1221
|
+
ignore(eat(buf, '\\'))
|
|
1222
|
+
parseBackslashLiteral(buf)
|
|
1223
|
+
},
|
|
1224
|
+
Ok(')') => Err(parseErr(buf, "Unmatched `)` in pattern", 0)),
|
|
1225
|
+
Ok(c) when (buf.config.isPerlRegExp) && (c == ']' || c == '}') => Err(parseErr(buf, "unmatched `" ++ Char.toString(c) ++ "` in pattern", 0)),
|
|
1226
|
+
// [TODO] case-insensitive (#691)
|
|
1227
|
+
Ok(c) when buf.config.caseSensitive => {
|
|
1228
|
+
ignore(next(buf))
|
|
1229
|
+
Ok(RELiteral(c))
|
|
1230
|
+
},
|
|
1231
|
+
Ok(c) => {
|
|
1232
|
+
ignore(next(buf))
|
|
1233
|
+
match(rangeAddCaseAware([], Some(Char.code(c)), buf.config)) {
|
|
1234
|
+
Ok(rng) => Ok(makeRERange(rng, rangeLimit)),
|
|
1235
|
+
Err(e) => Err(e)
|
|
1236
|
+
}
|
|
1237
|
+
}
|
|
1238
|
+
}
|
|
1239
|
+
}
|
|
1240
|
+
},
|
|
1241
|
+
|
|
1242
|
+
parseBackslashLiteral = (buf: RegExBuf) => {
|
|
1243
|
+
if (!more(buf)) {
|
|
1244
|
+
// Special case: EOS after backslash matches null
|
|
1245
|
+
Err(parseErr(buf, "Expected to find escaped value after backslash", 0))
|
|
1246
|
+
} else {
|
|
1247
|
+
match(peek(buf)) {
|
|
1248
|
+
Err(e) => Err(e),
|
|
1249
|
+
// pregexp:
|
|
1250
|
+
Ok(c) when (buf.config.isPerlRegExp) && (Char.code(c) >= Char.code('0') && Char.code(c) <= Char.code('9')) => {
|
|
1251
|
+
buf.config.references := true
|
|
1252
|
+
match(parseInteger(buf, 0)) {
|
|
1253
|
+
Err(e) => Err(e),
|
|
1254
|
+
Ok(n) => {
|
|
1255
|
+
Ok(REReference(n, buf.config.caseSensitive))
|
|
1256
|
+
}
|
|
1257
|
+
}
|
|
1258
|
+
},
|
|
1259
|
+
Ok(c) when (buf.config.isPerlRegExp) && (((Char.code(c) >= Char.code('a') && Char.code(c) <= Char.code('z'))) || (Char.code(c) >= Char.code('A') && Char.code(c) <= Char.code('Z'))) => {
|
|
1260
|
+
match(c) {
|
|
1261
|
+
'p' => {
|
|
1262
|
+
ignore(eat(buf, 'p'))
|
|
1263
|
+
match(parseUnicodeCategories(buf, "p")) {
|
|
1264
|
+
Err(e) => Err(e),
|
|
1265
|
+
Ok((cats, negated)) => Ok(REUnicodeCategories(cats, negated))
|
|
1266
|
+
}
|
|
1267
|
+
},
|
|
1268
|
+
'P' => {
|
|
1269
|
+
ignore(eat(buf, 'P'))
|
|
1270
|
+
match(parseUnicodeCategories(buf, "P")) {
|
|
1271
|
+
Err(e) => Err(e),
|
|
1272
|
+
Ok((cats, negated)) => Ok(REUnicodeCategories(cats, !negated))
|
|
1273
|
+
}
|
|
1274
|
+
},
|
|
1275
|
+
'b' => {
|
|
1276
|
+
ignore(eat(buf, 'b'))
|
|
1277
|
+
Ok(REWordBoundary)
|
|
1278
|
+
},
|
|
1279
|
+
'B' => {
|
|
1280
|
+
ignore(eat(buf, 'B'))
|
|
1281
|
+
Ok(RENotWordBoundary)
|
|
1282
|
+
},
|
|
1283
|
+
_ => {
|
|
1284
|
+
match(parseClass(buf)) {
|
|
1285
|
+
Err(e) => Err(parseErr(buf, "illegal alphabetic escape", 0)),
|
|
1286
|
+
Ok(rng) => Ok(makeRERange(rng, rangeLimit))
|
|
1287
|
+
}
|
|
1288
|
+
}
|
|
1289
|
+
}
|
|
1290
|
+
},
|
|
1291
|
+
Ok(c) => {
|
|
1292
|
+
ignore(next(buf))
|
|
1293
|
+
Ok(RELiteral(c))
|
|
1294
|
+
}
|
|
1295
|
+
}
|
|
1296
|
+
}
|
|
1297
|
+
},
|
|
1298
|
+
|
|
1299
|
+
parseNonGreedy = (buf: RegExBuf) => {
|
|
1300
|
+
let checkNotNested = (res) => {
|
|
1301
|
+
if (!more(buf)) {
|
|
1302
|
+
res
|
|
1303
|
+
} else {
|
|
1304
|
+
match(peek(buf)) {
|
|
1305
|
+
Err(e) => Err(e),
|
|
1306
|
+
Ok(c) when (c == '?' || c == '*' || c == '+') => {
|
|
1307
|
+
Err(parseErr(buf, "nested '" ++ toString(c) ++ "' in pattern", 0))
|
|
1308
|
+
},
|
|
1309
|
+
Ok(_) => res
|
|
1310
|
+
}
|
|
1311
|
+
}
|
|
1312
|
+
}
|
|
1313
|
+
if (!more(buf)) {
|
|
1314
|
+
Ok(false)
|
|
1315
|
+
} else {
|
|
1316
|
+
match(peek(buf)) {
|
|
1317
|
+
Err(e) => Err(e),
|
|
1318
|
+
Ok('?') => {
|
|
1319
|
+
ignore(eat(buf, '?'))
|
|
1320
|
+
checkNotNested(Ok(true))
|
|
1321
|
+
},
|
|
1322
|
+
Ok(_) => checkNotNested(Ok(false)),
|
|
1323
|
+
}
|
|
1324
|
+
}
|
|
1325
|
+
},
|
|
1326
|
+
|
|
1327
|
+
parsePCE = (buf: RegExBuf) => {
|
|
1328
|
+
match(parseAtom(buf)) {
|
|
1329
|
+
Err(e) => Err(e),
|
|
1330
|
+
Ok(atom) => {
|
|
1331
|
+
if (!more(buf)) {
|
|
1332
|
+
Ok(atom)
|
|
1333
|
+
} else {
|
|
1334
|
+
match(peek(buf)) {
|
|
1335
|
+
Err(e) => Err(e),
|
|
1336
|
+
Ok('*') => {
|
|
1337
|
+
ignore(eat(buf, '*'))
|
|
1338
|
+
match(parseNonGreedy(buf)) {
|
|
1339
|
+
Err(e) => Err(e),
|
|
1340
|
+
Ok(ng) => Ok(RERepeat(atom, 0, None, ng))
|
|
1341
|
+
}
|
|
1342
|
+
},
|
|
1343
|
+
Ok('+') => {
|
|
1344
|
+
ignore(eat(buf, '+'))
|
|
1345
|
+
match(parseNonGreedy(buf)) {
|
|
1346
|
+
Err(e) => Err(e),
|
|
1347
|
+
Ok(ng) => Ok(RERepeat(atom, 1, None, ng))
|
|
1348
|
+
}
|
|
1349
|
+
},
|
|
1350
|
+
Ok('?') => {
|
|
1351
|
+
ignore(eat(buf, '?'))
|
|
1352
|
+
match(parseNonGreedy(buf)) {
|
|
1353
|
+
Err(e) => Err(e),
|
|
1354
|
+
Ok(ng) => Ok(REMaybe(atom, ng))
|
|
1355
|
+
}
|
|
1356
|
+
},
|
|
1357
|
+
Ok('{') when buf.config.isPerlRegExp => {
|
|
1358
|
+
ignore(eat(buf, '{'))
|
|
1359
|
+
match(parseInteger(buf, 0)) {
|
|
1360
|
+
Err(e) => Err(e),
|
|
1361
|
+
Ok(n1) => {
|
|
1362
|
+
match(peek(buf)) {
|
|
1363
|
+
Ok(',') => {
|
|
1364
|
+
ignore(eat(buf, ','))
|
|
1365
|
+
let curPos = unbox(buf.cursor)
|
|
1366
|
+
match(parseInteger(buf, 0)) {
|
|
1367
|
+
Err(e) => Err(e),
|
|
1368
|
+
Ok(n2) => {
|
|
1369
|
+
match(peek(buf)) {
|
|
1370
|
+
Err(e) => Err(e),
|
|
1371
|
+
Ok('}') => {
|
|
1372
|
+
// for `{n,}`, we match >= n times, so n2adj should be infinity
|
|
1373
|
+
let n2adj = if (curPos == unbox(buf.cursor)) { None } else { Some(n2) }
|
|
1374
|
+
ignore(eat(buf, '}'))
|
|
1375
|
+
match(parseNonGreedy(buf)) {
|
|
1376
|
+
Err(e) => Err(e),
|
|
1377
|
+
Ok(ng) => Ok(RERepeat(atom, n1, n2adj, ng))
|
|
1378
|
+
}
|
|
1379
|
+
},
|
|
1380
|
+
Ok(_) => Err(parseErr(buf, "expected digit or `}` to end repetition specification started with `{`", 0))
|
|
1381
|
+
}
|
|
1382
|
+
}
|
|
1383
|
+
}
|
|
1384
|
+
},
|
|
1385
|
+
Ok('}') => {
|
|
1386
|
+
ignore(eat(buf, '}'))
|
|
1387
|
+
match(parseNonGreedy(buf)) {
|
|
1388
|
+
Err(e) => Err(e),
|
|
1389
|
+
// match exactly n1 times
|
|
1390
|
+
Ok(ng) => Ok(RERepeat(atom, n1, Some(n1), ng),)
|
|
1391
|
+
}
|
|
1392
|
+
},
|
|
1393
|
+
_ => Err(parseErr(buf, "expected digit, `,`, or `}' for repetition specification started with `{`", 0))
|
|
1394
|
+
}
|
|
1395
|
+
}
|
|
1396
|
+
}
|
|
1397
|
+
},
|
|
1398
|
+
Ok(_) => Ok(atom)
|
|
1399
|
+
}
|
|
1400
|
+
}
|
|
1401
|
+
}
|
|
1402
|
+
}
|
|
1403
|
+
},
|
|
1404
|
+
|
|
1405
|
+
parsePCEs = (buf: RegExBuf, toplevel: Bool) => {
|
|
1406
|
+
if (!more(buf)) {
|
|
1407
|
+
Ok([])
|
|
1408
|
+
} else {
|
|
1409
|
+
match(parsePCE(buf)) {
|
|
1410
|
+
Err(e) => Err(e),
|
|
1411
|
+
Ok(pce) => {
|
|
1412
|
+
if (!more(buf)) {
|
|
1413
|
+
Ok([pce])
|
|
1414
|
+
} else {
|
|
1415
|
+
match(peek(buf)) {
|
|
1416
|
+
Err(e) => Err(e),
|
|
1417
|
+
Ok('|') => Ok([pce]),
|
|
1418
|
+
Ok(')') when toplevel => Err(parseErr(buf, "Unmatched `)`", 0)),
|
|
1419
|
+
Ok(')') => Ok([pce]),
|
|
1420
|
+
Ok(_) => {
|
|
1421
|
+
match(parsePCEs(buf, toplevel)) {
|
|
1422
|
+
Err(e) => Err(e),
|
|
1423
|
+
Ok(otherPces) => Ok([pce, ...otherPces])
|
|
1424
|
+
}
|
|
1425
|
+
}
|
|
1426
|
+
}
|
|
1427
|
+
}
|
|
1428
|
+
}
|
|
1429
|
+
}
|
|
1430
|
+
}
|
|
1431
|
+
},
|
|
1432
|
+
|
|
1433
|
+
parseRegex = (buf: RegExBuf) => {
|
|
1434
|
+
if (!more(buf)) {
|
|
1435
|
+
Ok(REEmpty)
|
|
1436
|
+
} else {
|
|
1437
|
+
match(peek(buf)) {
|
|
1438
|
+
Err(e) => Err(e),
|
|
1439
|
+
Ok(')') => {
|
|
1440
|
+
Ok(REEmpty)
|
|
1441
|
+
},
|
|
1442
|
+
Ok(_) => {
|
|
1443
|
+
match(parsePCEs(buf, false)) {
|
|
1444
|
+
Err(e) => Err(e),
|
|
1445
|
+
Ok(pces) => {
|
|
1446
|
+
if (!more(buf)) {
|
|
1447
|
+
Ok(makeRESequence(pces))
|
|
1448
|
+
} else {
|
|
1449
|
+
match(peek(buf)) {
|
|
1450
|
+
Err(e) => Err(e),
|
|
1451
|
+
Ok('|') => {
|
|
1452
|
+
ignore(eat(buf, '|'))
|
|
1453
|
+
match(parseRegex(buf)) {
|
|
1454
|
+
Err(e) => Err(e),
|
|
1455
|
+
Ok(rx2) => {
|
|
1456
|
+
Ok(makeREAlts(makeRESequence(pces), rx2, rangeLimit))
|
|
1457
|
+
}
|
|
1458
|
+
}
|
|
1459
|
+
},
|
|
1460
|
+
Ok(_) => Ok(makeRESequence(pces))
|
|
1461
|
+
}
|
|
1462
|
+
}
|
|
1463
|
+
}
|
|
1464
|
+
}
|
|
1465
|
+
}
|
|
1466
|
+
}
|
|
1467
|
+
}
|
|
1468
|
+
},
|
|
1469
|
+
|
|
1470
|
+
parseRegexNonEmpty = (buf: RegExBuf) => {
|
|
1471
|
+
match(parsePCEs(buf, false)) {
|
|
1472
|
+
Err(e) => Err(e),
|
|
1473
|
+
Ok(pces) => {
|
|
1474
|
+
if (!more(buf)) {
|
|
1475
|
+
Ok(makeRESequence(pces))
|
|
1476
|
+
} else {
|
|
1477
|
+
match(peek(buf)) {
|
|
1478
|
+
Err(e) => Err(e),
|
|
1479
|
+
Ok('|') => {
|
|
1480
|
+
ignore(eat(buf, '|'))
|
|
1481
|
+
match(parseRegexNonEmpty(buf)) {
|
|
1482
|
+
Err(e) => Err(e),
|
|
1483
|
+
Ok(rx2) => {
|
|
1484
|
+
Ok(makeREAlts(makeRESequence(pces), rx2, rangeLimit))
|
|
1485
|
+
}
|
|
1486
|
+
}
|
|
1487
|
+
},
|
|
1488
|
+
Ok(_) => Ok(makeRESequence(pces))
|
|
1489
|
+
}
|
|
1490
|
+
}
|
|
1491
|
+
}
|
|
1492
|
+
}
|
|
1493
|
+
}
|
|
1494
|
+
|
|
1495
|
+
let parseRegex = (buf: RegExBuf) => {
|
|
1496
|
+
match(parsePCEs(buf, true)) {
|
|
1497
|
+
Err(e) => Err(e),
|
|
1498
|
+
Ok(pces) => {
|
|
1499
|
+
if (!more(buf)) {
|
|
1500
|
+
Ok(makeRESequence(pces))
|
|
1501
|
+
} else {
|
|
1502
|
+
match(peek(buf)) {
|
|
1503
|
+
Err(e) => Err(e),
|
|
1504
|
+
Ok('|') => {
|
|
1505
|
+
ignore(eat(buf, '|'))
|
|
1506
|
+
match(parseRegex(buf)) {
|
|
1507
|
+
Err(e) => Err(e),
|
|
1508
|
+
Ok(rx2) => {
|
|
1509
|
+
Ok(makeREAlts(makeRESequence(pces), rx2, rangeLimit))
|
|
1510
|
+
}
|
|
1511
|
+
}
|
|
1512
|
+
},
|
|
1513
|
+
Ok(_) => Ok(makeRESequence(pces))
|
|
1514
|
+
}
|
|
1515
|
+
}
|
|
1516
|
+
}
|
|
1517
|
+
}
|
|
1518
|
+
}
|
|
1519
|
+
|
|
1520
|
+
|
|
1521
|
+
/*
|
|
1522
|
+
|
|
1523
|
+
REGEX ANALYSIS
|
|
1524
|
+
-------
|
|
1525
|
+
|
|
1526
|
+
In addition to the parse tree, we take three analyses from Racket:
|
|
1527
|
+
- isAnchored, which checks if a matching string must match at the beginning (avoids useless backtracking)
|
|
1528
|
+
- mustString, which determines if there is a substring which must appear in matches that we can use to filter out non-matching strings
|
|
1529
|
+
- startRange, which determins if there is a closed set of characters which must appear at the beginning of any match
|
|
1530
|
+
- validate, which performs consistency checks across the groups defined in the regex.
|
|
1531
|
+
|
|
1532
|
+
*/
|
|
1533
|
+
|
|
1534
|
+
// is-anchored:
|
|
1535
|
+
|
|
1536
|
+
let rec isAnchored = (re: ParsedRegularExpression) => {
|
|
1537
|
+
match(re) {
|
|
1538
|
+
REStart => true,
|
|
1539
|
+
RESequence(lst, _) => {
|
|
1540
|
+
let rec loop = (lst) => {
|
|
1541
|
+
match(lst) {
|
|
1542
|
+
[] => false,
|
|
1543
|
+
[hd, ...tl] => {
|
|
1544
|
+
match(hd) {
|
|
1545
|
+
RELookahead(_, _, _, _) => loop(tl),
|
|
1546
|
+
RELookbehind(_, _, _, _, _, _) => loop(tl),
|
|
1547
|
+
_ => isAnchored(hd),
|
|
1548
|
+
}
|
|
1549
|
+
}
|
|
1550
|
+
}
|
|
1551
|
+
}
|
|
1552
|
+
loop(lst)
|
|
1553
|
+
},
|
|
1554
|
+
REAlts(a, b) => isAnchored(a) && isAnchored(b),
|
|
1555
|
+
REConditional(_, rx1, rx2, _, _, _) => isAnchored(rx1) && Option.mapWithDefault(isAnchored, false, rx2),
|
|
1556
|
+
REGroup(rx, _) => isAnchored(rx),
|
|
1557
|
+
RECut(rx, _, _, _) => isAnchored(rx),
|
|
1558
|
+
_ => false,
|
|
1559
|
+
}
|
|
1560
|
+
}
|
|
1561
|
+
|
|
1562
|
+
// must-string:
|
|
1563
|
+
|
|
1564
|
+
let rec mustString = (re: ParsedRegularExpression) => {
|
|
1565
|
+
match(re) {
|
|
1566
|
+
RELiteral(c) => Some(Char.toString(c)),
|
|
1567
|
+
RELiteralString(s) => Some(s),
|
|
1568
|
+
RESequence(pces, _) => {
|
|
1569
|
+
List.reduce((acc, pce) => {
|
|
1570
|
+
match((mustString(pce), acc)) {
|
|
1571
|
+
(x, None) => x,
|
|
1572
|
+
(None, x) => x,
|
|
1573
|
+
(Some(a), Some(b)) when String.length(a) > String.length(b) => Some(a),
|
|
1574
|
+
(Some(a), Some(b)) => Some(b),
|
|
1575
|
+
}
|
|
1576
|
+
}, None, pces)
|
|
1577
|
+
},
|
|
1578
|
+
RERepeat(re, min, _, _) => {
|
|
1579
|
+
if (min == 0) {
|
|
1580
|
+
None
|
|
1581
|
+
} else {
|
|
1582
|
+
mustString(re)
|
|
1583
|
+
}
|
|
1584
|
+
},
|
|
1585
|
+
REGroup(re, _) => mustString(re),
|
|
1586
|
+
RECut(re, _, _, _) => mustString(re),
|
|
1587
|
+
RELookahead(re, true, _, _) => mustString(re),
|
|
1588
|
+
RELookbehind(re, true, _, _, _, _) => mustString(re),
|
|
1589
|
+
_ => None
|
|
1590
|
+
}
|
|
1591
|
+
}
|
|
1592
|
+
|
|
1593
|
+
// start-range
|
|
1594
|
+
|
|
1595
|
+
let rec zeroSized = (re) => {
|
|
1596
|
+
match(re) {
|
|
1597
|
+
REEmpty => true,
|
|
1598
|
+
REStart => true,
|
|
1599
|
+
RELineStart => true,
|
|
1600
|
+
REWordBoundary => true,
|
|
1601
|
+
RENotWordBoundary => true,
|
|
1602
|
+
RELookahead(_, _, _, _) => true,
|
|
1603
|
+
RELookbehind(_, _, _, _, _, _) => true,
|
|
1604
|
+
REGroup(re, _) => zeroSized(re),
|
|
1605
|
+
RECut(re, _, _, _) => zeroSized(re),
|
|
1606
|
+
_ => false,
|
|
1607
|
+
}
|
|
1608
|
+
}
|
|
1609
|
+
|
|
1610
|
+
let rec startRange = (re) => {
|
|
1611
|
+
match (re) {
|
|
1612
|
+
RELiteral(c) => Some(rangeAdd([], Char.code(c))),
|
|
1613
|
+
RELiteralString(s) => Some(rangeAdd([], Char.code(String.charAt(0, s)))),
|
|
1614
|
+
RESequence(elts, _) => {
|
|
1615
|
+
let rec loop = (l) => {
|
|
1616
|
+
match(l) {
|
|
1617
|
+
[] => None,
|
|
1618
|
+
[hd, ...tl] when zeroSized(hd) => loop(tl),
|
|
1619
|
+
[hd, ..._] => startRange(hd)
|
|
1620
|
+
}
|
|
1621
|
+
}
|
|
1622
|
+
loop(elts)
|
|
1623
|
+
},
|
|
1624
|
+
REAlts(re1, re2) => {
|
|
1625
|
+
match(startRange(re1)) {
|
|
1626
|
+
None => None,
|
|
1627
|
+
Some(rng1) => {
|
|
1628
|
+
match(startRange(re2)) {
|
|
1629
|
+
None => None,
|
|
1630
|
+
Some(rng2) => Some(rangeUnion(rng1, rng2))
|
|
1631
|
+
}
|
|
1632
|
+
}
|
|
1633
|
+
}
|
|
1634
|
+
},
|
|
1635
|
+
REConditional(_, re1, re2, _, _, _) => {
|
|
1636
|
+
match(startRange(re1)) {
|
|
1637
|
+
None => None,
|
|
1638
|
+
Some(rng1) => {
|
|
1639
|
+
match(re2) {
|
|
1640
|
+
None => None,
|
|
1641
|
+
Some(re2) => {
|
|
1642
|
+
match(startRange(re2)) {
|
|
1643
|
+
None => None,
|
|
1644
|
+
Some(rng2) => Some(rangeUnion(rng1, rng2))
|
|
1645
|
+
}
|
|
1646
|
+
}
|
|
1647
|
+
}
|
|
1648
|
+
}
|
|
1649
|
+
}
|
|
1650
|
+
},
|
|
1651
|
+
REGroup(re, _) => startRange(re),
|
|
1652
|
+
RECut(re, _, _, _) => startRange(re),
|
|
1653
|
+
RERepeat(re, min, _, _) when min > 0 => startRange(re),
|
|
1654
|
+
RERange(rng) => Some(rng),
|
|
1655
|
+
_ => None,
|
|
1656
|
+
}
|
|
1657
|
+
}
|
|
1658
|
+
|
|
1659
|
+
// validate:
|
|
1660
|
+
|
|
1661
|
+
enum ValidateError {
|
|
1662
|
+
MightBeEmpty,
|
|
1663
|
+
DoesNotMatchBounded,
|
|
1664
|
+
BackreferenceTooBig,
|
|
1665
|
+
InternalError(ParsedRegularExpression),
|
|
1666
|
+
}
|
|
1667
|
+
|
|
1668
|
+
let rec validate = (re: ParsedRegularExpression, numGroups) => {
|
|
1669
|
+
let groupSizes = Map.make()
|
|
1670
|
+
let mut dependsSizes = Map.make()
|
|
1671
|
+
let mut mustSizes = Map.make()
|
|
1672
|
+
// to avoid excess allocations inside of `loop`, we set a flag
|
|
1673
|
+
// which is checked at the end of the function.
|
|
1674
|
+
let mut thrownError = None
|
|
1675
|
+
let rec mergeDependsSizes = (ht1, ht2) => {
|
|
1676
|
+
if (Map.size(ht1) == 0) {
|
|
1677
|
+
ht2
|
|
1678
|
+
} else if (Map.size(ht1) > Map.size(ht2)) {
|
|
1679
|
+
mergeDependsSizes(ht2, ht1)
|
|
1680
|
+
} else {
|
|
1681
|
+
Map.forEach((k, v) => Map.set(k, v, ht2), ht1)
|
|
1682
|
+
ht2
|
|
1683
|
+
}
|
|
1684
|
+
}
|
|
1685
|
+
/**
|
|
1686
|
+
Computes the range of possible UTF-8 byte lengths for the given character range
|
|
1687
|
+
*/
|
|
1688
|
+
let rangeUtf8EncodingLengths = (rng) => {
|
|
1689
|
+
let (min, max, _) = List.reduce(((min1, max1, n), (segStart, segEnd)) => {
|
|
1690
|
+
if (rangeOverlaps(rng, segStart, segEnd)) {
|
|
1691
|
+
(min(min1, n), max(max1, n), n + 1)
|
|
1692
|
+
} else {
|
|
1693
|
+
(min1, max1, n + 1)
|
|
1694
|
+
}
|
|
1695
|
+
}, (4, 0, 1), [(0, 127), (128, 0x7ff), (0x800, 0x7fff), (0x10000, 0x10ffff)])
|
|
1696
|
+
(min, max)
|
|
1697
|
+
}
|
|
1698
|
+
let rec loop = (re) => {
|
|
1699
|
+
match(re) {
|
|
1700
|
+
RENever => (1, 1, 0),
|
|
1701
|
+
REAny => (1, 1, 0),
|
|
1702
|
+
RELiteral(_) => (1, 1, 0),
|
|
1703
|
+
RERange(_) => (1, 1, 0),
|
|
1704
|
+
RELiteralString(s) => {
|
|
1705
|
+
let ls = String.length(s)
|
|
1706
|
+
(ls, ls, 0)
|
|
1707
|
+
},
|
|
1708
|
+
REEmpty => (0, 0, 0),
|
|
1709
|
+
REEnd => (0, 0, 0),
|
|
1710
|
+
RELineEnd => (0, 0, 0),
|
|
1711
|
+
REStart => (0, 0, 1),
|
|
1712
|
+
RELineStart => (0, 0, 1),
|
|
1713
|
+
REWordBoundary => (0, 0, 1),
|
|
1714
|
+
RENotWordBoundary => (0, 0, 1),
|
|
1715
|
+
REAlts(re1, re2) => {
|
|
1716
|
+
let (min1, max1, maxL1) = loop(re1)
|
|
1717
|
+
let (min2, max2, maxL2) = loop(re2)
|
|
1718
|
+
(min(min1, min2), max(max1, max2), max(maxL1, maxL2))
|
|
1719
|
+
},
|
|
1720
|
+
RESequence(elts, _) => {
|
|
1721
|
+
List.reduce(((accMin, accMax, accMaxL), e) => {
|
|
1722
|
+
let (minE, maxE, maxLE) = loop(e)
|
|
1723
|
+
(accMin + minE, accMax + maxE, max(accMaxL, maxLE))
|
|
1724
|
+
}, (0, 0, 0), elts)
|
|
1725
|
+
},
|
|
1726
|
+
REGroup(re, n) => {
|
|
1727
|
+
let (min1, max1, maxL1) = loop(re)
|
|
1728
|
+
Map.set(n, min1, groupSizes)
|
|
1729
|
+
(min1, max1, maxL1)
|
|
1730
|
+
},
|
|
1731
|
+
RERepeat(re, repeatMin, repeatMax, nonGreedy) => {
|
|
1732
|
+
let oldDependsSizes = dependsSizes
|
|
1733
|
+
dependsSizes = Map.make()
|
|
1734
|
+
let oldMustSizes = mustSizes
|
|
1735
|
+
mustSizes = Map.make()
|
|
1736
|
+
let (min1, max1, maxL1) = loop(re)
|
|
1737
|
+
if (min1 == 0) {
|
|
1738
|
+
thrownError = Some(MightBeEmpty)
|
|
1739
|
+
(0, 0, 0)
|
|
1740
|
+
} else {
|
|
1741
|
+
mustSizes = mergeDependsSizes(oldMustSizes, mustSizes)
|
|
1742
|
+
dependsSizes = mergeDependsSizes(oldDependsSizes, dependsSizes)
|
|
1743
|
+
let repeatMax = match(repeatMax) {
|
|
1744
|
+
None => Float32.toNumber(Float32.infinity),
|
|
1745
|
+
Some(n) => n
|
|
1746
|
+
}
|
|
1747
|
+
(min1 * repeatMin, max1 * repeatMax, maxL1)
|
|
1748
|
+
}
|
|
1749
|
+
},
|
|
1750
|
+
REMaybe(re, nonGreedy) => {
|
|
1751
|
+
let (_, max1, maxL1) = loop(re)
|
|
1752
|
+
(0, max1, maxL1)
|
|
1753
|
+
},
|
|
1754
|
+
REConditional(reTest, reTrue, reFalse, _, _, _) => {
|
|
1755
|
+
let (min1, max1, maxL1) = loop(reTest)
|
|
1756
|
+
let (min2, max2, maxL2) = loop(reTrue)
|
|
1757
|
+
let (min3, max3, maxL3) = Option.mapWithDefault(loop, (0, 0, 0), reFalse)
|
|
1758
|
+
(min(min2, min3), max(max2, max3), max(max(maxL1, maxL2), maxL3))
|
|
1759
|
+
},
|
|
1760
|
+
RELookahead(re, _, _, _) => {
|
|
1761
|
+
let (_, _, maxL1) = loop(re)
|
|
1762
|
+
(0, 0, maxL1)
|
|
1763
|
+
},
|
|
1764
|
+
RELookbehind(re, _, lbMin, lbMax, _, _) => {
|
|
1765
|
+
let (min1, max1, maxL1) = loop(re)
|
|
1766
|
+
if (max1 == Float32.toNumber(Float32.infinity)) {
|
|
1767
|
+
thrownError = Some(DoesNotMatchBounded)
|
|
1768
|
+
(0, 0, 0)
|
|
1769
|
+
} else {
|
|
1770
|
+
lbMin := min1
|
|
1771
|
+
lbMax := max1
|
|
1772
|
+
(0, 0, max(max1, maxL1))
|
|
1773
|
+
}
|
|
1774
|
+
},
|
|
1775
|
+
RECut(re, _, _, _) => {
|
|
1776
|
+
loop(re)
|
|
1777
|
+
},
|
|
1778
|
+
REReference(n, _) => {
|
|
1779
|
+
if (n > numGroups) {
|
|
1780
|
+
thrownError = Some(BackreferenceTooBig)
|
|
1781
|
+
(0, 0, 0)
|
|
1782
|
+
} else {
|
|
1783
|
+
match(Map.get(n, groupSizes)) {
|
|
1784
|
+
Some(minSize) => (minSize, Float32.toNumber(Float32.infinity), 0),
|
|
1785
|
+
None => {
|
|
1786
|
+
Map.set(n - 1, true, dependsSizes)
|
|
1787
|
+
(1, Float32.toNumber(Float32.infinity), 0)
|
|
1788
|
+
}
|
|
1789
|
+
}
|
|
1790
|
+
}
|
|
1791
|
+
},
|
|
1792
|
+
REUnicodeCategories(_, _) => (1, 4, 0)
|
|
1793
|
+
}
|
|
1794
|
+
}
|
|
1795
|
+
let (minLen, maxLen, maxLookbehind) = loop(re)
|
|
1796
|
+
Map.forEach((k, _) => {
|
|
1797
|
+
match(Map.get(k, groupSizes)) {
|
|
1798
|
+
None => void,
|
|
1799
|
+
Some(sz) => {
|
|
1800
|
+
if (sz <= 0) {
|
|
1801
|
+
thrownError = Some(MightBeEmpty)
|
|
1802
|
+
}
|
|
1803
|
+
}
|
|
1804
|
+
}
|
|
1805
|
+
}, mustSizes)
|
|
1806
|
+
match(thrownError) {
|
|
1807
|
+
Some(MightBeEmpty) => Err("`*`, `+`, or `{...}` operand could be empty"),
|
|
1808
|
+
Some(DoesNotMatchBounded) => Err("lookbehind pattern does not match a bounded length"),
|
|
1809
|
+
Some(BackreferenceTooBig) => Err("backreference number is larger than the highest-numbered cluster"),
|
|
1810
|
+
Some(InternalError(re)) => Err("regex validate: Internal error: " ++ toString(re)),
|
|
1811
|
+
None => Ok(maxLookbehind)
|
|
1812
|
+
}
|
|
1813
|
+
}
|
|
1814
|
+
|
|
1815
|
+
|
|
1816
|
+
/*
|
|
1817
|
+
|
|
1818
|
+
=========================
|
|
1819
|
+
REGEX MATCHER COMPILATION
|
|
1820
|
+
=========================
|
|
1821
|
+
|
|
1822
|
+
*/
|
|
1823
|
+
|
|
1824
|
+
|
|
1825
|
+
record MatchBuf {
|
|
1826
|
+
matchInput: String,
|
|
1827
|
+
matchInputExploded: Array<Char>,
|
|
1828
|
+
}
|
|
1829
|
+
|
|
1830
|
+
let makeMatchBuffer = (s) => {
|
|
1831
|
+
{
|
|
1832
|
+
matchInput: s,
|
|
1833
|
+
matchInputExploded: String.explode(s),
|
|
1834
|
+
}
|
|
1835
|
+
}
|
|
1836
|
+
|
|
1837
|
+
let matchBufMore = (buf: MatchBuf, pos: Number) => {
|
|
1838
|
+
pos < Array.length(buf.matchInputExploded)
|
|
1839
|
+
}
|
|
1840
|
+
|
|
1841
|
+
let matchBufChar = (buf: MatchBuf, pos: Number) => {
|
|
1842
|
+
if (pos >= Array.length(buf.matchInputExploded)) {
|
|
1843
|
+
Err("end of match buffer reached")
|
|
1844
|
+
} else {
|
|
1845
|
+
Ok(buf.matchInputExploded[pos])
|
|
1846
|
+
}
|
|
1847
|
+
}
|
|
1848
|
+
|
|
1849
|
+
enum StackElt {
|
|
1850
|
+
SEPositionProducer(Number -> Option<Number>),
|
|
1851
|
+
SESavedGroup(Number, Option<(Number, Number)>),
|
|
1852
|
+
}
|
|
1853
|
+
|
|
1854
|
+
let done_m = (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => Some(pos)
|
|
1855
|
+
let continue_m = (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
1856
|
+
match(stack) {
|
|
1857
|
+
[SEPositionProducer(hd), ..._] => hd(pos),
|
|
1858
|
+
_ => fail "Impossible: continue_m",
|
|
1859
|
+
}
|
|
1860
|
+
}
|
|
1861
|
+
let limit_m = (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => if (pos == limit) Some(pos) else None
|
|
1862
|
+
|
|
1863
|
+
|
|
1864
|
+
let iterateMatcher = (m, size, max) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
1865
|
+
let limit = match(max) {
|
|
1866
|
+
Some(max) => min(limit, pos + (max * size)),
|
|
1867
|
+
None => limit,
|
|
1868
|
+
}
|
|
1869
|
+
let rec loop = (pos2, n) => {
|
|
1870
|
+
let pos3 = pos2 + size
|
|
1871
|
+
if (pos3 > limit || !m(buf, pos2, start, limit, end, state, stack)) {
|
|
1872
|
+
(pos2, n, size)
|
|
1873
|
+
} else {
|
|
1874
|
+
loop(pos3, n + 1)
|
|
1875
|
+
}
|
|
1876
|
+
}
|
|
1877
|
+
loop(pos, 0)
|
|
1878
|
+
}
|
|
1879
|
+
|
|
1880
|
+
// single-char matching
|
|
1881
|
+
|
|
1882
|
+
let charMatcher = (toMatch, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
1883
|
+
if ({
|
|
1884
|
+
pos < limit && match(matchBufChar(buf, pos)) {
|
|
1885
|
+
Err(_) => false,
|
|
1886
|
+
Ok(c) => toMatch == c
|
|
1887
|
+
}
|
|
1888
|
+
}) next_m(buf, pos + 1, start, limit, end, state, stack) else None
|
|
1889
|
+
}
|
|
1890
|
+
|
|
1891
|
+
let charTailMatcher = (toMatch) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
1892
|
+
if ({
|
|
1893
|
+
pos < limit && match(matchBufChar(buf, pos)) {
|
|
1894
|
+
Err(_) => false,
|
|
1895
|
+
Ok(c) => toMatch == c
|
|
1896
|
+
}
|
|
1897
|
+
}) Some(pos + 1) else None
|
|
1898
|
+
}
|
|
1899
|
+
|
|
1900
|
+
let charMatcherIterated = (toMatch, max) => iterateMatcher((buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
1901
|
+
match(matchBufChar(buf, pos)) {
|
|
1902
|
+
Err(_) => false,
|
|
1903
|
+
Ok(c) => toMatch == c
|
|
1904
|
+
}
|
|
1905
|
+
}, 1, max)
|
|
1906
|
+
|
|
1907
|
+
// string matching
|
|
1908
|
+
|
|
1909
|
+
let subArraysEqual = (arr1, start1, arr2, start2, length) => {
|
|
1910
|
+
if (Array.length(arr1) - start1 < length || Array.length(arr2) - start2 < length) {
|
|
1911
|
+
false
|
|
1912
|
+
} else {
|
|
1913
|
+
let mut result = true
|
|
1914
|
+
for (let mut i = 0; i < length; i += 1) {
|
|
1915
|
+
if (arr1[start1 + i] != arr2[start2 + i]) {
|
|
1916
|
+
result = false
|
|
1917
|
+
break
|
|
1918
|
+
}
|
|
1919
|
+
}
|
|
1920
|
+
result
|
|
1921
|
+
}
|
|
1922
|
+
}
|
|
1923
|
+
|
|
1924
|
+
let stringMatcher = (toMatch, len, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
1925
|
+
if ({
|
|
1926
|
+
pos + len <= limit && subArraysEqual(buf.matchInputExploded, pos, String.explode(toMatch), 0, len)
|
|
1927
|
+
}) next_m(buf, pos + len, start, limit, end, state, stack) else None
|
|
1928
|
+
}
|
|
1929
|
+
|
|
1930
|
+
let stringTailMatcher = (toMatch, len) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
1931
|
+
if ({
|
|
1932
|
+
pos + len <= limit && subArraysEqual(buf.matchInputExploded, pos, String.explode(toMatch), 0, len)
|
|
1933
|
+
}) Some(pos + len) else None
|
|
1934
|
+
}
|
|
1935
|
+
|
|
1936
|
+
let stringMatcherIterated = (toMatch, len, max) => iterateMatcher((buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
1937
|
+
subArraysEqual(buf.matchInputExploded, pos, String.explode(toMatch), 0, len)
|
|
1938
|
+
}, len, max)
|
|
1939
|
+
|
|
1940
|
+
|
|
1941
|
+
// match nothing
|
|
1942
|
+
|
|
1943
|
+
let neverMatcher = (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
1944
|
+
None
|
|
1945
|
+
}
|
|
1946
|
+
|
|
1947
|
+
// match any byte
|
|
1948
|
+
|
|
1949
|
+
let anyMatcher = (next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
1950
|
+
if ({
|
|
1951
|
+
pos < limit
|
|
1952
|
+
}) next_m(buf, pos + 1, start, limit, end, state, stack) else None
|
|
1953
|
+
}
|
|
1954
|
+
|
|
1955
|
+
let anyTailMatcher = () => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
1956
|
+
if ({
|
|
1957
|
+
pos < limit
|
|
1958
|
+
}) Some(pos + 1) else None
|
|
1959
|
+
}
|
|
1960
|
+
|
|
1961
|
+
let anyMatcherIterated = (max) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
1962
|
+
let n = match(max) {
|
|
1963
|
+
None => limit - pos,
|
|
1964
|
+
Some(max) => min(max, limit - pos),
|
|
1965
|
+
}
|
|
1966
|
+
(pos + n, n, 1)
|
|
1967
|
+
}
|
|
1968
|
+
|
|
1969
|
+
// match byte in set (range)
|
|
1970
|
+
|
|
1971
|
+
let rangeMatcher = (rng, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
1972
|
+
if ({
|
|
1973
|
+
pos < limit && match(matchBufChar(buf, pos)) {
|
|
1974
|
+
Err(_) => false,
|
|
1975
|
+
Ok(c) => rangeContains(rng, Char.code(c))
|
|
1976
|
+
}
|
|
1977
|
+
}) next_m(buf, pos + 1, start, limit, end, state, stack) else None
|
|
1978
|
+
}
|
|
1979
|
+
|
|
1980
|
+
let rangeTailMatcher = (rng) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
1981
|
+
if ({
|
|
1982
|
+
pos < limit && match(matchBufChar(buf, pos)) {
|
|
1983
|
+
Err(_) => false,
|
|
1984
|
+
Ok(c) => rangeContains(rng, Char.code(c))
|
|
1985
|
+
}
|
|
1986
|
+
}) Some(pos + 1) else None
|
|
1987
|
+
}
|
|
1988
|
+
|
|
1989
|
+
let rangeMatcherIterated = (rng, max) => iterateMatcher((buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
1990
|
+
match(matchBufChar(buf, pos)) {
|
|
1991
|
+
Err(_) => false,
|
|
1992
|
+
Ok(c) => rangeContains(rng, Char.code(c))
|
|
1993
|
+
}
|
|
1994
|
+
}, 1, max)
|
|
1995
|
+
|
|
1996
|
+
// zero-width matchers
|
|
1997
|
+
|
|
1998
|
+
let startMatcher = (next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
1999
|
+
if (pos == start) next_m(buf, pos, start, limit, end, state, stack) else None
|
|
2000
|
+
}
|
|
2001
|
+
|
|
2002
|
+
let endMatcher = (next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2003
|
+
if (pos == end) next_m(buf, pos, start, limit, end, state, stack) else None
|
|
2004
|
+
}
|
|
2005
|
+
|
|
2006
|
+
let lineStartMatcher = (next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2007
|
+
if (pos == start || matchBufChar(buf, pos - 1) == Ok('\n')) next_m(buf, pos, start, limit, end, state, stack) else None
|
|
2008
|
+
}
|
|
2009
|
+
|
|
2010
|
+
let lineEndMatcher = (next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2011
|
+
if (pos == end || matchBufChar(buf, pos) == Ok('\n')) next_m(buf, pos, start, limit, end, state, stack) else None
|
|
2012
|
+
}
|
|
2013
|
+
|
|
2014
|
+
let isWordChar = (c) => {
|
|
2015
|
+
match(c) {
|
|
2016
|
+
Err(_) => false,
|
|
2017
|
+
Ok(c) when (Char.code('0') <= Char.code(c) && Char.code(c) <= Char.code('9')) => true,
|
|
2018
|
+
Ok(c) when (Char.code('a') <= Char.code(c) && Char.code(c) <= Char.code('z')) => true,
|
|
2019
|
+
Ok(c) when (Char.code('A') <= Char.code(c) && Char.code(c) <= Char.code('Z')) => true,
|
|
2020
|
+
Ok(c) when (Char.code('_') <= Char.code(c)) => true,
|
|
2021
|
+
_ => false
|
|
2022
|
+
}
|
|
2023
|
+
}
|
|
2024
|
+
|
|
2025
|
+
let isWordBoundary = (buf, pos, start, limit, end) => {
|
|
2026
|
+
!((pos == start || !isWordChar(matchBufChar(buf, pos - 1))) == (pos == end || !isWordChar(matchBufChar(buf, pos))))
|
|
2027
|
+
}
|
|
2028
|
+
|
|
2029
|
+
let wordBoundaryMatcher = (next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2030
|
+
if (isWordBoundary(buf, pos, start, limit, end)) next_m(buf, pos, start, limit, end, state, stack) else None
|
|
2031
|
+
}
|
|
2032
|
+
|
|
2033
|
+
let notWordBoundaryMatcher = (next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2034
|
+
if (!isWordBoundary(buf, pos, start, limit, end)) next_m(buf, pos, start, limit, end, state, stack) else None
|
|
2035
|
+
}
|
|
2036
|
+
|
|
2037
|
+
// Alternatives
|
|
2038
|
+
|
|
2039
|
+
let altsMatcher = (m1, m2) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2040
|
+
match(m1(buf, pos, start, limit, end, state, stack)) {
|
|
2041
|
+
None => m2(buf, pos, start, limit, end, state, stack),
|
|
2042
|
+
Some(v) => Some(v)
|
|
2043
|
+
}
|
|
2044
|
+
}
|
|
2045
|
+
|
|
2046
|
+
// repeats, greedy (default) and non-greedy
|
|
2047
|
+
|
|
2048
|
+
let repeatMatcher = (r_m, min, max, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2049
|
+
let rec rloop = (pos, n) => {
|
|
2050
|
+
if (n < min) {
|
|
2051
|
+
let newStack = [SEPositionProducer(pos => rloop(pos, n + 1)), ...stack]
|
|
2052
|
+
r_m(buf, pos, start, limit, end, state, newStack)
|
|
2053
|
+
} else if (match(max) { None => false, Some(max) => max == n}) {
|
|
2054
|
+
next_m(buf, pos, start, limit, end, state, stack)
|
|
2055
|
+
} else {
|
|
2056
|
+
let newStack = [SEPositionProducer(pos => rloop(pos, n + 1)), ...stack]
|
|
2057
|
+
match(r_m(buf, pos, start, limit, end, state, newStack)) {
|
|
2058
|
+
Some(v) => Some(v),
|
|
2059
|
+
None => next_m(buf, pos, start, limit, end, state, stack)
|
|
2060
|
+
}
|
|
2061
|
+
}
|
|
2062
|
+
}
|
|
2063
|
+
rloop(pos, 0)
|
|
2064
|
+
}
|
|
2065
|
+
|
|
2066
|
+
let rStack = [SEPositionProducer(pos => Some(pos))]
|
|
2067
|
+
|
|
2068
|
+
let arrayCopy = (dest, destStart, src, srcStart, srcEnd) => {
|
|
2069
|
+
let mut count = srcStart
|
|
2070
|
+
while (count < srcEnd) {
|
|
2071
|
+
dest[destStart + (count - srcStart)] = src[count]
|
|
2072
|
+
count = count + 1
|
|
2073
|
+
}
|
|
2074
|
+
}
|
|
2075
|
+
|
|
2076
|
+
let saveGroups = (state, nStart, numN) => {
|
|
2077
|
+
if (numN == 0) {
|
|
2078
|
+
Array.make(0, None)
|
|
2079
|
+
} else if (Array.length(state) == 0) {
|
|
2080
|
+
Array.make(0, None)
|
|
2081
|
+
} else {
|
|
2082
|
+
let newState = Array.make(numN, None)
|
|
2083
|
+
arrayCopy(newState, 0, state, nStart, nStart + numN)
|
|
2084
|
+
newState
|
|
2085
|
+
}
|
|
2086
|
+
}
|
|
2087
|
+
|
|
2088
|
+
let restoreGroups = (state, oldState, nStart, numN) => {
|
|
2089
|
+
if (Array.length(oldState) > 0) {
|
|
2090
|
+
arrayCopy(state, nStart, oldState, 0, Array.length(oldState))
|
|
2091
|
+
}
|
|
2092
|
+
}
|
|
2093
|
+
|
|
2094
|
+
let addRepeatedGroup = (groupN, state, pos, n, backAmt, callback) => {
|
|
2095
|
+
match(groupN) {
|
|
2096
|
+
Some(groupN) when Array.length(state) > 0 => {
|
|
2097
|
+
let oldSpan = state[groupN]
|
|
2098
|
+
state[groupN] = if (n == 0) None else Some((pos - backAmt, pos))
|
|
2099
|
+
let groupRevert = () => { state[groupN] = oldSpan }
|
|
2100
|
+
callback(groupRevert)
|
|
2101
|
+
},
|
|
2102
|
+
_ => {
|
|
2103
|
+
let groupRevert = () => void
|
|
2104
|
+
callback(groupRevert)
|
|
2105
|
+
}
|
|
2106
|
+
}
|
|
2107
|
+
}
|
|
2108
|
+
|
|
2109
|
+
let repeatSimpleMatcher = (r_m, min, max, groupN, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2110
|
+
let rec rloop = (pos, n, backAmt) => {
|
|
2111
|
+
let pos2 = match(max) {
|
|
2112
|
+
Some(max) when n < max => r_m(buf, pos, start, limit, end, state, rStack),
|
|
2113
|
+
Some(_) => None,
|
|
2114
|
+
_ => r_m(buf, pos, start, limit, end, state, rStack)
|
|
2115
|
+
}
|
|
2116
|
+
match(pos2) {
|
|
2117
|
+
Some(pos2) => rloop(pos2, n + 1, pos2 - pos),
|
|
2118
|
+
None => {
|
|
2119
|
+
// Perform backtracking
|
|
2120
|
+
let rec bloop = (pos, n) => {
|
|
2121
|
+
if (n < min) {
|
|
2122
|
+
None
|
|
2123
|
+
} else {
|
|
2124
|
+
addRepeatedGroup(groupN, state, pos, n, backAmt, (groupRevert) => {
|
|
2125
|
+
match(next_m(buf, pos, start, limit, end, state, stack)) {
|
|
2126
|
+
Some(v) => Some(v),
|
|
2127
|
+
None => {
|
|
2128
|
+
groupRevert()
|
|
2129
|
+
bloop(pos - backAmt, n - 1)
|
|
2130
|
+
}
|
|
2131
|
+
}
|
|
2132
|
+
})
|
|
2133
|
+
}
|
|
2134
|
+
}
|
|
2135
|
+
bloop(pos, n)
|
|
2136
|
+
}
|
|
2137
|
+
}
|
|
2138
|
+
}
|
|
2139
|
+
rloop(pos, 0, 0)
|
|
2140
|
+
}
|
|
2141
|
+
|
|
2142
|
+
let repeatSimpleManyMatcher = (r_m, min, max, groupN, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2143
|
+
let (pos2, n, backAmt) = r_m(buf, pos, start, limit, end, state, stack)
|
|
2144
|
+
let rec bloop = (pos, n) => {
|
|
2145
|
+
if (n < min) {
|
|
2146
|
+
None
|
|
2147
|
+
} else {
|
|
2148
|
+
addRepeatedGroup(groupN, state, pos, n, backAmt, (groupRevert) => {
|
|
2149
|
+
match(next_m(buf, pos, start, limit, end, state, stack)) {
|
|
2150
|
+
Some(v) => Some(v),
|
|
2151
|
+
None => {
|
|
2152
|
+
groupRevert()
|
|
2153
|
+
bloop(pos - backAmt, n - 1)
|
|
2154
|
+
}
|
|
2155
|
+
}
|
|
2156
|
+
})
|
|
2157
|
+
}
|
|
2158
|
+
}
|
|
2159
|
+
bloop(pos2, n)
|
|
2160
|
+
}
|
|
2161
|
+
|
|
2162
|
+
let lazyRepeatMatcher = (r_m, min, max, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2163
|
+
let rec rloop = (pos, n, min) => {
|
|
2164
|
+
if (n < min) {
|
|
2165
|
+
let newStack = [SEPositionProducer(pos => rloop(pos, n + 1, min)), ...stack]
|
|
2166
|
+
r_m(buf, pos, start, limit, end, state, newStack)
|
|
2167
|
+
} else if (match(max) { None => false, Some(max) => max == n }) {
|
|
2168
|
+
next_m(buf, pos, start, limit, end, state, stack)
|
|
2169
|
+
} else match (next_m(buf, pos, start, limit, end, state, stack)) {
|
|
2170
|
+
Some(p) => Some(p),
|
|
2171
|
+
None => rloop(pos, n, min + 1)
|
|
2172
|
+
}
|
|
2173
|
+
}
|
|
2174
|
+
rloop(pos, 0, min)
|
|
2175
|
+
}
|
|
2176
|
+
|
|
2177
|
+
let lazyRepeatSimpleMatcher = (r_m, min, max, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2178
|
+
let rec rloop = (pos, n, min) => {
|
|
2179
|
+
if (n < min) {
|
|
2180
|
+
match(r_m(buf, pos, start, limit, end, state, stack)) {
|
|
2181
|
+
Some(p) => rloop(p, n + 1, min),
|
|
2182
|
+
None => None
|
|
2183
|
+
}
|
|
2184
|
+
} else if (match(max) { None => false, Some(max) => max == n }) {
|
|
2185
|
+
next_m(buf, pos, start, limit, end, state, stack)
|
|
2186
|
+
} else match (next_m(buf, pos, start, limit, end, state, stack)) {
|
|
2187
|
+
Some(p) => Some(p),
|
|
2188
|
+
None => rloop(pos, n, min + 1)
|
|
2189
|
+
}
|
|
2190
|
+
}
|
|
2191
|
+
rloop(pos, 0, min)
|
|
2192
|
+
}
|
|
2193
|
+
|
|
2194
|
+
// Recording and referencing group matches
|
|
2195
|
+
|
|
2196
|
+
let groupPushMatcher = (n, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2197
|
+
let newStack = [SESavedGroup(pos, if (Array.length(state) > 0) state[n] else None), ...stack]
|
|
2198
|
+
next_m(buf, pos, start, limit, end, state, newStack)
|
|
2199
|
+
}
|
|
2200
|
+
|
|
2201
|
+
let groupSetMatcher = (n, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2202
|
+
match(stack) {
|
|
2203
|
+
[SESavedGroup(oldPos, oldSpan), ...stackTl] => {
|
|
2204
|
+
if (Array.length(state) > 0) {
|
|
2205
|
+
state[n] = Some((oldPos, pos))
|
|
2206
|
+
}
|
|
2207
|
+
match(next_m(buf, pos, start, limit, end, state, stackTl)) {
|
|
2208
|
+
Some(v) => Some(v),
|
|
2209
|
+
None => {
|
|
2210
|
+
if (Array.length(state) > 0) {
|
|
2211
|
+
state[n] = oldSpan
|
|
2212
|
+
}
|
|
2213
|
+
None
|
|
2214
|
+
}
|
|
2215
|
+
}
|
|
2216
|
+
},
|
|
2217
|
+
_ => fail "Impossible: groupSetMatcher"
|
|
2218
|
+
}
|
|
2219
|
+
}
|
|
2220
|
+
|
|
2221
|
+
let makeReferenceMatcher = (eq) => (n, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2222
|
+
match(state[n]) {
|
|
2223
|
+
None => None,
|
|
2224
|
+
Some((refStart, refEnd)) => {
|
|
2225
|
+
let len = refEnd - refStart
|
|
2226
|
+
if ((pos + len <= limit) && subArraysEqual(buf.matchInputExploded, refStart, buf.matchInputExploded, pos, len)) {
|
|
2227
|
+
next_m(buf, pos + len, start, limit, end, state, stack)
|
|
2228
|
+
} else None
|
|
2229
|
+
}
|
|
2230
|
+
}
|
|
2231
|
+
}
|
|
2232
|
+
|
|
2233
|
+
let referenceMatcher = makeReferenceMatcher(((a, b)) => (a == b))
|
|
2234
|
+
|
|
2235
|
+
let asciiCharToLower = (c) => {
|
|
2236
|
+
if (Char.code('Z') <= Char.code(c) && Char.code(c) <= Char.code('Z')) {
|
|
2237
|
+
Char.fromCode(Char.code(c) + (Char.code('a') - Char.code('A')))
|
|
2238
|
+
} else {
|
|
2239
|
+
c
|
|
2240
|
+
}
|
|
2241
|
+
}
|
|
2242
|
+
|
|
2243
|
+
let referenceMatcherCaseInsensitive = makeReferenceMatcher(((a, b)) => (asciiCharToLower(a) == asciiCharToLower(b)))
|
|
2244
|
+
|
|
2245
|
+
// Lookahead, Lookbehind, Conditionals, and Cut
|
|
2246
|
+
|
|
2247
|
+
let lookaheadMatcher = (isMatch, sub_m, nStart, numN, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2248
|
+
let oldState = saveGroups(state, nStart, numN)
|
|
2249
|
+
let ret = match(sub_m(buf, pos, start, limit, end, state, stack)) {
|
|
2250
|
+
Some(_) when isMatch => {
|
|
2251
|
+
match(next_m(buf, pos, start, limit, end, state, stack)) {
|
|
2252
|
+
Some(p) => Some(p),
|
|
2253
|
+
None => { restoreGroups(state, oldState, nStart, numN); None },
|
|
2254
|
+
}
|
|
2255
|
+
},
|
|
2256
|
+
Some(_) => { restoreGroups(state, oldState, nStart, numN); None },
|
|
2257
|
+
None when isMatch => { restoreGroups(state, oldState, nStart, numN); None },
|
|
2258
|
+
_ => next_m(buf, pos, start, limit, end, state, stack)
|
|
2259
|
+
}
|
|
2260
|
+
ret
|
|
2261
|
+
}
|
|
2262
|
+
|
|
2263
|
+
let lookbehindMatcher = (isMatch, lbMin, lbMax, sub_m, nStart, numN, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2264
|
+
let lbMinPos = max(start, pos - lbMax)
|
|
2265
|
+
let rec loop = (lbPos) => {
|
|
2266
|
+
if (lbPos < lbMinPos) {
|
|
2267
|
+
if (isMatch) {
|
|
2268
|
+
None
|
|
2269
|
+
} else {
|
|
2270
|
+
next_m(buf, pos, start, limit, end, state, stack)
|
|
2271
|
+
}
|
|
2272
|
+
} else {
|
|
2273
|
+
let oldState = saveGroups(state, nStart, numN)
|
|
2274
|
+
match(sub_m(buf, lbPos, start, pos, end, state, stack)) {
|
|
2275
|
+
Some(_) when isMatch => {
|
|
2276
|
+
match(next_m(buf, pos, start, limit, end, state, stack)) {
|
|
2277
|
+
Some(p) => Some(p),
|
|
2278
|
+
None => { restoreGroups(state, oldState, nStart, numN); None },
|
|
2279
|
+
}
|
|
2280
|
+
},
|
|
2281
|
+
_ when isMatch => {
|
|
2282
|
+
loop(lbPos - 1)
|
|
2283
|
+
},
|
|
2284
|
+
Some(_) => { restoreGroups(state, oldState, nStart, numN); None },
|
|
2285
|
+
_ => next_m(buf, pos, start, limit, end, state, stack)
|
|
2286
|
+
}
|
|
2287
|
+
}
|
|
2288
|
+
}
|
|
2289
|
+
loop(pos - lbMin)
|
|
2290
|
+
}
|
|
2291
|
+
|
|
2292
|
+
let conditionalReferenceMatcher = (n, m1, m2) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2293
|
+
if (Option.isSome(state[n])) {
|
|
2294
|
+
m1(buf, pos, start, limit, end, state, stack)
|
|
2295
|
+
} else {
|
|
2296
|
+
m2(buf, pos, start, limit, end, state, stack)
|
|
2297
|
+
}
|
|
2298
|
+
}
|
|
2299
|
+
|
|
2300
|
+
let conditionalLookMatcher = (tst_m, m1, m2, nStart, numN) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2301
|
+
let oldState = saveGroups(state, nStart, numN)
|
|
2302
|
+
let res = match(tst_m(buf, pos, start, limit, end, state, [])) {
|
|
2303
|
+
Some(_) => m1(buf, pos, start, limit, end, state, stack),
|
|
2304
|
+
None => m2(buf, pos, start, limit, end, state, stack)
|
|
2305
|
+
}
|
|
2306
|
+
match(res) {
|
|
2307
|
+
Some(p) => Some(p),
|
|
2308
|
+
None => { restoreGroups(state, oldState, nStart, numN); None }
|
|
2309
|
+
}
|
|
2310
|
+
}
|
|
2311
|
+
|
|
2312
|
+
let cutMatcher = (sub_m, nStart, numN, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2313
|
+
let oldState = saveGroups(state, nStart, numN)
|
|
2314
|
+
match(sub_m(buf, pos, start, limit, end, state, [])) {
|
|
2315
|
+
None => None,
|
|
2316
|
+
Some(_) => {
|
|
2317
|
+
match(next_m(buf, pos, start, limit, end, state, stack)) {
|
|
2318
|
+
None => { restoreGroups(state, oldState, nStart, numN); None },
|
|
2319
|
+
Some(p) => Some(p)
|
|
2320
|
+
}
|
|
2321
|
+
}
|
|
2322
|
+
}
|
|
2323
|
+
}
|
|
2324
|
+
|
|
2325
|
+
// Unicode characters in UTF-8 encoding
|
|
2326
|
+
|
|
2327
|
+
let unicodeCategoriesMatcher = (cats, isMatch, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
|
|
2328
|
+
fail "NYI: unicodeCategoriesMatcher is not supported until grain-lang/grain#661 is resolved."
|
|
2329
|
+
}
|
|
2330
|
+
|
|
2331
|
+
// -------
|
|
2332
|
+
// Regex matcher compilation
|
|
2333
|
+
// -------
|
|
2334
|
+
|
|
2335
|
+
let countBacktrackPrefix = (l) => {
|
|
2336
|
+
let rec loop = (l, total, nonBt) => {
|
|
2337
|
+
match(l) {
|
|
2338
|
+
[] => total - nonBt,
|
|
2339
|
+
[hd, ...tl] when needsBacktrack(hd) => loop(tl, total + 1, 0),
|
|
2340
|
+
[hd, ...tl] => loop(tl, total + 1, nonBt + 1)
|
|
2341
|
+
}
|
|
2342
|
+
}
|
|
2343
|
+
loop(l, 0, 0)
|
|
2344
|
+
}
|
|
2345
|
+
|
|
2346
|
+
let compileMatcherRepeater = (rx, min, max) => {
|
|
2347
|
+
match(rx) {
|
|
2348
|
+
RELiteral(c) => Some(charMatcherIterated(c, max)),
|
|
2349
|
+
RELiteralString(s) => Some(stringMatcherIterated(s, String.length(s), max)),
|
|
2350
|
+
REAny => Some(anyMatcherIterated(max)),
|
|
2351
|
+
RERange(rng) => Some(rangeMatcherIterated(rng, max)),
|
|
2352
|
+
_ => None
|
|
2353
|
+
}
|
|
2354
|
+
}
|
|
2355
|
+
|
|
2356
|
+
let compileRegexToMatcher = (re: ParsedRegularExpression) => {
|
|
2357
|
+
let rec compile = (re: ParsedRegularExpression, next_m) => {
|
|
2358
|
+
let useTail = next_m is done_m
|
|
2359
|
+
match(re) {
|
|
2360
|
+
RELiteral(c) when useTail => charTailMatcher(c),
|
|
2361
|
+
RELiteral(c) => charMatcher(c, next_m),
|
|
2362
|
+
RELiteralString(s) when useTail => stringTailMatcher(s, String.length(s)),
|
|
2363
|
+
RELiteralString(s) => stringMatcher(s, String.length(s), next_m),
|
|
2364
|
+
REEmpty => next_m,
|
|
2365
|
+
RENever => neverMatcher,
|
|
2366
|
+
REAny when useTail => anyTailMatcher(),
|
|
2367
|
+
REAny => anyMatcher(next_m),
|
|
2368
|
+
RERange(rng) when useTail => rangeTailMatcher(rng),
|
|
2369
|
+
RERange(rng) => rangeMatcher(rng, next_m),
|
|
2370
|
+
REStart => startMatcher(next_m),
|
|
2371
|
+
REEnd => endMatcher(next_m),
|
|
2372
|
+
RELineStart => lineStartMatcher(next_m),
|
|
2373
|
+
RELineEnd => lineEndMatcher(next_m),
|
|
2374
|
+
REWordBoundary => wordBoundaryMatcher(next_m),
|
|
2375
|
+
RENotWordBoundary => notWordBoundaryMatcher(next_m),
|
|
2376
|
+
RESequence(res, _) => {
|
|
2377
|
+
List.reduceRight(compile, next_m, res)
|
|
2378
|
+
},
|
|
2379
|
+
REAlts(re1, re2) => altsMatcher(compile(re1, next_m), compile(re2, next_m)),
|
|
2380
|
+
REMaybe(re, true) => altsMatcher(next_m, compile(re, next_m)), // non-greedy
|
|
2381
|
+
REMaybe(re, _) => altsMatcher(compile(re, next_m), next_m),
|
|
2382
|
+
RERepeat(actualRe, min, max, nonGreedy) => {
|
|
2383
|
+
// Special case: group around simple pattern in non-lazy repeat
|
|
2384
|
+
let re = match(actualRe) {
|
|
2385
|
+
REGroup(groupRe, n) when !nonGreedy && !needsBacktrack(groupRe) => groupRe,
|
|
2386
|
+
_ => actualRe
|
|
2387
|
+
}
|
|
2388
|
+
let simple = !needsBacktrack(re)
|
|
2389
|
+
let groupN = if (simple) match(actualRe) {
|
|
2390
|
+
REGroup(_, n) => Some(n),
|
|
2391
|
+
_ => None
|
|
2392
|
+
} else None
|
|
2393
|
+
match(compileMatcherRepeater(re, min, max)) {
|
|
2394
|
+
Some(matcher) when !nonGreedy => repeatSimpleManyMatcher(matcher, min, max, groupN, next_m),
|
|
2395
|
+
_ => {
|
|
2396
|
+
let r_m = compile(re, if (simple) done_m else continue_m)
|
|
2397
|
+
if (nonGreedy) {
|
|
2398
|
+
if (simple) {
|
|
2399
|
+
lazyRepeatSimpleMatcher(r_m, min, max, next_m)
|
|
2400
|
+
} else {
|
|
2401
|
+
lazyRepeatMatcher(r_m, min, max, next_m)
|
|
2402
|
+
}
|
|
2403
|
+
} else {
|
|
2404
|
+
if (simple) {
|
|
2405
|
+
repeatSimpleMatcher(r_m, min, max, groupN, next_m)
|
|
2406
|
+
} else {
|
|
2407
|
+
repeatMatcher(r_m, min, max, next_m)
|
|
2408
|
+
}
|
|
2409
|
+
}
|
|
2410
|
+
}
|
|
2411
|
+
}
|
|
2412
|
+
},
|
|
2413
|
+
REGroup(re, n) => groupPushMatcher(n, compile(re, groupSetMatcher(n, next_m))),
|
|
2414
|
+
REReference(0, _) => neverMatcher,
|
|
2415
|
+
REReference(n, true) => referenceMatcher(n - 1, next_m), // case-sensitive
|
|
2416
|
+
REReference(n, _) => referenceMatcherCaseInsensitive(n - 1, next_m),
|
|
2417
|
+
RECut(re, nStart, numN, _) => cutMatcher(compile(re, done_m), nStart, numN, next_m),
|
|
2418
|
+
REConditional(tst, reTrue, reFalse, nStart, numN, _) => {
|
|
2419
|
+
let m1 = compile(reTrue, next_m)
|
|
2420
|
+
let m2 = compile(Option.unwrapWithDefault(REEmpty, reFalse), next_m)
|
|
2421
|
+
match(tst) {
|
|
2422
|
+
REReference(n, _) => conditionalReferenceMatcher(n - 1, m1, m2),
|
|
2423
|
+
_ => conditionalLookMatcher(compile(tst, done_m), m1, m2, nStart, numN)
|
|
2424
|
+
}
|
|
2425
|
+
},
|
|
2426
|
+
RELookahead(re, isMatch, nStart, numN) => lookaheadMatcher(isMatch, compile(re, done_m), nStart, numN, next_m),
|
|
2427
|
+
RELookbehind(re, isMatch, lbMin, lbMax, nStart, numN) => lookbehindMatcher(isMatch, unbox(lbMin), unbox(lbMax), compile(re, done_m), nStart, numN, next_m),
|
|
2428
|
+
REUnicodeCategories(cats, isMatch) => unicodeCategoriesMatcher(cats, isMatch, next_m)
|
|
2429
|
+
}
|
|
2430
|
+
}
|
|
2431
|
+
compile(re, done_m)
|
|
2432
|
+
}
|
|
2433
|
+
|
|
2434
|
+
let interp = (compiledRe, matchBuffer, pos, start, limitOrEnd, state) => {
|
|
2435
|
+
compiledRe(matchBuffer, pos, start, limitOrEnd, limitOrEnd, state, [])
|
|
2436
|
+
}
|
|
2437
|
+
|
|
2438
|
+
// Should be exported as abstract type when possible
|
|
2439
|
+
record RegularExpression {
|
|
2440
|
+
reParsed: ParsedRegularExpression,
|
|
2441
|
+
reNumGroups: Number,
|
|
2442
|
+
reReferences: Bool,
|
|
2443
|
+
reMaxLookbehind: Number,
|
|
2444
|
+
reCompiled: ((MatchBuf, Number, Number, Number, Number, Array<Option<(Number, Number)>>, List<StackElt>) -> Option<Number>),
|
|
2445
|
+
reMustString: Option<String>,
|
|
2446
|
+
reIsAnchored: Bool,
|
|
2447
|
+
reStartRange: Option<List<(Number, Number)>>,
|
|
2448
|
+
}
|
|
2449
|
+
|
|
2450
|
+
/**
|
|
2451
|
+
* @section Values: Functions for working with regular expressions.
|
|
2452
|
+
*/
|
|
2453
|
+
|
|
2454
|
+
// [TODO] When #661 is resolved, re-add the following pieces of documentation:
|
|
2455
|
+
/*
|
|
2456
|
+
[Under POSIX character classes]
|
|
2457
|
+
|
|
2458
|
+
* - `[:graph:]` - Matches all ASCII characters which use ink when printed
|
|
2459
|
+
* - `[:print:]` - Matches space, tab, and all ASCII ink users
|
|
2460
|
+
|
|
2461
|
+
[At end of documentation]
|
|
2462
|
+
|
|
2463
|
+
* Finally, the following is the list of supported Unicode properties.
|
|
2464
|
+
* These class codes come from this portion of the Unicode standard:
|
|
2465
|
+
* https://www.unicode.org/reports/tr44/#General_Category_Values
|
|
2466
|
+
*
|
|
2467
|
+
* - `Ll` - Letter, lowercase
|
|
2468
|
+
* - `Lu` - Letter, uppercase
|
|
2469
|
+
* - `Lt` - Letter, titlecase
|
|
2470
|
+
* - `Lm` - Letter, modifier
|
|
2471
|
+
* - `L&` - Union of `Ll`, `Lu`, `Lt`, and `Lm`
|
|
2472
|
+
* - `Lo` - Letter, other
|
|
2473
|
+
* - `L` - Union of `L&` and `Lo`
|
|
2474
|
+
* - `Nd` - Number, decimal digit
|
|
2475
|
+
* - `Nl` - Number, letter
|
|
2476
|
+
* - `No` - Number, other
|
|
2477
|
+
* - `N` - Union of `Nd`, `Nl`, and `No`
|
|
2478
|
+
* - `Ps` - Punctuation, open
|
|
2479
|
+
* - `Pe` - Punctuation, close
|
|
2480
|
+
* - `Pi` - Punctuation, initial quote
|
|
2481
|
+
* - `Pf` - Punctuation, final quote
|
|
2482
|
+
* - `Pc` - Punctuation, connector
|
|
2483
|
+
* - `Pd` - Punctuation, dash
|
|
2484
|
+
* - `Po` - Punctuation, other
|
|
2485
|
+
* - `P` - Union of `Ps`, `Pe`, `Pi`, `Pf`, `Pc`, `Pd`, and `Po`
|
|
2486
|
+
* - `Mn` - Mark, non-spacing
|
|
2487
|
+
* - `Mc` - Mark, spacing combining
|
|
2488
|
+
* - `Me` - Mark, enclosing
|
|
2489
|
+
* - `M` - Union of `Mn`, `Mc`, and `Me`
|
|
2490
|
+
* - `Sc` - Symbol, currency
|
|
2491
|
+
* - `Sk` - Symbol, modifier
|
|
2492
|
+
* - `Sm` - Symbol, math
|
|
2493
|
+
* - `So` - Symbol, other
|
|
2494
|
+
* - `S` - Union of `Sc`, `Sk`, `Sm`, and `So`
|
|
2495
|
+
* - `Zl` - Separator, line
|
|
2496
|
+
* - `Zp` - Separator, paragraph
|
|
2497
|
+
* - `Zs` - Separator, space
|
|
2498
|
+
* - `Z` - Union of `Zl`, `Zp`, and `Zs`
|
|
2499
|
+
* - `Cc` - Other, control
|
|
2500
|
+
* - `Cf` - Other, format
|
|
2501
|
+
* - `Cs` - Other, surrogate
|
|
2502
|
+
* - `Cn` - Other, not assigned
|
|
2503
|
+
* - `Co` - Other, private use
|
|
2504
|
+
* - `C` - Union of `Cc`, `Cf`, `Cs`, `Cn`, and `Co`
|
|
2505
|
+
* - `.` - Union of all Unicode categories
|
|
2506
|
+
*/
|
|
2507
|
+
|
|
2508
|
+
/**
|
|
2509
|
+
* Compiles the given pattern string into a regular expression object.
|
|
2510
|
+
*
|
|
2511
|
+
* For a general overview of regular expressions, refer to
|
|
2512
|
+
* ["Mastering Regular Expressions"](http://regex.info/book.html) by Friedl, or other online resources.
|
|
2513
|
+
*
|
|
2514
|
+
* Regular expressions are a combination of normal and special characters. A normal
|
|
2515
|
+
* character in a pattern will match a one-character string containing that character.
|
|
2516
|
+
* Moreover, if there are two regular expressions `A` and `B`, they can be concatenated
|
|
2517
|
+
* into a regular expression `AB`. If a string `p` matches `A` and `q` matches `B`,
|
|
2518
|
+
* then `pq` will match `AB`.
|
|
2519
|
+
*
|
|
2520
|
+
* The special character sequences are as follows:
|
|
2521
|
+
*
|
|
2522
|
+
* - `.` - Matches any character, except for a newline in multi-line mode
|
|
2523
|
+
* - `^` - Matches the beginning of the input, or after a newline (`\n`) in multi-line mode
|
|
2524
|
+
* - `$` - Matches the end of the input, or right before a newline (`\n`) in multi-line mode
|
|
2525
|
+
* - `«re»*` - Matches `«re»` zero or more times
|
|
2526
|
+
* - `«re»+` - Matches `«re»` one or more times
|
|
2527
|
+
* - `«re»?` - Matches `«re»` zero or one times
|
|
2528
|
+
* - `«re»{«n»}` - Matches `«re»` exactly `«n»` times
|
|
2529
|
+
* - `«re»{«n»,}` - Matches `«re»` `«n»` or more times
|
|
2530
|
+
* - `«re»{,«m»}` - Matches `«re»` zero to `«m»` times
|
|
2531
|
+
* - `«re»{«n»,«m»}` - Matches `«re»` between `«n»` and `«m»` times
|
|
2532
|
+
* - `«re»{}` - Matches `«re»` zero or more times
|
|
2533
|
+
* - `[«rng»]` - Matches any character in `«rng»` (see below)
|
|
2534
|
+
* - `[^«rng»]` - Matches any character not in `«rng»` (see below)
|
|
2535
|
+
* - `\«n»` - Matches the latest match for group `«n»` (one-indexed)
|
|
2536
|
+
* - `\b` - Matches the boundary of `\w*` (`\w` defined below, under "basic classes")
|
|
2537
|
+
* - `\B` - Matches where `\b` does not
|
|
2538
|
+
* - `\p{«property»}` - Matches any character with Unicode property `«property»` (see below)
|
|
2539
|
+
* - `\P{«property»}` - Matches any character without Unicode property `«property»` (see below)
|
|
2540
|
+
* - `(«re»)` - Matches `«re»`, storing the result in a group
|
|
2541
|
+
* - `(?:«re»)` - Matches `«re»` without storing the result in a group
|
|
2542
|
+
* - `(?«mode»:«re») - Matches `«re»` with the mode settings specified by `«mode»` using the following syntax:
|
|
2543
|
+
* - `«mode»i` - The same as `«mode»`, but with case-insensitivity enabled (temporarily not supported until grain-lang/grain#661 is resolved)
|
|
2544
|
+
* - `«mode»-i` - The same as `«mode»`, but with case-insensitivity disabled (the default)
|
|
2545
|
+
* - `«mode»m` / `«mode»-s` - The same as `«mode»`, but with multi-line mode enabled
|
|
2546
|
+
* - `«mode»-m` / `«mode»s` - The same as `«mode»`, but with multi-line mode disabled
|
|
2547
|
+
* - An empty string, which will not change any mode settings
|
|
2548
|
+
* - `(?«tst»«re1»|«re2»)` - Will match `«re1»` if `«tst»`, otherwise will match `«re2»`. The following options are available for `«tst»`
|
|
2549
|
+
* - `(«n»)` - Will be true if group `«n»` has a match
|
|
2550
|
+
* - `(?=«re»)` - Will be true if `«re»` matches the next sequence
|
|
2551
|
+
* - `(?!«re»)` - Will be true if `«re»` does not match the next sequence
|
|
2552
|
+
* - `(?<=«re»)` - Will be true if `«re»` matches the preceding sequence
|
|
2553
|
+
* - `(?<!«re»)` - Will be true if `«re»` does not match the preceding sequence
|
|
2554
|
+
* - `(?«tst»«re»)` - Equivalent to `(?«tst»«re»|)`
|
|
2555
|
+
* - Finally, basic classes (defined below) can also appear outside of character ranges.
|
|
2556
|
+
*
|
|
2557
|
+
* Character ranges (referred to as `«rng»` above) have the following syntax:
|
|
2558
|
+
* - `«c»` - Matches the character `«c»` exactly
|
|
2559
|
+
* - `«c1»-«c2»` - Matches any character with a character code between the character code for `«c1»` and the code for `«c2»`
|
|
2560
|
+
*
|
|
2561
|
+
* These forms can be repeated any number of times, which will construct a range of their union. That is, `[ba-c]` and `[a-c]` are equivalent ranges.
|
|
2562
|
+
* Additionally, there are the following special cases:
|
|
2563
|
+
* - A `]` as the first character of the range will match a `]`
|
|
2564
|
+
* - A `-` as the first or last character of the range will match a `-`
|
|
2565
|
+
* - A `^` in any position other than the first position will match a `^`
|
|
2566
|
+
* - `\«c»`, where `«c»` is a non-alphabetic character, will match `«c»`
|
|
2567
|
+
*
|
|
2568
|
+
* Furthermore, ranges can include character classes, which are predefined commonly-used
|
|
2569
|
+
* sets of characters. There are two "flavors" of these: *basic* classes and *POSIX* classes.
|
|
2570
|
+
* Both are provided for ease of use and to maximize compatibility with other regular
|
|
2571
|
+
* expression engines, so feel free to use whichever is most convenient.
|
|
2572
|
+
*
|
|
2573
|
+
* The *basic* classes are as follows:
|
|
2574
|
+
* - `\d` - Matches `0-9`
|
|
2575
|
+
* - `\D` - Matches characters not in `\d`
|
|
2576
|
+
* - `\w` - Matches `a-z`, `A-Z`, `0-9`, and `_`
|
|
2577
|
+
* - `\W` - Matches characters not in `\w`
|
|
2578
|
+
* - `\s` - Matches space, tab, formfeed, and return
|
|
2579
|
+
* - `\S` - Matches characters not in `\s`
|
|
2580
|
+
* The *POSIX* classes are as follows:
|
|
2581
|
+
* - `[:alpha:]` - Matches `a-z` and `A-Z`
|
|
2582
|
+
* - `[:upper:]` - Matches `A-Z`
|
|
2583
|
+
* - `[:lower:]` - Matches `a-z`
|
|
2584
|
+
* - `[:digit:]` - Matches `0-9`
|
|
2585
|
+
* - `[:xdigit:]` - Matches `0-9`, `a-f`, and `A-F`
|
|
2586
|
+
* - `[:alnum:]` - Matches `a-z`, `A-Z`, and `0-9`
|
|
2587
|
+
* - `[:word:]` - Matches `a-z`, `A-Z`, `0-9`, and `_`
|
|
2588
|
+
* - `[:blank:]` - Matches space and tab
|
|
2589
|
+
* - `[:space:]` - Matches space, tab, newline, formfeed, and return
|
|
2590
|
+
* - `[:cntrl:]` - Contains all characters with code points < 32
|
|
2591
|
+
* - `[:ascii:]` - Contains all ASCII characters
|
|
2592
|
+
*
|
|
2593
|
+
*
|
|
2594
|
+
*
|
|
2595
|
+
* @param regexString: The regular expression to compile
|
|
2596
|
+
* @returns The compiled regular expression
|
|
2597
|
+
*
|
|
2598
|
+
* @example Regex.make("(foo|bar)[0-9]+")
|
|
2599
|
+
*
|
|
2600
|
+
* @since 0.4.3
|
|
2601
|
+
*/
|
|
2602
|
+
export let make = (regexString: String) => {
|
|
2603
|
+
let buf = makeRegExBuf(regexString)
|
|
2604
|
+
match(parseRegex(buf)) {
|
|
2605
|
+
Err(e) => Err(e),
|
|
2606
|
+
Ok(parsed) => {
|
|
2607
|
+
let numGroups = unbox(buf.config.groupNumber)
|
|
2608
|
+
let references = unbox(buf.config.references)
|
|
2609
|
+
match(validate(parsed, numGroups)) {
|
|
2610
|
+
Err(e) => Err(e),
|
|
2611
|
+
Ok(maxLookbehind) => {
|
|
2612
|
+
let matcher = compileRegexToMatcher(parsed)
|
|
2613
|
+
Ok({
|
|
2614
|
+
reParsed: parsed,
|
|
2615
|
+
reNumGroups: numGroups,
|
|
2616
|
+
reReferences: references,
|
|
2617
|
+
reMaxLookbehind: maxLookbehind,
|
|
2618
|
+
reCompiled: matcher,
|
|
2619
|
+
reMustString: mustString(parsed),
|
|
2620
|
+
reIsAnchored: isAnchored(parsed),
|
|
2621
|
+
reStartRange: startRange(parsed),
|
|
2622
|
+
})
|
|
2623
|
+
}
|
|
2624
|
+
}
|
|
2625
|
+
}
|
|
2626
|
+
}
|
|
2627
|
+
}
|
|
2628
|
+
|
|
2629
|
+
|
|
2630
|
+
//
|
|
2631
|
+
//
|
|
2632
|
+
// ============
|
|
2633
|
+
// REGEX SEARCH
|
|
2634
|
+
// ============
|
|
2635
|
+
//
|
|
2636
|
+
//
|
|
2637
|
+
|
|
2638
|
+
// speed up failures using must-string
|
|
2639
|
+
let checkMustString = (ms, buf: MatchBuf, pos, endPos) => {
|
|
2640
|
+
match(ms) {
|
|
2641
|
+
None => true,
|
|
2642
|
+
Some(ms) => {
|
|
2643
|
+
let toCheck = if (pos == 0 && endPos == Array.length(buf.matchInputExploded)) {
|
|
2644
|
+
buf.matchInput
|
|
2645
|
+
} else {
|
|
2646
|
+
String.slice(pos, endPos, buf.matchInput)
|
|
2647
|
+
}
|
|
2648
|
+
Option.isSome(String.indexOf(ms, toCheck))
|
|
2649
|
+
}
|
|
2650
|
+
}
|
|
2651
|
+
}
|
|
2652
|
+
|
|
2653
|
+
// speed up failures using start-range
|
|
2654
|
+
let checkStartRange = (startRange, buf, pos, endPos) => {
|
|
2655
|
+
rangeContains(startRange, Char.code(buf.matchInputExploded[pos]))
|
|
2656
|
+
}
|
|
2657
|
+
|
|
2658
|
+
|
|
2659
|
+
let searchMatch = (rx: RegularExpression, buf: MatchBuf, pos, startPos, endPos, state) => {
|
|
2660
|
+
if (!checkMustString(rx.reMustString, buf, pos, endPos)) {
|
|
2661
|
+
None
|
|
2662
|
+
} else {
|
|
2663
|
+
let matcher = rx.reCompiled
|
|
2664
|
+
let anchored = rx.reIsAnchored
|
|
2665
|
+
let startRange = rx.reStartRange
|
|
2666
|
+
let rec loop = (pos) => {
|
|
2667
|
+
if (anchored && pos != startPos) {
|
|
2668
|
+
None
|
|
2669
|
+
} else {
|
|
2670
|
+
match(startRange) {
|
|
2671
|
+
Some(_) when pos == endPos => None, // Can't possibly match if chars are required and we are at EOS
|
|
2672
|
+
Some(rng) when !checkStartRange(rng, buf, pos, endPos) => loop(pos + 1),
|
|
2673
|
+
_ => {
|
|
2674
|
+
let pos2 = interp(matcher, buf, pos, startPos, endPos, state)
|
|
2675
|
+
match (pos2) {
|
|
2676
|
+
Some(p) => Some((pos, p)),
|
|
2677
|
+
None when pos < endPos => loop(pos + 1),
|
|
2678
|
+
None => None
|
|
2679
|
+
}
|
|
2680
|
+
}
|
|
2681
|
+
}
|
|
2682
|
+
}
|
|
2683
|
+
}
|
|
2684
|
+
loop(pos)
|
|
2685
|
+
}
|
|
2686
|
+
}
|
|
2687
|
+
|
|
2688
|
+
/**
|
|
2689
|
+
* This object contains the results
|
|
2690
|
+
* of a regular expression match. The results can be obtained using
|
|
2691
|
+
* the following accessors:
|
|
2692
|
+
*
|
|
2693
|
+
* ```grain
|
|
2694
|
+
* group : Number -> Option<String>
|
|
2695
|
+
* ```
|
|
2696
|
+
*
|
|
2697
|
+
* Returns the contents of the given group. Note that group 0 contains
|
|
2698
|
+
* the entire matched substring, and group 1 contains the first parenthesized group.
|
|
2699
|
+
*
|
|
2700
|
+
* ```grain
|
|
2701
|
+
* groupPosition : Number -> Option<(Number, Number)>
|
|
2702
|
+
* ```
|
|
2703
|
+
*
|
|
2704
|
+
* Returns the position of the given group.
|
|
2705
|
+
*
|
|
2706
|
+
* ```grain
|
|
2707
|
+
* numGroups : Number
|
|
2708
|
+
* ```
|
|
2709
|
+
*
|
|
2710
|
+
* The number of defined groups in this match object (including group 0).
|
|
2711
|
+
*
|
|
2712
|
+
* ```grain
|
|
2713
|
+
* allGroups : () -> Array<Option<String>>
|
|
2714
|
+
* ```
|
|
2715
|
+
*
|
|
2716
|
+
* Returns the contents of all groups matched in this match object.
|
|
2717
|
+
*
|
|
2718
|
+
* ```grain
|
|
2719
|
+
* allGroupPositions : () -> Array<Option<(Number, Number)>>
|
|
2720
|
+
* ```
|
|
2721
|
+
*
|
|
2722
|
+
* Returns the positions of all groups matched in this match object.
|
|
2723
|
+
*
|
|
2724
|
+
* @since 0.4.3
|
|
2725
|
+
*/
|
|
2726
|
+
export record MatchResult {
|
|
2727
|
+
/**
|
|
2728
|
+
* Returns the contents of the given group
|
|
2729
|
+
*/
|
|
2730
|
+
group: Number -> Option<String>,
|
|
2731
|
+
/**
|
|
2732
|
+
* Returns the position of the given group
|
|
2733
|
+
*/
|
|
2734
|
+
groupPosition: Number -> Option<(Number, Number)>,
|
|
2735
|
+
/**
|
|
2736
|
+
* Returns the number of defined groups in this match object (includes group 0)
|
|
2737
|
+
*/
|
|
2738
|
+
numGroups: Number,
|
|
2739
|
+
/**
|
|
2740
|
+
* Returns the contents of all groups matched in this match object
|
|
2741
|
+
*/
|
|
2742
|
+
allGroups: () -> Array<Option<String>>,
|
|
2743
|
+
/**
|
|
2744
|
+
* Returns the positions of all groups matched in this match object
|
|
2745
|
+
*/
|
|
2746
|
+
allGroupPositions: () -> Array<Option<(Number, Number)>>,
|
|
2747
|
+
}
|
|
2748
|
+
|
|
2749
|
+
let makeMatchResult = (origString, start, end, state) => {
|
|
2750
|
+
let getMatchGroupPosition = (n) => {
|
|
2751
|
+
if (n == 0) {
|
|
2752
|
+
Some((start, end))
|
|
2753
|
+
} else if (n < 0 || n - 1 > Array.length(state)) {
|
|
2754
|
+
None
|
|
2755
|
+
} else match (state[n-1]) {
|
|
2756
|
+
None => None,
|
|
2757
|
+
Some((start, end)) => Some((start, end))
|
|
2758
|
+
}
|
|
2759
|
+
}
|
|
2760
|
+
let getMatchGroup = (n) => {
|
|
2761
|
+
match(getMatchGroupPosition(n)) {
|
|
2762
|
+
Some((start, end)) => Some(String.slice(start, end, origString)),
|
|
2763
|
+
None => None
|
|
2764
|
+
}
|
|
2765
|
+
}
|
|
2766
|
+
let getAllMatchGroupPositions = () => {
|
|
2767
|
+
let ret = Array.make(Array.length(state) + 1, None)
|
|
2768
|
+
ret[0] = Some((start, end))
|
|
2769
|
+
for (let mut i = 0; i < Array.length(state); i += 1) {
|
|
2770
|
+
ret[i + 1] = state[i]
|
|
2771
|
+
}
|
|
2772
|
+
ret
|
|
2773
|
+
}
|
|
2774
|
+
let getAllMatchGroups = () => {
|
|
2775
|
+
Array.map(o => match(o) {
|
|
2776
|
+
None => None,
|
|
2777
|
+
Some((start, end)) => Some(String.slice(start, end, origString))
|
|
2778
|
+
}, getAllMatchGroupPositions())
|
|
2779
|
+
}
|
|
2780
|
+
{
|
|
2781
|
+
group: getMatchGroup,
|
|
2782
|
+
groupPosition: getMatchGroupPosition,
|
|
2783
|
+
numGroups: Array.length(state) + 1,
|
|
2784
|
+
allGroupPositions: getAllMatchGroupPositions,
|
|
2785
|
+
allGroups: getAllMatchGroups
|
|
2786
|
+
}
|
|
2787
|
+
}
|
|
2788
|
+
|
|
2789
|
+
// Helpers for user-facing match functionality
|
|
2790
|
+
|
|
2791
|
+
let fastDriveRegexIsMatch = (rx, string, startOffset, endOffset) => {
|
|
2792
|
+
let state = if (rx.reReferences) Array.make(rx.reNumGroups, None) else Array.make(0, None)
|
|
2793
|
+
let toWrap = if (startOffset == 0 && endOffset == String.length(string)) string else String.slice(startOffset, endOffset, string)
|
|
2794
|
+
let buf = makeMatchBuffer(toWrap)
|
|
2795
|
+
Option.isSome(searchMatch(rx, buf, 0, 0, Array.length(buf.matchInputExploded), state))
|
|
2796
|
+
}
|
|
2797
|
+
|
|
2798
|
+
let rec fastDriveRegexMatchAll = (rx, string, startOffset, endOffset) => {
|
|
2799
|
+
if (startOffset >= endOffset) {
|
|
2800
|
+
[]
|
|
2801
|
+
} else {
|
|
2802
|
+
let state = Array.make(rx.reNumGroups, None)
|
|
2803
|
+
let toWrap = if (startOffset == 0 && endOffset == String.length(string)) string else String.slice(startOffset, endOffset, string)
|
|
2804
|
+
let buf = makeMatchBuffer(toWrap)
|
|
2805
|
+
match(searchMatch(rx, buf, 0, 0, Array.length(buf.matchInputExploded), state)) {
|
|
2806
|
+
None => [],
|
|
2807
|
+
Some((startPos, endPos)) => [makeMatchResult(string, startPos + startOffset, endPos + startOffset, Array.map(elt => {
|
|
2808
|
+
match(elt) {
|
|
2809
|
+
None => None,
|
|
2810
|
+
Some((start, end)) => Some((start + startOffset, end + startOffset))
|
|
2811
|
+
}
|
|
2812
|
+
}, state)), ...fastDriveRegexMatchAll(rx, string, startPos + startOffset + 1, endOffset)],
|
|
2813
|
+
}
|
|
2814
|
+
}
|
|
2815
|
+
}
|
|
2816
|
+
|
|
2817
|
+
let fastDriveRegexMatch = (rx, string, startOffset, endOffset) => {
|
|
2818
|
+
let state = Array.make(rx.reNumGroups, None)
|
|
2819
|
+
let toWrap = if (startOffset == 0 && endOffset == String.length(string)) string else String.slice(startOffset, endOffset, string)
|
|
2820
|
+
let buf = makeMatchBuffer(toWrap)
|
|
2821
|
+
match(searchMatch(rx, buf, 0, 0, Array.length(buf.matchInputExploded), state)) {
|
|
2822
|
+
None => None,
|
|
2823
|
+
Some((startPos, endPos)) => {
|
|
2824
|
+
Some(makeMatchResult(string, startPos + startOffset, endPos + startOffset, Array.map(elt => {
|
|
2825
|
+
match(elt) {
|
|
2826
|
+
None => None,
|
|
2827
|
+
Some((start, end)) => Some((start + startOffset, end + startOffset))
|
|
2828
|
+
}
|
|
2829
|
+
}, state)))
|
|
2830
|
+
}
|
|
2831
|
+
}
|
|
2832
|
+
}
|
|
2833
|
+
|
|
2834
|
+
/**
|
|
2835
|
+
* Determines if the given regular expression has a match in the given string.
|
|
2836
|
+
* @param rx: The regular expression to search for
|
|
2837
|
+
* @param string: The string to search within
|
|
2838
|
+
* @returns `true` if the RegExp matches the string, otherwise `false`
|
|
2839
|
+
*
|
|
2840
|
+
* @example assert Regex.isMatch(Result.unwrap(Regex.make("ca+[at]")), "caaat") == true
|
|
2841
|
+
*
|
|
2842
|
+
* @since 0.4.3
|
|
2843
|
+
*/
|
|
2844
|
+
export let isMatch = (rx: RegularExpression, string: String) => {
|
|
2845
|
+
fastDriveRegexIsMatch(rx, string, 0, String.length(string))
|
|
2846
|
+
}
|
|
2847
|
+
|
|
2848
|
+
/**
|
|
2849
|
+
* Determines if the given regular expression has a match in the given string between the given start/end offsets.
|
|
2850
|
+
* @param rx: The regular expression to search for
|
|
2851
|
+
* @param string: The string to search
|
|
2852
|
+
* @param start: The start offset to search between
|
|
2853
|
+
* @param end: The end offset to search between
|
|
2854
|
+
* @returns `true` if the RegExp matches the string in the given range, otherwise `false`
|
|
2855
|
+
*
|
|
2856
|
+
* @example assert Regex.isMatchRange(Result.unwrap(Regex.make("ca+[at]")), "caaat", 0, 5) == true
|
|
2857
|
+
* @example assert Regex.isMatchRange(Result.unwrap(Regex.make("ca+[at]")), "caaat", 1, 5) == false
|
|
2858
|
+
*
|
|
2859
|
+
* @since 0.4.3
|
|
2860
|
+
*/
|
|
2861
|
+
export let isMatchRange = (rx: RegularExpression, string: String, start: Number, end: Number) => {
|
|
2862
|
+
fastDriveRegexIsMatch(rx, string, start, end)
|
|
2863
|
+
}
|
|
2864
|
+
|
|
2865
|
+
/**
|
|
2866
|
+
* Returns the first match for the given regular expression contained within the given string.
|
|
2867
|
+
* @param rx: The regular expression to search for
|
|
2868
|
+
* @param string: The string to search
|
|
2869
|
+
* @returns The match result, if any
|
|
2870
|
+
*
|
|
2871
|
+
* @example Regex.find(Result.unwrap(Regex.make("ca+[at]")), "caaat")
|
|
2872
|
+
*
|
|
2873
|
+
* @since 0.4.3
|
|
2874
|
+
*/
|
|
2875
|
+
export let find = (rx: RegularExpression, string: String) => {
|
|
2876
|
+
fastDriveRegexMatch(rx, string, 0, String.length(string))
|
|
2877
|
+
}
|
|
2878
|
+
|
|
2879
|
+
/**
|
|
2880
|
+
* Returns the first match for the given regular expression contained within the given string
|
|
2881
|
+
* between the given start/end range.
|
|
2882
|
+
* @param rx: The regular expression to search for
|
|
2883
|
+
* @param string: The string to search
|
|
2884
|
+
* @param start: The start offset to search between
|
|
2885
|
+
* @param end: The end offset to search between
|
|
2886
|
+
* @returns The match result, if any
|
|
2887
|
+
*
|
|
2888
|
+
* @example Regex.findRange(Result.unwrap(Regex.make("ca+[at]")), "caaat", 0, 5)
|
|
2889
|
+
*
|
|
2890
|
+
* @since 0.4.3
|
|
2891
|
+
*/
|
|
2892
|
+
export let findRange = (rx: RegularExpression, string: String, start: Number, end: Number) => {
|
|
2893
|
+
fastDriveRegexMatch(rx, string, start, end)
|
|
2894
|
+
}
|
|
2895
|
+
|
|
2896
|
+
/**
|
|
2897
|
+
* Returns all matches for the given regular expression contained within the given string.
|
|
2898
|
+
* @param rx: The regular expression to search for
|
|
2899
|
+
* @param string: The string to search
|
|
2900
|
+
* @returns The list of matches
|
|
2901
|
+
*/
|
|
2902
|
+
export let findAll = (rx: RegularExpression, string: String) => {
|
|
2903
|
+
fastDriveRegexMatchAll(rx, string, 0, String.length(string))
|
|
2904
|
+
}
|
|
2905
|
+
|
|
2906
|
+
/**
|
|
2907
|
+
* Returns all matches for the given regular expression contained within the given string
|
|
2908
|
+
* between the given start/end range.
|
|
2909
|
+
* @param rx: The regular expression to search for
|
|
2910
|
+
* @param string: The string to search
|
|
2911
|
+
* @param start: The start offset to search between
|
|
2912
|
+
* @param end: The end offset to search between
|
|
2913
|
+
* @returns The list of matches
|
|
2914
|
+
*
|
|
2915
|
+
* @example Regex.findAllRange(Result.unwrap(Regex.make("ca+[at]")), "caaat", 0, 5)
|
|
2916
|
+
*
|
|
2917
|
+
* @since 0.4.3
|
|
2918
|
+
*/
|
|
2919
|
+
export let findAllRange = (rx: RegularExpression, string: String, start: Number, end: Number) => {
|
|
2920
|
+
fastDriveRegexMatchAll(rx, string, start, end)
|
|
2921
|
+
}
|
|
2922
|
+
|
|
2923
|
+
|
|
2924
|
+
let computeReplacement = (matchBuf: MatchBuf, replacementString: String, start, end, state) => {
|
|
2925
|
+
let replacementExploded = String.explode(replacementString)
|
|
2926
|
+
let len = Array.length(replacementExploded)
|
|
2927
|
+
let mut acc = []
|
|
2928
|
+
let getBeforeMatch = () => String.slice(0, start, matchBuf.matchInput)
|
|
2929
|
+
let getAfterMatch = () => String.slice(end, String.length(matchBuf.matchInput), matchBuf.matchInput)
|
|
2930
|
+
let getInputSubstr = (n) => {
|
|
2931
|
+
if (n == 0) {
|
|
2932
|
+
String.slice(start, end, matchBuf.matchInput)
|
|
2933
|
+
} else if (n - 1 < Array.length(state)) {
|
|
2934
|
+
match (state[n-1]) {
|
|
2935
|
+
Some((start, end)) => String.slice(start, end, matchBuf.matchInput),
|
|
2936
|
+
None => ""
|
|
2937
|
+
}
|
|
2938
|
+
} else {
|
|
2939
|
+
""
|
|
2940
|
+
}
|
|
2941
|
+
}
|
|
2942
|
+
let consRange = (start, end, lst) => {
|
|
2943
|
+
if (start == end) lst else [String.slice(start, end, replacementString), ...lst]
|
|
2944
|
+
}
|
|
2945
|
+
let rec loop = (pos, since) => {
|
|
2946
|
+
if (pos == len) {
|
|
2947
|
+
consRange(since, pos, [])
|
|
2948
|
+
} else if (replacementExploded[pos] == '$') {
|
|
2949
|
+
let c = if ((pos + 1) < len) Some(replacementExploded[pos + 1]) else None
|
|
2950
|
+
if (c == Some('&')) {
|
|
2951
|
+
consRange(since, pos, [getInputSubstr(0), ...loop(pos + 2, pos + 2)])
|
|
2952
|
+
} else if (c == Some('`')) {
|
|
2953
|
+
consRange(since, pos, [getBeforeMatch(), ...loop(pos + 2, pos + 2)])
|
|
2954
|
+
} else if (c == Some('\'')) {
|
|
2955
|
+
consRange(since, pos, [getAfterMatch(), ...loop(pos + 2, pos + 2)])
|
|
2956
|
+
} else {
|
|
2957
|
+
consRange(since, pos, {
|
|
2958
|
+
if (c == Some('$')) {
|
|
2959
|
+
loop(pos + 2, pos + 1)
|
|
2960
|
+
} else if (c == Some('.')) {
|
|
2961
|
+
loop(pos + 2, pos + 2)
|
|
2962
|
+
} else {
|
|
2963
|
+
let rec dLoop = (pos, accum) => {
|
|
2964
|
+
if (pos == len) {
|
|
2965
|
+
[getInputSubstr(accum)]
|
|
2966
|
+
} else {
|
|
2967
|
+
let c = replacementExploded[pos]
|
|
2968
|
+
if (Char.code('0') <= Char.code(c) && Char.code(c) <= Char.code('9')) {
|
|
2969
|
+
dLoop(pos + 1, (10 * accum) + (Char.code(c) - Char.code('0')))
|
|
2970
|
+
} else {
|
|
2971
|
+
[getInputSubstr(accum), ...loop(pos, pos)]
|
|
2972
|
+
}
|
|
2973
|
+
}
|
|
2974
|
+
}
|
|
2975
|
+
dLoop(pos + 1, 0)
|
|
2976
|
+
}
|
|
2977
|
+
})
|
|
2978
|
+
}
|
|
2979
|
+
} else {
|
|
2980
|
+
loop(pos + 1, since)
|
|
2981
|
+
}
|
|
2982
|
+
}
|
|
2983
|
+
let res = loop(0, 0)
|
|
2984
|
+
List.reduceRight(String.concat, "", res)
|
|
2985
|
+
}
|
|
2986
|
+
|
|
2987
|
+
|
|
2988
|
+
let regexReplaceHelp = (rx: RegularExpression, toSearch: String, replacement: String, all: Bool) => {
|
|
2989
|
+
let buf = makeMatchBuffer(toSearch)
|
|
2990
|
+
let mut out = []
|
|
2991
|
+
let rec loop = (searchPos) => {
|
|
2992
|
+
let state = Array.make(rx.reNumGroups, None)
|
|
2993
|
+
let poss = searchMatch(rx, buf, searchPos, searchPos, Array.length(buf.matchInputExploded), state)
|
|
2994
|
+
let recur = (start, end) => {
|
|
2995
|
+
if (end == searchPos) {
|
|
2996
|
+
if (searchPos == String.length(toSearch)) {
|
|
2997
|
+
""
|
|
2998
|
+
} else {
|
|
2999
|
+
String.concat(String.slice(searchPos, searchPos + 1, toSearch), loop(searchPos + 1))
|
|
3000
|
+
}
|
|
3001
|
+
} else {
|
|
3002
|
+
loop(end)
|
|
3003
|
+
}
|
|
3004
|
+
}
|
|
3005
|
+
match(poss) {
|
|
3006
|
+
None => if (searchPos == 0) toSearch else String.slice(searchPos, String.length(toSearch), toSearch),
|
|
3007
|
+
Some((start, end)) =>
|
|
3008
|
+
String.concat(String.slice(searchPos, start, toSearch),
|
|
3009
|
+
String.concat(computeReplacement(buf, replacement, start, end, state),
|
|
3010
|
+
if (all) recur(start, end) else String.slice(end, String.length(toSearch), toSearch))),
|
|
3011
|
+
}
|
|
3012
|
+
}
|
|
3013
|
+
loop(0)
|
|
3014
|
+
}
|
|
3015
|
+
|
|
3016
|
+
/**
|
|
3017
|
+
* Replaces the first match for the given regular expression contained within the given string with the specified replacement.
|
|
3018
|
+
* Replacement strings support the following syntax:
|
|
3019
|
+
* - `$&` - Replaced with the text of the matching portion of input (e.g. for `(foo)`, the search string `foo bar`, and the replacement `baz $&`, the result will be `baz foo bar`)
|
|
3020
|
+
* - `$n` / `$nn` (where `n` is a digit) - Replaced with the text of group `nn`
|
|
3021
|
+
* - `$$` - Replaced with a literal `$`
|
|
3022
|
+
* - `$.` - Does nothing (this exists to support replacement strings such as `$4$.0`, which will place the contents of group 4 prior to a zero)
|
|
3023
|
+
* - `$\`` - Replaced with the text preceding the matched substring
|
|
3024
|
+
* - `$'` - Replaced with the text following the matched substring
|
|
3025
|
+
* - Any other character will be placed as-is in the replaced output.
|
|
3026
|
+
*
|
|
3027
|
+
* @param rx: The regular expression to search for
|
|
3028
|
+
* @param toSearch: The string to search
|
|
3029
|
+
* @param replacement: The string that replaces matches
|
|
3030
|
+
* @returns The given string with the appropriate replacements, if any
|
|
3031
|
+
*
|
|
3032
|
+
* @example assert Regex.replace(Result.unwrap(Regex.make("o")), "foo", "a") == "fao"
|
|
3033
|
+
*
|
|
3034
|
+
* @since 0.4.3
|
|
3035
|
+
*/
|
|
3036
|
+
export let replace = (rx: RegularExpression, toSearch: String, replacement: String) => {
|
|
3037
|
+
regexReplaceHelp(rx, toSearch, replacement, false)
|
|
3038
|
+
}
|
|
3039
|
+
|
|
3040
|
+
/**
|
|
3041
|
+
* Replaces all matches for the given regular expression contained within the given string with the specified replacement.
|
|
3042
|
+
* See `replace` for replacement string syntax.
|
|
3043
|
+
*
|
|
3044
|
+
* @param rx: The regular expression to search for
|
|
3045
|
+
* @param toSearch: The string to search
|
|
3046
|
+
* @param replacement: The string that replaces matches
|
|
3047
|
+
* @returns The input string with the appropriate replacements, if any
|
|
3048
|
+
*
|
|
3049
|
+
* @example assert Regex.replaceAll(Result.unwrap(Regex.make("o")), "skoot", "r") == "skrrt"
|
|
3050
|
+
*
|
|
3051
|
+
* @since 0.4.3
|
|
3052
|
+
*/
|
|
3053
|
+
export let replaceAll = (rx: RegularExpression, toSearch: String, replacement: String) => {
|
|
3054
|
+
regexReplaceHelp(rx, toSearch, replacement, true)
|
|
3055
|
+
}
|