@grain/stdlib 0.4.0 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/regex.gr ADDED
@@ -0,0 +1,3055 @@
1
+ /**
2
+ * @module Regex: Regular Expressions.
3
+ * @example import Regex from "regex"
4
+ *
5
+ * @since 0.4.3
6
+ */
7
+
8
+ /*
9
+ This library provides support for regular expressions in Grain.
10
+ Its parser and analyzer are largely ported from Racket (https://racket-lang.org/),
11
+ which is licensed under Apache 2.0. Racket's regular expression
12
+ engine is itself inspired by the Spencer engine, as found in Tcl.
13
+ */
14
+ import Array from "array"
15
+ import Char from "char"
16
+ import List from "list"
17
+ import Map from "map"
18
+ import Option from "option"
19
+ import Result from "result"
20
+ import String from "string"
21
+ import Float32 from "float32"
22
+ import { min, max } from "number"
23
+
24
+ /*
25
+
26
+ ===============================
27
+ REGEX PARSER CONFIG DEFINITIONS
28
+ ===============================
29
+
30
+ */
31
+
32
+ /*
33
+ We use boxes in these records in order to share
34
+ references across multiple objects.
35
+ For example, when a user types `(?i:...)`, we
36
+ want to create a new configuration which is
37
+ case-insensitive while still having the same group
38
+ number and reference counter.
39
+ */
40
+
41
+ record RegExParserConfig {
42
+ // Whether to use Perl-based regexp syntax
43
+ isPerlRegExp: Bool,
44
+ // Whether the regexp is case-sensitive
45
+ caseSensitive: Bool,
46
+ // Whether multi-line mode is enabled
47
+ multiline: Bool,
48
+ // The number of total groups in this regular expression
49
+ groupNumber: Box<Number>,
50
+ // Whether this regular expression contains any references
51
+ references: Box<Bool>,
52
+ }
53
+
54
+ let makeRegExParserConfig = () => {
55
+ {
56
+ isPerlRegExp: true,
57
+ caseSensitive: true,
58
+ multiline: false,
59
+ groupNumber: box(0),
60
+ references: box(false),
61
+ }
62
+ }
63
+
64
+ let configWithCaseSensitive = (config: RegExParserConfig, caseSensitive: Bool) => {
65
+ {
66
+ isPerlRegExp: config.isPerlRegExp,
67
+ caseSensitive: caseSensitive,
68
+ multiline: config.multiline,
69
+ groupNumber: config.groupNumber,
70
+ references: config.references,
71
+ }
72
+ }
73
+
74
+ let configWithMultiLine = (config: RegExParserConfig, multiline: Bool) => {
75
+ {
76
+ isPerlRegExp: config.isPerlRegExp,
77
+ caseSensitive: config.caseSensitive,
78
+ multiline: multiline,
79
+ groupNumber: config.groupNumber,
80
+ references: config.references,
81
+ }
82
+ }
83
+
84
+ let configGroupNumber = (config: RegExParserConfig) => unbox(config.groupNumber)
85
+
86
+ let configIncGroupNumber = (config: RegExParserConfig) => {
87
+ config.groupNumber := unbox(config.groupNumber) + 1
88
+ config
89
+ }
90
+
91
+ record RegExBuf {
92
+ input: String,
93
+ inputExploded: Array<Char>,
94
+ cursor: Box<Number>,
95
+ config: RegExParserConfig,
96
+ }
97
+
98
+ let makeRegExBuf = (s) => {
99
+ {input: s, inputExploded: String.explode(s), cursor: box(0), config: makeRegExParserConfig()}
100
+ }
101
+
102
+ let withConfig = (buf: RegExBuf, config: RegExParserConfig) => {
103
+ {input: buf.input, inputExploded: buf.inputExploded, cursor: buf.cursor, config: config}
104
+ }
105
+
106
+ // Parsing internals for recursive descent
107
+
108
+ let parseErr = (buf: RegExBuf, msg: String, posShift) => {
109
+ "Invalid Regular Expression: " ++ msg ++ " (position " ++ toString(unbox(buf.cursor) + posShift) ++ ")"
110
+ }
111
+
112
+ let next = (buf: RegExBuf) => {
113
+ let cursor = unbox(buf.cursor)
114
+ if (cursor >= Array.length(buf.inputExploded)) {
115
+ Err(parseErr(buf, "end of buffer reached", 0))
116
+ } else {
117
+ let ret = buf.inputExploded[cursor]
118
+ buf.cursor := cursor + 1
119
+ Ok(ret)
120
+ }
121
+ }
122
+
123
+ let peek = (buf: RegExBuf) => {
124
+ let cursor = unbox(buf.cursor)
125
+ if (cursor >= Array.length(buf.inputExploded)) {
126
+ Err(parseErr(buf, "end of buffer reached", 0))
127
+ } else {
128
+ Ok(buf.inputExploded[cursor])
129
+ }
130
+ }
131
+
132
+ let peekN = (buf: RegExBuf, n) => {
133
+ let cursor = unbox(buf.cursor)
134
+ if (cursor + n >= Array.length(buf.inputExploded)) {
135
+ Err(parseErr(buf, "end of buffer reached", 0))
136
+ } else {
137
+ Ok(buf.inputExploded[cursor + n])
138
+ }
139
+ }
140
+
141
+ let eat = (buf: RegExBuf, char: Char) => {
142
+ let cursor = unbox(buf.cursor)
143
+ if (cursor >= Array.length(buf.inputExploded)) {
144
+ Err(parseErr(buf, "end of buffer reached", 0))
145
+ } else {
146
+ let ret = buf.inputExploded[cursor]
147
+ if (ret == char) {
148
+ buf.cursor := cursor + 1
149
+ Ok(ret)
150
+ } else {
151
+ Err(parseErr(buf, "Expected character '" ++ Char.toString(char) ++ ", but found character '" ++ Char.toString(ret) ++ "'", 0))
152
+ }
153
+ }
154
+ }
155
+
156
+ /**
157
+ * Checks if the given regex buffer is empty
158
+ * @param buf: The buffer to check
159
+ * @returns `false` if the buffer is empty, `true` otherwise.
160
+ */
161
+ let more = (buf: RegExBuf) => {
162
+ unbox(buf.cursor) < Array.length(buf.inputExploded)
163
+ }
164
+
165
+ let moreN = (buf: RegExBuf, n) => {
166
+ unbox(buf.cursor) + n < Array.length(buf.inputExploded)
167
+ }
168
+
169
+ // END Parsing internals for recursive descent
170
+
171
+
172
+ /*
173
+
174
+ =================================
175
+ REGEX RANGE DEFINITIONS AND UTILS
176
+ =================================
177
+
178
+ Based on https://github.com/racket/racket/blob/0a9c70e95a69743dd5d219a395e995be4a4bfd41/racket/src/regexp/common/range.rkt
179
+
180
+ */
181
+
182
+ // [TODO] alias type RERange as List<(Number, Number)>
183
+
184
+ let rangeInvert = (rng, limitC) => {
185
+ let rec help = (rng, start) => {
186
+ match(rng) {
187
+ [] when start > limitC => [],
188
+ [] => [(start, limitC)],
189
+ [(subrangeStart, subrangeEnd), ...tl] => [(start, subrangeStart - 1), ...help(tl, subrangeEnd + 1)],
190
+ }
191
+ }
192
+ help(rng, 0)
193
+ }
194
+
195
+ let rec rangeContains = (rng, v) => {
196
+ match(rng) {
197
+ [] => false,
198
+ [(start, end), ..._] when (start <= v) && (v <= end) => true,
199
+ [_, ...tl] => rangeContains(tl, v),
200
+ }
201
+ }
202
+
203
+ let rec rangeAdd = (rng, v) => {
204
+ match(rng) {
205
+ _ when rangeContains(rng, v) => rng,
206
+ _ => rangeUnion(rng, [(v, v)])
207
+ }
208
+ },
209
+
210
+ rangeUnion = (rng1, rng2) => {
211
+ match((rng1, rng2)) {
212
+ ([], _) => rng2,
213
+ (_, []) => rng1,
214
+ ([(r1start, r1end), ...r1tl], [(r2start, r2end), ...r2tl]) when r1start <= r2start => {
215
+ if (r1end + 1 >= r2start) {
216
+ if (r1end <= r2end) {
217
+ rangeUnion([(r1start, r2end), ...r2tl], r1tl)
218
+ } else {
219
+ rangeUnion(rng1, r2tl)
220
+ }
221
+ } else {
222
+ [(r1start, r1end), ...rangeUnion(r1tl, rng2)]
223
+ }
224
+ },
225
+ (_, _) => rangeUnion(rng2, rng1)
226
+ }
227
+ }
228
+
229
+ let rangeAddSpan = (rng, fromC, toC) => {
230
+ rangeUnion(rng, [(fromC, toC)])
231
+ }
232
+
233
+ let rangeSingleton = (rng) => {
234
+ match(rng) {
235
+ [(c1, c2)] when c1 == c2 => Some(c1),
236
+ _ => None
237
+ }
238
+ }
239
+
240
+ let rec rangeIncludes = (rng, lo, hi) => {
241
+ match(rng) {
242
+ [] => false,
243
+ [(c1, c2), ...tl] when lo > c2 => rangeIncludes(tl, lo, hi),
244
+ [(c1, c2), ..._] => lo >= c1 && hi <= c2,
245
+ }
246
+ }
247
+
248
+ let rec rangeWithin = (rng, lo, hi) => {
249
+ match(rng) {
250
+ [] => true,
251
+ [(c1, _), ..._] when c1 < lo => false,
252
+ [(_, c2), ..._] when c2 > hi => false,
253
+ [_, ...tl] => rangeWithin(tl, lo, hi)
254
+ }
255
+ }
256
+
257
+ let rec rangeOverlaps = (rng, lo, hi) => {
258
+ match(rng) {
259
+ [] => false,
260
+ [(_, c2), ...tl] when lo > c2 => rangeOverlaps(tl, lo, hi),
261
+ [(c1, c2), ..._] => (lo >= c1 && lo <= c2) && (hi >= c1 && hi <= c2)
262
+ }
263
+ }
264
+
265
+ let rangeAddCaseAware = (rng, c, config) => {
266
+ match(c) {
267
+ None => Ok(rng),
268
+ Some(c) => {
269
+ let rng = rangeAdd(rng, c)
270
+ if (config.caseSensitive) {
271
+ Ok(rng)
272
+ } else {
273
+ // Needs Char.upcase and friends (once it's added, change return type from Result<RERange> to RERange) [see #661]:
274
+ /*
275
+ let rng = rangeAdd(rng, Char.code(Char.upcase(Char.fromCode(c))))
276
+ let rng = rangeAdd(rng, Char.code(Char.foldcase(Char.fromCode(c))))
277
+ let rng = rangeAdd(rng, Char.code(Char.downcase(Char.fromCode(c))))
278
+ Ok(rng)
279
+ */
280
+ Err("NYI: Case-insensitive matching is not supported until grain-lang/grain#661 is resolved.")
281
+ }
282
+ }
283
+ }
284
+ }
285
+
286
+ let rangeAddSpanCaseAware = (rng, fromC, toC, config) => {
287
+ if (config.caseSensitive) {
288
+ Ok(rangeAddSpan(rng, fromC, toC))
289
+ } else {
290
+ let mut ret = Ok(rng)
291
+ for (let mut i = fromC; i <= toC; i = i + 1) {
292
+ match (ret) {
293
+ Ok(x) => ret = rangeAddCaseAware(x, Some(i), config),
294
+ Err(e) => break
295
+ }
296
+ }
297
+ ret
298
+ }
299
+ }
300
+
301
+ /*
302
+
303
+ =====================
304
+ REGEX AST DEFINITIONS
305
+ =====================
306
+
307
+ */
308
+
309
+ enum RepeatQuantifier {
310
+ ZeroOrMore,
311
+ OnceOrMore,
312
+ ZeroOrOne,
313
+ }
314
+
315
+ enum GroupModeFlag {
316
+ GMFCaseSensitive,
317
+ GMFCaseInsensitive,
318
+ GMFNotMulti,
319
+ GMFMulti,
320
+ }
321
+
322
+ enum LookMode {
323
+ LMMatches,
324
+ LMDoesntMatch,
325
+ LMMatchesPreceding,
326
+ LMDoesntMatchPreceding,
327
+ }
328
+
329
+ enum PCEMode {
330
+ PCEOnce,
331
+ PCELongest,
332
+ PCEShortest,
333
+ }
334
+
335
+ enum UnicodeCategory {
336
+ LetterLowercase,
337
+ LetterUppercase,
338
+ LetterTitlecase,
339
+ LetterModifier,
340
+ LetterOther,
341
+ NumberDecimalDigit,
342
+ NumberLetter,
343
+ NumberOther,
344
+ PunctuationOpen,
345
+ PunctuationClose,
346
+ PunctuationInitialQuote,
347
+ PunctuationFinalQuote,
348
+ PunctuationConnector,
349
+ PunctuationDash,
350
+ PunctuationOther,
351
+ MarkNonSpacing,
352
+ MarkSpacingCombining,
353
+ MarkEnclosing,
354
+ SymbolCurrency,
355
+ SymbolModifier,
356
+ SymbolMath,
357
+ SymbolOther,
358
+ SeparatorLine,
359
+ SeparatorParagraph,
360
+ SeparatorSpace,
361
+ OtherControl,
362
+ OtherFormat,
363
+ OtherSurrogate,
364
+ OtherNotAssigned,
365
+ OtherPrivateUse
366
+ }
367
+
368
+ enum ParsedRegularExpression {
369
+ RENever,
370
+ REEmpty,
371
+ REAny,
372
+ REStart,
373
+ REEnd,
374
+ RELineStart,
375
+ RELineEnd,
376
+ REWordBoundary,
377
+ RENotWordBoundary,
378
+ RELiteral(Char),
379
+ RELiteralString(String), // <- sequences of literals are flattened into a string
380
+ REAlts(ParsedRegularExpression, ParsedRegularExpression),
381
+ RESequence(List<ParsedRegularExpression>, Bool), // seq elts, needs backtrack
382
+ REGroup(ParsedRegularExpression, Number), // regex, group ID
383
+ RERepeat(ParsedRegularExpression, Number, Option<Number>, Bool), // regex, min, max (None for infinity), true=non-greedy
384
+ REMaybe(ParsedRegularExpression, Bool), // regex, true=non-greedy
385
+ REConditional(ParsedRegularExpression, ParsedRegularExpression, Option<ParsedRegularExpression>, Number, Number, Bool), // test, if-true, if-false, n-start, num-n, needs-backtrack
386
+ RELookahead(ParsedRegularExpression, Bool, Number, Number), // regex, is-match, n-start, num-n
387
+ RELookbehind(ParsedRegularExpression, Bool, Box<Number>, Box<Number>, Number, Number), // regex, is-match, lb-min, lb-max, n-start, num-n (lb-xx values patched in later)
388
+ RECut(ParsedRegularExpression, Number, Number, Bool), // regex, n-start, num-n, needs-backtrack
389
+ REReference(Number, Bool), // n, case-sensitive
390
+ RERange(List<(Number, Number)>),
391
+ REUnicodeCategories(List<UnicodeCategory>, Bool) // symlist, true=match/false=does-not-match
392
+ }
393
+
394
+ let needsBacktrack = (rx: ParsedRegularExpression) => {
395
+ match(rx) {
396
+ REAlts(_, _) => true,
397
+ RESequence(_, nb) => nb,
398
+ REGroup(_, _) => true,
399
+ RERepeat(_, _, _, _) => true,
400
+ REMaybe(_, _) => true,
401
+ REConditional(_, _, _, _, _, nb) => nb,
402
+ RECut(_, _, _, nb) => nb,
403
+ REUnicodeCategories(_, _) => true,
404
+ _ => false
405
+ }
406
+ }
407
+
408
+ let makeRERange = (rng, limitC) => {
409
+ match(rng) {
410
+ [(c1, c2)] when c1 == c2 => RELiteral(Char.fromCode(c1)),
411
+ _ when rangeIncludes(rng, 0, limitC) => REAny,
412
+ _ => RERange(rng),
413
+ }
414
+ }
415
+
416
+ enum MergeMode {
417
+ MMChar,
418
+ }
419
+
420
+ let mergeAdjacent = (lst) => {
421
+ // see [TODO] below
422
+ let readyForAccum = (l, mode) => {
423
+ match(l) {
424
+ [] => true,
425
+ [hd, ..._] => {
426
+ match(mode) {
427
+ None => false,
428
+ Some(MMChar) => {
429
+ match(hd) {
430
+ RELiteral(x) => false,
431
+ RELiteralString(x) => false,
432
+ _ => true
433
+ }
434
+ }
435
+ }
436
+ }
437
+ }
438
+ }
439
+ let rec loop = (mode, accum, l) => {
440
+ match(l) {
441
+ // flatten nested sequences
442
+ [(RESequence(rxs1, _)), ...tl] => loop(mode, accum, List.append(rxs1, tl)),
443
+ // drop empty elements
444
+ [REEmpty, ...tl] => loop(mode, accum, tl),
445
+ [RELiteralString(""), ...tl] => loop(mode, accum, tl),
446
+ // [TODO] Clean up with or-patterns (grain-lang/grain#696)
447
+ _ when readyForAccum(l, mode) => {
448
+ match(accum) {
449
+ [] => [],
450
+ [hd] => [RELiteralString(hd), ...loop(None, [], l)],
451
+ [hd, ...tl] => {
452
+ let newHd = match(mode) {
453
+ // MMByte would go here, if supported
454
+ Some(MMChar) => List.join("", List.reverse(accum)),
455
+ None => fail "internal error (mergeAdjacent)",
456
+ }
457
+ [RELiteralString(newHd), ...loop(None, [], l)]
458
+ },
459
+ }
460
+ },
461
+ [] => fail "impossible (mergeAdjacent)", // avoid warning (can delete once TODO is resolved)
462
+ [RELiteralString(x), ...tl] when Option.isSome(mode) => loop(mode, [x, ...accum], tl),
463
+ [RELiteral(c), ...tl] when Option.isSome(mode) => loop(mode, [Char.toString(c), ...accum], tl),
464
+ [RELiteralString(x), ...tl] => loop(Some(MMChar), [x], tl),
465
+ [RELiteral(c), ...tl] => loop(Some(MMChar), [Char.toString(c)], tl),
466
+ [hd, ...tl] => [hd, ...loop(None, [], tl)],
467
+ }
468
+ }
469
+ loop(None, [], lst)
470
+ }
471
+
472
+ let makeRESequence = (lst) => {
473
+ match(lst) {
474
+ [] => REEmpty,
475
+ [hd] => hd,
476
+ _ => {
477
+ match(mergeAdjacent(lst)) {
478
+ [hd] => hd,
479
+ mList => RESequence(mList, List.some(needsBacktrack, mList))
480
+ }
481
+ }
482
+ }
483
+ }
484
+
485
+ let makeREAlts = (rx1, rx2, limitC) => {
486
+ match((rx1, rx2)) {
487
+ ((RENever, _)) => rx2,
488
+ ((_, RENever)) => rx1,
489
+ ((RERange(r1), RERange(r2))) => makeRERange(rangeUnion(r1, r2), limitC),
490
+ ((RERange(r1), RELiteral(c2))) => makeRERange(rangeAdd(r1, Char.code(c2)), limitC),
491
+ ((RELiteral(c1), RERange(r2))) => makeRERange(rangeAdd(r2, Char.code(c1)), limitC),
492
+ ((RELiteral(c1), RELiteral(c2))) => makeRERange(rangeAdd(rangeAdd([], Char.code(c1)), Char.code(c2)), limitC),
493
+ _ => REAlts(rx1, rx2)
494
+ }
495
+ }
496
+
497
+ let makeRECut = (rx, nStart, numN) => {
498
+ RECut(rx, nStart, numN, needsBacktrack(rx))
499
+ }
500
+
501
+ let makeREConditional = (tst, pces1, pces2, nStart, numN) => {
502
+ let nb = needsBacktrack(pces1) || match(pces2) {
503
+ None => false,
504
+ Some(p2) => needsBacktrack(p2)
505
+ }
506
+ REConditional(tst, pces1, pces2, nStart, numN, nb)
507
+ }
508
+
509
+ /*
510
+
511
+ =========================
512
+ REGEX PARSING DEFINITIONS
513
+ =========================
514
+
515
+ */
516
+
517
+ // Range parsing ("[a-z]")
518
+
519
+ // [TODO] (#769) When byte-based regexes are supported, we'll need another limit of 255 for those.
520
+ let rangeLimit = 0x10FFFF
521
+
522
+ // These are snake-cased to avoid confusion with their capitalized counterparts
523
+
524
+ let range_d = () => {
525
+ rangeAddSpan([], Char.code('0'), Char.code('9'))
526
+ }
527
+
528
+ let range_w = () => {
529
+ rangeAdd(rangeAddSpan(rangeAddSpan(range_d(), Char.code('a'), Char.code('z')), Char.code('A'), Char.code('Z')), Char.code('_'))
530
+ }
531
+
532
+ let range_s = () => {
533
+ // newline, tab, page, return
534
+ rangeAdd(rangeAdd(rangeAdd(rangeAdd(rangeAdd([], Char.code(' ')), 9), 10), 12), 13)
535
+ }
536
+
537
+ let rec parseRangeNot = (buf: RegExBuf) => {
538
+ if (!more(buf)) {
539
+ Err(parseErr(buf, "Missing closing `]`", 0))
540
+ } else {
541
+ match(peek(buf)) {
542
+ Err(e) => Err(e),
543
+ Ok('^') => {
544
+ ignore(eat(buf, '^'))
545
+ match(parseRange(buf)) {
546
+ Err(e) => Err(e),
547
+ Ok(rng) => Ok(rangeInvert(rng, rangeLimit))
548
+ }
549
+ },
550
+ Ok(_) => parseRange(buf)
551
+ }
552
+ }
553
+ },
554
+
555
+ parseRange = (buf: RegExBuf) => {
556
+ if (!more(buf)) {
557
+ Err(parseErr(buf, "Missing closing `]`", 0))
558
+ } else {
559
+ match(peek(buf)) {
560
+ Err(e) => Err(e),
561
+ Ok(']') => {
562
+ ignore(eat(buf, ']'))
563
+ match(parseRangeRest(buf, [], None, None)) {
564
+ Err(e) => Err(e),
565
+ Ok(rng) => Ok(rangeAdd(rng, Char.code(']')))
566
+ }
567
+ },
568
+ Ok('-') => {
569
+ ignore(eat(buf, '-'))
570
+ match(parseRangeRest(buf, [], None, None)) {
571
+ Err(e) => Err(e),
572
+ Ok(rng) => Ok(rangeAdd(rng, Char.code('-')))
573
+ }
574
+ },
575
+ Ok(_) => parseRangeRest(buf, [], None, None)
576
+ }
577
+ }
578
+ },
579
+
580
+ parseClass = (buf: RegExBuf) => {
581
+ if (!more(buf)) {
582
+ Err("no chars") // caught in handler (we use a Result to cleanly mesh with the Result type below)
583
+ } else {
584
+ match(peek(buf)) {
585
+ Err(e) => Err(e),
586
+ Ok('d') => {
587
+ ignore(eat(buf, 'd'))
588
+ Ok(range_d())
589
+ },
590
+ Ok('D') => {
591
+ ignore(eat(buf, 'D'))
592
+ Ok(rangeInvert(range_d(), rangeLimit))
593
+ },
594
+ Ok('w') => {
595
+ ignore(eat(buf, 'w'))
596
+ Ok(range_w())
597
+ },
598
+ Ok('W') => {
599
+ ignore(eat(buf, 'W'))
600
+ Ok(rangeInvert(range_w(), rangeLimit))
601
+ },
602
+ Ok('s') => {
603
+ ignore(eat(buf, 's'))
604
+ Ok(range_s())
605
+ },
606
+ Ok('S') => {
607
+ ignore(eat(buf, 'S'))
608
+ Ok(rangeInvert(range_s(), rangeLimit))
609
+ },
610
+ Ok(c) => Err("unknown class: " ++ toString(c)),
611
+ }
612
+ }
613
+ },
614
+
615
+ parsePosixCharClass = (buf: RegExBuf) => {
616
+ if (!more(buf)) {
617
+ Err(parseErr(buf, "Missing POSIX character class after `[`", 0))
618
+ } else {
619
+ match(peek(buf)) {
620
+ Err(e) => Err(e),
621
+ Ok(':') => {
622
+ ignore(eat(buf, ':'))
623
+ let rec loop = (acc) => {
624
+ match(peek(buf)) {
625
+ Err(e) => Err(e),
626
+ Ok(':') => {
627
+ ignore(eat(buf, ':'))
628
+ match(eat(buf, ']')) {
629
+ Err(_) => Err(parseErr(buf, "Missing closing `]`", 0)),
630
+ Ok(_) => Ok(List.join("", List.reverse(acc)))
631
+ }
632
+ },
633
+ Ok(c) when (Char.code('a') <= Char.code(c) && Char.code(c) <= Char.code('z')) => {
634
+ ignore(eat(buf, c))
635
+ loop([Char.toString(c), ...acc])
636
+ },
637
+ Ok(_) => Err(parseErr(buf, "Invalid character in POSIX character class", 0))
638
+ }
639
+ }
640
+ match(loop([])) {
641
+ Err(e) => Err(e),
642
+ Ok(s) => {
643
+ match(s) {
644
+ "alpha" => Ok(rangeAddSpan(rangeAddSpan([], Char.code('a'), Char.code('z')), Char.code('A'), Char.code('Z'))),
645
+ "upper" => Ok(rangeAddSpan([], Char.code('A'), Char.code('Z'))),
646
+ "lower" => Ok(rangeAddSpan([], Char.code('a'), Char.code('z'))),
647
+ "digit" => Ok(rangeAddSpan([], Char.code('0'), Char.code('9'))),
648
+ "xdigit" => Ok(rangeAddSpan(rangeAddSpan(rangeAddSpan([], Char.code('0'), Char.code('9')), Char.code('a'), Char.code('f')), Char.code('A'), Char.code('F'))),
649
+ "alnum" => Ok(rangeAddSpan(rangeAddSpan(rangeAddSpan([], Char.code('0'), Char.code('9')), Char.code('a'), Char.code('z')), Char.code('A'), Char.code('Z'))),
650
+ "word" => Ok(rangeAdd(rangeAddSpan(rangeAddSpan([], Char.code('a'), Char.code('f')), Char.code('A'), Char.code('F')), Char.code('_'))),
651
+ "blank" => Ok(rangeAdd(rangeAdd([], 0x20), 0x9)), // space and tab
652
+ "space" => Ok(range_s()),
653
+ "graph" => Err(parseErr(buf, "the [:graph:] character class is not currently supported. For more information, see https://github.com/grain-lang/grain/issues/661", 0)),
654
+ "print" => Err(parseErr(buf, "the [:print:] character class is not currently supported. For more information, see https://github.com/grain-lang/grain/issues/661", 0)),
655
+ "cntrl" => Ok(rangeAddSpan([], 0, 31)),
656
+ "ascii" => Ok(rangeAddSpan([], 0, 127)),
657
+ _ => Err(parseErr(buf, "Invalid POSIX character class: " ++ s, 0))
658
+ }
659
+ }
660
+ }
661
+ },
662
+ Ok(c) => Err(parseErr(buf, "Expected `:` after `[`. Found: `" ++ Char.toString(c) ++ "`", 0))
663
+ }
664
+ }
665
+ },
666
+
667
+ parseRangeRest = (buf: RegExBuf, rng, spanFrom: Option<Number>, mustSpanFrom: Option<Number>) => {
668
+ if (!more(buf)) {
669
+ Err(parseErr(buf, "Missing closing `]`", 0))
670
+ } else {
671
+ match(peek(buf)) {
672
+ Err(e) => Err(e),
673
+ Ok(']') => {
674
+ ignore(eat(buf, ']'))
675
+ rangeAddCaseAware(rng, spanFrom, buf.config)
676
+ },
677
+ Ok('-') => {
678
+ if (!moreN(buf, 1)) {
679
+ Err(parseErr(buf, "Missing closing `]`", 1))
680
+ } else {
681
+ match(peekN(buf, 1)) {
682
+ Err(e) => Err(e),
683
+ Ok(']') => {
684
+ match(mustSpanFrom) {
685
+ Some(_) => Err(parseErr(buf, "misplaced hyphen within square brackets in pattern", 1)),
686
+ None => {
687
+ ignore(eat(buf, '-'))
688
+ ignore(eat(buf, ']'))
689
+ match(rangeAddCaseAware(rng, spanFrom, buf.config)) {
690
+ Err(e) => Err(e),
691
+ Ok(rng) => Ok(rangeAdd(rng, Char.code('-')))
692
+ }
693
+ }
694
+ }
695
+ },
696
+ Ok(_) when Option.isNone(spanFrom) => Err(parseErr(buf, "misplaced hyphen within square brackets in pattern", 1)),
697
+ Ok(_) => {
698
+ ignore(eat(buf, '-'))
699
+ parseRangeRest(buf, rng, None, spanFrom)
700
+ }
701
+ }
702
+ }
703
+ },
704
+ Ok('\\') => {
705
+ ignore(eat(buf, '\\'))
706
+ if (!(buf.config.isPerlRegExp)) {
707
+ parseRangeRestSpan(buf, Char.code('\\'), rng, spanFrom, mustSpanFrom)
708
+ } else {
709
+ if (!more(buf)) {
710
+ Err(parseErr(buf, "escaping backslash at end pattern (within square brackets)", 0))
711
+ } else {
712
+ match(peek(buf)) {
713
+ Err(e) => Err(e),
714
+ Ok(c) when ((Char.code('a') <= Char.code(c) && Char.code(c) <= Char.code('z')) || (Char.code('A') <= Char.code(c) && Char.code(c) <= Char.code('Z'))) => {
715
+ match(mustSpanFrom) {
716
+ Some(_) => Err(parseErr(buf, "misplaced hyphen within square brackets in pattern", 0)),
717
+ None => {
718
+ let curPos = unbox(buf.cursor)
719
+ match(parseClass(buf)) {
720
+ Err(e) => Err("Invalid Regular Expression: illegal alphebetic escape (position " ++ toString(curPos) ++ ")"),
721
+ Ok(range1) => {
722
+ match(rangeAddCaseAware(rng, spanFrom, buf.config)) {
723
+ Err(e) => Err(e),
724
+ Ok(r) => parseRangeRest(buf, rangeUnion(range1, r), spanFrom, mustSpanFrom)
725
+ }
726
+ }
727
+ }
728
+ }
729
+ }
730
+ },
731
+ Ok(c) => {
732
+ ignore(next(buf))
733
+ parseRangeRestSpan(buf, Char.code(c), rng, spanFrom, mustSpanFrom)
734
+ }
735
+ }
736
+ }
737
+ }
738
+ },
739
+ Ok('[') => {
740
+ ignore(eat(buf, '['))
741
+ let curPos = unbox(buf.cursor)
742
+ match(parsePosixCharClass(buf)) {
743
+ // NOTE: Based on the spec, we don't propagate out
744
+ // the errors here. Instead, we treat malformed
745
+ // POSIX classes as being simple sequences of characters.
746
+ Err(e) => {
747
+ buf.cursor := curPos
748
+ parseRangeRestSpan(buf, Char.code('['), rng, spanFrom, mustSpanFrom)
749
+ },
750
+ Ok(rngNew) => {
751
+ match(rangeAddCaseAware(rng, spanFrom, buf.config)) {
752
+ Err(e) => Err(e),
753
+ Ok(rng) => parseRangeRest(buf, rangeUnion(rngNew, rng), None, None)
754
+ }
755
+ }
756
+ }
757
+ },
758
+ Ok(c) => {
759
+ ignore(next(buf))
760
+ parseRangeRestSpan(buf, Char.code(c), rng, spanFrom, mustSpanFrom)
761
+ }
762
+ }
763
+ }
764
+ },
765
+
766
+ parseRangeRestSpan = (buf: RegExBuf, c, rng, spanFrom: Option<Number>, mustSpanFrom: Option<Number>) => {
767
+ match(mustSpanFrom) {
768
+ Some(n) => {
769
+ if (n > c) {
770
+ Err(parseErr(buf, "invalid range within square brackets in pattern", 0))
771
+ } else {
772
+ match(rangeAddSpanCaseAware(rng, n, c, buf.config)) {
773
+ Err(e) => Err(e),
774
+ Ok(rng) => parseRangeRest(buf, rng, None, None)
775
+ }
776
+ }
777
+ },
778
+ None => {
779
+ match(rangeAddCaseAware(rng, spanFrom, buf.config)) {
780
+ Err(e) => Err(e),
781
+ Ok(rng) => parseRangeRest(buf, rng, Some(c), None)
782
+ }
783
+ }
784
+ }
785
+ }
786
+
787
+ // Main parsing
788
+
789
+ let rec parseAtom = (buf: RegExBuf) => {
790
+ match (peek(buf)) {
791
+ Err(e) => Err(e),
792
+ Ok(c) => match(c) {
793
+ '(' => {
794
+ if (!moreN(buf, 1)) {
795
+ Err(parseErr(buf, "Parentheses not closed", 1))
796
+ } else if (peekN(buf, 1) == Ok('?')) {
797
+ // fancy group
798
+ if (!moreN(buf, 2)) {
799
+ Err(parseErr(buf, "Parentheses not closed", 2))
800
+ } else {
801
+ match(peekN(buf, 2)) {
802
+ Err(e) => Err(e),
803
+ Ok('>') => {
804
+ // cut
805
+ ignore(eat(buf, '('))
806
+ ignore(eat(buf, '?'))
807
+ ignore(eat(buf, '>'))
808
+ let preNumGroups = unbox(buf.config.groupNumber)
809
+ match(parseRegex(buf)) {
810
+ Err(e) => Err(e),
811
+ Ok(rx) => {
812
+ let postNumGroups = unbox(buf.config.groupNumber)
813
+ match(eat(buf, ')')) {
814
+ Err(e) => Err(e),
815
+ Ok(_) => Ok(makeRECut(rx, preNumGroups, postNumGroups - preNumGroups))
816
+ }
817
+ }
818
+ }
819
+ },
820
+ Ok('(') => {
821
+ // conditional
822
+ ignore(eat(buf, '('))
823
+ ignore(eat(buf, '?'))
824
+ ignore(eat(buf, '('))
825
+ let tstPreNumGroups = unbox(buf.config.groupNumber)
826
+ match(parseTest(buf)) {
827
+ Err(e) => Err(e),
828
+ Ok(test) => {
829
+ let tstSpanNumGroups = unbox(buf.config.groupNumber) - tstPreNumGroups
830
+ match(parsePCEs(buf, false)) {
831
+ Err(e) => Err(e),
832
+ Ok(pces) => {
833
+ if (!more(buf)) {
834
+ Err(parseErr(buf, "Parentheses not closed", 0))
835
+ } else {
836
+ match(peek(buf)) {
837
+ Err(e) => Err(e),
838
+ Ok('|') => {
839
+ ignore(eat(buf, '|'))
840
+ match(parsePCEs(buf, false)) {
841
+ Err(e) => Err(e),
842
+ Ok(pces2) => {
843
+ match(peek(buf)) {
844
+ Err(_) => Err(parseErr(buf, "Parentheses not closed", 0)),
845
+ Ok(_) => {
846
+ ignore(eat(buf, ')'))
847
+ Ok(makeREConditional(test, makeRESequence(pces), Some(makeRESequence(pces2)), tstPreNumGroups, tstSpanNumGroups))
848
+ }
849
+ }
850
+ }
851
+ }
852
+ },
853
+ Ok(')') => {
854
+ ignore(eat(buf, ')'))
855
+ Ok(makeREConditional(test, makeRESequence(pces), None, tstPreNumGroups, tstSpanNumGroups))
856
+ },
857
+ Ok(_) => {
858
+ Err(parseErr(buf, "Failed to parse condition", 0))
859
+ }
860
+ }
861
+ }
862
+ }
863
+ }
864
+ }
865
+ }
866
+ },
867
+ Ok(c) when (c == 'i' || c == 's' || c == 'm' || c == '-' || c == ':') => {
868
+ // match with mode
869
+ ignore(eat(buf, '('))
870
+ ignore(eat(buf, '?'))
871
+ match(parseMode(buf)) {
872
+ Err(e) => Err(e),
873
+ Ok(config) => {
874
+ if (!more(buf)) {
875
+ Err(parseErr(buf, "Parentheses not closed", 0))
876
+ } else {
877
+ match(peek(buf)) {
878
+ Err(e) => Err(e),
879
+ Ok(':') => {
880
+ ignore(eat(buf, ':'))
881
+ match (parseRegex(withConfig(buf, config))) {
882
+ Err(e) => Err(e),
883
+ Ok(rx) => {
884
+ match(eat(buf, ')')) {
885
+ Err(e) => Err(e),
886
+ Ok(_) => Ok(rx)
887
+ }
888
+ }
889
+ }
890
+ },
891
+ Ok(_) => {
892
+ Err(parseErr(buf, "expected `:` or another mode after `(?` and a mode sequence; a mode is `i`, `-i`, `m`, `-m`, `s`, or `-s`", 0))
893
+ }
894
+ }
895
+ }
896
+ }
897
+ }
898
+ },
899
+ Ok(_) => {
900
+ ignore(eat(buf, '('))
901
+ ignore(eat(buf, '?'))
902
+ parseLook(buf)
903
+ },
904
+ }
905
+ }
906
+ } else {
907
+ // simple group
908
+ ignore(eat(buf, '('))
909
+ let groupNum = unbox(buf.config.groupNumber)
910
+ // Note that this inc operation is side-effecting
911
+ match(parseRegex(withConfig(buf, configIncGroupNumber(buf.config)))) {
912
+ Err(e) => Err(e),
913
+ Ok(r) => {
914
+ match(eat(buf, ')')) {
915
+ Err(e) => Err(e),
916
+ Ok(_) => Ok(REGroup(r, groupNum))
917
+ }
918
+ }
919
+ }
920
+ }
921
+ },
922
+ '[' => {
923
+ ignore(eat(buf, '['))
924
+ match(parseRangeNot(buf)) {
925
+ Err(e) => Err(e),
926
+ Ok(rng) => Ok(makeRERange(rng, rangeLimit))
927
+ }
928
+ },
929
+ '.' => {
930
+ ignore(eat(buf, '.'))
931
+ if (buf.config.multiline) {
932
+ // if in multiline mode, '.' matches everything but \n
933
+ Ok(makeRERange(rangeInvert(rangeAdd([], Char.code('\n')), rangeLimit), rangeLimit))
934
+ } else {
935
+ Ok(REAny)
936
+ }
937
+ },
938
+ '^' => {
939
+ ignore(eat(buf, '^'))
940
+ Ok(if (buf.config.multiline) { RELineStart } else { REStart })
941
+ },
942
+ '$' => {
943
+ ignore(eat(buf, '$'))
944
+ Ok(if (buf.config.multiline) { RELineEnd } else { REEnd })
945
+ },
946
+ _ => parseLiteral(buf)
947
+ }
948
+ }
949
+ },
950
+
951
+ parseLook = (buf: RegExBuf) => {
952
+ let preNumGroups = unbox(buf.config.groupNumber)
953
+ let spanNumGroups = () => unbox(buf.config.groupNumber) - preNumGroups
954
+ // (isMatch, isAhead)
955
+ let flags = match(peek(buf)) {
956
+ Err(e) => Err(e),
957
+ Ok('=') => {
958
+ ignore(eat(buf, '='))
959
+ Ok((true, true))
960
+ },
961
+ Ok('!') => {
962
+ ignore(eat(buf, '!'))
963
+ Ok((false, true))
964
+ },
965
+ Ok('<') => {
966
+ ignore(eat(buf, '<'))
967
+ if (!more(buf)) {
968
+ Err(parseErr(buf, "Unterminated look sequence", 0))
969
+ } else {
970
+ match(peek(buf)) {
971
+ Err(e) => Err(e),
972
+ Ok('=') => {
973
+ ignore(eat(buf, '='))
974
+ Ok((true, false))
975
+ },
976
+ Ok('!') => {
977
+ ignore(eat(buf, '!'))
978
+ Ok((false, false))
979
+ },
980
+ Ok(_) => Err(parseErr(buf, "Invalid look sequence", 0))
981
+ }
982
+ }
983
+ },
984
+ Ok(_) => {
985
+ Err(parseErr(buf, "Invalid look sequence", 0))
986
+ }
987
+ }
988
+ match(flags) {
989
+ Err(e) => Err(e),
990
+ Ok((isMatch, isAhead)) => {
991
+ match(parseRegex(buf)) {
992
+ Err(e) => Err(e),
993
+ Ok(rx) => {
994
+ match(eat(buf, ')')) {
995
+ Err(e) => Err(e),
996
+ Ok(_) => {
997
+ if (isAhead) {
998
+ Ok(RELookahead(rx, isMatch, preNumGroups, spanNumGroups()))
999
+ } else {
1000
+ Ok(RELookbehind(rx, isMatch, box(0), box(0), preNumGroups, spanNumGroups()))
1001
+ }
1002
+ }
1003
+ }
1004
+ }
1005
+ }
1006
+ }
1007
+ }
1008
+ },
1009
+
1010
+ parseTest = (buf: RegExBuf) => {
1011
+ if (!more(buf)) {
1012
+ Err(parseErr(buf, "Expected test", 0))
1013
+ } else {
1014
+ match(peek(buf)) {
1015
+ Err(e) => Err(e),
1016
+ Ok('?') => {
1017
+ ignore(eat(buf, '?'))
1018
+ parseLook(buf)
1019
+ },
1020
+ Ok(c) when (Char.code(c) >= Char.code('0') && Char.code(c) <= Char.code('9')) => {
1021
+ buf.config.references := true
1022
+ let curPos = unbox(buf.cursor)
1023
+ match(parseInteger(buf, 0)) {
1024
+ Err(e) => Err(e),
1025
+ Ok(n) => {
1026
+ if (unbox(buf.cursor) == curPos) {
1027
+ Err(parseErr(buf, "expected `)` after `(?(` followed by digits", 0))
1028
+ } else {
1029
+ match(eat(buf, ')')) {
1030
+ Err(e) => Err(e),
1031
+ Ok(_) => Ok(REReference(n, false))
1032
+ }
1033
+ }
1034
+ }
1035
+ }
1036
+ },
1037
+ Ok(_) => Err(parseErr(buf, "expected `(?=`, `(?!`, `(?<`, or digit after `(?(`", 0))
1038
+ }
1039
+ }
1040
+ },
1041
+
1042
+ parseInteger = (buf: RegExBuf, n) => {
1043
+ if (!more(buf)) {
1044
+ Ok(n)
1045
+ } else {
1046
+ match(peek(buf)) {
1047
+ Err(c) => Err(c),
1048
+ Ok(c) when (Char.code(c) >= Char.code('0') && Char.code(c) <= Char.code('9')) => {
1049
+ ignore(next(buf))
1050
+ parseInteger(buf, (10 * n) + (Char.code(c) - Char.code('0')))
1051
+ },
1052
+ Ok(_) => Ok(n)
1053
+ }
1054
+ }
1055
+ },
1056
+
1057
+ parseMode = (buf: RegExBuf) => {
1058
+ let processState = ((cs, ml)) => {
1059
+ let withCs = match(cs) {
1060
+ None => buf.config,
1061
+ Some(true) => configWithCaseSensitive(buf.config, true),
1062
+ Some(_) => configWithCaseSensitive(buf.config, false),
1063
+ }
1064
+ match(ml) {
1065
+ None => withCs,
1066
+ Some(true) => configWithMultiLine(withCs, true),
1067
+ Some(_) => configWithMultiLine(withCs, false),
1068
+ }
1069
+ }
1070
+ let rec help = ((cs, ml)) => {
1071
+ if (!more(buf)) {
1072
+ Ok(processState((cs, ml)))
1073
+ } else {
1074
+ match(peek(buf)) {
1075
+ Err(e) => Err(e),
1076
+ Ok('i') => {
1077
+ ignore(eat(buf, 'i'))
1078
+ help((Some(false), ml))
1079
+ },
1080
+ Ok('s') => {
1081
+ ignore(eat(buf, 's'))
1082
+ help((cs, Some(false)))
1083
+ },
1084
+ Ok('m') => {
1085
+ ignore(eat(buf, 'm'))
1086
+ help((cs, Some(true)))
1087
+ },
1088
+ Ok('-') => {
1089
+ ignore(eat(buf, '-'))
1090
+ if (!more(buf)) {
1091
+ Ok(processState((cs, ml)))
1092
+ } else {
1093
+ match(peek(buf)) {
1094
+ Err(e) => Err(e),
1095
+ Ok('i') => {
1096
+ ignore(eat(buf, 'i'))
1097
+ help((Some(true), ml))
1098
+ },
1099
+ Ok('s') => {
1100
+ ignore(eat(buf, 's'))
1101
+ help((cs, Some(true)))
1102
+ },
1103
+ Ok('m') => {
1104
+ ignore(eat(buf, 'm'))
1105
+ help((cs, Some(false)))
1106
+ },
1107
+ _ => Ok(processState((cs, ml)))
1108
+ }
1109
+ }
1110
+ },
1111
+ _ => Ok(processState((cs, ml)))
1112
+ }
1113
+ }
1114
+ }
1115
+ help((None, None))
1116
+ },
1117
+
1118
+ parseUnicodeCategories = (buf: RegExBuf, pC: String) => {
1119
+ if (!more(buf)) {
1120
+ Err(parseErr(buf, "Expected unicode category", 0))
1121
+ } else {
1122
+ match(peek(buf)) {
1123
+ Err(e) => Err(e),
1124
+ Ok('{') => {
1125
+ ignore(eat(buf, '{'))
1126
+ let catNegated = if (peek(buf) == Ok('^')) {
1127
+ ignore(eat(buf, '^'))
1128
+ true
1129
+ } else false
1130
+ let rec loop = (acc) => {
1131
+ match(peek(buf)) {
1132
+ Err(e) => Err(parseErr(buf, "Missing `}` to close `\\" ++ pC ++ "`", 0)),
1133
+ Ok('}') => {
1134
+ ignore(eat(buf, '}'))
1135
+ Ok(List.join("", List.reverse(acc)))
1136
+ },
1137
+ Ok(c) => {
1138
+ ignore(eat(buf, c))
1139
+ loop([Char.toString(c), ...acc])
1140
+ }
1141
+ }
1142
+ }
1143
+ let lst = match(loop([])) {
1144
+ Err(e) => Err(e),
1145
+ Ok(s) => {
1146
+ // In case anyone is curious where these codes originate from:
1147
+ // https://www.unicode.org/reports/tr44/#General_Category_Values
1148
+ match(s) {
1149
+ "Ll" => Ok([LetterLowercase]),
1150
+ "Lu" => Ok([LetterUppercase]),
1151
+ "Lt" => Ok([LetterTitlecase]),
1152
+ "Lm" => Ok([LetterModifier]),
1153
+ "L&" => Ok([LetterLowercase, LetterUppercase, LetterTitlecase, LetterModifier]),
1154
+ "Lo" => Ok([LetterOther]),
1155
+ "L" => Ok([LetterLowercase, LetterUppercase, LetterTitlecase, LetterModifier, LetterOther]),
1156
+ "Nd" => Ok([NumberDecimalDigit]),
1157
+ "Nl" => Ok([NumberLetter]),
1158
+ "No" => Ok([NumberOther]),
1159
+ "N" => Ok([NumberDecimalDigit, NumberLetter, NumberOther]),
1160
+ "Ps" => Ok([PunctuationOpen]),
1161
+ "Pe" => Ok([PunctuationClose]),
1162
+ "Pi" => Ok([PunctuationInitialQuote]),
1163
+ "Pf" => Ok([PunctuationFinalQuote]),
1164
+ "Pc" => Ok([PunctuationConnector]),
1165
+ "Pd" => Ok([PunctuationDash]),
1166
+ "Po" => Ok([PunctuationOther]),
1167
+ "P" => Ok([PunctuationOpen, PunctuationClose, PunctuationInitialQuote, PunctuationFinalQuote, PunctuationConnector, PunctuationDash, PunctuationOther]),
1168
+ "Mn" => Ok([MarkNonSpacing]),
1169
+ "Mc" => Ok([MarkSpacingCombining]),
1170
+ "Me" => Ok([MarkEnclosing]),
1171
+ "M" => Ok([MarkNonSpacing, MarkSpacingCombining, MarkEnclosing]),
1172
+ "Sc" => Ok([SymbolCurrency]),
1173
+ "Sk" => Ok([SymbolModifier]),
1174
+ "Sm" => Ok([SymbolMath]),
1175
+ "So" => Ok([SymbolOther]),
1176
+ "S" => Ok([SymbolCurrency, SymbolModifier, SymbolMath, SymbolOther]),
1177
+ "Zl" => Ok([SeparatorLine]),
1178
+ "Zp" => Ok([SeparatorParagraph]),
1179
+ "Zs" => Ok([SeparatorSpace]),
1180
+ "Z" => Ok([SeparatorLine, SeparatorParagraph, SeparatorSpace]),
1181
+ "Cc" => Ok([OtherControl]),
1182
+ "Cf" => Ok([OtherFormat]),
1183
+ "Cs" => Ok([OtherSurrogate]),
1184
+ "Cn" => Ok([OtherNotAssigned]),
1185
+ "Co" => Ok([OtherPrivateUse]),
1186
+ "C" => Ok([OtherControl, OtherFormat, OtherSurrogate, OtherNotAssigned, OtherPrivateUse]),
1187
+ "." => Ok([
1188
+ LetterLowercase, LetterUppercase, LetterTitlecase, LetterModifier, LetterOther,
1189
+ NumberDecimalDigit, NumberLetter, NumberOther,
1190
+ PunctuationOpen, PunctuationClose, PunctuationInitialQuote, PunctuationFinalQuote, PunctuationConnector, PunctuationDash, PunctuationOther,
1191
+ MarkNonSpacing, MarkSpacingCombining, MarkEnclosing,
1192
+ SymbolCurrency, SymbolModifier, SymbolMath, SymbolOther,
1193
+ SeparatorLine, SeparatorParagraph, SeparatorSpace,
1194
+ OtherControl, OtherFormat, OtherSurrogate, OtherNotAssigned, OtherPrivateUse
1195
+ ]),
1196
+ s => Err(parseErr(buf, "Unrecognized property name in `\\" ++ pC ++ "`: `" ++ s ++ "`", 0))
1197
+ }
1198
+ }
1199
+ }
1200
+ match(lst) {
1201
+ Err(e) => Err(e),
1202
+ Ok(l) => Ok((l, catNegated))
1203
+ }
1204
+ },
1205
+ Ok(_) => Err(parseErr(buf, "Expected `{` after `\\" ++ pC ++ "`", 0))
1206
+ }
1207
+ }
1208
+ },
1209
+
1210
+ parseLiteral = (buf: RegExBuf) => {
1211
+ if (!more(buf)) {
1212
+ Err(parseErr(buf, "Expected literal", 0))
1213
+ } else {
1214
+ match(peek(buf)) {
1215
+ Err(e) => Err(e),
1216
+ Ok('*') => Err(parseErr(buf, "`*` follows nothing in pattern", 0)),
1217
+ Ok('+') => Err(parseErr(buf, "`+` follows nothing in pattern", 0)),
1218
+ Ok('?') => Err(parseErr(buf, "`?` follows nothing in pattern", 0)),
1219
+ Ok('{') when buf.config.isPerlRegExp => Err(parseErr(buf, "`{` follows nothing in pattern", 0)),
1220
+ Ok('\\') => {
1221
+ ignore(eat(buf, '\\'))
1222
+ parseBackslashLiteral(buf)
1223
+ },
1224
+ Ok(')') => Err(parseErr(buf, "Unmatched `)` in pattern", 0)),
1225
+ Ok(c) when (buf.config.isPerlRegExp) && (c == ']' || c == '}') => Err(parseErr(buf, "unmatched `" ++ Char.toString(c) ++ "` in pattern", 0)),
1226
+ // [TODO] case-insensitive (#691)
1227
+ Ok(c) when buf.config.caseSensitive => {
1228
+ ignore(next(buf))
1229
+ Ok(RELiteral(c))
1230
+ },
1231
+ Ok(c) => {
1232
+ ignore(next(buf))
1233
+ match(rangeAddCaseAware([], Some(Char.code(c)), buf.config)) {
1234
+ Ok(rng) => Ok(makeRERange(rng, rangeLimit)),
1235
+ Err(e) => Err(e)
1236
+ }
1237
+ }
1238
+ }
1239
+ }
1240
+ },
1241
+
1242
+ parseBackslashLiteral = (buf: RegExBuf) => {
1243
+ if (!more(buf)) {
1244
+ // Special case: EOS after backslash matches null
1245
+ Err(parseErr(buf, "Expected to find escaped value after backslash", 0))
1246
+ } else {
1247
+ match(peek(buf)) {
1248
+ Err(e) => Err(e),
1249
+ // pregexp:
1250
+ Ok(c) when (buf.config.isPerlRegExp) && (Char.code(c) >= Char.code('0') && Char.code(c) <= Char.code('9')) => {
1251
+ buf.config.references := true
1252
+ match(parseInteger(buf, 0)) {
1253
+ Err(e) => Err(e),
1254
+ Ok(n) => {
1255
+ Ok(REReference(n, buf.config.caseSensitive))
1256
+ }
1257
+ }
1258
+ },
1259
+ Ok(c) when (buf.config.isPerlRegExp) && (((Char.code(c) >= Char.code('a') && Char.code(c) <= Char.code('z'))) || (Char.code(c) >= Char.code('A') && Char.code(c) <= Char.code('Z'))) => {
1260
+ match(c) {
1261
+ 'p' => {
1262
+ ignore(eat(buf, 'p'))
1263
+ match(parseUnicodeCategories(buf, "p")) {
1264
+ Err(e) => Err(e),
1265
+ Ok((cats, negated)) => Ok(REUnicodeCategories(cats, negated))
1266
+ }
1267
+ },
1268
+ 'P' => {
1269
+ ignore(eat(buf, 'P'))
1270
+ match(parseUnicodeCategories(buf, "P")) {
1271
+ Err(e) => Err(e),
1272
+ Ok((cats, negated)) => Ok(REUnicodeCategories(cats, !negated))
1273
+ }
1274
+ },
1275
+ 'b' => {
1276
+ ignore(eat(buf, 'b'))
1277
+ Ok(REWordBoundary)
1278
+ },
1279
+ 'B' => {
1280
+ ignore(eat(buf, 'B'))
1281
+ Ok(RENotWordBoundary)
1282
+ },
1283
+ _ => {
1284
+ match(parseClass(buf)) {
1285
+ Err(e) => Err(parseErr(buf, "illegal alphabetic escape", 0)),
1286
+ Ok(rng) => Ok(makeRERange(rng, rangeLimit))
1287
+ }
1288
+ }
1289
+ }
1290
+ },
1291
+ Ok(c) => {
1292
+ ignore(next(buf))
1293
+ Ok(RELiteral(c))
1294
+ }
1295
+ }
1296
+ }
1297
+ },
1298
+
1299
+ parseNonGreedy = (buf: RegExBuf) => {
1300
+ let checkNotNested = (res) => {
1301
+ if (!more(buf)) {
1302
+ res
1303
+ } else {
1304
+ match(peek(buf)) {
1305
+ Err(e) => Err(e),
1306
+ Ok(c) when (c == '?' || c == '*' || c == '+') => {
1307
+ Err(parseErr(buf, "nested '" ++ toString(c) ++ "' in pattern", 0))
1308
+ },
1309
+ Ok(_) => res
1310
+ }
1311
+ }
1312
+ }
1313
+ if (!more(buf)) {
1314
+ Ok(false)
1315
+ } else {
1316
+ match(peek(buf)) {
1317
+ Err(e) => Err(e),
1318
+ Ok('?') => {
1319
+ ignore(eat(buf, '?'))
1320
+ checkNotNested(Ok(true))
1321
+ },
1322
+ Ok(_) => checkNotNested(Ok(false)),
1323
+ }
1324
+ }
1325
+ },
1326
+
1327
+ parsePCE = (buf: RegExBuf) => {
1328
+ match(parseAtom(buf)) {
1329
+ Err(e) => Err(e),
1330
+ Ok(atom) => {
1331
+ if (!more(buf)) {
1332
+ Ok(atom)
1333
+ } else {
1334
+ match(peek(buf)) {
1335
+ Err(e) => Err(e),
1336
+ Ok('*') => {
1337
+ ignore(eat(buf, '*'))
1338
+ match(parseNonGreedy(buf)) {
1339
+ Err(e) => Err(e),
1340
+ Ok(ng) => Ok(RERepeat(atom, 0, None, ng))
1341
+ }
1342
+ },
1343
+ Ok('+') => {
1344
+ ignore(eat(buf, '+'))
1345
+ match(parseNonGreedy(buf)) {
1346
+ Err(e) => Err(e),
1347
+ Ok(ng) => Ok(RERepeat(atom, 1, None, ng))
1348
+ }
1349
+ },
1350
+ Ok('?') => {
1351
+ ignore(eat(buf, '?'))
1352
+ match(parseNonGreedy(buf)) {
1353
+ Err(e) => Err(e),
1354
+ Ok(ng) => Ok(REMaybe(atom, ng))
1355
+ }
1356
+ },
1357
+ Ok('{') when buf.config.isPerlRegExp => {
1358
+ ignore(eat(buf, '{'))
1359
+ match(parseInteger(buf, 0)) {
1360
+ Err(e) => Err(e),
1361
+ Ok(n1) => {
1362
+ match(peek(buf)) {
1363
+ Ok(',') => {
1364
+ ignore(eat(buf, ','))
1365
+ let curPos = unbox(buf.cursor)
1366
+ match(parseInteger(buf, 0)) {
1367
+ Err(e) => Err(e),
1368
+ Ok(n2) => {
1369
+ match(peek(buf)) {
1370
+ Err(e) => Err(e),
1371
+ Ok('}') => {
1372
+ // for `{n,}`, we match >= n times, so n2adj should be infinity
1373
+ let n2adj = if (curPos == unbox(buf.cursor)) { None } else { Some(n2) }
1374
+ ignore(eat(buf, '}'))
1375
+ match(parseNonGreedy(buf)) {
1376
+ Err(e) => Err(e),
1377
+ Ok(ng) => Ok(RERepeat(atom, n1, n2adj, ng))
1378
+ }
1379
+ },
1380
+ Ok(_) => Err(parseErr(buf, "expected digit or `}` to end repetition specification started with `{`", 0))
1381
+ }
1382
+ }
1383
+ }
1384
+ },
1385
+ Ok('}') => {
1386
+ ignore(eat(buf, '}'))
1387
+ match(parseNonGreedy(buf)) {
1388
+ Err(e) => Err(e),
1389
+ // match exactly n1 times
1390
+ Ok(ng) => Ok(RERepeat(atom, n1, Some(n1), ng),)
1391
+ }
1392
+ },
1393
+ _ => Err(parseErr(buf, "expected digit, `,`, or `}' for repetition specification started with `{`", 0))
1394
+ }
1395
+ }
1396
+ }
1397
+ },
1398
+ Ok(_) => Ok(atom)
1399
+ }
1400
+ }
1401
+ }
1402
+ }
1403
+ },
1404
+
1405
+ parsePCEs = (buf: RegExBuf, toplevel: Bool) => {
1406
+ if (!more(buf)) {
1407
+ Ok([])
1408
+ } else {
1409
+ match(parsePCE(buf)) {
1410
+ Err(e) => Err(e),
1411
+ Ok(pce) => {
1412
+ if (!more(buf)) {
1413
+ Ok([pce])
1414
+ } else {
1415
+ match(peek(buf)) {
1416
+ Err(e) => Err(e),
1417
+ Ok('|') => Ok([pce]),
1418
+ Ok(')') when toplevel => Err(parseErr(buf, "Unmatched `)`", 0)),
1419
+ Ok(')') => Ok([pce]),
1420
+ Ok(_) => {
1421
+ match(parsePCEs(buf, toplevel)) {
1422
+ Err(e) => Err(e),
1423
+ Ok(otherPces) => Ok([pce, ...otherPces])
1424
+ }
1425
+ }
1426
+ }
1427
+ }
1428
+ }
1429
+ }
1430
+ }
1431
+ },
1432
+
1433
+ parseRegex = (buf: RegExBuf) => {
1434
+ if (!more(buf)) {
1435
+ Ok(REEmpty)
1436
+ } else {
1437
+ match(peek(buf)) {
1438
+ Err(e) => Err(e),
1439
+ Ok(')') => {
1440
+ Ok(REEmpty)
1441
+ },
1442
+ Ok(_) => {
1443
+ match(parsePCEs(buf, false)) {
1444
+ Err(e) => Err(e),
1445
+ Ok(pces) => {
1446
+ if (!more(buf)) {
1447
+ Ok(makeRESequence(pces))
1448
+ } else {
1449
+ match(peek(buf)) {
1450
+ Err(e) => Err(e),
1451
+ Ok('|') => {
1452
+ ignore(eat(buf, '|'))
1453
+ match(parseRegex(buf)) {
1454
+ Err(e) => Err(e),
1455
+ Ok(rx2) => {
1456
+ Ok(makeREAlts(makeRESequence(pces), rx2, rangeLimit))
1457
+ }
1458
+ }
1459
+ },
1460
+ Ok(_) => Ok(makeRESequence(pces))
1461
+ }
1462
+ }
1463
+ }
1464
+ }
1465
+ }
1466
+ }
1467
+ }
1468
+ },
1469
+
1470
+ parseRegexNonEmpty = (buf: RegExBuf) => {
1471
+ match(parsePCEs(buf, false)) {
1472
+ Err(e) => Err(e),
1473
+ Ok(pces) => {
1474
+ if (!more(buf)) {
1475
+ Ok(makeRESequence(pces))
1476
+ } else {
1477
+ match(peek(buf)) {
1478
+ Err(e) => Err(e),
1479
+ Ok('|') => {
1480
+ ignore(eat(buf, '|'))
1481
+ match(parseRegexNonEmpty(buf)) {
1482
+ Err(e) => Err(e),
1483
+ Ok(rx2) => {
1484
+ Ok(makeREAlts(makeRESequence(pces), rx2, rangeLimit))
1485
+ }
1486
+ }
1487
+ },
1488
+ Ok(_) => Ok(makeRESequence(pces))
1489
+ }
1490
+ }
1491
+ }
1492
+ }
1493
+ }
1494
+
1495
+ let parseRegex = (buf: RegExBuf) => {
1496
+ match(parsePCEs(buf, true)) {
1497
+ Err(e) => Err(e),
1498
+ Ok(pces) => {
1499
+ if (!more(buf)) {
1500
+ Ok(makeRESequence(pces))
1501
+ } else {
1502
+ match(peek(buf)) {
1503
+ Err(e) => Err(e),
1504
+ Ok('|') => {
1505
+ ignore(eat(buf, '|'))
1506
+ match(parseRegex(buf)) {
1507
+ Err(e) => Err(e),
1508
+ Ok(rx2) => {
1509
+ Ok(makeREAlts(makeRESequence(pces), rx2, rangeLimit))
1510
+ }
1511
+ }
1512
+ },
1513
+ Ok(_) => Ok(makeRESequence(pces))
1514
+ }
1515
+ }
1516
+ }
1517
+ }
1518
+ }
1519
+
1520
+
1521
+ /*
1522
+
1523
+ REGEX ANALYSIS
1524
+ -------
1525
+
1526
+ In addition to the parse tree, we take three analyses from Racket:
1527
+ - isAnchored, which checks if a matching string must match at the beginning (avoids useless backtracking)
1528
+ - mustString, which determines if there is a substring which must appear in matches that we can use to filter out non-matching strings
1529
+ - startRange, which determins if there is a closed set of characters which must appear at the beginning of any match
1530
+ - validate, which performs consistency checks across the groups defined in the regex.
1531
+
1532
+ */
1533
+
1534
+ // is-anchored:
1535
+
1536
+ let rec isAnchored = (re: ParsedRegularExpression) => {
1537
+ match(re) {
1538
+ REStart => true,
1539
+ RESequence(lst, _) => {
1540
+ let rec loop = (lst) => {
1541
+ match(lst) {
1542
+ [] => false,
1543
+ [hd, ...tl] => {
1544
+ match(hd) {
1545
+ RELookahead(_, _, _, _) => loop(tl),
1546
+ RELookbehind(_, _, _, _, _, _) => loop(tl),
1547
+ _ => isAnchored(hd),
1548
+ }
1549
+ }
1550
+ }
1551
+ }
1552
+ loop(lst)
1553
+ },
1554
+ REAlts(a, b) => isAnchored(a) && isAnchored(b),
1555
+ REConditional(_, rx1, rx2, _, _, _) => isAnchored(rx1) && Option.mapWithDefault(isAnchored, false, rx2),
1556
+ REGroup(rx, _) => isAnchored(rx),
1557
+ RECut(rx, _, _, _) => isAnchored(rx),
1558
+ _ => false,
1559
+ }
1560
+ }
1561
+
1562
+ // must-string:
1563
+
1564
+ let rec mustString = (re: ParsedRegularExpression) => {
1565
+ match(re) {
1566
+ RELiteral(c) => Some(Char.toString(c)),
1567
+ RELiteralString(s) => Some(s),
1568
+ RESequence(pces, _) => {
1569
+ List.reduce((acc, pce) => {
1570
+ match((mustString(pce), acc)) {
1571
+ (x, None) => x,
1572
+ (None, x) => x,
1573
+ (Some(a), Some(b)) when String.length(a) > String.length(b) => Some(a),
1574
+ (Some(a), Some(b)) => Some(b),
1575
+ }
1576
+ }, None, pces)
1577
+ },
1578
+ RERepeat(re, min, _, _) => {
1579
+ if (min == 0) {
1580
+ None
1581
+ } else {
1582
+ mustString(re)
1583
+ }
1584
+ },
1585
+ REGroup(re, _) => mustString(re),
1586
+ RECut(re, _, _, _) => mustString(re),
1587
+ RELookahead(re, true, _, _) => mustString(re),
1588
+ RELookbehind(re, true, _, _, _, _) => mustString(re),
1589
+ _ => None
1590
+ }
1591
+ }
1592
+
1593
+ // start-range
1594
+
1595
+ let rec zeroSized = (re) => {
1596
+ match(re) {
1597
+ REEmpty => true,
1598
+ REStart => true,
1599
+ RELineStart => true,
1600
+ REWordBoundary => true,
1601
+ RENotWordBoundary => true,
1602
+ RELookahead(_, _, _, _) => true,
1603
+ RELookbehind(_, _, _, _, _, _) => true,
1604
+ REGroup(re, _) => zeroSized(re),
1605
+ RECut(re, _, _, _) => zeroSized(re),
1606
+ _ => false,
1607
+ }
1608
+ }
1609
+
1610
+ let rec startRange = (re) => {
1611
+ match (re) {
1612
+ RELiteral(c) => Some(rangeAdd([], Char.code(c))),
1613
+ RELiteralString(s) => Some(rangeAdd([], Char.code(String.charAt(0, s)))),
1614
+ RESequence(elts, _) => {
1615
+ let rec loop = (l) => {
1616
+ match(l) {
1617
+ [] => None,
1618
+ [hd, ...tl] when zeroSized(hd) => loop(tl),
1619
+ [hd, ..._] => startRange(hd)
1620
+ }
1621
+ }
1622
+ loop(elts)
1623
+ },
1624
+ REAlts(re1, re2) => {
1625
+ match(startRange(re1)) {
1626
+ None => None,
1627
+ Some(rng1) => {
1628
+ match(startRange(re2)) {
1629
+ None => None,
1630
+ Some(rng2) => Some(rangeUnion(rng1, rng2))
1631
+ }
1632
+ }
1633
+ }
1634
+ },
1635
+ REConditional(_, re1, re2, _, _, _) => {
1636
+ match(startRange(re1)) {
1637
+ None => None,
1638
+ Some(rng1) => {
1639
+ match(re2) {
1640
+ None => None,
1641
+ Some(re2) => {
1642
+ match(startRange(re2)) {
1643
+ None => None,
1644
+ Some(rng2) => Some(rangeUnion(rng1, rng2))
1645
+ }
1646
+ }
1647
+ }
1648
+ }
1649
+ }
1650
+ },
1651
+ REGroup(re, _) => startRange(re),
1652
+ RECut(re, _, _, _) => startRange(re),
1653
+ RERepeat(re, min, _, _) when min > 0 => startRange(re),
1654
+ RERange(rng) => Some(rng),
1655
+ _ => None,
1656
+ }
1657
+ }
1658
+
1659
+ // validate:
1660
+
1661
+ enum ValidateError {
1662
+ MightBeEmpty,
1663
+ DoesNotMatchBounded,
1664
+ BackreferenceTooBig,
1665
+ InternalError(ParsedRegularExpression),
1666
+ }
1667
+
1668
+ let rec validate = (re: ParsedRegularExpression, numGroups) => {
1669
+ let groupSizes = Map.make()
1670
+ let mut dependsSizes = Map.make()
1671
+ let mut mustSizes = Map.make()
1672
+ // to avoid excess allocations inside of `loop`, we set a flag
1673
+ // which is checked at the end of the function.
1674
+ let mut thrownError = None
1675
+ let rec mergeDependsSizes = (ht1, ht2) => {
1676
+ if (Map.size(ht1) == 0) {
1677
+ ht2
1678
+ } else if (Map.size(ht1) > Map.size(ht2)) {
1679
+ mergeDependsSizes(ht2, ht1)
1680
+ } else {
1681
+ Map.forEach((k, v) => Map.set(k, v, ht2), ht1)
1682
+ ht2
1683
+ }
1684
+ }
1685
+ /**
1686
+ Computes the range of possible UTF-8 byte lengths for the given character range
1687
+ */
1688
+ let rangeUtf8EncodingLengths = (rng) => {
1689
+ let (min, max, _) = List.reduce(((min1, max1, n), (segStart, segEnd)) => {
1690
+ if (rangeOverlaps(rng, segStart, segEnd)) {
1691
+ (min(min1, n), max(max1, n), n + 1)
1692
+ } else {
1693
+ (min1, max1, n + 1)
1694
+ }
1695
+ }, (4, 0, 1), [(0, 127), (128, 0x7ff), (0x800, 0x7fff), (0x10000, 0x10ffff)])
1696
+ (min, max)
1697
+ }
1698
+ let rec loop = (re) => {
1699
+ match(re) {
1700
+ RENever => (1, 1, 0),
1701
+ REAny => (1, 1, 0),
1702
+ RELiteral(_) => (1, 1, 0),
1703
+ RERange(_) => (1, 1, 0),
1704
+ RELiteralString(s) => {
1705
+ let ls = String.length(s)
1706
+ (ls, ls, 0)
1707
+ },
1708
+ REEmpty => (0, 0, 0),
1709
+ REEnd => (0, 0, 0),
1710
+ RELineEnd => (0, 0, 0),
1711
+ REStart => (0, 0, 1),
1712
+ RELineStart => (0, 0, 1),
1713
+ REWordBoundary => (0, 0, 1),
1714
+ RENotWordBoundary => (0, 0, 1),
1715
+ REAlts(re1, re2) => {
1716
+ let (min1, max1, maxL1) = loop(re1)
1717
+ let (min2, max2, maxL2) = loop(re2)
1718
+ (min(min1, min2), max(max1, max2), max(maxL1, maxL2))
1719
+ },
1720
+ RESequence(elts, _) => {
1721
+ List.reduce(((accMin, accMax, accMaxL), e) => {
1722
+ let (minE, maxE, maxLE) = loop(e)
1723
+ (accMin + minE, accMax + maxE, max(accMaxL, maxLE))
1724
+ }, (0, 0, 0), elts)
1725
+ },
1726
+ REGroup(re, n) => {
1727
+ let (min1, max1, maxL1) = loop(re)
1728
+ Map.set(n, min1, groupSizes)
1729
+ (min1, max1, maxL1)
1730
+ },
1731
+ RERepeat(re, repeatMin, repeatMax, nonGreedy) => {
1732
+ let oldDependsSizes = dependsSizes
1733
+ dependsSizes = Map.make()
1734
+ let oldMustSizes = mustSizes
1735
+ mustSizes = Map.make()
1736
+ let (min1, max1, maxL1) = loop(re)
1737
+ if (min1 == 0) {
1738
+ thrownError = Some(MightBeEmpty)
1739
+ (0, 0, 0)
1740
+ } else {
1741
+ mustSizes = mergeDependsSizes(oldMustSizes, mustSizes)
1742
+ dependsSizes = mergeDependsSizes(oldDependsSizes, dependsSizes)
1743
+ let repeatMax = match(repeatMax) {
1744
+ None => Float32.toNumber(Float32.infinity),
1745
+ Some(n) => n
1746
+ }
1747
+ (min1 * repeatMin, max1 * repeatMax, maxL1)
1748
+ }
1749
+ },
1750
+ REMaybe(re, nonGreedy) => {
1751
+ let (_, max1, maxL1) = loop(re)
1752
+ (0, max1, maxL1)
1753
+ },
1754
+ REConditional(reTest, reTrue, reFalse, _, _, _) => {
1755
+ let (min1, max1, maxL1) = loop(reTest)
1756
+ let (min2, max2, maxL2) = loop(reTrue)
1757
+ let (min3, max3, maxL3) = Option.mapWithDefault(loop, (0, 0, 0), reFalse)
1758
+ (min(min2, min3), max(max2, max3), max(max(maxL1, maxL2), maxL3))
1759
+ },
1760
+ RELookahead(re, _, _, _) => {
1761
+ let (_, _, maxL1) = loop(re)
1762
+ (0, 0, maxL1)
1763
+ },
1764
+ RELookbehind(re, _, lbMin, lbMax, _, _) => {
1765
+ let (min1, max1, maxL1) = loop(re)
1766
+ if (max1 == Float32.toNumber(Float32.infinity)) {
1767
+ thrownError = Some(DoesNotMatchBounded)
1768
+ (0, 0, 0)
1769
+ } else {
1770
+ lbMin := min1
1771
+ lbMax := max1
1772
+ (0, 0, max(max1, maxL1))
1773
+ }
1774
+ },
1775
+ RECut(re, _, _, _) => {
1776
+ loop(re)
1777
+ },
1778
+ REReference(n, _) => {
1779
+ if (n > numGroups) {
1780
+ thrownError = Some(BackreferenceTooBig)
1781
+ (0, 0, 0)
1782
+ } else {
1783
+ match(Map.get(n, groupSizes)) {
1784
+ Some(minSize) => (minSize, Float32.toNumber(Float32.infinity), 0),
1785
+ None => {
1786
+ Map.set(n - 1, true, dependsSizes)
1787
+ (1, Float32.toNumber(Float32.infinity), 0)
1788
+ }
1789
+ }
1790
+ }
1791
+ },
1792
+ REUnicodeCategories(_, _) => (1, 4, 0)
1793
+ }
1794
+ }
1795
+ let (minLen, maxLen, maxLookbehind) = loop(re)
1796
+ Map.forEach((k, _) => {
1797
+ match(Map.get(k, groupSizes)) {
1798
+ None => void,
1799
+ Some(sz) => {
1800
+ if (sz <= 0) {
1801
+ thrownError = Some(MightBeEmpty)
1802
+ }
1803
+ }
1804
+ }
1805
+ }, mustSizes)
1806
+ match(thrownError) {
1807
+ Some(MightBeEmpty) => Err("`*`, `+`, or `{...}` operand could be empty"),
1808
+ Some(DoesNotMatchBounded) => Err("lookbehind pattern does not match a bounded length"),
1809
+ Some(BackreferenceTooBig) => Err("backreference number is larger than the highest-numbered cluster"),
1810
+ Some(InternalError(re)) => Err("regex validate: Internal error: " ++ toString(re)),
1811
+ None => Ok(maxLookbehind)
1812
+ }
1813
+ }
1814
+
1815
+
1816
+ /*
1817
+
1818
+ =========================
1819
+ REGEX MATCHER COMPILATION
1820
+ =========================
1821
+
1822
+ */
1823
+
1824
+
1825
+ record MatchBuf {
1826
+ matchInput: String,
1827
+ matchInputExploded: Array<Char>,
1828
+ }
1829
+
1830
+ let makeMatchBuffer = (s) => {
1831
+ {
1832
+ matchInput: s,
1833
+ matchInputExploded: String.explode(s),
1834
+ }
1835
+ }
1836
+
1837
+ let matchBufMore = (buf: MatchBuf, pos: Number) => {
1838
+ pos < Array.length(buf.matchInputExploded)
1839
+ }
1840
+
1841
+ let matchBufChar = (buf: MatchBuf, pos: Number) => {
1842
+ if (pos >= Array.length(buf.matchInputExploded)) {
1843
+ Err("end of match buffer reached")
1844
+ } else {
1845
+ Ok(buf.matchInputExploded[pos])
1846
+ }
1847
+ }
1848
+
1849
+ enum StackElt {
1850
+ SEPositionProducer(Number -> Option<Number>),
1851
+ SESavedGroup(Number, Option<(Number, Number)>),
1852
+ }
1853
+
1854
+ let done_m = (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => Some(pos)
1855
+ let continue_m = (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
1856
+ match(stack) {
1857
+ [SEPositionProducer(hd), ..._] => hd(pos),
1858
+ _ => fail "Impossible: continue_m",
1859
+ }
1860
+ }
1861
+ let limit_m = (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => if (pos == limit) Some(pos) else None
1862
+
1863
+
1864
+ let iterateMatcher = (m, size, max) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
1865
+ let limit = match(max) {
1866
+ Some(max) => min(limit, pos + (max * size)),
1867
+ None => limit,
1868
+ }
1869
+ let rec loop = (pos2, n) => {
1870
+ let pos3 = pos2 + size
1871
+ if (pos3 > limit || !m(buf, pos2, start, limit, end, state, stack)) {
1872
+ (pos2, n, size)
1873
+ } else {
1874
+ loop(pos3, n + 1)
1875
+ }
1876
+ }
1877
+ loop(pos, 0)
1878
+ }
1879
+
1880
+ // single-char matching
1881
+
1882
+ let charMatcher = (toMatch, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
1883
+ if ({
1884
+ pos < limit && match(matchBufChar(buf, pos)) {
1885
+ Err(_) => false,
1886
+ Ok(c) => toMatch == c
1887
+ }
1888
+ }) next_m(buf, pos + 1, start, limit, end, state, stack) else None
1889
+ }
1890
+
1891
+ let charTailMatcher = (toMatch) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
1892
+ if ({
1893
+ pos < limit && match(matchBufChar(buf, pos)) {
1894
+ Err(_) => false,
1895
+ Ok(c) => toMatch == c
1896
+ }
1897
+ }) Some(pos + 1) else None
1898
+ }
1899
+
1900
+ let charMatcherIterated = (toMatch, max) => iterateMatcher((buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
1901
+ match(matchBufChar(buf, pos)) {
1902
+ Err(_) => false,
1903
+ Ok(c) => toMatch == c
1904
+ }
1905
+ }, 1, max)
1906
+
1907
+ // string matching
1908
+
1909
+ let subArraysEqual = (arr1, start1, arr2, start2, length) => {
1910
+ if (Array.length(arr1) - start1 < length || Array.length(arr2) - start2 < length) {
1911
+ false
1912
+ } else {
1913
+ let mut result = true
1914
+ for (let mut i = 0; i < length; i += 1) {
1915
+ if (arr1[start1 + i] != arr2[start2 + i]) {
1916
+ result = false
1917
+ break
1918
+ }
1919
+ }
1920
+ result
1921
+ }
1922
+ }
1923
+
1924
+ let stringMatcher = (toMatch, len, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
1925
+ if ({
1926
+ pos + len <= limit && subArraysEqual(buf.matchInputExploded, pos, String.explode(toMatch), 0, len)
1927
+ }) next_m(buf, pos + len, start, limit, end, state, stack) else None
1928
+ }
1929
+
1930
+ let stringTailMatcher = (toMatch, len) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
1931
+ if ({
1932
+ pos + len <= limit && subArraysEqual(buf.matchInputExploded, pos, String.explode(toMatch), 0, len)
1933
+ }) Some(pos + len) else None
1934
+ }
1935
+
1936
+ let stringMatcherIterated = (toMatch, len, max) => iterateMatcher((buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
1937
+ subArraysEqual(buf.matchInputExploded, pos, String.explode(toMatch), 0, len)
1938
+ }, len, max)
1939
+
1940
+
1941
+ // match nothing
1942
+
1943
+ let neverMatcher = (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
1944
+ None
1945
+ }
1946
+
1947
+ // match any byte
1948
+
1949
+ let anyMatcher = (next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
1950
+ if ({
1951
+ pos < limit
1952
+ }) next_m(buf, pos + 1, start, limit, end, state, stack) else None
1953
+ }
1954
+
1955
+ let anyTailMatcher = () => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
1956
+ if ({
1957
+ pos < limit
1958
+ }) Some(pos + 1) else None
1959
+ }
1960
+
1961
+ let anyMatcherIterated = (max) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
1962
+ let n = match(max) {
1963
+ None => limit - pos,
1964
+ Some(max) => min(max, limit - pos),
1965
+ }
1966
+ (pos + n, n, 1)
1967
+ }
1968
+
1969
+ // match byte in set (range)
1970
+
1971
+ let rangeMatcher = (rng, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
1972
+ if ({
1973
+ pos < limit && match(matchBufChar(buf, pos)) {
1974
+ Err(_) => false,
1975
+ Ok(c) => rangeContains(rng, Char.code(c))
1976
+ }
1977
+ }) next_m(buf, pos + 1, start, limit, end, state, stack) else None
1978
+ }
1979
+
1980
+ let rangeTailMatcher = (rng) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
1981
+ if ({
1982
+ pos < limit && match(matchBufChar(buf, pos)) {
1983
+ Err(_) => false,
1984
+ Ok(c) => rangeContains(rng, Char.code(c))
1985
+ }
1986
+ }) Some(pos + 1) else None
1987
+ }
1988
+
1989
+ let rangeMatcherIterated = (rng, max) => iterateMatcher((buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
1990
+ match(matchBufChar(buf, pos)) {
1991
+ Err(_) => false,
1992
+ Ok(c) => rangeContains(rng, Char.code(c))
1993
+ }
1994
+ }, 1, max)
1995
+
1996
+ // zero-width matchers
1997
+
1998
+ let startMatcher = (next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
1999
+ if (pos == start) next_m(buf, pos, start, limit, end, state, stack) else None
2000
+ }
2001
+
2002
+ let endMatcher = (next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2003
+ if (pos == end) next_m(buf, pos, start, limit, end, state, stack) else None
2004
+ }
2005
+
2006
+ let lineStartMatcher = (next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2007
+ if (pos == start || matchBufChar(buf, pos - 1) == Ok('\n')) next_m(buf, pos, start, limit, end, state, stack) else None
2008
+ }
2009
+
2010
+ let lineEndMatcher = (next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2011
+ if (pos == end || matchBufChar(buf, pos) == Ok('\n')) next_m(buf, pos, start, limit, end, state, stack) else None
2012
+ }
2013
+
2014
+ let isWordChar = (c) => {
2015
+ match(c) {
2016
+ Err(_) => false,
2017
+ Ok(c) when (Char.code('0') <= Char.code(c) && Char.code(c) <= Char.code('9')) => true,
2018
+ Ok(c) when (Char.code('a') <= Char.code(c) && Char.code(c) <= Char.code('z')) => true,
2019
+ Ok(c) when (Char.code('A') <= Char.code(c) && Char.code(c) <= Char.code('Z')) => true,
2020
+ Ok(c) when (Char.code('_') <= Char.code(c)) => true,
2021
+ _ => false
2022
+ }
2023
+ }
2024
+
2025
+ let isWordBoundary = (buf, pos, start, limit, end) => {
2026
+ !((pos == start || !isWordChar(matchBufChar(buf, pos - 1))) == (pos == end || !isWordChar(matchBufChar(buf, pos))))
2027
+ }
2028
+
2029
+ let wordBoundaryMatcher = (next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2030
+ if (isWordBoundary(buf, pos, start, limit, end)) next_m(buf, pos, start, limit, end, state, stack) else None
2031
+ }
2032
+
2033
+ let notWordBoundaryMatcher = (next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2034
+ if (!isWordBoundary(buf, pos, start, limit, end)) next_m(buf, pos, start, limit, end, state, stack) else None
2035
+ }
2036
+
2037
+ // Alternatives
2038
+
2039
+ let altsMatcher = (m1, m2) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2040
+ match(m1(buf, pos, start, limit, end, state, stack)) {
2041
+ None => m2(buf, pos, start, limit, end, state, stack),
2042
+ Some(v) => Some(v)
2043
+ }
2044
+ }
2045
+
2046
+ // repeats, greedy (default) and non-greedy
2047
+
2048
+ let repeatMatcher = (r_m, min, max, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2049
+ let rec rloop = (pos, n) => {
2050
+ if (n < min) {
2051
+ let newStack = [SEPositionProducer(pos => rloop(pos, n + 1)), ...stack]
2052
+ r_m(buf, pos, start, limit, end, state, newStack)
2053
+ } else if (match(max) { None => false, Some(max) => max == n}) {
2054
+ next_m(buf, pos, start, limit, end, state, stack)
2055
+ } else {
2056
+ let newStack = [SEPositionProducer(pos => rloop(pos, n + 1)), ...stack]
2057
+ match(r_m(buf, pos, start, limit, end, state, newStack)) {
2058
+ Some(v) => Some(v),
2059
+ None => next_m(buf, pos, start, limit, end, state, stack)
2060
+ }
2061
+ }
2062
+ }
2063
+ rloop(pos, 0)
2064
+ }
2065
+
2066
+ let rStack = [SEPositionProducer(pos => Some(pos))]
2067
+
2068
+ let arrayCopy = (dest, destStart, src, srcStart, srcEnd) => {
2069
+ let mut count = srcStart
2070
+ while (count < srcEnd) {
2071
+ dest[destStart + (count - srcStart)] = src[count]
2072
+ count = count + 1
2073
+ }
2074
+ }
2075
+
2076
+ let saveGroups = (state, nStart, numN) => {
2077
+ if (numN == 0) {
2078
+ Array.make(0, None)
2079
+ } else if (Array.length(state) == 0) {
2080
+ Array.make(0, None)
2081
+ } else {
2082
+ let newState = Array.make(numN, None)
2083
+ arrayCopy(newState, 0, state, nStart, nStart + numN)
2084
+ newState
2085
+ }
2086
+ }
2087
+
2088
+ let restoreGroups = (state, oldState, nStart, numN) => {
2089
+ if (Array.length(oldState) > 0) {
2090
+ arrayCopy(state, nStart, oldState, 0, Array.length(oldState))
2091
+ }
2092
+ }
2093
+
2094
+ let addRepeatedGroup = (groupN, state, pos, n, backAmt, callback) => {
2095
+ match(groupN) {
2096
+ Some(groupN) when Array.length(state) > 0 => {
2097
+ let oldSpan = state[groupN]
2098
+ state[groupN] = if (n == 0) None else Some((pos - backAmt, pos))
2099
+ let groupRevert = () => { state[groupN] = oldSpan }
2100
+ callback(groupRevert)
2101
+ },
2102
+ _ => {
2103
+ let groupRevert = () => void
2104
+ callback(groupRevert)
2105
+ }
2106
+ }
2107
+ }
2108
+
2109
+ let repeatSimpleMatcher = (r_m, min, max, groupN, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2110
+ let rec rloop = (pos, n, backAmt) => {
2111
+ let pos2 = match(max) {
2112
+ Some(max) when n < max => r_m(buf, pos, start, limit, end, state, rStack),
2113
+ Some(_) => None,
2114
+ _ => r_m(buf, pos, start, limit, end, state, rStack)
2115
+ }
2116
+ match(pos2) {
2117
+ Some(pos2) => rloop(pos2, n + 1, pos2 - pos),
2118
+ None => {
2119
+ // Perform backtracking
2120
+ let rec bloop = (pos, n) => {
2121
+ if (n < min) {
2122
+ None
2123
+ } else {
2124
+ addRepeatedGroup(groupN, state, pos, n, backAmt, (groupRevert) => {
2125
+ match(next_m(buf, pos, start, limit, end, state, stack)) {
2126
+ Some(v) => Some(v),
2127
+ None => {
2128
+ groupRevert()
2129
+ bloop(pos - backAmt, n - 1)
2130
+ }
2131
+ }
2132
+ })
2133
+ }
2134
+ }
2135
+ bloop(pos, n)
2136
+ }
2137
+ }
2138
+ }
2139
+ rloop(pos, 0, 0)
2140
+ }
2141
+
2142
+ let repeatSimpleManyMatcher = (r_m, min, max, groupN, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2143
+ let (pos2, n, backAmt) = r_m(buf, pos, start, limit, end, state, stack)
2144
+ let rec bloop = (pos, n) => {
2145
+ if (n < min) {
2146
+ None
2147
+ } else {
2148
+ addRepeatedGroup(groupN, state, pos, n, backAmt, (groupRevert) => {
2149
+ match(next_m(buf, pos, start, limit, end, state, stack)) {
2150
+ Some(v) => Some(v),
2151
+ None => {
2152
+ groupRevert()
2153
+ bloop(pos - backAmt, n - 1)
2154
+ }
2155
+ }
2156
+ })
2157
+ }
2158
+ }
2159
+ bloop(pos2, n)
2160
+ }
2161
+
2162
+ let lazyRepeatMatcher = (r_m, min, max, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2163
+ let rec rloop = (pos, n, min) => {
2164
+ if (n < min) {
2165
+ let newStack = [SEPositionProducer(pos => rloop(pos, n + 1, min)), ...stack]
2166
+ r_m(buf, pos, start, limit, end, state, newStack)
2167
+ } else if (match(max) { None => false, Some(max) => max == n }) {
2168
+ next_m(buf, pos, start, limit, end, state, stack)
2169
+ } else match (next_m(buf, pos, start, limit, end, state, stack)) {
2170
+ Some(p) => Some(p),
2171
+ None => rloop(pos, n, min + 1)
2172
+ }
2173
+ }
2174
+ rloop(pos, 0, min)
2175
+ }
2176
+
2177
+ let lazyRepeatSimpleMatcher = (r_m, min, max, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2178
+ let rec rloop = (pos, n, min) => {
2179
+ if (n < min) {
2180
+ match(r_m(buf, pos, start, limit, end, state, stack)) {
2181
+ Some(p) => rloop(p, n + 1, min),
2182
+ None => None
2183
+ }
2184
+ } else if (match(max) { None => false, Some(max) => max == n }) {
2185
+ next_m(buf, pos, start, limit, end, state, stack)
2186
+ } else match (next_m(buf, pos, start, limit, end, state, stack)) {
2187
+ Some(p) => Some(p),
2188
+ None => rloop(pos, n, min + 1)
2189
+ }
2190
+ }
2191
+ rloop(pos, 0, min)
2192
+ }
2193
+
2194
+ // Recording and referencing group matches
2195
+
2196
+ let groupPushMatcher = (n, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2197
+ let newStack = [SESavedGroup(pos, if (Array.length(state) > 0) state[n] else None), ...stack]
2198
+ next_m(buf, pos, start, limit, end, state, newStack)
2199
+ }
2200
+
2201
+ let groupSetMatcher = (n, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2202
+ match(stack) {
2203
+ [SESavedGroup(oldPos, oldSpan), ...stackTl] => {
2204
+ if (Array.length(state) > 0) {
2205
+ state[n] = Some((oldPos, pos))
2206
+ }
2207
+ match(next_m(buf, pos, start, limit, end, state, stackTl)) {
2208
+ Some(v) => Some(v),
2209
+ None => {
2210
+ if (Array.length(state) > 0) {
2211
+ state[n] = oldSpan
2212
+ }
2213
+ None
2214
+ }
2215
+ }
2216
+ },
2217
+ _ => fail "Impossible: groupSetMatcher"
2218
+ }
2219
+ }
2220
+
2221
+ let makeReferenceMatcher = (eq) => (n, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2222
+ match(state[n]) {
2223
+ None => None,
2224
+ Some((refStart, refEnd)) => {
2225
+ let len = refEnd - refStart
2226
+ if ((pos + len <= limit) && subArraysEqual(buf.matchInputExploded, refStart, buf.matchInputExploded, pos, len)) {
2227
+ next_m(buf, pos + len, start, limit, end, state, stack)
2228
+ } else None
2229
+ }
2230
+ }
2231
+ }
2232
+
2233
+ let referenceMatcher = makeReferenceMatcher(((a, b)) => (a == b))
2234
+
2235
+ let asciiCharToLower = (c) => {
2236
+ if (Char.code('Z') <= Char.code(c) && Char.code(c) <= Char.code('Z')) {
2237
+ Char.fromCode(Char.code(c) + (Char.code('a') - Char.code('A')))
2238
+ } else {
2239
+ c
2240
+ }
2241
+ }
2242
+
2243
+ let referenceMatcherCaseInsensitive = makeReferenceMatcher(((a, b)) => (asciiCharToLower(a) == asciiCharToLower(b)))
2244
+
2245
+ // Lookahead, Lookbehind, Conditionals, and Cut
2246
+
2247
+ let lookaheadMatcher = (isMatch, sub_m, nStart, numN, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2248
+ let oldState = saveGroups(state, nStart, numN)
2249
+ let ret = match(sub_m(buf, pos, start, limit, end, state, stack)) {
2250
+ Some(_) when isMatch => {
2251
+ match(next_m(buf, pos, start, limit, end, state, stack)) {
2252
+ Some(p) => Some(p),
2253
+ None => { restoreGroups(state, oldState, nStart, numN); None },
2254
+ }
2255
+ },
2256
+ Some(_) => { restoreGroups(state, oldState, nStart, numN); None },
2257
+ None when isMatch => { restoreGroups(state, oldState, nStart, numN); None },
2258
+ _ => next_m(buf, pos, start, limit, end, state, stack)
2259
+ }
2260
+ ret
2261
+ }
2262
+
2263
+ let lookbehindMatcher = (isMatch, lbMin, lbMax, sub_m, nStart, numN, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2264
+ let lbMinPos = max(start, pos - lbMax)
2265
+ let rec loop = (lbPos) => {
2266
+ if (lbPos < lbMinPos) {
2267
+ if (isMatch) {
2268
+ None
2269
+ } else {
2270
+ next_m(buf, pos, start, limit, end, state, stack)
2271
+ }
2272
+ } else {
2273
+ let oldState = saveGroups(state, nStart, numN)
2274
+ match(sub_m(buf, lbPos, start, pos, end, state, stack)) {
2275
+ Some(_) when isMatch => {
2276
+ match(next_m(buf, pos, start, limit, end, state, stack)) {
2277
+ Some(p) => Some(p),
2278
+ None => { restoreGroups(state, oldState, nStart, numN); None },
2279
+ }
2280
+ },
2281
+ _ when isMatch => {
2282
+ loop(lbPos - 1)
2283
+ },
2284
+ Some(_) => { restoreGroups(state, oldState, nStart, numN); None },
2285
+ _ => next_m(buf, pos, start, limit, end, state, stack)
2286
+ }
2287
+ }
2288
+ }
2289
+ loop(pos - lbMin)
2290
+ }
2291
+
2292
+ let conditionalReferenceMatcher = (n, m1, m2) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2293
+ if (Option.isSome(state[n])) {
2294
+ m1(buf, pos, start, limit, end, state, stack)
2295
+ } else {
2296
+ m2(buf, pos, start, limit, end, state, stack)
2297
+ }
2298
+ }
2299
+
2300
+ let conditionalLookMatcher = (tst_m, m1, m2, nStart, numN) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2301
+ let oldState = saveGroups(state, nStart, numN)
2302
+ let res = match(tst_m(buf, pos, start, limit, end, state, [])) {
2303
+ Some(_) => m1(buf, pos, start, limit, end, state, stack),
2304
+ None => m2(buf, pos, start, limit, end, state, stack)
2305
+ }
2306
+ match(res) {
2307
+ Some(p) => Some(p),
2308
+ None => { restoreGroups(state, oldState, nStart, numN); None }
2309
+ }
2310
+ }
2311
+
2312
+ let cutMatcher = (sub_m, nStart, numN, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2313
+ let oldState = saveGroups(state, nStart, numN)
2314
+ match(sub_m(buf, pos, start, limit, end, state, [])) {
2315
+ None => None,
2316
+ Some(_) => {
2317
+ match(next_m(buf, pos, start, limit, end, state, stack)) {
2318
+ None => { restoreGroups(state, oldState, nStart, numN); None },
2319
+ Some(p) => Some(p)
2320
+ }
2321
+ }
2322
+ }
2323
+ }
2324
+
2325
+ // Unicode characters in UTF-8 encoding
2326
+
2327
+ let unicodeCategoriesMatcher = (cats, isMatch, next_m) => (buf: MatchBuf, pos: Number, start: Number, limit: Number, end: Number, state, stack) => {
2328
+ fail "NYI: unicodeCategoriesMatcher is not supported until grain-lang/grain#661 is resolved."
2329
+ }
2330
+
2331
+ // -------
2332
+ // Regex matcher compilation
2333
+ // -------
2334
+
2335
+ let countBacktrackPrefix = (l) => {
2336
+ let rec loop = (l, total, nonBt) => {
2337
+ match(l) {
2338
+ [] => total - nonBt,
2339
+ [hd, ...tl] when needsBacktrack(hd) => loop(tl, total + 1, 0),
2340
+ [hd, ...tl] => loop(tl, total + 1, nonBt + 1)
2341
+ }
2342
+ }
2343
+ loop(l, 0, 0)
2344
+ }
2345
+
2346
+ let compileMatcherRepeater = (rx, min, max) => {
2347
+ match(rx) {
2348
+ RELiteral(c) => Some(charMatcherIterated(c, max)),
2349
+ RELiteralString(s) => Some(stringMatcherIterated(s, String.length(s), max)),
2350
+ REAny => Some(anyMatcherIterated(max)),
2351
+ RERange(rng) => Some(rangeMatcherIterated(rng, max)),
2352
+ _ => None
2353
+ }
2354
+ }
2355
+
2356
+ let compileRegexToMatcher = (re: ParsedRegularExpression) => {
2357
+ let rec compile = (re: ParsedRegularExpression, next_m) => {
2358
+ let useTail = next_m is done_m
2359
+ match(re) {
2360
+ RELiteral(c) when useTail => charTailMatcher(c),
2361
+ RELiteral(c) => charMatcher(c, next_m),
2362
+ RELiteralString(s) when useTail => stringTailMatcher(s, String.length(s)),
2363
+ RELiteralString(s) => stringMatcher(s, String.length(s), next_m),
2364
+ REEmpty => next_m,
2365
+ RENever => neverMatcher,
2366
+ REAny when useTail => anyTailMatcher(),
2367
+ REAny => anyMatcher(next_m),
2368
+ RERange(rng) when useTail => rangeTailMatcher(rng),
2369
+ RERange(rng) => rangeMatcher(rng, next_m),
2370
+ REStart => startMatcher(next_m),
2371
+ REEnd => endMatcher(next_m),
2372
+ RELineStart => lineStartMatcher(next_m),
2373
+ RELineEnd => lineEndMatcher(next_m),
2374
+ REWordBoundary => wordBoundaryMatcher(next_m),
2375
+ RENotWordBoundary => notWordBoundaryMatcher(next_m),
2376
+ RESequence(res, _) => {
2377
+ List.reduceRight(compile, next_m, res)
2378
+ },
2379
+ REAlts(re1, re2) => altsMatcher(compile(re1, next_m), compile(re2, next_m)),
2380
+ REMaybe(re, true) => altsMatcher(next_m, compile(re, next_m)), // non-greedy
2381
+ REMaybe(re, _) => altsMatcher(compile(re, next_m), next_m),
2382
+ RERepeat(actualRe, min, max, nonGreedy) => {
2383
+ // Special case: group around simple pattern in non-lazy repeat
2384
+ let re = match(actualRe) {
2385
+ REGroup(groupRe, n) when !nonGreedy && !needsBacktrack(groupRe) => groupRe,
2386
+ _ => actualRe
2387
+ }
2388
+ let simple = !needsBacktrack(re)
2389
+ let groupN = if (simple) match(actualRe) {
2390
+ REGroup(_, n) => Some(n),
2391
+ _ => None
2392
+ } else None
2393
+ match(compileMatcherRepeater(re, min, max)) {
2394
+ Some(matcher) when !nonGreedy => repeatSimpleManyMatcher(matcher, min, max, groupN, next_m),
2395
+ _ => {
2396
+ let r_m = compile(re, if (simple) done_m else continue_m)
2397
+ if (nonGreedy) {
2398
+ if (simple) {
2399
+ lazyRepeatSimpleMatcher(r_m, min, max, next_m)
2400
+ } else {
2401
+ lazyRepeatMatcher(r_m, min, max, next_m)
2402
+ }
2403
+ } else {
2404
+ if (simple) {
2405
+ repeatSimpleMatcher(r_m, min, max, groupN, next_m)
2406
+ } else {
2407
+ repeatMatcher(r_m, min, max, next_m)
2408
+ }
2409
+ }
2410
+ }
2411
+ }
2412
+ },
2413
+ REGroup(re, n) => groupPushMatcher(n, compile(re, groupSetMatcher(n, next_m))),
2414
+ REReference(0, _) => neverMatcher,
2415
+ REReference(n, true) => referenceMatcher(n - 1, next_m), // case-sensitive
2416
+ REReference(n, _) => referenceMatcherCaseInsensitive(n - 1, next_m),
2417
+ RECut(re, nStart, numN, _) => cutMatcher(compile(re, done_m), nStart, numN, next_m),
2418
+ REConditional(tst, reTrue, reFalse, nStart, numN, _) => {
2419
+ let m1 = compile(reTrue, next_m)
2420
+ let m2 = compile(Option.unwrapWithDefault(REEmpty, reFalse), next_m)
2421
+ match(tst) {
2422
+ REReference(n, _) => conditionalReferenceMatcher(n - 1, m1, m2),
2423
+ _ => conditionalLookMatcher(compile(tst, done_m), m1, m2, nStart, numN)
2424
+ }
2425
+ },
2426
+ RELookahead(re, isMatch, nStart, numN) => lookaheadMatcher(isMatch, compile(re, done_m), nStart, numN, next_m),
2427
+ RELookbehind(re, isMatch, lbMin, lbMax, nStart, numN) => lookbehindMatcher(isMatch, unbox(lbMin), unbox(lbMax), compile(re, done_m), nStart, numN, next_m),
2428
+ REUnicodeCategories(cats, isMatch) => unicodeCategoriesMatcher(cats, isMatch, next_m)
2429
+ }
2430
+ }
2431
+ compile(re, done_m)
2432
+ }
2433
+
2434
+ let interp = (compiledRe, matchBuffer, pos, start, limitOrEnd, state) => {
2435
+ compiledRe(matchBuffer, pos, start, limitOrEnd, limitOrEnd, state, [])
2436
+ }
2437
+
2438
+ // Should be exported as abstract type when possible
2439
+ record RegularExpression {
2440
+ reParsed: ParsedRegularExpression,
2441
+ reNumGroups: Number,
2442
+ reReferences: Bool,
2443
+ reMaxLookbehind: Number,
2444
+ reCompiled: ((MatchBuf, Number, Number, Number, Number, Array<Option<(Number, Number)>>, List<StackElt>) -> Option<Number>),
2445
+ reMustString: Option<String>,
2446
+ reIsAnchored: Bool,
2447
+ reStartRange: Option<List<(Number, Number)>>,
2448
+ }
2449
+
2450
+ /**
2451
+ * @section Values: Functions for working with regular expressions.
2452
+ */
2453
+
2454
+ // [TODO] When #661 is resolved, re-add the following pieces of documentation:
2455
+ /*
2456
+ [Under POSIX character classes]
2457
+
2458
+ * - `[:graph:]` - Matches all ASCII characters which use ink when printed
2459
+ * - `[:print:]` - Matches space, tab, and all ASCII ink users
2460
+
2461
+ [At end of documentation]
2462
+
2463
+ * Finally, the following is the list of supported Unicode properties.
2464
+ * These class codes come from this portion of the Unicode standard:
2465
+ * https://www.unicode.org/reports/tr44/#General_Category_Values
2466
+ *
2467
+ * - `Ll` - Letter, lowercase
2468
+ * - `Lu` - Letter, uppercase
2469
+ * - `Lt` - Letter, titlecase
2470
+ * - `Lm` - Letter, modifier
2471
+ * - `L&` - Union of `Ll`, `Lu`, `Lt`, and `Lm`
2472
+ * - `Lo` - Letter, other
2473
+ * - `L` - Union of `L&` and `Lo`
2474
+ * - `Nd` - Number, decimal digit
2475
+ * - `Nl` - Number, letter
2476
+ * - `No` - Number, other
2477
+ * - `N` - Union of `Nd`, `Nl`, and `No`
2478
+ * - `Ps` - Punctuation, open
2479
+ * - `Pe` - Punctuation, close
2480
+ * - `Pi` - Punctuation, initial quote
2481
+ * - `Pf` - Punctuation, final quote
2482
+ * - `Pc` - Punctuation, connector
2483
+ * - `Pd` - Punctuation, dash
2484
+ * - `Po` - Punctuation, other
2485
+ * - `P` - Union of `Ps`, `Pe`, `Pi`, `Pf`, `Pc`, `Pd`, and `Po`
2486
+ * - `Mn` - Mark, non-spacing
2487
+ * - `Mc` - Mark, spacing combining
2488
+ * - `Me` - Mark, enclosing
2489
+ * - `M` - Union of `Mn`, `Mc`, and `Me`
2490
+ * - `Sc` - Symbol, currency
2491
+ * - `Sk` - Symbol, modifier
2492
+ * - `Sm` - Symbol, math
2493
+ * - `So` - Symbol, other
2494
+ * - `S` - Union of `Sc`, `Sk`, `Sm`, and `So`
2495
+ * - `Zl` - Separator, line
2496
+ * - `Zp` - Separator, paragraph
2497
+ * - `Zs` - Separator, space
2498
+ * - `Z` - Union of `Zl`, `Zp`, and `Zs`
2499
+ * - `Cc` - Other, control
2500
+ * - `Cf` - Other, format
2501
+ * - `Cs` - Other, surrogate
2502
+ * - `Cn` - Other, not assigned
2503
+ * - `Co` - Other, private use
2504
+ * - `C` - Union of `Cc`, `Cf`, `Cs`, `Cn`, and `Co`
2505
+ * - `.` - Union of all Unicode categories
2506
+ */
2507
+
2508
+ /**
2509
+ * Compiles the given pattern string into a regular expression object.
2510
+ *
2511
+ * For a general overview of regular expressions, refer to
2512
+ * ["Mastering Regular Expressions"](http://regex.info/book.html) by Friedl, or other online resources.
2513
+ *
2514
+ * Regular expressions are a combination of normal and special characters. A normal
2515
+ * character in a pattern will match a one-character string containing that character.
2516
+ * Moreover, if there are two regular expressions `A` and `B`, they can be concatenated
2517
+ * into a regular expression `AB`. If a string `p` matches `A` and `q` matches `B`,
2518
+ * then `pq` will match `AB`.
2519
+ *
2520
+ * The special character sequences are as follows:
2521
+ *
2522
+ * - `.` - Matches any character, except for a newline in multi-line mode
2523
+ * - `^` - Matches the beginning of the input, or after a newline (`\n`) in multi-line mode
2524
+ * - `$` - Matches the end of the input, or right before a newline (`\n`) in multi-line mode
2525
+ * - `«re»*` - Matches `«re»` zero or more times
2526
+ * - `«re»+` - Matches `«re»` one or more times
2527
+ * - `«re»?` - Matches `«re»` zero or one times
2528
+ * - `«re»{«n»}` - Matches `«re»` exactly `«n»` times
2529
+ * - `«re»{«n»,}` - Matches `«re»` `«n»` or more times
2530
+ * - `«re»{,«m»}` - Matches `«re»` zero to `«m»` times
2531
+ * - `«re»{«n»,«m»}` - Matches `«re»` between `«n»` and `«m»` times
2532
+ * - `«re»{}` - Matches `«re»` zero or more times
2533
+ * - `[«rng»]` - Matches any character in `«rng»` (see below)
2534
+ * - `[^«rng»]` - Matches any character not in `«rng»` (see below)
2535
+ * - `\«n»` - Matches the latest match for group `«n»` (one-indexed)
2536
+ * - `\b` - Matches the boundary of `\w*` (`\w` defined below, under "basic classes")
2537
+ * - `\B` - Matches where `\b` does not
2538
+ * - `\p{«property»}` - Matches any character with Unicode property `«property»` (see below)
2539
+ * - `\P{«property»}` - Matches any character without Unicode property `«property»` (see below)
2540
+ * - `(«re»)` - Matches `«re»`, storing the result in a group
2541
+ * - `(?:«re»)` - Matches `«re»` without storing the result in a group
2542
+ * - `(?«mode»:«re») - Matches `«re»` with the mode settings specified by `«mode»` using the following syntax:
2543
+ * - `«mode»i` - The same as `«mode»`, but with case-insensitivity enabled (temporarily not supported until grain-lang/grain#661 is resolved)
2544
+ * - `«mode»-i` - The same as `«mode»`, but with case-insensitivity disabled (the default)
2545
+ * - `«mode»m` / `«mode»-s` - The same as `«mode»`, but with multi-line mode enabled
2546
+ * - `«mode»-m` / `«mode»s` - The same as `«mode»`, but with multi-line mode disabled
2547
+ * - An empty string, which will not change any mode settings
2548
+ * - `(?«tst»«re1»|«re2»)` - Will match `«re1»` if `«tst»`, otherwise will match `«re2»`. The following options are available for `«tst»`
2549
+ * - `(«n»)` - Will be true if group `«n»` has a match
2550
+ * - `(?=«re»)` - Will be true if `«re»` matches the next sequence
2551
+ * - `(?!«re»)` - Will be true if `«re»` does not match the next sequence
2552
+ * - `(?<=«re»)` - Will be true if `«re»` matches the preceding sequence
2553
+ * - `(?<!«re»)` - Will be true if `«re»` does not match the preceding sequence
2554
+ * - `(?«tst»«re»)` - Equivalent to `(?«tst»«re»|)`
2555
+ * - Finally, basic classes (defined below) can also appear outside of character ranges.
2556
+ *
2557
+ * Character ranges (referred to as `«rng»` above) have the following syntax:
2558
+ * - `«c»` - Matches the character `«c»` exactly
2559
+ * - `«c1»-«c2»` - Matches any character with a character code between the character code for `«c1»` and the code for `«c2»`
2560
+ *
2561
+ * These forms can be repeated any number of times, which will construct a range of their union. That is, `[ba-c]` and `[a-c]` are equivalent ranges.
2562
+ * Additionally, there are the following special cases:
2563
+ * - A `]` as the first character of the range will match a `]`
2564
+ * - A `-` as the first or last character of the range will match a `-`
2565
+ * - A `^` in any position other than the first position will match a `^`
2566
+ * - `\«c»`, where `«c»` is a non-alphabetic character, will match `«c»`
2567
+ *
2568
+ * Furthermore, ranges can include character classes, which are predefined commonly-used
2569
+ * sets of characters. There are two "flavors" of these: *basic* classes and *POSIX* classes.
2570
+ * Both are provided for ease of use and to maximize compatibility with other regular
2571
+ * expression engines, so feel free to use whichever is most convenient.
2572
+ *
2573
+ * The *basic* classes are as follows:
2574
+ * - `\d` - Matches `0-9`
2575
+ * - `\D` - Matches characters not in `\d`
2576
+ * - `\w` - Matches `a-z`, `A-Z`, `0-9`, and `_`
2577
+ * - `\W` - Matches characters not in `\w`
2578
+ * - `\s` - Matches space, tab, formfeed, and return
2579
+ * - `\S` - Matches characters not in `\s`
2580
+ * The *POSIX* classes are as follows:
2581
+ * - `[:alpha:]` - Matches `a-z` and `A-Z`
2582
+ * - `[:upper:]` - Matches `A-Z`
2583
+ * - `[:lower:]` - Matches `a-z`
2584
+ * - `[:digit:]` - Matches `0-9`
2585
+ * - `[:xdigit:]` - Matches `0-9`, `a-f`, and `A-F`
2586
+ * - `[:alnum:]` - Matches `a-z`, `A-Z`, and `0-9`
2587
+ * - `[:word:]` - Matches `a-z`, `A-Z`, `0-9`, and `_`
2588
+ * - `[:blank:]` - Matches space and tab
2589
+ * - `[:space:]` - Matches space, tab, newline, formfeed, and return
2590
+ * - `[:cntrl:]` - Contains all characters with code points < 32
2591
+ * - `[:ascii:]` - Contains all ASCII characters
2592
+ *
2593
+ *
2594
+ *
2595
+ * @param regexString: The regular expression to compile
2596
+ * @returns The compiled regular expression
2597
+ *
2598
+ * @example Regex.make("(foo|bar)[0-9]+")
2599
+ *
2600
+ * @since 0.4.3
2601
+ */
2602
+ export let make = (regexString: String) => {
2603
+ let buf = makeRegExBuf(regexString)
2604
+ match(parseRegex(buf)) {
2605
+ Err(e) => Err(e),
2606
+ Ok(parsed) => {
2607
+ let numGroups = unbox(buf.config.groupNumber)
2608
+ let references = unbox(buf.config.references)
2609
+ match(validate(parsed, numGroups)) {
2610
+ Err(e) => Err(e),
2611
+ Ok(maxLookbehind) => {
2612
+ let matcher = compileRegexToMatcher(parsed)
2613
+ Ok({
2614
+ reParsed: parsed,
2615
+ reNumGroups: numGroups,
2616
+ reReferences: references,
2617
+ reMaxLookbehind: maxLookbehind,
2618
+ reCompiled: matcher,
2619
+ reMustString: mustString(parsed),
2620
+ reIsAnchored: isAnchored(parsed),
2621
+ reStartRange: startRange(parsed),
2622
+ })
2623
+ }
2624
+ }
2625
+ }
2626
+ }
2627
+ }
2628
+
2629
+
2630
+ //
2631
+ //
2632
+ // ============
2633
+ // REGEX SEARCH
2634
+ // ============
2635
+ //
2636
+ //
2637
+
2638
+ // speed up failures using must-string
2639
+ let checkMustString = (ms, buf: MatchBuf, pos, endPos) => {
2640
+ match(ms) {
2641
+ None => true,
2642
+ Some(ms) => {
2643
+ let toCheck = if (pos == 0 && endPos == Array.length(buf.matchInputExploded)) {
2644
+ buf.matchInput
2645
+ } else {
2646
+ String.slice(pos, endPos, buf.matchInput)
2647
+ }
2648
+ Option.isSome(String.indexOf(ms, toCheck))
2649
+ }
2650
+ }
2651
+ }
2652
+
2653
+ // speed up failures using start-range
2654
+ let checkStartRange = (startRange, buf, pos, endPos) => {
2655
+ rangeContains(startRange, Char.code(buf.matchInputExploded[pos]))
2656
+ }
2657
+
2658
+
2659
+ let searchMatch = (rx: RegularExpression, buf: MatchBuf, pos, startPos, endPos, state) => {
2660
+ if (!checkMustString(rx.reMustString, buf, pos, endPos)) {
2661
+ None
2662
+ } else {
2663
+ let matcher = rx.reCompiled
2664
+ let anchored = rx.reIsAnchored
2665
+ let startRange = rx.reStartRange
2666
+ let rec loop = (pos) => {
2667
+ if (anchored && pos != startPos) {
2668
+ None
2669
+ } else {
2670
+ match(startRange) {
2671
+ Some(_) when pos == endPos => None, // Can't possibly match if chars are required and we are at EOS
2672
+ Some(rng) when !checkStartRange(rng, buf, pos, endPos) => loop(pos + 1),
2673
+ _ => {
2674
+ let pos2 = interp(matcher, buf, pos, startPos, endPos, state)
2675
+ match (pos2) {
2676
+ Some(p) => Some((pos, p)),
2677
+ None when pos < endPos => loop(pos + 1),
2678
+ None => None
2679
+ }
2680
+ }
2681
+ }
2682
+ }
2683
+ }
2684
+ loop(pos)
2685
+ }
2686
+ }
2687
+
2688
+ /**
2689
+ * This object contains the results
2690
+ * of a regular expression match. The results can be obtained using
2691
+ * the following accessors:
2692
+ *
2693
+ * ```grain
2694
+ * group : Number -> Option<String>
2695
+ * ```
2696
+ *
2697
+ * Returns the contents of the given group. Note that group 0 contains
2698
+ * the entire matched substring, and group 1 contains the first parenthesized group.
2699
+ *
2700
+ * ```grain
2701
+ * groupPosition : Number -> Option<(Number, Number)>
2702
+ * ```
2703
+ *
2704
+ * Returns the position of the given group.
2705
+ *
2706
+ * ```grain
2707
+ * numGroups : Number
2708
+ * ```
2709
+ *
2710
+ * The number of defined groups in this match object (including group 0).
2711
+ *
2712
+ * ```grain
2713
+ * allGroups : () -> Array<Option<String>>
2714
+ * ```
2715
+ *
2716
+ * Returns the contents of all groups matched in this match object.
2717
+ *
2718
+ * ```grain
2719
+ * allGroupPositions : () -> Array<Option<(Number, Number)>>
2720
+ * ```
2721
+ *
2722
+ * Returns the positions of all groups matched in this match object.
2723
+ *
2724
+ * @since 0.4.3
2725
+ */
2726
+ export record MatchResult {
2727
+ /**
2728
+ * Returns the contents of the given group
2729
+ */
2730
+ group: Number -> Option<String>,
2731
+ /**
2732
+ * Returns the position of the given group
2733
+ */
2734
+ groupPosition: Number -> Option<(Number, Number)>,
2735
+ /**
2736
+ * Returns the number of defined groups in this match object (includes group 0)
2737
+ */
2738
+ numGroups: Number,
2739
+ /**
2740
+ * Returns the contents of all groups matched in this match object
2741
+ */
2742
+ allGroups: () -> Array<Option<String>>,
2743
+ /**
2744
+ * Returns the positions of all groups matched in this match object
2745
+ */
2746
+ allGroupPositions: () -> Array<Option<(Number, Number)>>,
2747
+ }
2748
+
2749
+ let makeMatchResult = (origString, start, end, state) => {
2750
+ let getMatchGroupPosition = (n) => {
2751
+ if (n == 0) {
2752
+ Some((start, end))
2753
+ } else if (n < 0 || n - 1 > Array.length(state)) {
2754
+ None
2755
+ } else match (state[n-1]) {
2756
+ None => None,
2757
+ Some((start, end)) => Some((start, end))
2758
+ }
2759
+ }
2760
+ let getMatchGroup = (n) => {
2761
+ match(getMatchGroupPosition(n)) {
2762
+ Some((start, end)) => Some(String.slice(start, end, origString)),
2763
+ None => None
2764
+ }
2765
+ }
2766
+ let getAllMatchGroupPositions = () => {
2767
+ let ret = Array.make(Array.length(state) + 1, None)
2768
+ ret[0] = Some((start, end))
2769
+ for (let mut i = 0; i < Array.length(state); i += 1) {
2770
+ ret[i + 1] = state[i]
2771
+ }
2772
+ ret
2773
+ }
2774
+ let getAllMatchGroups = () => {
2775
+ Array.map(o => match(o) {
2776
+ None => None,
2777
+ Some((start, end)) => Some(String.slice(start, end, origString))
2778
+ }, getAllMatchGroupPositions())
2779
+ }
2780
+ {
2781
+ group: getMatchGroup,
2782
+ groupPosition: getMatchGroupPosition,
2783
+ numGroups: Array.length(state) + 1,
2784
+ allGroupPositions: getAllMatchGroupPositions,
2785
+ allGroups: getAllMatchGroups
2786
+ }
2787
+ }
2788
+
2789
+ // Helpers for user-facing match functionality
2790
+
2791
+ let fastDriveRegexIsMatch = (rx, string, startOffset, endOffset) => {
2792
+ let state = if (rx.reReferences) Array.make(rx.reNumGroups, None) else Array.make(0, None)
2793
+ let toWrap = if (startOffset == 0 && endOffset == String.length(string)) string else String.slice(startOffset, endOffset, string)
2794
+ let buf = makeMatchBuffer(toWrap)
2795
+ Option.isSome(searchMatch(rx, buf, 0, 0, Array.length(buf.matchInputExploded), state))
2796
+ }
2797
+
2798
+ let rec fastDriveRegexMatchAll = (rx, string, startOffset, endOffset) => {
2799
+ if (startOffset >= endOffset) {
2800
+ []
2801
+ } else {
2802
+ let state = Array.make(rx.reNumGroups, None)
2803
+ let toWrap = if (startOffset == 0 && endOffset == String.length(string)) string else String.slice(startOffset, endOffset, string)
2804
+ let buf = makeMatchBuffer(toWrap)
2805
+ match(searchMatch(rx, buf, 0, 0, Array.length(buf.matchInputExploded), state)) {
2806
+ None => [],
2807
+ Some((startPos, endPos)) => [makeMatchResult(string, startPos + startOffset, endPos + startOffset, Array.map(elt => {
2808
+ match(elt) {
2809
+ None => None,
2810
+ Some((start, end)) => Some((start + startOffset, end + startOffset))
2811
+ }
2812
+ }, state)), ...fastDriveRegexMatchAll(rx, string, startPos + startOffset + 1, endOffset)],
2813
+ }
2814
+ }
2815
+ }
2816
+
2817
+ let fastDriveRegexMatch = (rx, string, startOffset, endOffset) => {
2818
+ let state = Array.make(rx.reNumGroups, None)
2819
+ let toWrap = if (startOffset == 0 && endOffset == String.length(string)) string else String.slice(startOffset, endOffset, string)
2820
+ let buf = makeMatchBuffer(toWrap)
2821
+ match(searchMatch(rx, buf, 0, 0, Array.length(buf.matchInputExploded), state)) {
2822
+ None => None,
2823
+ Some((startPos, endPos)) => {
2824
+ Some(makeMatchResult(string, startPos + startOffset, endPos + startOffset, Array.map(elt => {
2825
+ match(elt) {
2826
+ None => None,
2827
+ Some((start, end)) => Some((start + startOffset, end + startOffset))
2828
+ }
2829
+ }, state)))
2830
+ }
2831
+ }
2832
+ }
2833
+
2834
+ /**
2835
+ * Determines if the given regular expression has a match in the given string.
2836
+ * @param rx: The regular expression to search for
2837
+ * @param string: The string to search within
2838
+ * @returns `true` if the RegExp matches the string, otherwise `false`
2839
+ *
2840
+ * @example assert Regex.isMatch(Result.unwrap(Regex.make("ca+[at]")), "caaat") == true
2841
+ *
2842
+ * @since 0.4.3
2843
+ */
2844
+ export let isMatch = (rx: RegularExpression, string: String) => {
2845
+ fastDriveRegexIsMatch(rx, string, 0, String.length(string))
2846
+ }
2847
+
2848
+ /**
2849
+ * Determines if the given regular expression has a match in the given string between the given start/end offsets.
2850
+ * @param rx: The regular expression to search for
2851
+ * @param string: The string to search
2852
+ * @param start: The start offset to search between
2853
+ * @param end: The end offset to search between
2854
+ * @returns `true` if the RegExp matches the string in the given range, otherwise `false`
2855
+ *
2856
+ * @example assert Regex.isMatchRange(Result.unwrap(Regex.make("ca+[at]")), "caaat", 0, 5) == true
2857
+ * @example assert Regex.isMatchRange(Result.unwrap(Regex.make("ca+[at]")), "caaat", 1, 5) == false
2858
+ *
2859
+ * @since 0.4.3
2860
+ */
2861
+ export let isMatchRange = (rx: RegularExpression, string: String, start: Number, end: Number) => {
2862
+ fastDriveRegexIsMatch(rx, string, start, end)
2863
+ }
2864
+
2865
+ /**
2866
+ * Returns the first match for the given regular expression contained within the given string.
2867
+ * @param rx: The regular expression to search for
2868
+ * @param string: The string to search
2869
+ * @returns The match result, if any
2870
+ *
2871
+ * @example Regex.find(Result.unwrap(Regex.make("ca+[at]")), "caaat")
2872
+ *
2873
+ * @since 0.4.3
2874
+ */
2875
+ export let find = (rx: RegularExpression, string: String) => {
2876
+ fastDriveRegexMatch(rx, string, 0, String.length(string))
2877
+ }
2878
+
2879
+ /**
2880
+ * Returns the first match for the given regular expression contained within the given string
2881
+ * between the given start/end range.
2882
+ * @param rx: The regular expression to search for
2883
+ * @param string: The string to search
2884
+ * @param start: The start offset to search between
2885
+ * @param end: The end offset to search between
2886
+ * @returns The match result, if any
2887
+ *
2888
+ * @example Regex.findRange(Result.unwrap(Regex.make("ca+[at]")), "caaat", 0, 5)
2889
+ *
2890
+ * @since 0.4.3
2891
+ */
2892
+ export let findRange = (rx: RegularExpression, string: String, start: Number, end: Number) => {
2893
+ fastDriveRegexMatch(rx, string, start, end)
2894
+ }
2895
+
2896
+ /**
2897
+ * Returns all matches for the given regular expression contained within the given string.
2898
+ * @param rx: The regular expression to search for
2899
+ * @param string: The string to search
2900
+ * @returns The list of matches
2901
+ */
2902
+ export let findAll = (rx: RegularExpression, string: String) => {
2903
+ fastDriveRegexMatchAll(rx, string, 0, String.length(string))
2904
+ }
2905
+
2906
+ /**
2907
+ * Returns all matches for the given regular expression contained within the given string
2908
+ * between the given start/end range.
2909
+ * @param rx: The regular expression to search for
2910
+ * @param string: The string to search
2911
+ * @param start: The start offset to search between
2912
+ * @param end: The end offset to search between
2913
+ * @returns The list of matches
2914
+ *
2915
+ * @example Regex.findAllRange(Result.unwrap(Regex.make("ca+[at]")), "caaat", 0, 5)
2916
+ *
2917
+ * @since 0.4.3
2918
+ */
2919
+ export let findAllRange = (rx: RegularExpression, string: String, start: Number, end: Number) => {
2920
+ fastDriveRegexMatchAll(rx, string, start, end)
2921
+ }
2922
+
2923
+
2924
+ let computeReplacement = (matchBuf: MatchBuf, replacementString: String, start, end, state) => {
2925
+ let replacementExploded = String.explode(replacementString)
2926
+ let len = Array.length(replacementExploded)
2927
+ let mut acc = []
2928
+ let getBeforeMatch = () => String.slice(0, start, matchBuf.matchInput)
2929
+ let getAfterMatch = () => String.slice(end, String.length(matchBuf.matchInput), matchBuf.matchInput)
2930
+ let getInputSubstr = (n) => {
2931
+ if (n == 0) {
2932
+ String.slice(start, end, matchBuf.matchInput)
2933
+ } else if (n - 1 < Array.length(state)) {
2934
+ match (state[n-1]) {
2935
+ Some((start, end)) => String.slice(start, end, matchBuf.matchInput),
2936
+ None => ""
2937
+ }
2938
+ } else {
2939
+ ""
2940
+ }
2941
+ }
2942
+ let consRange = (start, end, lst) => {
2943
+ if (start == end) lst else [String.slice(start, end, replacementString), ...lst]
2944
+ }
2945
+ let rec loop = (pos, since) => {
2946
+ if (pos == len) {
2947
+ consRange(since, pos, [])
2948
+ } else if (replacementExploded[pos] == '$') {
2949
+ let c = if ((pos + 1) < len) Some(replacementExploded[pos + 1]) else None
2950
+ if (c == Some('&')) {
2951
+ consRange(since, pos, [getInputSubstr(0), ...loop(pos + 2, pos + 2)])
2952
+ } else if (c == Some('`')) {
2953
+ consRange(since, pos, [getBeforeMatch(), ...loop(pos + 2, pos + 2)])
2954
+ } else if (c == Some('\'')) {
2955
+ consRange(since, pos, [getAfterMatch(), ...loop(pos + 2, pos + 2)])
2956
+ } else {
2957
+ consRange(since, pos, {
2958
+ if (c == Some('$')) {
2959
+ loop(pos + 2, pos + 1)
2960
+ } else if (c == Some('.')) {
2961
+ loop(pos + 2, pos + 2)
2962
+ } else {
2963
+ let rec dLoop = (pos, accum) => {
2964
+ if (pos == len) {
2965
+ [getInputSubstr(accum)]
2966
+ } else {
2967
+ let c = replacementExploded[pos]
2968
+ if (Char.code('0') <= Char.code(c) && Char.code(c) <= Char.code('9')) {
2969
+ dLoop(pos + 1, (10 * accum) + (Char.code(c) - Char.code('0')))
2970
+ } else {
2971
+ [getInputSubstr(accum), ...loop(pos, pos)]
2972
+ }
2973
+ }
2974
+ }
2975
+ dLoop(pos + 1, 0)
2976
+ }
2977
+ })
2978
+ }
2979
+ } else {
2980
+ loop(pos + 1, since)
2981
+ }
2982
+ }
2983
+ let res = loop(0, 0)
2984
+ List.reduceRight(String.concat, "", res)
2985
+ }
2986
+
2987
+
2988
+ let regexReplaceHelp = (rx: RegularExpression, toSearch: String, replacement: String, all: Bool) => {
2989
+ let buf = makeMatchBuffer(toSearch)
2990
+ let mut out = []
2991
+ let rec loop = (searchPos) => {
2992
+ let state = Array.make(rx.reNumGroups, None)
2993
+ let poss = searchMatch(rx, buf, searchPos, searchPos, Array.length(buf.matchInputExploded), state)
2994
+ let recur = (start, end) => {
2995
+ if (end == searchPos) {
2996
+ if (searchPos == String.length(toSearch)) {
2997
+ ""
2998
+ } else {
2999
+ String.concat(String.slice(searchPos, searchPos + 1, toSearch), loop(searchPos + 1))
3000
+ }
3001
+ } else {
3002
+ loop(end)
3003
+ }
3004
+ }
3005
+ match(poss) {
3006
+ None => if (searchPos == 0) toSearch else String.slice(searchPos, String.length(toSearch), toSearch),
3007
+ Some((start, end)) =>
3008
+ String.concat(String.slice(searchPos, start, toSearch),
3009
+ String.concat(computeReplacement(buf, replacement, start, end, state),
3010
+ if (all) recur(start, end) else String.slice(end, String.length(toSearch), toSearch))),
3011
+ }
3012
+ }
3013
+ loop(0)
3014
+ }
3015
+
3016
+ /**
3017
+ * Replaces the first match for the given regular expression contained within the given string with the specified replacement.
3018
+ * Replacement strings support the following syntax:
3019
+ * - `$&` - Replaced with the text of the matching portion of input (e.g. for `(foo)`, the search string `foo bar`, and the replacement `baz $&`, the result will be `baz foo bar`)
3020
+ * - `$n` / `$nn` (where `n` is a digit) - Replaced with the text of group `nn`
3021
+ * - `$$` - Replaced with a literal `$`
3022
+ * - `$.` - Does nothing (this exists to support replacement strings such as `$4$.0`, which will place the contents of group 4 prior to a zero)
3023
+ * - `$\`` - Replaced with the text preceding the matched substring
3024
+ * - `$'` - Replaced with the text following the matched substring
3025
+ * - Any other character will be placed as-is in the replaced output.
3026
+ *
3027
+ * @param rx: The regular expression to search for
3028
+ * @param toSearch: The string to search
3029
+ * @param replacement: The string that replaces matches
3030
+ * @returns The given string with the appropriate replacements, if any
3031
+ *
3032
+ * @example assert Regex.replace(Result.unwrap(Regex.make("o")), "foo", "a") == "fao"
3033
+ *
3034
+ * @since 0.4.3
3035
+ */
3036
+ export let replace = (rx: RegularExpression, toSearch: String, replacement: String) => {
3037
+ regexReplaceHelp(rx, toSearch, replacement, false)
3038
+ }
3039
+
3040
+ /**
3041
+ * Replaces all matches for the given regular expression contained within the given string with the specified replacement.
3042
+ * See `replace` for replacement string syntax.
3043
+ *
3044
+ * @param rx: The regular expression to search for
3045
+ * @param toSearch: The string to search
3046
+ * @param replacement: The string that replaces matches
3047
+ * @returns The input string with the appropriate replacements, if any
3048
+ *
3049
+ * @example assert Regex.replaceAll(Result.unwrap(Regex.make("o")), "skoot", "r") == "skrrt"
3050
+ *
3051
+ * @since 0.4.3
3052
+ */
3053
+ export let replaceAll = (rx: RegularExpression, toSearch: String, replacement: String) => {
3054
+ regexReplaceHelp(rx, toSearch, replacement, true)
3055
+ }