@futpib/parser 1.0.3 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,580 @@
1
+ import { createUnionParser } from './unionParser.js';
2
+ import { createExactSequenceParser } from './exactSequenceParser.js';
3
+ import { promiseCompose } from './promiseCompose.js';
4
+ import { createTupleParser } from './tupleParser.js';
5
+ import { createArrayParser } from './arrayParser.js';
6
+ import { createParserAccessorParser } from './parserAccessorParser.js';
7
+ import { createElementParser } from './elementParser.js';
8
+ import { parserCreatorCompose } from './parserCreatorCompose.js';
9
+ import { createOptionalParser } from './optionalParser.js';
10
+ import { createFixedLengthSequenceParser } from './fixedLengthSequenceParser.js';
11
+ import { createTerminatedArrayParser } from './terminatedArrayParser.js';
12
+ import { createDisjunctionParser } from './disjunctionParser.js';
13
+ import { createNegativeLookaheadParser } from './negativeLookaheadParser.js';
14
+ // CharacterSet helpers
15
+ const emptyCharacterSet = { type: 'empty' };
16
+ function codePointRangeIsEmpty(range) {
17
+ return range.start > range.end;
18
+ }
19
+ function codePointRangeIsStrictlyBefore(rangeA, rangeB) {
20
+ return rangeA.end + 1 < rangeB.start;
21
+ }
22
+ function codePointRangeIsStrictlyAfter(rangeA, rangeB) {
23
+ return codePointRangeIsStrictlyBefore(rangeB, rangeA);
24
+ }
25
+ function codePointRangeLeastUpperBound(rangeA, rangeB) {
26
+ if (codePointRangeIsEmpty(rangeA))
27
+ return rangeB;
28
+ if (codePointRangeIsEmpty(rangeB))
29
+ return rangeA;
30
+ return {
31
+ start: Math.min(rangeA.start, rangeB.start),
32
+ end: Math.max(rangeA.end, rangeB.end),
33
+ };
34
+ }
35
+ function codePointRangeStrictlyDisjoint(rangeA, rangeB) {
36
+ return codePointRangeIsStrictlyBefore(rangeA, rangeB) || codePointRangeIsStrictlyAfter(rangeA, rangeB);
37
+ }
38
+ function characterSetNode(range, left, right) {
39
+ return { type: 'node', range, left, right };
40
+ }
41
+ function* characterSetGetRanges(set) {
42
+ if (set.type === 'node') {
43
+ yield* characterSetGetRanges(set.left);
44
+ yield set.range;
45
+ yield* characterSetGetRanges(set.right);
46
+ }
47
+ }
48
+ function characterSetExtractOverlap(set, range) {
49
+ if (set.type === 'empty') {
50
+ return { restCharSet: set, extendedRange: range };
51
+ }
52
+ let extendedRange = range;
53
+ let newLeft = set.left;
54
+ let newRight = set.right;
55
+ if (range.start < set.range.start) {
56
+ const resultLeft = characterSetExtractOverlap(set.left, range);
57
+ extendedRange = codePointRangeLeastUpperBound(extendedRange, resultLeft.extendedRange);
58
+ newLeft = resultLeft.restCharSet;
59
+ }
60
+ if (range.end > set.range.end) {
61
+ const resultRight = characterSetExtractOverlap(set.right, range);
62
+ extendedRange = codePointRangeLeastUpperBound(extendedRange, resultRight.extendedRange);
63
+ newRight = resultRight.restCharSet;
64
+ }
65
+ if (codePointRangeStrictlyDisjoint(range, set.range)) {
66
+ return {
67
+ extendedRange,
68
+ restCharSet: characterSetNode(set.range, newLeft, newRight),
69
+ };
70
+ }
71
+ return {
72
+ extendedRange: codePointRangeLeastUpperBound(set.range, extendedRange),
73
+ restCharSet: characterSetUnion(newLeft, newRight),
74
+ };
75
+ }
76
+ function characterSetInsertRange(set, range) {
77
+ if (codePointRangeIsEmpty(range)) {
78
+ return set;
79
+ }
80
+ if (set.type === 'empty') {
81
+ return characterSetNode(range, emptyCharacterSet, emptyCharacterSet);
82
+ }
83
+ if (codePointRangeIsStrictlyBefore(range, set.range)) {
84
+ return characterSetNode(set.range, characterSetInsertRange(set.left, range), set.right);
85
+ }
86
+ if (codePointRangeIsStrictlyAfter(range, set.range)) {
87
+ return characterSetNode(set.range, set.left, characterSetInsertRange(set.right, range));
88
+ }
89
+ const resultLeft = characterSetExtractOverlap(set.left, range);
90
+ const resultRight = characterSetExtractOverlap(set.right, range);
91
+ const resultRange = [set.range, resultLeft.extendedRange, resultRight.extendedRange].reduce(codePointRangeLeastUpperBound);
92
+ if (codePointRangeIsEmpty(resultRange)) {
93
+ return emptyCharacterSet;
94
+ }
95
+ return characterSetNode(resultRange, resultLeft.restCharSet, resultRight.restCharSet);
96
+ }
97
+ function characterSetUnion(setA, setB) {
98
+ return [...characterSetGetRanges(setB)].reduce(characterSetInsertRange, setA);
99
+ }
100
+ function codePointRangeSplitAt(point, range) {
101
+ return [
102
+ { start: range.start, end: Math.min(range.end, point) },
103
+ { start: Math.max(range.start, point + 1), end: range.end },
104
+ ];
105
+ }
106
+ function codePointRangeUnion(rangeA, rangeB) {
107
+ if (codePointRangeIsEmpty(rangeA) && codePointRangeIsEmpty(rangeB))
108
+ return [];
109
+ if (codePointRangeIsEmpty(rangeA))
110
+ return [rangeB];
111
+ if (codePointRangeIsEmpty(rangeB))
112
+ return [rangeA];
113
+ if (rangeA.end + 1 < rangeB.start)
114
+ return [rangeA, rangeB];
115
+ if (rangeB.end + 1 < rangeA.start)
116
+ return [rangeB, rangeA];
117
+ return [{
118
+ start: Math.min(rangeA.start, rangeB.start),
119
+ end: Math.max(rangeA.end, rangeB.end),
120
+ }];
121
+ }
122
+ function codePointRangeDifference(rangeA, rangeB) {
123
+ const [before, restRangeA] = codePointRangeSplitAt(rangeB.start - 1, rangeA);
124
+ const [, after] = codePointRangeSplitAt(rangeB.end, restRangeA);
125
+ return codePointRangeUnion(before, after);
126
+ }
127
+ function characterSetDeleteRange(set, range) {
128
+ if (codePointRangeIsEmpty(range)) {
129
+ return set;
130
+ }
131
+ if (set.type === 'empty') {
132
+ return emptyCharacterSet;
133
+ }
134
+ const [rangeBeforeStart] = codePointRangeSplitAt(set.range.start - 1, range);
135
+ const [rangeRest2, rangeAfterEnd] = codePointRangeSplitAt(set.range.end, range);
136
+ const newLeft = characterSetDeleteRange(set.left, rangeBeforeStart);
137
+ const newRight = characterSetDeleteRange(set.right, rangeAfterEnd);
138
+ const setRangeRest = codePointRangeDifference(set.range, rangeRest2);
139
+ if (setRangeRest.length === 0) {
140
+ return characterSetUnion(newLeft, newRight);
141
+ }
142
+ if (setRangeRest.length === 1) {
143
+ return characterSetNode(setRangeRest[0], newLeft, newRight);
144
+ }
145
+ // setRangeRest.length === 2
146
+ return characterSetUnion(characterSetInsertRange(newLeft, setRangeRest[0]), characterSetInsertRange(newRight, setRangeRest[1]));
147
+ }
148
+ function characterSetDifference(setA, setB) {
149
+ return [...characterSetGetRanges(setB)].reduce(characterSetDeleteRange, setA);
150
+ }
151
+ function characterSetFromRange(range) {
152
+ if (codePointRangeIsEmpty(range)) {
153
+ return emptyCharacterSet;
154
+ }
155
+ return characterSetNode(range, emptyCharacterSet, emptyCharacterSet);
156
+ }
157
+ function characterSetSingleton(char) {
158
+ const codePoint = char.codePointAt(0);
159
+ return characterSetFromRange({ start: codePoint, end: codePoint });
160
+ }
161
+ function characterSetCharRange(startChar, endChar) {
162
+ const start = startChar.codePointAt(0);
163
+ const end = endChar.codePointAt(0);
164
+ return characterSetFromRange({ start, end });
165
+ }
166
+ function characterSetFromArray(chars) {
167
+ return chars.map(characterSetSingleton).reduce(characterSetUnion, emptyCharacterSet);
168
+ }
169
+ function characterSetComplement(set) {
170
+ return characterSetDifference(alphabet, set);
171
+ }
172
+ // Pre-defined character sets
173
+ const alphabet = characterSetDifference(characterSetFromRange({ start: 0, end: 0x10FFFF }), characterSetFromArray(['\r', '\n', '\u2028', '\u2029']));
174
+ const wildcardCharacterSet = characterSetDifference(alphabet, characterSetFromArray(['\r', '\n', '\u2028', '\u2029']));
175
+ const digitChars = characterSetCharRange('0', '9');
176
+ const nonDigitChars = characterSetComplement(digitChars);
177
+ const wordChars = [
178
+ characterSetCharRange('a', 'z'),
179
+ characterSetCharRange('A', 'Z'),
180
+ characterSetCharRange('0', '9'),
181
+ characterSetSingleton('_'),
182
+ ].reduce(characterSetUnion);
183
+ const nonWordChars = characterSetComplement(wordChars);
184
+ const whiteSpaceChars = [
185
+ characterSetSingleton('\f'),
186
+ characterSetSingleton('\n'),
187
+ characterSetSingleton('\r'),
188
+ characterSetSingleton('\t'),
189
+ characterSetSingleton('\v'),
190
+ characterSetSingleton('\u0020'),
191
+ characterSetSingleton('\u00a0'),
192
+ characterSetSingleton('\u1680'),
193
+ characterSetCharRange('\u2000', '\u200a'),
194
+ characterSetSingleton('\u2028'),
195
+ characterSetSingleton('\u2029'),
196
+ characterSetSingleton('\u202f'),
197
+ characterSetSingleton('\u205f'),
198
+ characterSetSingleton('\u3000'),
199
+ characterSetSingleton('\ufeff'),
200
+ ].reduce(characterSetUnion);
201
+ const nonWhiteSpaceChars = characterSetComplement(whiteSpaceChars);
202
+ // AST constructors
203
+ const epsilon = { type: 'epsilon' };
204
+ function literal(charset) {
205
+ return { type: 'literal', charset };
206
+ }
207
+ function concat(left, right) {
208
+ return { type: 'concat', left, right };
209
+ }
210
+ function union(left, right) {
211
+ return { type: 'union', left, right };
212
+ }
213
+ function star(inner) {
214
+ return { type: 'star', inner };
215
+ }
216
+ function plus(inner) {
217
+ return { type: 'plus', inner };
218
+ }
219
+ function optional(inner) {
220
+ return { type: 'optional', inner };
221
+ }
222
+ function repeat(inner, bounds) {
223
+ return { type: 'repeat', inner, bounds };
224
+ }
225
+ function captureGroup(inner, name) {
226
+ if (name === undefined) {
227
+ return { type: 'capture-group', inner };
228
+ }
229
+ return { type: 'capture-group', inner, name };
230
+ }
231
+ function lookahead(isPositive, inner, right) {
232
+ return { type: 'lookahead', isPositive, inner, right };
233
+ }
234
+ function startAnchor(left, right) {
235
+ return { type: 'start-anchor', left, right };
236
+ }
237
+ function endAnchor(left, right) {
238
+ return { type: 'end-anchor', left, right };
239
+ }
240
+ // Parser implementation
241
+ const elementParser = createElementParser();
242
+ const metaCharacters = new Set(['\\', '^', '$', '.', '|', '?', '*', '+', '(', ')', '[', ']', '{', '}']);
243
+ // Escape sequences for control characters
244
+ const escapeNParser = promiseCompose(createExactSequenceParser('\\n'), () => literal(characterSetSingleton('\n')));
245
+ const escapeRParser = promiseCompose(createExactSequenceParser('\\r'), () => literal(characterSetSingleton('\r')));
246
+ const escapeTParser = promiseCompose(createExactSequenceParser('\\t'), () => literal(characterSetSingleton('\t')));
247
+ const escapeFParser = promiseCompose(createExactSequenceParser('\\f'), () => literal(characterSetSingleton('\f')));
248
+ const escapeVParser = promiseCompose(createExactSequenceParser('\\v'), () => literal(characterSetSingleton('\v')));
249
+ const escape0Parser = promiseCompose(createExactSequenceParser('\\0'), () => literal(characterSetSingleton('\0')));
250
+ // Character class escapes
251
+ const escapeDigitParser = promiseCompose(createExactSequenceParser('\\d'), () => literal(digitChars));
252
+ const escapeNonDigitParser = promiseCompose(createExactSequenceParser('\\D'), () => literal(nonDigitChars));
253
+ const escapeWordParser = promiseCompose(createExactSequenceParser('\\w'), () => literal(wordChars));
254
+ const escapeNonWordParser = promiseCompose(createExactSequenceParser('\\W'), () => literal(nonWordChars));
255
+ const escapeSpaceParser = promiseCompose(createExactSequenceParser('\\s'), () => literal(whiteSpaceChars));
256
+ const escapeNonSpaceParser = promiseCompose(createExactSequenceParser('\\S'), () => literal(nonWhiteSpaceChars));
257
+ // Hex escape \xHH
258
+ const escapeHexParser = promiseCompose(createTupleParser([
259
+ createExactSequenceParser('\\x'),
260
+ createFixedLengthSequenceParser(2),
261
+ ]), ([, hexCode]) => literal(characterSetSingleton(String.fromCharCode(Number.parseInt(hexCode, 16)))));
262
+ // Unicode escape \uHHHH
263
+ const escapeUnicodeParser = promiseCompose(createTupleParser([
264
+ createExactSequenceParser('\\u'),
265
+ createFixedLengthSequenceParser(4),
266
+ ]), ([, hexCode]) => literal(characterSetSingleton(String.fromCharCode(Number.parseInt(hexCode, 16)))));
267
+ // Escaped metacharacter (e.g., \., \*, etc.)
268
+ const escapeMetacharacterParser = promiseCompose(createTupleParser([
269
+ createExactSequenceParser('\\'),
270
+ elementParser,
271
+ ]), ([, char]) => literal(characterSetSingleton(char)));
272
+ // All escape sequences - use createDisjunctionParser to try specific escapes first
273
+ const escapeParser = createDisjunctionParser([
274
+ escapeNParser,
275
+ escapeRParser,
276
+ escapeTParser,
277
+ escapeFParser,
278
+ escapeVParser,
279
+ escape0Parser,
280
+ escapeDigitParser,
281
+ escapeNonDigitParser,
282
+ escapeWordParser,
283
+ escapeNonWordParser,
284
+ escapeSpaceParser,
285
+ escapeNonSpaceParser,
286
+ escapeHexParser,
287
+ escapeUnicodeParser,
288
+ escapeMetacharacterParser, // Must be last - matches any escaped char
289
+ ]);
290
+ // Dot (matches any character except newline)
291
+ const dotParser = promiseCompose(createExactSequenceParser('.'), () => literal(wildcardCharacterSet));
292
+ // Literal character (non-metacharacter)
293
+ const literalCharacterParser = parserCreatorCompose(() => elementParser, char => async (parserContext) => {
294
+ parserContext.invariant(!metaCharacters.has(char), 'Unexpected metacharacter "%s"', char);
295
+ return literal(characterSetSingleton(char));
296
+ })();
297
+ // Character class internals
298
+ // Character in a character class (different rules than outside)
299
+ const charClassMetaCharacters = new Set(['\\', ']', '^', '-']);
300
+ // Escape sequences inside character class (returns CharacterSet)
301
+ const charClassEscapeNParser = promiseCompose(createExactSequenceParser('\\n'), () => characterSetSingleton('\n'));
302
+ const charClassEscapeRParser = promiseCompose(createExactSequenceParser('\\r'), () => characterSetSingleton('\r'));
303
+ const charClassEscapeTParser = promiseCompose(createExactSequenceParser('\\t'), () => characterSetSingleton('\t'));
304
+ const charClassEscapeFParser = promiseCompose(createExactSequenceParser('\\f'), () => characterSetSingleton('\f'));
305
+ const charClassEscapeVParser = promiseCompose(createExactSequenceParser('\\v'), () => characterSetSingleton('\v'));
306
+ const charClassEscape0Parser = promiseCompose(createExactSequenceParser('\\0'), () => characterSetSingleton('\0'));
307
+ const charClassEscapeDigitParser = promiseCompose(createExactSequenceParser('\\d'), () => digitChars);
308
+ const charClassEscapeNonDigitParser = promiseCompose(createExactSequenceParser('\\D'), () => nonDigitChars);
309
+ const charClassEscapeWordParser = promiseCompose(createExactSequenceParser('\\w'), () => wordChars);
310
+ const charClassEscapeNonWordParser = promiseCompose(createExactSequenceParser('\\W'), () => nonWordChars);
311
+ const charClassEscapeSpaceParser = promiseCompose(createExactSequenceParser('\\s'), () => whiteSpaceChars);
312
+ const charClassEscapeNonSpaceParser = promiseCompose(createExactSequenceParser('\\S'), () => nonWhiteSpaceChars);
313
+ const charClassEscapeHexParser = promiseCompose(createTupleParser([
314
+ createExactSequenceParser('\\x'),
315
+ createFixedLengthSequenceParser(2),
316
+ ]), ([, hexCode]) => characterSetSingleton(String.fromCharCode(Number.parseInt(hexCode, 16))));
317
+ const charClassEscapeUnicodeParser = promiseCompose(createTupleParser([
318
+ createExactSequenceParser('\\u'),
319
+ createFixedLengthSequenceParser(4),
320
+ ]), ([, hexCode]) => characterSetSingleton(String.fromCharCode(Number.parseInt(hexCode, 16))));
321
+ const charClassEscapeMetacharacterParser = promiseCompose(createTupleParser([
322
+ createExactSequenceParser('\\'),
323
+ elementParser,
324
+ ]), ([, char]) => characterSetSingleton(char));
325
+ // Use createDisjunctionParser to try specific escapes before generic metacharacter escape
326
+ const charClassEscapeParser = createDisjunctionParser([
327
+ charClassEscapeNParser,
328
+ charClassEscapeRParser,
329
+ charClassEscapeTParser,
330
+ charClassEscapeFParser,
331
+ charClassEscapeVParser,
332
+ charClassEscape0Parser,
333
+ charClassEscapeDigitParser,
334
+ charClassEscapeNonDigitParser,
335
+ charClassEscapeWordParser,
336
+ charClassEscapeNonWordParser,
337
+ charClassEscapeSpaceParser,
338
+ charClassEscapeNonSpaceParser,
339
+ charClassEscapeHexParser,
340
+ charClassEscapeUnicodeParser,
341
+ charClassEscapeMetacharacterParser, // Must be last - matches any escaped char
342
+ ]);
343
+ // Single character (not escape, not ], not -)
344
+ const charClassLiteralParser = parserCreatorCompose(() => elementParser, char => async (parserContext) => {
345
+ parserContext.invariant(!charClassMetaCharacters.has(char), 'Unexpected character class metacharacter "%s"', char);
346
+ return characterSetSingleton(char);
347
+ })();
348
+ // Single char in character class (escape or literal) - returns the character string for range checking
349
+ const charClassSingleCharParser = createUnionParser([
350
+ // Escape sequences that produce single chars
351
+ promiseCompose(createExactSequenceParser('\\n'), () => '\n'),
352
+ promiseCompose(createExactSequenceParser('\\r'), () => '\r'),
353
+ promiseCompose(createExactSequenceParser('\\t'), () => '\t'),
354
+ promiseCompose(createExactSequenceParser('\\f'), () => '\f'),
355
+ promiseCompose(createExactSequenceParser('\\v'), () => '\v'),
356
+ promiseCompose(createExactSequenceParser('\\0'), () => '\0'),
357
+ promiseCompose(createTupleParser([
358
+ createExactSequenceParser('\\x'),
359
+ createFixedLengthSequenceParser(2),
360
+ ]), ([, hexCode]) => String.fromCharCode(Number.parseInt(hexCode, 16))),
361
+ promiseCompose(createTupleParser([
362
+ createExactSequenceParser('\\u'),
363
+ createFixedLengthSequenceParser(4),
364
+ ]), ([, hexCode]) => String.fromCharCode(Number.parseInt(hexCode, 16))),
365
+ promiseCompose(createTupleParser([
366
+ createExactSequenceParser('\\'),
367
+ elementParser,
368
+ ]), ([, char]) => char),
369
+ // Literal char (not metacharacter, not -)
370
+ parserCreatorCompose(() => elementParser, char => async (parserContext) => {
371
+ parserContext.invariant(!charClassMetaCharacters.has(char) && char !== '-', 'Unexpected character "%s"', char);
372
+ return char;
373
+ })(),
374
+ ]);
375
+ // Character range (a-z)
376
+ const charClassRangeParser = promiseCompose(createTupleParser([
377
+ charClassSingleCharParser,
378
+ createExactSequenceParser('-'),
379
+ charClassSingleCharParser,
380
+ ]), ([startChar, , endChar]) => characterSetCharRange(startChar, endChar));
381
+ // Character class element: range, escape (for \d, \w, etc.), or single char
382
+ const charClassElementParser = createDisjunctionParser([
383
+ charClassRangeParser,
384
+ charClassEscapeParser,
385
+ charClassLiteralParser,
386
+ // Literal hyphen at end or after negation
387
+ promiseCompose(createTupleParser([
388
+ createExactSequenceParser('-'),
389
+ createNegativeLookaheadParser(createExactSequenceParser(']')),
390
+ ]), () => characterSetSingleton('-')),
391
+ ]);
392
+ // Character class [...]
393
+ const characterClassParser = promiseCompose(createTupleParser([
394
+ createExactSequenceParser('['),
395
+ createOptionalParser(createExactSequenceParser('^')),
396
+ createTerminatedArrayParser(charClassElementParser, createExactSequenceParser(']')),
397
+ ]), ([, negation, [elements]]) => {
398
+ let charset = elements.reduce((acc, el) => characterSetUnion(acc, el), emptyCharacterSet);
399
+ if (negation !== undefined) {
400
+ charset = characterSetComplement(charset);
401
+ }
402
+ return literal(charset);
403
+ });
404
+ const starQuantifierParser = promiseCompose(createExactSequenceParser('*'), () => ({ type: 'star' }));
405
+ const plusQuantifierParser = promiseCompose(createExactSequenceParser('+'), () => ({ type: 'plus' }));
406
+ const optionalQuantifierParser = promiseCompose(createExactSequenceParser('?'), () => ({ type: 'optional' }));
407
+ // Parse a number for quantifiers
408
+ const numberParser = parserCreatorCompose(() => createArrayParser(parserCreatorCompose(() => elementParser, char => async (parserContext) => {
409
+ parserContext.invariant(char >= '0' && char <= '9', 'Expected digit, got "%s"', char);
410
+ return char;
411
+ })()), digits => async (parserContext) => {
412
+ parserContext.invariant(digits.length > 0, 'Expected at least one digit');
413
+ return Number.parseInt(digits.join(''), 10);
414
+ })();
415
+ // {n}, {n,}, {n,m}
416
+ const braceQuantifierParser = promiseCompose(createTupleParser([
417
+ createExactSequenceParser('{'),
418
+ numberParser,
419
+ createOptionalParser(createTupleParser([
420
+ createExactSequenceParser(','),
421
+ createOptionalParser(numberParser),
422
+ ])),
423
+ createExactSequenceParser('}'),
424
+ ]), ([, min, comma]) => {
425
+ if (comma === undefined) {
426
+ // {n} - exactly n
427
+ return { type: 'repeat', bounds: min };
428
+ }
429
+ const [, max] = comma;
430
+ if (max === undefined) {
431
+ // {n,} - at least n
432
+ return { type: 'repeat', bounds: { min } };
433
+ }
434
+ // {n,m} - between n and m
435
+ return { type: 'repeat', bounds: { min, max } };
436
+ });
437
+ const quantifierParser = createUnionParser([
438
+ starQuantifierParser,
439
+ plusQuantifierParser,
440
+ optionalQuantifierParser,
441
+ braceQuantifierParser,
442
+ ]);
443
+ // Groups
444
+ // Capture group (...)
445
+ const captureGroupParser = promiseCompose(createTupleParser([
446
+ createExactSequenceParser('('),
447
+ createNegativeLookaheadParser(createExactSequenceParser('?')),
448
+ createParserAccessorParser(() => alternationParser),
449
+ createExactSequenceParser(')'),
450
+ ]), ([, , inner]) => captureGroup(inner));
451
+ // Named capture group (?<name>...)
452
+ const namedCaptureGroupParser = promiseCompose(createTupleParser([
453
+ createExactSequenceParser('(?<'),
454
+ createTerminatedArrayParser(parserCreatorCompose(() => elementParser, char => async (parserContext) => {
455
+ parserContext.invariant(char !== '>', 'Unexpected ">"');
456
+ return char;
457
+ })(), createExactSequenceParser('>')),
458
+ createParserAccessorParser(() => alternationParser),
459
+ createExactSequenceParser(')'),
460
+ ]), ([, [nameChars], inner]) => captureGroup(inner, nameChars.join('')));
461
+ // Non-capture group (?:...)
462
+ const nonCaptureGroupParser = promiseCompose(createTupleParser([
463
+ createExactSequenceParser('(?:'),
464
+ createParserAccessorParser(() => alternationParser),
465
+ createExactSequenceParser(')'),
466
+ ]), ([, inner]) => inner);
467
+ // Positive lookahead (?=...)
468
+ const positiveLookaheadMarkerParser = promiseCompose(createTupleParser([
469
+ createExactSequenceParser('(?='),
470
+ createParserAccessorParser(() => alternationParser),
471
+ createExactSequenceParser(')'),
472
+ ]), ([, inner]) => ({ type: 'lookahead-marker', isPositive: true, inner }));
473
+ // Negative lookahead (?!...)
474
+ const negativeLookaheadMarkerParser = promiseCompose(createTupleParser([
475
+ createExactSequenceParser('(?!'),
476
+ createParserAccessorParser(() => alternationParser),
477
+ createExactSequenceParser(')'),
478
+ ]), ([, inner]) => ({ type: 'lookahead-marker', isPositive: false, inner }));
479
+ const groupParser = createUnionParser([
480
+ namedCaptureGroupParser,
481
+ nonCaptureGroupParser,
482
+ captureGroupParser,
483
+ ]);
484
+ const startAnchorMarkerParser = promiseCompose(createExactSequenceParser('^'), () => ({ type: 'start-anchor-marker' }));
485
+ const endAnchorMarkerParser = promiseCompose(createExactSequenceParser('$'), () => ({ type: 'end-anchor-marker' }));
486
+ // Atom: the basic unit that can be quantified (excluding anchors)
487
+ const atomParser = createUnionParser([
488
+ groupParser,
489
+ characterClassParser,
490
+ escapeParser,
491
+ dotParser,
492
+ literalCharacterParser,
493
+ ]);
494
+ // Quantified atom
495
+ const quantifiedParser = promiseCompose(createTupleParser([
496
+ atomParser,
497
+ createOptionalParser(quantifierParser),
498
+ ]), ([atom, quantifier]) => {
499
+ if (quantifier === undefined) {
500
+ return atom;
501
+ }
502
+ switch (quantifier.type) {
503
+ case 'star':
504
+ return star(atom);
505
+ case 'plus':
506
+ return plus(atom);
507
+ case 'optional':
508
+ return optional(atom);
509
+ case 'repeat':
510
+ return repeat(atom, quantifier.bounds);
511
+ }
512
+ });
513
+ // Element in a sequence: either a quantified atom, anchor marker, or lookahead marker
514
+ const sequenceElementParser = createUnionParser([
515
+ startAnchorMarkerParser,
516
+ endAnchorMarkerParser,
517
+ positiveLookaheadMarkerParser,
518
+ negativeLookaheadMarkerParser,
519
+ quantifiedParser,
520
+ ]);
521
+ // Helper to concatenate a list of RegularExpressions (right-associative)
522
+ function concatList(parts) {
523
+ if (parts.length === 0) {
524
+ return epsilon;
525
+ }
526
+ return parts.reduceRight((acc, part) => concat(part, acc));
527
+ }
528
+ // Process elements with anchor markers and lookahead markers into proper AST
529
+ // Handles anchors and lookahead as infix operators like @gruhn/regex-utils
530
+ // Precedence order (lowest to highest): union -> start-anchor -> end-anchor -> lookahead -> concat
531
+ function processElements(elements) {
532
+ if (elements.length === 0) {
533
+ return epsilon;
534
+ }
535
+ // Process start anchors first (lowest precedence among infix operators)
536
+ const startAnchorIdx = elements.findIndex(e => 'type' in e && e.type === 'start-anchor-marker');
537
+ if (startAnchorIdx !== -1) {
538
+ const left = elements.slice(0, startAnchorIdx);
539
+ const right = elements.slice(startAnchorIdx + 1);
540
+ return startAnchor(processElements(left), processElements(right));
541
+ }
542
+ // Then end anchors
543
+ const endAnchorIdx = elements.findIndex(e => 'type' in e && e.type === 'end-anchor-marker');
544
+ if (endAnchorIdx !== -1) {
545
+ const left = elements.slice(0, endAnchorIdx);
546
+ const right = elements.slice(endAnchorIdx + 1);
547
+ return endAnchor(processElements(left), processElements(right));
548
+ }
549
+ // Then lookaheads (higher precedence than anchors)
550
+ const lookaheadIdx = elements.findIndex(e => 'type' in e && e.type === 'lookahead-marker');
551
+ if (lookaheadIdx !== -1) {
552
+ const marker = elements[lookaheadIdx];
553
+ const left = elements.slice(0, lookaheadIdx);
554
+ const right = elements.slice(lookaheadIdx + 1);
555
+ const lookaheadExpr = lookahead(marker.isPositive, marker.inner, processElements(right));
556
+ if (left.length === 0) {
557
+ return lookaheadExpr;
558
+ }
559
+ // If there's content before the lookahead, concatenate it
560
+ return concat(processElements(left), lookaheadExpr);
561
+ }
562
+ // No markers, just regular expressions - concatenate them
563
+ const regexParts = elements;
564
+ return concatList(regexParts);
565
+ }
566
+ // Concatenation: sequence of quantified atoms and anchors
567
+ const concatParser = promiseCompose(createArrayParser(sequenceElementParser), processElements);
568
+ // Alternation: concat ('|' concat)*
569
+ const alternationParser = promiseCompose(createTupleParser([
570
+ concatParser,
571
+ createArrayParser(promiseCompose(createTupleParser([
572
+ createExactSequenceParser('|'),
573
+ concatParser,
574
+ ]), ([, right]) => right)),
575
+ ]), ([first, rest]) => {
576
+ // Right-associative union like @gruhn/regex-utils
577
+ const allParts = [first, ...rest];
578
+ return allParts.reduceRight((acc, part) => union(part, acc));
579
+ });
580
+ export const regularExpressionParser = alternationParser;
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,89 @@
1
+ import { testProp, fc } from '@fast-check/ava';
2
+ import { regularExpressionParser } from './regularExpressionParser.js';
3
+ const seed = process.env.SEED ? Number(process.env.SEED) : undefined;
4
+ // Import directly from file path to bypass package exports
5
+ // eslint-disable-next-line import/no-unresolved
6
+ import { parseRegExpString } from '../node_modules/@gruhn/regex-utils/dist/regex-parser.js';
7
+ import { runParser } from './parser.js';
8
+ import { stringParserInputCompanion } from './parserInputCompanion.js';
9
+ import { arbitrarilySlicedAsyncIterator } from './arbitrarilySlicedAsyncInterator.js';
10
+ // Normalize AST for comparison - removes hashes from CharSets and normalizes structure
11
+ function normalizeCharacterSet(charset) {
12
+ if (charset.type === 'empty') {
13
+ return { type: 'empty' };
14
+ }
15
+ return {
16
+ type: 'node',
17
+ range: { start: charset.range.start, end: charset.range.end },
18
+ left: normalizeCharacterSet(charset.left),
19
+ right: normalizeCharacterSet(charset.right),
20
+ };
21
+ }
22
+ function normalizeRegularExpression(ast) {
23
+ switch (ast.type) {
24
+ case 'epsilon':
25
+ return { type: 'epsilon' };
26
+ case 'literal':
27
+ return { type: 'literal', charset: normalizeCharacterSet(ast.charset) };
28
+ case 'concat':
29
+ return { type: 'concat', left: normalizeRegularExpression(ast.left), right: normalizeRegularExpression(ast.right) };
30
+ case 'union':
31
+ return { type: 'union', left: normalizeRegularExpression(ast.left), right: normalizeRegularExpression(ast.right) };
32
+ case 'star':
33
+ return { type: 'star', inner: normalizeRegularExpression(ast.inner) };
34
+ case 'plus':
35
+ return { type: 'plus', inner: normalizeRegularExpression(ast.inner) };
36
+ case 'optional':
37
+ return { type: 'optional', inner: normalizeRegularExpression(ast.inner) };
38
+ case 'repeat':
39
+ return { type: 'repeat', inner: normalizeRegularExpression(ast.inner), bounds: ast.bounds };
40
+ case 'capture-group':
41
+ if (ast.name !== undefined) {
42
+ return { type: 'capture-group', inner: normalizeRegularExpression(ast.inner), name: ast.name };
43
+ }
44
+ return { type: 'capture-group', inner: normalizeRegularExpression(ast.inner) };
45
+ case 'lookahead':
46
+ return { type: 'lookahead', isPositive: ast.isPositive, inner: normalizeRegularExpression(ast.inner), right: normalizeRegularExpression(ast.right) };
47
+ case 'start-anchor':
48
+ return { type: 'start-anchor', left: normalizeRegularExpression(ast.left), right: normalizeRegularExpression(ast.right) };
49
+ case 'end-anchor':
50
+ return { type: 'end-anchor', left: normalizeRegularExpression(ast.left), right: normalizeRegularExpression(ast.right) };
51
+ }
52
+ }
53
+ // Generate regex patterns that are likely to be supported
54
+ const supportedRegexArbitrary = fc.stringMatching(/^([a-zA-Z0-9]|\\[dDwWsS.]|\[(\^)?([a-zA-Z0-9](-[a-zA-Z0-9])?|\\[dDwWsS])*\]|\.|\((\?[:=!])?[a-zA-Z0-9]*\)|[*+?]|\{[0-9]+(,[0-9]*)?\}|\||\^|\$)*$/).filter(s => {
55
+ // Filter out patterns that JavaScript doesn't support
56
+ try {
57
+ new RegExp(s);
58
+ }
59
+ catch {
60
+ return false;
61
+ }
62
+ // Filter out patterns that @gruhn/regex-utils doesn't support
63
+ try {
64
+ parseRegExpString(s);
65
+ }
66
+ catch {
67
+ return false;
68
+ }
69
+ // Filter out quantified lookaheads - @gruhn/regex-utils has a bug where it treats
70
+ // quantifiers after lookaheads as literals instead of quantifiers.
71
+ // See: https://github.com/gruhn/regex-utils/issues/13
72
+ // JavaScript allows (?=a){2} but @gruhn/regex-utils parses {2} as literal text.
73
+ if (/\(\?[=!][^)]*\)[*+?]|\(\?[=!][^)]*\)\{[0-9]/.test(s)) {
74
+ return false;
75
+ }
76
+ return true;
77
+ });
78
+ testProp('regularExpressionParser matches @gruhn/regex-utils', [
79
+ arbitrarilySlicedAsyncIterator(supportedRegexArbitrary),
80
+ ], async (t, [regexStr, regexStringChunkIterator]) => {
81
+ const expected = normalizeRegularExpression(parseRegExpString(regexStr));
82
+ const actual = normalizeRegularExpression(await runParser(regularExpressionParser, regexStringChunkIterator, stringParserInputCompanion, {
83
+ errorJoinMode: 'none',
84
+ }));
85
+ t.deepEqual(actual, expected);
86
+ }, {
87
+ verbose: true,
88
+ seed,
89
+ });