@futpib/parser 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/bash.d.ts +84 -0
- package/build/bash.js +1 -0
- package/build/bashParser.d.ts +6 -0
- package/build/bashParser.js +294 -0
- package/build/bashParser.test.d.ts +1 -0
- package/build/bashParser.test.js +181 -0
- package/build/index.d.ts +24 -2
- package/build/index.js +22 -1
- package/build/regexpParser.d.ts +2 -0
- package/build/regexpParser.js +71 -0
- package/build/regexpParser.test.d.ts +1 -0
- package/build/regexpParser.test.js +83 -0
- package/build/regularExpression.d.ts +63 -0
- package/build/regularExpression.js +1 -0
- package/build/regularExpressionParser.d.ts +3 -0
- package/build/regularExpressionParser.js +580 -0
- package/build/regularExpressionParser.test.d.ts +1 -0
- package/build/regularExpressionParser.test.js +89 -0
- package/package.json +2 -1
- package/src/bash.ts +120 -0
- package/src/bashParser.test.ts +332 -0
- package/src/bashParser.ts +461 -0
- package/src/index.ts +113 -2
- package/src/regexpParser.test.ts +186 -0
- package/src/regexpParser.ts +94 -0
- package/src/regularExpression.ts +24 -0
- package/src/regularExpressionParser.test.ts +102 -0
- package/src/regularExpressionParser.ts +921 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import test from 'ava';
|
|
2
|
+
import * as fc from 'fast-check';
|
|
3
|
+
import { testProp } from '@fast-check/ava';
|
|
4
|
+
import { runParser, runParserWithRemainingInput } from './parser.js';
|
|
5
|
+
import { stringParserInputCompanion } from './parserInputCompanion.js';
|
|
6
|
+
import { createRegExpParser } from './regexpParser.js';
|
|
7
|
+
|
|
8
|
+
test('regexpParser matches digits', async t => {
|
|
9
|
+
const regexpParser = createRegExpParser(/\d+/);
|
|
10
|
+
|
|
11
|
+
const result = await runParser(
|
|
12
|
+
regexpParser,
|
|
13
|
+
'123',
|
|
14
|
+
stringParserInputCompanion,
|
|
15
|
+
);
|
|
16
|
+
|
|
17
|
+
t.is(result[0], '123');
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
test('regexpParser matches at start only', async t => {
|
|
21
|
+
const regexpParser = createRegExpParser(/\d+/);
|
|
22
|
+
|
|
23
|
+
const { output, remainingInput } = await runParserWithRemainingInput(
|
|
24
|
+
regexpParser,
|
|
25
|
+
'123abc',
|
|
26
|
+
stringParserInputCompanion,
|
|
27
|
+
);
|
|
28
|
+
|
|
29
|
+
t.is(output[0], '123');
|
|
30
|
+
t.truthy(remainingInput);
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
test('regexpParser fails when no match at start', async t => {
|
|
34
|
+
const regexpParser = createRegExpParser(/\d+/);
|
|
35
|
+
|
|
36
|
+
await t.throwsAsync(
|
|
37
|
+
runParser(
|
|
38
|
+
regexpParser,
|
|
39
|
+
'abc123',
|
|
40
|
+
stringParserInputCompanion,
|
|
41
|
+
),
|
|
42
|
+
);
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
test('regexpParser with capture groups', async t => {
|
|
46
|
+
const regexpParser = createRegExpParser(/(\d+)-(\d+)/);
|
|
47
|
+
|
|
48
|
+
const result = await runParser(
|
|
49
|
+
regexpParser,
|
|
50
|
+
'123-456',
|
|
51
|
+
stringParserInputCompanion,
|
|
52
|
+
);
|
|
53
|
+
|
|
54
|
+
t.is(result[0], '123-456');
|
|
55
|
+
t.is(result[1], '123');
|
|
56
|
+
t.is(result[2], '456');
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
test('regexpParser greedy matching', async t => {
|
|
60
|
+
const regexpParser = createRegExpParser(/a+/);
|
|
61
|
+
|
|
62
|
+
const { output } = await runParserWithRemainingInput(
|
|
63
|
+
regexpParser,
|
|
64
|
+
'aaab',
|
|
65
|
+
stringParserInputCompanion,
|
|
66
|
+
);
|
|
67
|
+
|
|
68
|
+
t.is(output[0], 'aaa');
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
test('regexpParser with anchored regexp', async t => {
|
|
72
|
+
const regexpParser = createRegExpParser(/^hello/);
|
|
73
|
+
|
|
74
|
+
const { output } = await runParserWithRemainingInput(
|
|
75
|
+
regexpParser,
|
|
76
|
+
'hello world',
|
|
77
|
+
stringParserInputCompanion,
|
|
78
|
+
);
|
|
79
|
+
|
|
80
|
+
t.is(output[0], 'hello');
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
testProp.serial(
|
|
84
|
+
'regexpParser matches word characters',
|
|
85
|
+
[
|
|
86
|
+
fc.tuple(
|
|
87
|
+
fc.stringMatching(/^\w+$/),
|
|
88
|
+
fc.stringMatching(/^\W*$/),
|
|
89
|
+
),
|
|
90
|
+
],
|
|
91
|
+
async (t, [ word, nonWord ]) => {
|
|
92
|
+
const regexpParser = createRegExpParser(/\w+/);
|
|
93
|
+
|
|
94
|
+
const { output, position } = await runParserWithRemainingInput(
|
|
95
|
+
regexpParser,
|
|
96
|
+
word + nonWord,
|
|
97
|
+
stringParserInputCompanion,
|
|
98
|
+
);
|
|
99
|
+
|
|
100
|
+
t.is(output[0], word);
|
|
101
|
+
t.is(position, word.length);
|
|
102
|
+
},
|
|
103
|
+
{
|
|
104
|
+
verbose: true,
|
|
105
|
+
},
|
|
106
|
+
);
|
|
107
|
+
|
|
108
|
+
// Tests for zero-width/optional patterns at end of input
|
|
109
|
+
|
|
110
|
+
test('regexpParser with star quantifier on empty input', async t => {
|
|
111
|
+
const regexpParser = createRegExpParser(/a*/);
|
|
112
|
+
|
|
113
|
+
const result = await runParser(
|
|
114
|
+
regexpParser,
|
|
115
|
+
'',
|
|
116
|
+
stringParserInputCompanion,
|
|
117
|
+
);
|
|
118
|
+
|
|
119
|
+
t.is(result[0], '');
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
test('regexpParser with optional whitespace on empty input', async t => {
|
|
123
|
+
const regexpParser = createRegExpParser(/[ \t]*/);
|
|
124
|
+
|
|
125
|
+
const result = await runParser(
|
|
126
|
+
regexpParser,
|
|
127
|
+
'',
|
|
128
|
+
stringParserInputCompanion,
|
|
129
|
+
);
|
|
130
|
+
|
|
131
|
+
t.is(result[0], '');
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
test('regexpParser with star quantifier at end of input (no match)', async t => {
|
|
135
|
+
const regexpParser = createRegExpParser(/a*/);
|
|
136
|
+
|
|
137
|
+
const { output } = await runParserWithRemainingInput(
|
|
138
|
+
regexpParser,
|
|
139
|
+
'bbb',
|
|
140
|
+
stringParserInputCompanion,
|
|
141
|
+
);
|
|
142
|
+
|
|
143
|
+
t.is(output[0], '');
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
test('regexpParser with optional group on empty input', async t => {
|
|
147
|
+
const regexpParser = createRegExpParser(/(?:foo)?/);
|
|
148
|
+
|
|
149
|
+
const result = await runParser(
|
|
150
|
+
regexpParser,
|
|
151
|
+
'',
|
|
152
|
+
stringParserInputCompanion,
|
|
153
|
+
);
|
|
154
|
+
|
|
155
|
+
t.is(result[0], '');
|
|
156
|
+
});
|
|
157
|
+
|
|
158
|
+
// Tests for negative lookahead
|
|
159
|
+
|
|
160
|
+
test('regexpParser with negative lookahead should not match when followed by same char', async t => {
|
|
161
|
+
// This regex should NOT match anything in '||' - the | is followed by another |
|
|
162
|
+
const regexpParser = createRegExpParser(/\|(?!\|)/);
|
|
163
|
+
|
|
164
|
+
await t.throwsAsync(
|
|
165
|
+
runParser(
|
|
166
|
+
regexpParser,
|
|
167
|
+
'||',
|
|
168
|
+
stringParserInputCompanion,
|
|
169
|
+
),
|
|
170
|
+
);
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
test('regexpParser with negative lookahead should match single char', async t => {
|
|
174
|
+
// This regex should match single '|' when followed by something else
|
|
175
|
+
const regexpParser = createRegExpParser(/\|(?!\|)/);
|
|
176
|
+
|
|
177
|
+
const { output, position, remainingInput } = await runParserWithRemainingInput(
|
|
178
|
+
regexpParser,
|
|
179
|
+
'| ',
|
|
180
|
+
stringParserInputCompanion,
|
|
181
|
+
);
|
|
182
|
+
|
|
183
|
+
t.is(output[0], '|');
|
|
184
|
+
t.is(position, 1); // Consumed 1 character
|
|
185
|
+
t.truthy(remainingInput); // There's remaining input (the space)
|
|
186
|
+
});
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import { type Parser, setParserName } from './parser.js';
|
|
2
|
+
|
|
3
|
+
export const createRegExpParser = (
|
|
4
|
+
regexp: RegExp,
|
|
5
|
+
): Parser<RegExpExecArray, string, string> => {
|
|
6
|
+
const regexpParser: Parser<RegExpExecArray, string, string> = async parserContext => {
|
|
7
|
+
let start = 0;
|
|
8
|
+
let window = 1;
|
|
9
|
+
let lastMatch: RegExpExecArray | undefined;
|
|
10
|
+
let reachedEndOfInput = false;
|
|
11
|
+
|
|
12
|
+
while (true) {
|
|
13
|
+
const sequence = await parserContext.peekSequence(start, start + window);
|
|
14
|
+
|
|
15
|
+
if (sequence === undefined) {
|
|
16
|
+
reachedEndOfInput = true;
|
|
17
|
+
window = Math.floor(window / 2);
|
|
18
|
+
|
|
19
|
+
if (window === 0) {
|
|
20
|
+
// Get the full sequence we've accumulated to verify matches
|
|
21
|
+
const fullSequence = await parserContext.peekSequence(0, start);
|
|
22
|
+
|
|
23
|
+
// Verify any previous match is still valid with full context
|
|
24
|
+
// For lookahead/lookbehind assertions, additional input might invalidate a match
|
|
25
|
+
if (fullSequence !== undefined) {
|
|
26
|
+
const verifyMatch = regexp.exec(fullSequence);
|
|
27
|
+
if (verifyMatch !== null && verifyMatch.index === 0) {
|
|
28
|
+
parserContext.skip(verifyMatch[0].length);
|
|
29
|
+
return verifyMatch;
|
|
30
|
+
}
|
|
31
|
+
} else if (lastMatch !== undefined) {
|
|
32
|
+
// No full sequence available but we have a previous match
|
|
33
|
+
parserContext.skip(lastMatch[0].length);
|
|
34
|
+
return lastMatch;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// No previous match - try matching against empty string for zero-width patterns (e.g., /a*/, /[ \t]*/)
|
|
38
|
+
const emptyMatch = regexp.exec('');
|
|
39
|
+
if (emptyMatch !== null && emptyMatch.index === 0) {
|
|
40
|
+
return emptyMatch;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
return parserContext.invariant(false, 'Unexpected end of input without regex match');
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
const fullSequence = await parserContext.peekSequence(0, start + window);
|
|
50
|
+
|
|
51
|
+
if (fullSequence === undefined) {
|
|
52
|
+
continue;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
const match = regexp.exec(fullSequence);
|
|
56
|
+
|
|
57
|
+
if (match === null || match.index !== 0) {
|
|
58
|
+
if (lastMatch !== undefined) {
|
|
59
|
+
// Verify lastMatch is still valid with current full context
|
|
60
|
+
// For lookahead/lookbehind assertions, a match on shorter input might be
|
|
61
|
+
// invalidated by additional input (e.g., /\|(?!\|)/ matches '|' but not '||')
|
|
62
|
+
const verifyMatch = regexp.exec(fullSequence);
|
|
63
|
+
if (verifyMatch !== null && verifyMatch.index === 0) {
|
|
64
|
+
parserContext.skip(verifyMatch[0].length);
|
|
65
|
+
return verifyMatch;
|
|
66
|
+
}
|
|
67
|
+
// lastMatch was invalidated by additional context
|
|
68
|
+
lastMatch = undefined;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
if (reachedEndOfInput) {
|
|
72
|
+
parserContext.invariant(
|
|
73
|
+
false,
|
|
74
|
+
'Regex did not match at start of input',
|
|
75
|
+
);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
start += window;
|
|
79
|
+
window *= 2;
|
|
80
|
+
|
|
81
|
+
continue;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
lastMatch = match;
|
|
85
|
+
|
|
86
|
+
start += window;
|
|
87
|
+
window *= 2;
|
|
88
|
+
}
|
|
89
|
+
};
|
|
90
|
+
|
|
91
|
+
setParserName(regexpParser, regexp.toString());
|
|
92
|
+
|
|
93
|
+
return regexpParser;
|
|
94
|
+
};
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
export type CodePointRange = {
|
|
2
|
+
start: number;
|
|
3
|
+
end: number;
|
|
4
|
+
};
|
|
5
|
+
|
|
6
|
+
export type CharacterSet =
|
|
7
|
+
| { type: 'empty' }
|
|
8
|
+
| { type: 'node'; range: CodePointRange; left: CharacterSet; right: CharacterSet };
|
|
9
|
+
|
|
10
|
+
export type RepeatBounds = number | { min: number; max?: number } | { min?: number; max: number };
|
|
11
|
+
|
|
12
|
+
export type RegularExpression =
|
|
13
|
+
| { type: 'epsilon' }
|
|
14
|
+
| { type: 'literal'; charset: CharacterSet }
|
|
15
|
+
| { type: 'concat'; left: RegularExpression; right: RegularExpression }
|
|
16
|
+
| { type: 'union'; left: RegularExpression; right: RegularExpression }
|
|
17
|
+
| { type: 'star'; inner: RegularExpression }
|
|
18
|
+
| { type: 'plus'; inner: RegularExpression }
|
|
19
|
+
| { type: 'optional'; inner: RegularExpression }
|
|
20
|
+
| { type: 'repeat'; inner: RegularExpression; bounds: RepeatBounds }
|
|
21
|
+
| { type: 'capture-group'; inner: RegularExpression; name?: string }
|
|
22
|
+
| { type: 'lookahead'; isPositive: boolean; inner: RegularExpression; right: RegularExpression }
|
|
23
|
+
| { type: 'start-anchor'; left: RegularExpression; right: RegularExpression }
|
|
24
|
+
| { type: 'end-anchor'; left: RegularExpression; right: RegularExpression };
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import { testProp, fc } from '@fast-check/ava';
|
|
2
|
+
import { regularExpressionParser } from './regularExpressionParser.js';
|
|
3
|
+
|
|
4
|
+
const seed = process.env.SEED ? Number(process.env.SEED) : undefined;
|
|
5
|
+
|
|
6
|
+
// Import directly from file path to bypass package exports
|
|
7
|
+
// eslint-disable-next-line import/no-unresolved
|
|
8
|
+
import { parseRegExpString } from '../node_modules/@gruhn/regex-utils/dist/regex-parser.js';
|
|
9
|
+
import { runParser } from './parser.js';
|
|
10
|
+
import { stringParserInputCompanion } from './parserInputCompanion.js';
|
|
11
|
+
import { arbitrarilySlicedAsyncIterator } from './arbitrarilySlicedAsyncInterator.js';
|
|
12
|
+
import type { RegularExpression, CharacterSet } from './regularExpression.js';
|
|
13
|
+
|
|
14
|
+
// Normalize AST for comparison - removes hashes from CharSets and normalizes structure
|
|
15
|
+
function normalizeCharacterSet(charset: CharacterSet): CharacterSet {
|
|
16
|
+
if (charset.type === 'empty') {
|
|
17
|
+
return { type: 'empty' };
|
|
18
|
+
}
|
|
19
|
+
return {
|
|
20
|
+
type: 'node',
|
|
21
|
+
range: { start: charset.range.start, end: charset.range.end },
|
|
22
|
+
left: normalizeCharacterSet(charset.left),
|
|
23
|
+
right: normalizeCharacterSet(charset.right),
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
function normalizeRegularExpression(ast: RegularExpression): RegularExpression {
|
|
28
|
+
switch (ast.type) {
|
|
29
|
+
case 'epsilon':
|
|
30
|
+
return { type: 'epsilon' };
|
|
31
|
+
case 'literal':
|
|
32
|
+
return { type: 'literal', charset: normalizeCharacterSet(ast.charset) };
|
|
33
|
+
case 'concat':
|
|
34
|
+
return { type: 'concat', left: normalizeRegularExpression(ast.left), right: normalizeRegularExpression(ast.right) };
|
|
35
|
+
case 'union':
|
|
36
|
+
return { type: 'union', left: normalizeRegularExpression(ast.left), right: normalizeRegularExpression(ast.right) };
|
|
37
|
+
case 'star':
|
|
38
|
+
return { type: 'star', inner: normalizeRegularExpression(ast.inner) };
|
|
39
|
+
case 'plus':
|
|
40
|
+
return { type: 'plus', inner: normalizeRegularExpression(ast.inner) };
|
|
41
|
+
case 'optional':
|
|
42
|
+
return { type: 'optional', inner: normalizeRegularExpression(ast.inner) };
|
|
43
|
+
case 'repeat':
|
|
44
|
+
return { type: 'repeat', inner: normalizeRegularExpression(ast.inner), bounds: ast.bounds };
|
|
45
|
+
case 'capture-group':
|
|
46
|
+
if (ast.name !== undefined) {
|
|
47
|
+
return { type: 'capture-group', inner: normalizeRegularExpression(ast.inner), name: ast.name };
|
|
48
|
+
}
|
|
49
|
+
return { type: 'capture-group', inner: normalizeRegularExpression(ast.inner) };
|
|
50
|
+
case 'lookahead':
|
|
51
|
+
return { type: 'lookahead', isPositive: ast.isPositive, inner: normalizeRegularExpression(ast.inner), right: normalizeRegularExpression(ast.right) };
|
|
52
|
+
case 'start-anchor':
|
|
53
|
+
return { type: 'start-anchor', left: normalizeRegularExpression(ast.left), right: normalizeRegularExpression(ast.right) };
|
|
54
|
+
case 'end-anchor':
|
|
55
|
+
return { type: 'end-anchor', left: normalizeRegularExpression(ast.left), right: normalizeRegularExpression(ast.right) };
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Generate regex patterns that are likely to be supported
|
|
60
|
+
const supportedRegexArbitrary = fc.stringMatching(
|
|
61
|
+
/^([a-zA-Z0-9]|\\[dDwWsS.]|\[(\^)?([a-zA-Z0-9](-[a-zA-Z0-9])?|\\[dDwWsS])*\]|\.|\((\?[:=!])?[a-zA-Z0-9]*\)|[*+?]|\{[0-9]+(,[0-9]*)?\}|\||\^|\$)*$/,
|
|
62
|
+
).filter(s => {
|
|
63
|
+
// Filter out patterns that JavaScript doesn't support
|
|
64
|
+
try {
|
|
65
|
+
new RegExp(s);
|
|
66
|
+
} catch {
|
|
67
|
+
return false;
|
|
68
|
+
}
|
|
69
|
+
// Filter out patterns that @gruhn/regex-utils doesn't support
|
|
70
|
+
try {
|
|
71
|
+
parseRegExpString(s);
|
|
72
|
+
} catch {
|
|
73
|
+
return false;
|
|
74
|
+
}
|
|
75
|
+
// Filter out quantified lookaheads - @gruhn/regex-utils has a bug where it treats
|
|
76
|
+
// quantifiers after lookaheads as literals instead of quantifiers.
|
|
77
|
+
// See: https://github.com/gruhn/regex-utils/issues/13
|
|
78
|
+
// JavaScript allows (?=a){2} but @gruhn/regex-utils parses {2} as literal text.
|
|
79
|
+
if (/\(\?[=!][^)]*\)[*+?]|\(\?[=!][^)]*\)\{[0-9]/.test(s)) {
|
|
80
|
+
return false;
|
|
81
|
+
}
|
|
82
|
+
return true;
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
testProp(
|
|
86
|
+
'regularExpressionParser matches @gruhn/regex-utils',
|
|
87
|
+
[
|
|
88
|
+
arbitrarilySlicedAsyncIterator(supportedRegexArbitrary),
|
|
89
|
+
],
|
|
90
|
+
async (t, [regexStr, regexStringChunkIterator]) => {
|
|
91
|
+
const expected = normalizeRegularExpression(parseRegExpString(regexStr));
|
|
92
|
+
const actual = normalizeRegularExpression(await runParser(regularExpressionParser, regexStringChunkIterator, stringParserInputCompanion, {
|
|
93
|
+
errorJoinMode: 'none',
|
|
94
|
+
}));
|
|
95
|
+
|
|
96
|
+
t.deepEqual(actual, expected);
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
verbose: true,
|
|
100
|
+
seed,
|
|
101
|
+
},
|
|
102
|
+
);
|