porffor 0.1.1 → 0.2.0-c6c8c81
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +184 -204
- package/compiler/2c.js +377 -0
- package/compiler/builtins/base64.js +91 -91
- package/compiler/builtins.js +19 -13
- package/compiler/codeGen.js +1313 -418
- package/compiler/decompile.js +35 -9
- package/compiler/embedding.js +9 -5
- package/compiler/encoding.js +8 -2
- package/compiler/index.js +55 -17
- package/compiler/log.js +15 -0
- package/compiler/opt.js +357 -258
- package/compiler/parse.js +24 -1
- package/compiler/prototype.js +263 -56
- package/compiler/sections.js +51 -8
- package/compiler/wasmSpec.js +3 -0
- package/compiler/wrap.js +20 -6
- package/package.json +6 -1
- package/porf.cmd +1 -1
- package/rhemyn/README.md +37 -0
- package/rhemyn/compile.js +214 -0
- package/rhemyn/parse.js +321 -0
- package/rhemyn/test/parse.js +59 -0
- package/runner/index.js +54 -31
- package/runner/info.js +37 -2
- package/runner/profile.js +1 -2
- package/runner/repl.js +13 -11
- package/runner/results.json +1 -0
- package/runner/transform.js +15 -36
- package/runner/version.js +10 -0
- package/CNAME +0 -1
- package/index.html +0 -1264
- package/logo.png +0 -0
- package/sw.js +0 -26
package/compiler/wrap.js
CHANGED
@@ -4,7 +4,8 @@ import decompile from './decompile.js';
|
|
4
4
|
|
5
5
|
const bold = x => `\u001b[1m${x}\u001b[0m`;
|
6
6
|
|
7
|
-
const typeBase =
|
7
|
+
const typeBase = 0x00;
|
8
|
+
const internalTypeBase = 0x10;
|
8
9
|
const TYPES = {
|
9
10
|
[typeBase]: 'number',
|
10
11
|
[typeBase + 1]: 'boolean',
|
@@ -16,7 +17,8 @@ const TYPES = {
|
|
16
17
|
[typeBase + 7]: 'bigint',
|
17
18
|
|
18
19
|
// internal
|
19
|
-
[
|
20
|
+
[internalTypeBase]: '_array',
|
21
|
+
[internalTypeBase + 1]: '_regexp'
|
20
22
|
};
|
21
23
|
|
22
24
|
export default async (source, flags = [ 'module' ], customImports = {}, print = str => process.stdout.write(str)) => {
|
@@ -37,7 +39,6 @@ export default async (source, flags = [ 'module' ], customImports = {}, print =
|
|
37
39
|
'': {
|
38
40
|
p: valtype === 'i64' ? i => print(Number(i).toString()) : i => print(i.toString()),
|
39
41
|
c: valtype === 'i64' ? i => print(String.fromCharCode(Number(i))) : i => print(String.fromCharCode(i)),
|
40
|
-
a: c => { if (!Number(c)) throw new Error(`assert failed`); },
|
41
42
|
t: _ => performance.now(),
|
42
43
|
...customImports
|
43
44
|
}
|
@@ -64,11 +65,15 @@ export default async (source, flags = [ 'module' ], customImports = {}, print =
|
|
64
65
|
|
65
66
|
exports[func.name] = function() {
|
66
67
|
try {
|
67
|
-
const
|
68
|
+
const _ret = exp.apply(this, arguments);
|
68
69
|
|
69
|
-
if (
|
70
|
+
if (_ret == null) return undefined;
|
70
71
|
|
71
|
-
|
72
|
+
const [ ret, type ] = _ret;
|
73
|
+
|
74
|
+
// if (ret >= typeBase && ret <= typeBase + 8) return ret > (typeBase + 7) ? 'object' : TYPES[ret];
|
75
|
+
|
76
|
+
switch (TYPES[type]) {
|
72
77
|
case 'boolean': return Boolean(ret);
|
73
78
|
case 'undefined': return undefined;
|
74
79
|
case 'object': return ret === 0 ? null : {};
|
@@ -90,6 +95,15 @@ export default async (source, flags = [ 'module' ], customImports = {}, print =
|
|
90
95
|
return Array.from(new Uint16Array(memory.buffer, pointer + 4, length)).map(x => String.fromCharCode(x)).join('');
|
91
96
|
}
|
92
97
|
|
98
|
+
case 'function': {
|
99
|
+
// wasm func index, including all imports
|
100
|
+
const func = funcs.find(x => (x.originalIndex ?? x.index) === ret);
|
101
|
+
if (!func) return ret;
|
102
|
+
|
103
|
+
// make fake empty func for repl/etc
|
104
|
+
return {[func.name]() {}}[func.name];
|
105
|
+
}
|
106
|
+
|
93
107
|
default: return ret;
|
94
108
|
}
|
95
109
|
} catch (e) {
|
package/package.json
CHANGED
@@ -1,12 +1,17 @@
|
|
1
1
|
{
|
2
2
|
"name": "porffor",
|
3
3
|
"description": "a basic experimental wip aot optimizing js -> wasm engine/compiler/runtime in js",
|
4
|
-
"version": "0.
|
4
|
+
"version": "0.2.0-c6c8c81",
|
5
5
|
"author": "CanadaHonk",
|
6
6
|
"license": "MIT",
|
7
7
|
"dependencies": {
|
8
8
|
"acorn": "^8.9.0"
|
9
9
|
},
|
10
|
+
"optionalDependencies": {
|
11
|
+
"@babel/parser": "^7.23.6",
|
12
|
+
"hermes-parser": "^0.18.2",
|
13
|
+
"meriyah": "^4.3.9"
|
14
|
+
},
|
10
15
|
"bin": {
|
11
16
|
"porf": "./runner/index.js"
|
12
17
|
},
|
package/porf.cmd
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
@echo off
|
1
|
+
@echo off
|
2
2
|
node runner %*
|
package/rhemyn/README.md
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# Rhemyn
|
2
|
+
A basic experimental WIP regex engine/AOT Wasm compiler in JS. Regex engine for Porffor! Uses own regex parser, no dependencies (excluding porffor internals). <br>
|
3
|
+
Age: ~1 day (of work)
|
4
|
+
|
5
|
+
Made for use with Porffor but could possibly be adapted, implementation/library notes:
|
6
|
+
- Exposes functions for each regex "operation" (eg test, match)
|
7
|
+
- Given a regex pattern string (eg `a+`), it returns a "function" object
|
8
|
+
- Wasm function returned expects an i32 pointer to a UTF-16 string (can add UTF-8 option later if someone else actually wants to use this)
|
9
|
+
|
10
|
+
## syntax
|
11
|
+
🟢 supported 🟡 partial 🟠 parsed only 🔴 unsupported
|
12
|
+
|
13
|
+
- 🟢 literal characters (eg `a`)
|
14
|
+
- 🟢 escaping (eg `\.\n\cJ\x0a\u000a`)
|
15
|
+
- 🟢 character itself (eg `\.`)
|
16
|
+
- 🟢 escape sequences (eg `\n`)
|
17
|
+
- 🟢 control character (eg `\cJ`)
|
18
|
+
- 🟢 unicode code points (eg `\x00`, `\u0000`)
|
19
|
+
- 🟢 sets (eg `[ab]`)
|
20
|
+
- 🟢 ranges (eg `[a-z]`)
|
21
|
+
- 🟢 negated sets (eg `[^ab]`)
|
22
|
+
- 🟢 metacharacters
|
23
|
+
- 🟢 dot (eg `a.b`)
|
24
|
+
- 🟢 digit, not digit (eg `\d\D`)
|
25
|
+
- 🟢 word, not word (eg `\w\W`)
|
26
|
+
- 🟢 whitespace, not whitespace (eg `\s\S`)
|
27
|
+
- 🟠 quantifiers
|
28
|
+
- 🟠 star (eg `a*`)
|
29
|
+
- 🟠 plus (eg `a+`)
|
30
|
+
- 🟠 optional (eg `a?`)
|
31
|
+
- 🟠 lazy modifier (eg `a*?`)
|
32
|
+
- 🔴 n repetitions (eg `a{4}`)
|
33
|
+
- 🔴 n-m repetitions (eg `a{2,4}`)
|
34
|
+
- 🔴 assertions
|
35
|
+
- 🔴 beginning (eg `^a`)
|
36
|
+
- 🔴 end (eg `a$`)
|
37
|
+
- 🔴 word boundary assertion (eg `\b\B`)
|
@@ -0,0 +1,214 @@
|
|
1
|
+
import { Blocktype, Opcodes, Valtype, PageSize, ValtypeSize } from '../compiler/wasmSpec.js';
|
2
|
+
import { number } from '../compiler/embedding.js';
|
3
|
+
import { signedLEB128, unsignedLEB128 } from '../compiler/encoding.js';
|
4
|
+
import parse from './parse.js';
|
5
|
+
|
6
|
+
// local indexes
|
7
|
+
const BasePointer = 0; // base string pointer
|
8
|
+
const IterPointer = 1; // this iteration base pointer
|
9
|
+
const EndPointer = 2; // pointer for the end
|
10
|
+
const Counter = 3; // what char we are running on
|
11
|
+
const Pointer = 4; // next char BYTE pointer
|
12
|
+
const Length = 5;
|
13
|
+
const Tmp = 6;
|
14
|
+
|
15
|
+
let exprLastGet = false;
|
16
|
+
const generate = (node, negated = false, get = true, func = 'test') => {
|
17
|
+
let out = [];
|
18
|
+
switch (node.type) {
|
19
|
+
case 'Expression':
|
20
|
+
exprLastGet = false;
|
21
|
+
out = [
|
22
|
+
// set length local
|
23
|
+
[ Opcodes.local_get, BasePointer ],
|
24
|
+
[ Opcodes.i32_load, Math.log2(ValtypeSize.i32) - 1, 0 ],
|
25
|
+
[ Opcodes.local_set, Length ],
|
26
|
+
|
27
|
+
// set iter pointer local as base + sizeof i32 initially
|
28
|
+
[ Opcodes.local_get, BasePointer ],
|
29
|
+
...number(ValtypeSize.i32, Valtype.i32),
|
30
|
+
[ Opcodes.i32_add ],
|
31
|
+
[ Opcodes.local_set, IterPointer ],
|
32
|
+
|
33
|
+
[ Opcodes.loop, Blocktype.void ],
|
34
|
+
|
35
|
+
// reset pointer as iter pointer
|
36
|
+
[ Opcodes.local_get, IterPointer ],
|
37
|
+
[ Opcodes.local_set, Pointer ],
|
38
|
+
|
39
|
+
[ Opcodes.block, Blocktype.void ],
|
40
|
+
|
41
|
+
// generate checks
|
42
|
+
...node.body.flatMap((x, i) => {
|
43
|
+
exprLastGet = x.type !== 'Group' && i === (node.body.length - 1);
|
44
|
+
return generate(x, negated);
|
45
|
+
}),
|
46
|
+
|
47
|
+
// reached end without branching out, successful match
|
48
|
+
...({
|
49
|
+
test: number(1, Valtype.i32),
|
50
|
+
search: [
|
51
|
+
[ Opcodes.local_get, Counter ]
|
52
|
+
]
|
53
|
+
})[func],
|
54
|
+
[ Opcodes.return ],
|
55
|
+
|
56
|
+
[ Opcodes.end ],
|
57
|
+
|
58
|
+
// increment iter pointer by sizeof i16
|
59
|
+
[ Opcodes.local_get, IterPointer ],
|
60
|
+
...number(ValtypeSize.i16, Valtype.i32),
|
61
|
+
[ Opcodes.i32_add ],
|
62
|
+
[ Opcodes.local_set, IterPointer ],
|
63
|
+
|
64
|
+
// increment counter by 1, check if eq length, if not loop
|
65
|
+
[ Opcodes.local_get, Counter ],
|
66
|
+
...number(1, Valtype.i32),
|
67
|
+
[ Opcodes.i32_add ],
|
68
|
+
[ Opcodes.local_tee, Counter ],
|
69
|
+
|
70
|
+
[ Opcodes.local_get, Length ],
|
71
|
+
[ Opcodes.i32_ne ],
|
72
|
+
|
73
|
+
[ Opcodes.br_if, 0 ],
|
74
|
+
[ Opcodes.end ],
|
75
|
+
|
76
|
+
// no match, return 0
|
77
|
+
...number(({
|
78
|
+
test: 0,
|
79
|
+
search: -1
|
80
|
+
})[func], Valtype.i32)
|
81
|
+
];
|
82
|
+
|
83
|
+
if (globalThis.regexLog) {
|
84
|
+
const underline = x => `\u001b[4m\u001b[1m${x}\u001b[0m`;
|
85
|
+
console.log(`\n${underline('ast')}`);
|
86
|
+
console.log(node);
|
87
|
+
console.log(`\n${underline('wasm bytecode')}\n` + decompile(out) + '\n');
|
88
|
+
}
|
89
|
+
|
90
|
+
break;
|
91
|
+
|
92
|
+
case 'Character':
|
93
|
+
out = generateChar(node, node.negated ^ negated, get);
|
94
|
+
break;
|
95
|
+
|
96
|
+
case 'Set':
|
97
|
+
out = generateSet(node, node.negated, get);
|
98
|
+
break;
|
99
|
+
|
100
|
+
case 'Group':
|
101
|
+
out = generateGroup(node, negated, get);
|
102
|
+
break;
|
103
|
+
|
104
|
+
case 'Range':
|
105
|
+
out = generateRange(node, negated, get);
|
106
|
+
break;
|
107
|
+
}
|
108
|
+
|
109
|
+
return out;
|
110
|
+
};
|
111
|
+
|
112
|
+
const getNextChar = () => [
|
113
|
+
// get char from pointer
|
114
|
+
[ Opcodes.local_get, Pointer ],
|
115
|
+
[ Opcodes.i32_load16_u, Math.log2(ValtypeSize.i16) - 1, ...unsignedLEB128(0) ],
|
116
|
+
|
117
|
+
...(exprLastGet ? [] : [
|
118
|
+
// pointer += sizeof i16
|
119
|
+
[ Opcodes.local_get, Pointer ],
|
120
|
+
...number(ValtypeSize.i16, Valtype.i32),
|
121
|
+
[ Opcodes.i32_add ],
|
122
|
+
[ Opcodes.local_set, Pointer ]
|
123
|
+
])
|
124
|
+
];
|
125
|
+
|
126
|
+
const checkFailure = () => [
|
127
|
+
// surely we do not need to do this for every single mismatch, right?
|
128
|
+
/* [ Opcodes.if, Blocktype.void ],
|
129
|
+
...number(0, Valtype.i32),
|
130
|
+
[ Opcodes.return ],
|
131
|
+
[ Opcodes.end ], */
|
132
|
+
|
133
|
+
[ Opcodes.br_if, 0 ]
|
134
|
+
];
|
135
|
+
|
136
|
+
const generateChar = (node, negated, get) => {
|
137
|
+
return [
|
138
|
+
...(get ? getNextChar() : []),
|
139
|
+
...number(node.char.charCodeAt(0), Valtype.i32),
|
140
|
+
negated ? [ Opcodes.i32_eq ] : [ Opcodes.i32_ne ],
|
141
|
+
...(get ? checkFailure(): [])
|
142
|
+
];
|
143
|
+
};
|
144
|
+
|
145
|
+
const generateSet = (node, negated, get) => {
|
146
|
+
// for a single char we do not need a tmp, it is like just
|
147
|
+
const singleChar = node.body.length === 1 && node.body[0].type === 'Character';
|
148
|
+
|
149
|
+
let out = [
|
150
|
+
...(get ? getNextChar() : []),
|
151
|
+
...(singleChar ? [] : [ [ Opcodes.local_set, Tmp ] ]),
|
152
|
+
];
|
153
|
+
|
154
|
+
for (const x of node.body) {
|
155
|
+
out = [
|
156
|
+
...out,
|
157
|
+
...(singleChar ? [] : [ [ Opcodes.local_get, Tmp ] ]),
|
158
|
+
...generate(x, negated, false)
|
159
|
+
];
|
160
|
+
}
|
161
|
+
|
162
|
+
out = out.concat(new Array(node.body.length - 1).fill(negated ? [ Opcodes.i32_or ] : [ Opcodes.i32_and ]));
|
163
|
+
|
164
|
+
return [
|
165
|
+
...out,
|
166
|
+
...checkFailure()
|
167
|
+
];
|
168
|
+
};
|
169
|
+
|
170
|
+
const generateRange = (node, negated, get) => {
|
171
|
+
return [
|
172
|
+
...(get ? getNextChar() : []),
|
173
|
+
...(get ? [ [ Opcodes.local_tee, Tmp ] ] : []),
|
174
|
+
|
175
|
+
...number(node.from.charCodeAt(0), Valtype.i32),
|
176
|
+
// negated ? [ Opcodes.i32_lt_s ] : [ Opcodes.i32_ge_s ],
|
177
|
+
negated ? [ Opcodes.i32_ge_s ] : [ Opcodes.i32_lt_s ],
|
178
|
+
|
179
|
+
[ Opcodes.local_get, Tmp ],
|
180
|
+
...number(node.to.charCodeAt(0), Valtype.i32),
|
181
|
+
// negated ? [ Opcodes.i32_gt_s ] : [ Opcodes.i32_le_s ],
|
182
|
+
negated ? [ Opcodes.i32_le_s ] : [ Opcodes.i32_gt_s ],
|
183
|
+
|
184
|
+
negated ? [ Opcodes.i32_and ] : [ Opcodes.i32_or ],
|
185
|
+
...(get ? checkFailure(): [])
|
186
|
+
];
|
187
|
+
};
|
188
|
+
|
189
|
+
const generateGroup = (node, negated, get) => {
|
190
|
+
|
191
|
+
};
|
192
|
+
|
193
|
+
export const test = (regex, index = 0, name = 'regex_test_' + regex) => outputFunc(generate(parse(regex), false, true, 'test'), name, index);
|
194
|
+
export const search = (regex, index = 0, name = 'regex_search_' + regex) => outputFunc(generate(parse(regex), false, true, 'search'), name, index);
|
195
|
+
|
196
|
+
const outputFunc = (wasm, name, index) => ({
|
197
|
+
name,
|
198
|
+
index,
|
199
|
+
wasm,
|
200
|
+
|
201
|
+
export: true,
|
202
|
+
params: [ Valtype.i32 ],
|
203
|
+
returns: [ Valtype.i32 ],
|
204
|
+
returnType: 0xffffffffffff1, // boolean - todo: do not hardcode this
|
205
|
+
locals: {
|
206
|
+
basePointer: { idx: 0, type: Valtype.i32 },
|
207
|
+
iterPointer: { idx: 1, type: Valtype.i32 },
|
208
|
+
endPointer: { idx: 2, type: Valtype.i32 },
|
209
|
+
counter: { idx: 3, type: Valtype.i32 },
|
210
|
+
pointer: { idx: 4, type: Valtype.i32 },
|
211
|
+
length: { idx: 5, type: Valtype.i32 },
|
212
|
+
tmp: { idx: 6, type: Valtype.i32 },
|
213
|
+
}
|
214
|
+
});
|
package/rhemyn/parse.js
ADDED
@@ -0,0 +1,321 @@
|
|
1
|
+
const State = {
|
2
|
+
none: 0,
|
3
|
+
insideSet: 1
|
4
|
+
};
|
5
|
+
|
6
|
+
const Quantifiers = {
|
7
|
+
'*': [ 0 ], // 0 -
|
8
|
+
'+': [ 1 ], // 1 -
|
9
|
+
'?': [ 0, 1 ], // 0 - 1
|
10
|
+
};
|
11
|
+
const QuantifierKeys = Object.keys(Quantifiers);
|
12
|
+
|
13
|
+
const getArg = (name, def) => {
|
14
|
+
const arg = (typeof process !== 'undefined' ? process.argv : Deno.args).find(x => x.startsWith(`-${name}=`));
|
15
|
+
if (arg) return arg.split('=')[0];
|
16
|
+
|
17
|
+
return def;
|
18
|
+
};
|
19
|
+
|
20
|
+
// full is spec-compliant but slower. not needed most of the time. (evil)
|
21
|
+
const DotChars = () => ({
|
22
|
+
full: [ '\n', '\r', '\u2028', '\u2029' ],
|
23
|
+
simple: [ '\n', '\r' ],
|
24
|
+
fast: [ '\n' ]
|
25
|
+
})[getArg('regex-dot', 'fast')];
|
26
|
+
|
27
|
+
const WordChars = () => ({
|
28
|
+
full: [ [ 'a', 'z' ], [ 'A', 'Z' ], [ '0', '9' ], '_' ],
|
29
|
+
fast: [ [ '_', 'z' ], [ 'A', 'Z' ], [ '0', '9' ] ] // skip individual _ with _-z BUT it also matches '`'
|
30
|
+
})[getArg('regex-word', 'full')];
|
31
|
+
|
32
|
+
const WhitespaceChars = () => ({
|
33
|
+
full: [ ' ', '\t', '\n', '\r', '\u2028', '\u2029' ],
|
34
|
+
simple: [ ' ', '\t', '\n', '\r' ]
|
35
|
+
})[getArg('regex-ws', 'simple')];
|
36
|
+
|
37
|
+
const _Metachars = () => ({
|
38
|
+
unescaped: {
|
39
|
+
'.': [ DotChars(), true ], // dot
|
40
|
+
},
|
41
|
+
escaped: {
|
42
|
+
d: [ [ [ '0', '9' ] ], false ], // digit
|
43
|
+
D: [ [ [ '0', '9' ] ], true ], // not digit
|
44
|
+
w: [ WordChars(), false ], // word
|
45
|
+
W: [ WordChars(), true ], // not word
|
46
|
+
s: [ WhitespaceChars(), false ], // whitespace
|
47
|
+
S: [ WhitespaceChars(), true ], // not whitespace
|
48
|
+
}
|
49
|
+
});
|
50
|
+
|
51
|
+
const EscapeSequences = {
|
52
|
+
f: '\f',
|
53
|
+
n: '\n',
|
54
|
+
r: '\r',
|
55
|
+
t: '\t',
|
56
|
+
v: '\v',
|
57
|
+
'0': '\0'
|
58
|
+
};
|
59
|
+
|
60
|
+
const HexDigit = /[0-9a-fA-F]/;
|
61
|
+
|
62
|
+
export default str => {
|
63
|
+
const Metachars = _Metachars();
|
64
|
+
|
65
|
+
const out = {
|
66
|
+
type: 'Expression',
|
67
|
+
body: []
|
68
|
+
};
|
69
|
+
let node = out, parents = [];
|
70
|
+
|
71
|
+
let state = State.none, setIndex = 0, escape = false;
|
72
|
+
for (let i = 0; i < str.length; i++) {
|
73
|
+
const c = str[i];
|
74
|
+
|
75
|
+
const charNode = char => ({
|
76
|
+
type: 'Character',
|
77
|
+
char
|
78
|
+
});
|
79
|
+
|
80
|
+
const rangeNode = (from, to) => ({
|
81
|
+
type: 'Range',
|
82
|
+
from,
|
83
|
+
to
|
84
|
+
});
|
85
|
+
|
86
|
+
const addChar = (char = c) => {
|
87
|
+
node.body.push(charNode(char));
|
88
|
+
};
|
89
|
+
|
90
|
+
const addSet = (matches, negated = false) => {
|
91
|
+
let body = matches.map(x => x[1] ? rangeNode(x[0], x[1]) : charNode(x));
|
92
|
+
if (state === State.insideSet) {
|
93
|
+
// if negated, mark each node as negated for merge
|
94
|
+
if (negated) body = body.map(x => {
|
95
|
+
x.negated = true;
|
96
|
+
return x;
|
97
|
+
});
|
98
|
+
|
99
|
+
// already in set, merge bodies
|
100
|
+
node.body.push(...body);
|
101
|
+
return;
|
102
|
+
}
|
103
|
+
|
104
|
+
node.body.push({
|
105
|
+
type: 'Set',
|
106
|
+
body,
|
107
|
+
negated
|
108
|
+
});
|
109
|
+
};
|
110
|
+
|
111
|
+
const addMetachar = meta => {
|
112
|
+
const [ matches, negated = false ] = meta;
|
113
|
+
return addSet(matches, negated);
|
114
|
+
};
|
115
|
+
|
116
|
+
// get next char and consume it
|
117
|
+
const seek = (allowEscaped = true) => {
|
118
|
+
const cNext = str[++i];
|
119
|
+
|
120
|
+
if (cNext === '\\') return !allowEscaped ? undefined : [ str[++i], true ];
|
121
|
+
return !allowEscaped ? cNext : [ cNext, false ];
|
122
|
+
};
|
123
|
+
|
124
|
+
// get next char without consuming
|
125
|
+
const peek = (allowEscaped = true, offset = 0) => {
|
126
|
+
const cNext = str[i + 1 + offset];
|
127
|
+
|
128
|
+
if (cNext === '\\') return !allowEscaped ? undefined : [ str[i + 2 + offset], true ];
|
129
|
+
return !allowEscaped ? cNext : [ cNext, false ];
|
130
|
+
};
|
131
|
+
|
132
|
+
if (escape) {
|
133
|
+
escape = false;
|
134
|
+
if (EscapeSequences[c]) {
|
135
|
+
addChar(EscapeSequences[c]);
|
136
|
+
continue;
|
137
|
+
}
|
138
|
+
|
139
|
+
if (Metachars.escaped[c]) {
|
140
|
+
addMetachar(Metachars.escaped[c]);
|
141
|
+
continue;
|
142
|
+
}
|
143
|
+
|
144
|
+
if (c === 'c') {
|
145
|
+
// \c (not [A-Za-z] ...) = literal \c... (WHY)
|
146
|
+
const next = peek(false);
|
147
|
+
if (next == null || /[^a-zA-Z]/.test(next)) {
|
148
|
+
addChar('\\');
|
149
|
+
addChar('c');
|
150
|
+
continue;
|
151
|
+
}
|
152
|
+
|
153
|
+
// \c[A-Za-z]
|
154
|
+
const code = seek(false).charCodeAt(0);
|
155
|
+
addChar(String.fromCharCode(code % 32));
|
156
|
+
continue;
|
157
|
+
}
|
158
|
+
|
159
|
+
if (c === 'x') {
|
160
|
+
// \x = x
|
161
|
+
// \xH = xH
|
162
|
+
// \x[0-9a-zA-Z][0-9a-zA-Z] = \xAB
|
163
|
+
const next1 = peek(false);
|
164
|
+
const next2 = peek(false, 1);
|
165
|
+
|
166
|
+
// missing a char or invalid hex digit
|
167
|
+
if (next1 == null || next2 == null || !HexDigit.test(next1) || !HexDigit.test(next2)) {
|
168
|
+
addChar('x');
|
169
|
+
continue;
|
170
|
+
}
|
171
|
+
|
172
|
+
const code = parseInt(seek(false) + seek(false), 16);
|
173
|
+
addChar(String.fromCodePoint(code));
|
174
|
+
continue;
|
175
|
+
}
|
176
|
+
|
177
|
+
if (c === 'u') {
|
178
|
+
// '\u' = u
|
179
|
+
// '\uHHH' = uHHH
|
180
|
+
// '\uABCD' = \uABCD
|
181
|
+
const next1 = peek(false);
|
182
|
+
const next2 = peek(false, 1);
|
183
|
+
const next3 = peek(false, 2);
|
184
|
+
const next4 = peek(false, 3);
|
185
|
+
|
186
|
+
// missing a char or invalid hex digit
|
187
|
+
if (next1 == null || next2 == null || next3 == null || next4 == null || !HexDigit.test(next1) || !HexDigit.test(next2) || !HexDigit.test(next3) || !HexDigit.test(next4)) {
|
188
|
+
addChar('u');
|
189
|
+
continue;
|
190
|
+
}
|
191
|
+
|
192
|
+
const code = parseInt(seek(false) + seek(false) + seek(false) + seek(false), 16);
|
193
|
+
addChar(String.fromCodePoint(code));
|
194
|
+
continue;
|
195
|
+
}
|
196
|
+
|
197
|
+
addChar();
|
198
|
+
continue;
|
199
|
+
}
|
200
|
+
|
201
|
+
if (c === '\\') {
|
202
|
+
escape = true;
|
203
|
+
continue;
|
204
|
+
}
|
205
|
+
|
206
|
+
switch (state) {
|
207
|
+
case State.none:
|
208
|
+
if (c === '[') {
|
209
|
+
parents.push(node);
|
210
|
+
node = {
|
211
|
+
type: 'Set',
|
212
|
+
body: [],
|
213
|
+
negated: false
|
214
|
+
};
|
215
|
+
|
216
|
+
parents.at(-1).body.push(node);
|
217
|
+
|
218
|
+
state = State.insideSet;
|
219
|
+
setIndex = 0;
|
220
|
+
continue;
|
221
|
+
}
|
222
|
+
|
223
|
+
if (c === '(') {
|
224
|
+
parents.push(node);
|
225
|
+
node = {
|
226
|
+
type: 'Group',
|
227
|
+
body: []
|
228
|
+
};
|
229
|
+
|
230
|
+
parents.at(-1).body.push(node);
|
231
|
+
continue;
|
232
|
+
}
|
233
|
+
|
234
|
+
if (c === ')') {
|
235
|
+
if (node.type !== 'Group') throw new SyntaxError('Unmatched closing parenthesis');
|
236
|
+
|
237
|
+
node = parents.pop();
|
238
|
+
continue;
|
239
|
+
}
|
240
|
+
|
241
|
+
if (QuantifierKeys.includes(c)) {
|
242
|
+
node.body.at(-1).quantifier = Quantifiers[c];
|
243
|
+
|
244
|
+
// lazy modifier
|
245
|
+
if (peek(false) === '?') node.body.at(-1).lazy = true;
|
246
|
+
|
247
|
+
continue;
|
248
|
+
}
|
249
|
+
|
250
|
+
if (Metachars.unescaped[c]) {
|
251
|
+
addMetachar(Metachars.unescaped[c]);
|
252
|
+
continue;
|
253
|
+
}
|
254
|
+
|
255
|
+
addChar();
|
256
|
+
break;
|
257
|
+
|
258
|
+
case State.insideSet:
|
259
|
+
setIndex++;
|
260
|
+
if (setIndex === 1) {
|
261
|
+
// first char in set
|
262
|
+
if (c === '^') {
|
263
|
+
node.negated = true;
|
264
|
+
continue;
|
265
|
+
}
|
266
|
+
}
|
267
|
+
|
268
|
+
if (c === ']') {
|
269
|
+
state = State.none;
|
270
|
+
node = parents.pop();
|
271
|
+
|
272
|
+
continue;
|
273
|
+
}
|
274
|
+
|
275
|
+
// range
|
276
|
+
if (c === '-') {
|
277
|
+
// start of set (or not char), just literal -
|
278
|
+
if (node.body.at(-1)?.char == null) {
|
279
|
+
addChar(); // add -
|
280
|
+
continue;
|
281
|
+
}
|
282
|
+
|
283
|
+
const from = node.body.pop().char;
|
284
|
+
const [ to, escaped ] = seek();
|
285
|
+
|
286
|
+
// end of set, just literal -
|
287
|
+
if (to == null || (!escaped && to === ']')) {
|
288
|
+
addChar(from); // add from char back
|
289
|
+
i--; // rollback seek
|
290
|
+
|
291
|
+
addChar(); // add -
|
292
|
+
continue;
|
293
|
+
}
|
294
|
+
|
295
|
+
// next char was escaped and a metachar, just literal -
|
296
|
+
if (escaped && Metachars.escaped[to] != null) {
|
297
|
+
i -= 2; // rollback seek
|
298
|
+
|
299
|
+
addChar(); // add -
|
300
|
+
continue;
|
301
|
+
}
|
302
|
+
|
303
|
+
if (to < from) throw new SyntaxError('Range out of order');
|
304
|
+
|
305
|
+
node.body.push(rangeNode(from, to));
|
306
|
+
continue;
|
307
|
+
}
|
308
|
+
|
309
|
+
addChar();
|
310
|
+
break;
|
311
|
+
}
|
312
|
+
}
|
313
|
+
|
314
|
+
// still in a group by the end
|
315
|
+
if (node.type !== 'Expression') throw new SyntaxError('Unmatched opening parenthesis');
|
316
|
+
|
317
|
+
// still in a set by the end
|
318
|
+
if (state === State.insideSet) throw new SyntaxError('Unmatched opening square bracket');
|
319
|
+
|
320
|
+
return out;
|
321
|
+
};
|