@bufbuild/re2 0.0.1-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +30 -0
- package/dist/cjs/CharClass.d.ts +30 -0
- package/dist/cjs/CharClass.js +284 -0
- package/dist/cjs/CharGroup.d.ts +8 -0
- package/dist/cjs/CharGroup.js +83 -0
- package/dist/cjs/Codepoint.d.ts +3 -0
- package/dist/cjs/Codepoint.js +62 -0
- package/dist/cjs/Compiler.d.ts +40 -0
- package/dist/cjs/Compiler.js +262 -0
- package/dist/cjs/DFA.d.ts +36 -0
- package/dist/cjs/DFA.js +350 -0
- package/dist/cjs/Inst.d.ts +26 -0
- package/dist/cjs/Inst.js +86 -0
- package/dist/cjs/MachineInput.d.ts +17 -0
- package/dist/cjs/MachineInput.js +72 -0
- package/dist/cjs/Parser.d.ts +111 -0
- package/dist/cjs/Parser.js +1538 -0
- package/dist/cjs/Prefilter.d.ts +19 -0
- package/dist/cjs/Prefilter.js +163 -0
- package/dist/cjs/Prog.d.ts +39 -0
- package/dist/cjs/Prog.js +154 -0
- package/dist/cjs/RE2.d.ts +27 -0
- package/dist/cjs/RE2.js +221 -0
- package/dist/cjs/RE2Flags.d.ts +16 -0
- package/dist/cjs/RE2Flags.js +58 -0
- package/dist/cjs/Regexp.d.ts +43 -0
- package/dist/cjs/Regexp.js +98 -0
- package/dist/cjs/Simplify.d.ts +3 -0
- package/dist/cjs/Simplify.js +230 -0
- package/dist/cjs/Unicode.d.ts +17 -0
- package/dist/cjs/Unicode.js +165 -0
- package/dist/cjs/UnicodeRangeTable.d.ts +12 -0
- package/dist/cjs/UnicodeRangeTable.js +31 -0
- package/dist/cjs/UnicodeTables.d.ts +29 -0
- package/dist/cjs/UnicodeTables.js +571 -0
- package/dist/cjs/Utils.d.ts +22 -0
- package/dist/cjs/Utils.js +119 -0
- package/dist/cjs/__fixtures__/find.d.ts +9 -0
- package/dist/cjs/__fixtures__/find.js +115 -0
- package/dist/cjs/chars.d.ts +2 -0
- package/dist/cjs/chars.js +19 -0
- package/dist/cjs/exceptions.d.ts +55 -0
- package/dist/cjs/exceptions.js +94 -0
- package/dist/cjs/index.d.ts +102 -0
- package/dist/cjs/index.js +173 -0
- package/dist/cjs/package.json +1 -0
- package/dist/cjs/testParser.d.ts +3 -0
- package/dist/cjs/testParser.js +143 -0
- package/dist/esm/CharClass.d.ts +30 -0
- package/dist/esm/CharClass.js +281 -0
- package/dist/esm/CharGroup.d.ts +8 -0
- package/dist/esm/CharGroup.js +78 -0
- package/dist/esm/Codepoint.d.ts +3 -0
- package/dist/esm/Codepoint.js +59 -0
- package/dist/esm/Compiler.d.ts +40 -0
- package/dist/esm/Compiler.js +259 -0
- package/dist/esm/DFA.d.ts +36 -0
- package/dist/esm/DFA.js +347 -0
- package/dist/esm/Inst.d.ts +26 -0
- package/dist/esm/Inst.js +83 -0
- package/dist/esm/MachineInput.d.ts +17 -0
- package/dist/esm/MachineInput.js +68 -0
- package/dist/esm/Parser.d.ts +111 -0
- package/dist/esm/Parser.js +1535 -0
- package/dist/esm/Prefilter.d.ts +19 -0
- package/dist/esm/Prefilter.js +159 -0
- package/dist/esm/Prog.d.ts +39 -0
- package/dist/esm/Prog.js +150 -0
- package/dist/esm/RE2.d.ts +27 -0
- package/dist/esm/RE2.js +218 -0
- package/dist/esm/RE2Flags.d.ts +16 -0
- package/dist/esm/RE2Flags.js +41 -0
- package/dist/esm/Regexp.d.ts +43 -0
- package/dist/esm/Regexp.js +94 -0
- package/dist/esm/Simplify.d.ts +3 -0
- package/dist/esm/Simplify.js +228 -0
- package/dist/esm/Unicode.d.ts +17 -0
- package/dist/esm/Unicode.js +150 -0
- package/dist/esm/UnicodeRangeTable.d.ts +12 -0
- package/dist/esm/UnicodeRangeTable.js +28 -0
- package/dist/esm/UnicodeTables.d.ts +29 -0
- package/dist/esm/UnicodeTables.js +568 -0
- package/dist/esm/Utils.d.ts +22 -0
- package/dist/esm/Utils.js +103 -0
- package/dist/esm/__fixtures__/find.d.ts +9 -0
- package/dist/esm/__fixtures__/find.js +112 -0
- package/dist/esm/chars.d.ts +2 -0
- package/dist/esm/chars.js +14 -0
- package/dist/esm/exceptions.d.ts +55 -0
- package/dist/esm/exceptions.js +86 -0
- package/dist/esm/index.d.ts +102 -0
- package/dist/esm/index.js +163 -0
- package/dist/esm/testParser.d.ts +3 -0
- package/dist/esm/testParser.js +138 -0
- package/package.json +49 -0
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.Compiler = void 0;
|
|
4
|
+
const RE2Flags_js_1 = require("./RE2Flags.js");
|
|
5
|
+
const Unicode_js_1 = require("./Unicode.js");
|
|
6
|
+
const Utils_js_1 = require("./Utils.js");
|
|
7
|
+
const Regexp_js_1 = require("./Regexp.js");
|
|
8
|
+
const Inst_js_1 = require("./Inst.js");
|
|
9
|
+
const Prog_js_1 = require("./Prog.js");
|
|
10
|
+
const exceptions_js_1 = require("./exceptions.js");
|
|
11
|
+
/**
|
|
12
|
+
* A fragment of a compiled regular expression program.
|
|
13
|
+
*
|
|
14
|
+
* @see http://swtch.com/~rsc/regexp/regexp1.html
|
|
15
|
+
* @class
|
|
16
|
+
*/
|
|
17
|
+
class Frag {
|
|
18
|
+
i;
|
|
19
|
+
out;
|
|
20
|
+
nullable;
|
|
21
|
+
constructor(i = 0, out = new Prog_js_1.PatchList(), nullable = false) {
|
|
22
|
+
this.i = i; // an instruction address (pc).
|
|
23
|
+
this.out = out; // a patch list; see explanation in Prog.js
|
|
24
|
+
this.nullable = nullable; // whether the fragment can match the empty string
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Compiler from {@code Regexp} (RE2 abstract syntax) to {@code RE2} (compiled regular expression).
|
|
29
|
+
*
|
|
30
|
+
* The only entry point is {@link #compileRegexp}.
|
|
31
|
+
*/
|
|
32
|
+
class Compiler {
|
|
33
|
+
prog;
|
|
34
|
+
static ANY_RUNE_NOT_NL() {
|
|
35
|
+
return [0, 0x0a - 1, 0x0a + 1, Unicode_js_1.MAX_RUNE];
|
|
36
|
+
}
|
|
37
|
+
static ANY_RUNE() {
|
|
38
|
+
return [0, Unicode_js_1.MAX_RUNE];
|
|
39
|
+
}
|
|
40
|
+
static compileRegexp(re) {
|
|
41
|
+
const c = new Compiler();
|
|
42
|
+
const f = c.compile(re);
|
|
43
|
+
c.prog.patch(f.out, c.newInst(Inst_js_1.Inst.MATCH).i);
|
|
44
|
+
c.prog.start = f.i;
|
|
45
|
+
return c.prog;
|
|
46
|
+
}
|
|
47
|
+
constructor() {
|
|
48
|
+
this.prog = new Prog_js_1.Prog();
|
|
49
|
+
this.newInst(Inst_js_1.Inst.FAIL);
|
|
50
|
+
}
|
|
51
|
+
newInst(op) {
|
|
52
|
+
this.prog.addInst(op);
|
|
53
|
+
return new Frag(this.prog.numInst() - 1, new Prog_js_1.PatchList(), true);
|
|
54
|
+
}
|
|
55
|
+
// Returns a no-op fragment. Sometimes unavoidable.
|
|
56
|
+
nop() {
|
|
57
|
+
const f = this.newInst(Inst_js_1.Inst.NOP);
|
|
58
|
+
f.out = new Prog_js_1.PatchList(f.i << 1, f.i << 1);
|
|
59
|
+
return f;
|
|
60
|
+
}
|
|
61
|
+
fail() {
|
|
62
|
+
return new Frag();
|
|
63
|
+
}
|
|
64
|
+
// Given fragment a, returns (a) capturing as \n.
|
|
65
|
+
// Given a fragment a, returns a fragment with capturing parens around a.
|
|
66
|
+
cap(arg) {
|
|
67
|
+
const f = this.newInst(Inst_js_1.Inst.CAPTURE);
|
|
68
|
+
f.out = new Prog_js_1.PatchList(f.i << 1, f.i << 1);
|
|
69
|
+
this.prog.getInst(f.i).arg = arg;
|
|
70
|
+
if (this.prog.numCap < arg + 1) {
|
|
71
|
+
this.prog.numCap = arg + 1;
|
|
72
|
+
}
|
|
73
|
+
return f;
|
|
74
|
+
}
|
|
75
|
+
// Given fragments a and b, returns ab; a|b
|
|
76
|
+
cat(f1, f2) {
|
|
77
|
+
// concat of failure is failure
|
|
78
|
+
if (f1.i === 0 || f2.i === 0) {
|
|
79
|
+
return this.fail();
|
|
80
|
+
}
|
|
81
|
+
// eslint-disable-next-line no-warning-comments
|
|
82
|
+
// TODO(rsc): elide nop
|
|
83
|
+
this.prog.patch(f1.out, f2.i);
|
|
84
|
+
return new Frag(f1.i, f2.out, f1.nullable && f2.nullable);
|
|
85
|
+
}
|
|
86
|
+
// Given fragments for a and b, returns fragment for a|b.
|
|
87
|
+
alt(f1, f2) {
|
|
88
|
+
// alt of failure is other
|
|
89
|
+
if (f1.i === 0) {
|
|
90
|
+
return f2;
|
|
91
|
+
}
|
|
92
|
+
if (f2.i === 0) {
|
|
93
|
+
return f1;
|
|
94
|
+
}
|
|
95
|
+
const f = this.newInst(Inst_js_1.Inst.ALT);
|
|
96
|
+
const i = this.prog.getInst(f.i);
|
|
97
|
+
i.out = f1.i;
|
|
98
|
+
i.arg = f2.i;
|
|
99
|
+
f.out = this.prog.append(f1.out, f2.out);
|
|
100
|
+
f.nullable = f1.nullable || f2.nullable;
|
|
101
|
+
return f;
|
|
102
|
+
}
|
|
103
|
+
// loop returns the fragment for the main loop of a plus or star.
|
|
104
|
+
// For plus, it can be used directly. with f1.i as the entry.
|
|
105
|
+
// For star, it can be used directly when f1 can't match an empty string.
|
|
106
|
+
// (When f1 can match an empty string, f1* must be implemented as (f1+)?
|
|
107
|
+
// to get the priority match order correct.)
|
|
108
|
+
loop(f1, nongreedy) {
|
|
109
|
+
const f = this.newInst(Inst_js_1.Inst.ALT);
|
|
110
|
+
const i = this.prog.getInst(f.i);
|
|
111
|
+
if (nongreedy) {
|
|
112
|
+
i.arg = f1.i;
|
|
113
|
+
f.out = new Prog_js_1.PatchList(f.i << 1, f.i << 1);
|
|
114
|
+
}
|
|
115
|
+
else {
|
|
116
|
+
i.out = f1.i;
|
|
117
|
+
f.out = new Prog_js_1.PatchList((f.i << 1) | 1, (f.i << 1) | 1);
|
|
118
|
+
}
|
|
119
|
+
this.prog.patch(f1.out, f.i);
|
|
120
|
+
return f;
|
|
121
|
+
}
|
|
122
|
+
// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
|
|
123
|
+
quest(f1, nongreedy) {
|
|
124
|
+
const f = this.newInst(Inst_js_1.Inst.ALT);
|
|
125
|
+
const i = this.prog.getInst(f.i);
|
|
126
|
+
if (nongreedy) {
|
|
127
|
+
i.arg = f1.i;
|
|
128
|
+
f.out = new Prog_js_1.PatchList(f.i << 1, f.i << 1);
|
|
129
|
+
}
|
|
130
|
+
else {
|
|
131
|
+
i.out = f1.i;
|
|
132
|
+
f.out = new Prog_js_1.PatchList((f.i << 1) | 1, (f.i << 1) | 1);
|
|
133
|
+
}
|
|
134
|
+
f.out = this.prog.append(f.out, f1.out);
|
|
135
|
+
return f;
|
|
136
|
+
}
|
|
137
|
+
// Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
|
|
138
|
+
star(f1, nongreedy) {
|
|
139
|
+
if (f1.nullable) {
|
|
140
|
+
return this.quest(this.plus(f1, nongreedy), nongreedy);
|
|
141
|
+
}
|
|
142
|
+
return this.loop(f1, nongreedy);
|
|
143
|
+
}
|
|
144
|
+
// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
|
|
145
|
+
plus(f1, nongreedy) {
|
|
146
|
+
return new Frag(f1.i, this.loop(f1, nongreedy).out, f1.nullable);
|
|
147
|
+
}
|
|
148
|
+
// op is a bitmask of EMPTY_* flags.
|
|
149
|
+
empty(op) {
|
|
150
|
+
const f = this.newInst(Inst_js_1.Inst.EMPTY_WIDTH);
|
|
151
|
+
this.prog.getInst(f.i).arg = op;
|
|
152
|
+
f.out = new Prog_js_1.PatchList(f.i << 1, f.i << 1);
|
|
153
|
+
return f;
|
|
154
|
+
}
|
|
155
|
+
// flags : parser flags
|
|
156
|
+
rune(runes, flags) {
|
|
157
|
+
const f = this.newInst(Inst_js_1.Inst.RUNE);
|
|
158
|
+
f.nullable = false;
|
|
159
|
+
const i = this.prog.getInst(f.i);
|
|
160
|
+
i.runes = runes;
|
|
161
|
+
flags &= RE2Flags_js_1.FOLD_CASE;
|
|
162
|
+
if (runes.length !== 1 || (0, Unicode_js_1.simpleFold)(runes[0]) === runes[0]) {
|
|
163
|
+
flags &= ~RE2Flags_js_1.FOLD_CASE;
|
|
164
|
+
}
|
|
165
|
+
i.arg = flags;
|
|
166
|
+
f.out = new Prog_js_1.PatchList(f.i << 1, f.i << 1);
|
|
167
|
+
if (((flags & RE2Flags_js_1.FOLD_CASE) === 0 && runes.length === 1) ||
|
|
168
|
+
(runes.length === 2 && runes[0] === runes[1])) {
|
|
169
|
+
i.op = Inst_js_1.Inst.RUNE1;
|
|
170
|
+
}
|
|
171
|
+
else if (runes.length === 2 && runes[0] === 0 && runes[1] === Unicode_js_1.MAX_RUNE) {
|
|
172
|
+
i.op = Inst_js_1.Inst.RUNE_ANY;
|
|
173
|
+
}
|
|
174
|
+
else if (runes.length === 4 &&
|
|
175
|
+
runes[0] === 0 &&
|
|
176
|
+
runes[1] === 0x0a - 1 &&
|
|
177
|
+
runes[2] === 0x0a + 1 &&
|
|
178
|
+
runes[3] === Unicode_js_1.MAX_RUNE) {
|
|
179
|
+
i.op = Inst_js_1.Inst.RUNE_ANY_NOT_NL;
|
|
180
|
+
}
|
|
181
|
+
return f;
|
|
182
|
+
}
|
|
183
|
+
compile(re) {
|
|
184
|
+
switch (re.op) {
|
|
185
|
+
case Regexp_js_1.Regexp.Op.NO_MATCH:
|
|
186
|
+
return this.fail();
|
|
187
|
+
case Regexp_js_1.Regexp.Op.EMPTY_MATCH:
|
|
188
|
+
return this.nop();
|
|
189
|
+
case Regexp_js_1.Regexp.Op.LITERAL:
|
|
190
|
+
if (re.runes.length === 0) {
|
|
191
|
+
return this.nop();
|
|
192
|
+
}
|
|
193
|
+
let f = null;
|
|
194
|
+
for (let r of re.runes) {
|
|
195
|
+
const f1 = this.rune([r], re.flags);
|
|
196
|
+
f = f === null ? f1 : this.cat(f, f1);
|
|
197
|
+
}
|
|
198
|
+
return f;
|
|
199
|
+
case Regexp_js_1.Regexp.Op.CHAR_CLASS:
|
|
200
|
+
return this.rune(re.runes, re.flags);
|
|
201
|
+
case Regexp_js_1.Regexp.Op.ANY_CHAR_NOT_NL:
|
|
202
|
+
return this.rune(Compiler.ANY_RUNE_NOT_NL(), 0);
|
|
203
|
+
case Regexp_js_1.Regexp.Op.ANY_CHAR:
|
|
204
|
+
return this.rune(Compiler.ANY_RUNE(), 0);
|
|
205
|
+
case Regexp_js_1.Regexp.Op.BEGIN_LINE:
|
|
206
|
+
return this.empty(Utils_js_1.EMPTY_BEGIN_LINE);
|
|
207
|
+
case Regexp_js_1.Regexp.Op.END_LINE:
|
|
208
|
+
return this.empty(Utils_js_1.EMPTY_END_LINE);
|
|
209
|
+
case Regexp_js_1.Regexp.Op.BEGIN_TEXT:
|
|
210
|
+
return this.empty(Utils_js_1.EMPTY_BEGIN_TEXT);
|
|
211
|
+
case Regexp_js_1.Regexp.Op.END_TEXT:
|
|
212
|
+
return this.empty(Utils_js_1.EMPTY_END_TEXT);
|
|
213
|
+
case Regexp_js_1.Regexp.Op.WORD_BOUNDARY:
|
|
214
|
+
return this.empty(Utils_js_1.EMPTY_WORD_BOUNDARY);
|
|
215
|
+
case Regexp_js_1.Regexp.Op.NO_WORD_BOUNDARY:
|
|
216
|
+
return this.empty(Utils_js_1.EMPTY_NO_WORD_BOUNDARY);
|
|
217
|
+
case Regexp_js_1.Regexp.Op.CAPTURE: {
|
|
218
|
+
const bra = this.cap(re.cap << 1);
|
|
219
|
+
const sub = this.compile(re.subs[0]);
|
|
220
|
+
const ket = this.cap((re.cap << 1) | 1);
|
|
221
|
+
return this.cat(this.cat(bra, sub), ket);
|
|
222
|
+
}
|
|
223
|
+
case Regexp_js_1.Regexp.Op.STAR:
|
|
224
|
+
return this.star(this.compile(re.subs[0]), (re.flags & RE2Flags_js_1.NON_GREEDY) !== 0);
|
|
225
|
+
case Regexp_js_1.Regexp.Op.PLUS:
|
|
226
|
+
return this.plus(this.compile(re.subs[0]), (re.flags & RE2Flags_js_1.NON_GREEDY) !== 0);
|
|
227
|
+
case Regexp_js_1.Regexp.Op.QUEST:
|
|
228
|
+
return this.quest(this.compile(re.subs[0]), (re.flags & RE2Flags_js_1.NON_GREEDY) !== 0);
|
|
229
|
+
case Regexp_js_1.Regexp.Op.CONCAT: {
|
|
230
|
+
if (re.subs.length === 0) {
|
|
231
|
+
return this.nop();
|
|
232
|
+
}
|
|
233
|
+
let f = null;
|
|
234
|
+
for (let sub of re.subs) {
|
|
235
|
+
const f1 = this.compile(sub);
|
|
236
|
+
f = f === null ? f1 : this.cat(f, f1);
|
|
237
|
+
}
|
|
238
|
+
if (f === null) {
|
|
239
|
+
throw new Error("invalid frag");
|
|
240
|
+
}
|
|
241
|
+
return f;
|
|
242
|
+
}
|
|
243
|
+
case Regexp_js_1.Regexp.Op.ALTERNATE: {
|
|
244
|
+
if (re.subs.length === 0) {
|
|
245
|
+
return this.nop();
|
|
246
|
+
}
|
|
247
|
+
let f = null;
|
|
248
|
+
for (let sub of re.subs) {
|
|
249
|
+
const f1 = this.compile(sub);
|
|
250
|
+
f = f === null ? f1 : this.alt(f, f1);
|
|
251
|
+
}
|
|
252
|
+
if (f === null) {
|
|
253
|
+
throw new Error("invalid frag");
|
|
254
|
+
}
|
|
255
|
+
return f;
|
|
256
|
+
}
|
|
257
|
+
default:
|
|
258
|
+
throw new exceptions_js_1.RE2JSCompileException("regexp: unhandled case in compile");
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
exports.Compiler = Compiler;
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import type { Prog } from "./Prog.js";
|
|
2
|
+
import type { MachineUTF16Input } from "./MachineInput.js";
|
|
3
|
+
declare class DFAState {
|
|
4
|
+
nfaStates: Int32Array;
|
|
5
|
+
isMatch: boolean;
|
|
6
|
+
hasEmptyWidth: boolean;
|
|
7
|
+
matchIDs: number[];
|
|
8
|
+
nextAscii: (DFAState | null)[];
|
|
9
|
+
nextMap: Map<number, DFAState | null>;
|
|
10
|
+
constructor(nfaStates: Int32Array, isMatch: boolean, hasEmptyWidth: boolean, matchIDs?: number[]);
|
|
11
|
+
}
|
|
12
|
+
declare class DFA {
|
|
13
|
+
prog: Prog;
|
|
14
|
+
stateCache: Map<number, DFAState[]>;
|
|
15
|
+
stateCount: number;
|
|
16
|
+
startState: DFAState | null;
|
|
17
|
+
stateLimit: number;
|
|
18
|
+
cacheClears: number;
|
|
19
|
+
failed: boolean;
|
|
20
|
+
static MAX_CACHE_CLEARS: number;
|
|
21
|
+
constructor(prog: Prog);
|
|
22
|
+
computeClosure(pcs: number[]): {
|
|
23
|
+
pcs: Int32Array;
|
|
24
|
+
isMatch: boolean;
|
|
25
|
+
hasEmptyWidth: boolean;
|
|
26
|
+
matchIDs: number[];
|
|
27
|
+
};
|
|
28
|
+
resolveEmptyWidth(nfaStates: Int32Array, context: number): {
|
|
29
|
+
resolvedPCs: Set<number>;
|
|
30
|
+
isMatch: boolean;
|
|
31
|
+
};
|
|
32
|
+
getState(pcs: number[]): DFAState | null;
|
|
33
|
+
step(state: DFAState, charCode: number, anchor: number, context: number): DFAState | null;
|
|
34
|
+
match(input: MachineUTF16Input, pos: number, anchor: number): boolean | null;
|
|
35
|
+
}
|
|
36
|
+
export { DFA };
|
package/dist/cjs/DFA.js
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.DFA = void 0;
|
|
4
|
+
const Inst_js_1 = require("./Inst.js");
|
|
5
|
+
const RE2Flags_js_1 = require("./RE2Flags.js");
|
|
6
|
+
const Unicode_js_1 = require("./Unicode.js");
|
|
7
|
+
const Utils_js_1 = require("./Utils.js");
|
|
8
|
+
// FNV-1a 32-bit hash for an array of integers.
|
|
9
|
+
const hashPCs = (pcs) => {
|
|
10
|
+
let h = -2128831035;
|
|
11
|
+
for (let i = 0; i < pcs.length; i++) {
|
|
12
|
+
h ^= pcs[i];
|
|
13
|
+
h = Math.imul(h, 16777619);
|
|
14
|
+
}
|
|
15
|
+
return h;
|
|
16
|
+
};
|
|
17
|
+
const arraysEqual = (a, b) => {
|
|
18
|
+
if (a.length !== b.length)
|
|
19
|
+
return false;
|
|
20
|
+
for (let i = 0; i < a.length; i++) {
|
|
21
|
+
if (a[i] !== b[i])
|
|
22
|
+
return false;
|
|
23
|
+
}
|
|
24
|
+
return true;
|
|
25
|
+
};
|
|
26
|
+
class DFAState {
|
|
27
|
+
nfaStates;
|
|
28
|
+
isMatch;
|
|
29
|
+
hasEmptyWidth;
|
|
30
|
+
matchIDs;
|
|
31
|
+
nextAscii;
|
|
32
|
+
nextMap;
|
|
33
|
+
constructor(nfaStates, isMatch, hasEmptyWidth, matchIDs = []) {
|
|
34
|
+
this.nfaStates = nfaStates; // Int32Array of Instruction PCs
|
|
35
|
+
this.isMatch = isMatch;
|
|
36
|
+
this.hasEmptyWidth = hasEmptyWidth; // true if any PC is an EMPTY_WIDTH instruction
|
|
37
|
+
this.matchIDs = matchIDs;
|
|
38
|
+
this.nextAscii = new Array(Unicode_js_1.MAX_ASCII + 1).fill(null);
|
|
39
|
+
this.nextMap = new Map();
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
class DFA {
|
|
43
|
+
prog;
|
|
44
|
+
stateCache;
|
|
45
|
+
stateCount;
|
|
46
|
+
startState;
|
|
47
|
+
stateLimit;
|
|
48
|
+
cacheClears;
|
|
49
|
+
failed;
|
|
50
|
+
static MAX_CACHE_CLEARS = 5;
|
|
51
|
+
constructor(prog) {
|
|
52
|
+
this.prog = prog;
|
|
53
|
+
this.stateCache = new Map();
|
|
54
|
+
this.stateCount = 0;
|
|
55
|
+
this.startState = null;
|
|
56
|
+
this.stateLimit = 10000;
|
|
57
|
+
this.cacheClears = 0;
|
|
58
|
+
this.failed = false;
|
|
59
|
+
}
|
|
60
|
+
// Follows epsilon transitions to find all reachable states without consuming a char.
|
|
61
|
+
// Stops at EMPTY_WIDTH (includes the PC but does not follow through).
|
|
62
|
+
computeClosure(pcs) {
|
|
63
|
+
const closure = new Set();
|
|
64
|
+
const stack = [...pcs];
|
|
65
|
+
let isMatch = false;
|
|
66
|
+
let hasEmptyWidth = false;
|
|
67
|
+
const matchIDs = [];
|
|
68
|
+
while (stack.length > 0) {
|
|
69
|
+
const pc = stack.pop();
|
|
70
|
+
if (pc === undefined) {
|
|
71
|
+
throw new Error("invalid state");
|
|
72
|
+
}
|
|
73
|
+
if (closure.has(pc))
|
|
74
|
+
continue;
|
|
75
|
+
closure.add(pc);
|
|
76
|
+
const inst = this.prog.getInst(pc);
|
|
77
|
+
switch (inst.op) {
|
|
78
|
+
case Inst_js_1.Inst.MATCH:
|
|
79
|
+
isMatch = true;
|
|
80
|
+
if (!matchIDs.includes(inst.arg))
|
|
81
|
+
matchIDs.push(inst.arg);
|
|
82
|
+
break;
|
|
83
|
+
case Inst_js_1.Inst.ALT:
|
|
84
|
+
case Inst_js_1.Inst.ALT_MATCH:
|
|
85
|
+
stack.push(inst.out);
|
|
86
|
+
stack.push(inst.arg);
|
|
87
|
+
break;
|
|
88
|
+
case Inst_js_1.Inst.NOP:
|
|
89
|
+
case Inst_js_1.Inst.CAPTURE:
|
|
90
|
+
stack.push(inst.out);
|
|
91
|
+
break;
|
|
92
|
+
case Inst_js_1.Inst.EMPTY_WIDTH:
|
|
93
|
+
// Include in state but don't follow through — resolved at step time with context
|
|
94
|
+
hasEmptyWidth = true;
|
|
95
|
+
break;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
const sortedPCs = Int32Array.from(closure).sort();
|
|
99
|
+
matchIDs.sort((a, b) => a - b);
|
|
100
|
+
return { pcs: sortedPCs, isMatch, hasEmptyWidth, matchIDs };
|
|
101
|
+
}
|
|
102
|
+
// Resolve EMPTY_WIDTH PCs using the given context.
|
|
103
|
+
// Returns { resolvedPCs: Set<number>, isMatch: boolean }
|
|
104
|
+
resolveEmptyWidth(nfaStates, context) {
|
|
105
|
+
const resolved = new Set();
|
|
106
|
+
const stack = [];
|
|
107
|
+
let isMatch = false;
|
|
108
|
+
// Start with all PCs in the state
|
|
109
|
+
for (let i = 0; i < nfaStates.length; i++) {
|
|
110
|
+
const pc = nfaStates[i];
|
|
111
|
+
const inst = this.prog.getInst(pc);
|
|
112
|
+
if (inst.op === Inst_js_1.Inst.EMPTY_WIDTH) {
|
|
113
|
+
// Check if context satisfies the empty-width condition
|
|
114
|
+
if ((inst.arg & ~context) === 0) {
|
|
115
|
+
stack.push(inst.out);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
else {
|
|
119
|
+
resolved.add(pc);
|
|
120
|
+
if (inst.op === Inst_js_1.Inst.MATCH) {
|
|
121
|
+
isMatch = true;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
// Follow through from resolved EMPTY_WIDTH transitions
|
|
126
|
+
while (stack.length > 0) {
|
|
127
|
+
const pc = stack.pop();
|
|
128
|
+
if (pc === undefined) {
|
|
129
|
+
throw new Error("invalid state");
|
|
130
|
+
}
|
|
131
|
+
if (resolved.has(pc))
|
|
132
|
+
continue;
|
|
133
|
+
resolved.add(pc);
|
|
134
|
+
const inst = this.prog.getInst(pc);
|
|
135
|
+
switch (inst.op) {
|
|
136
|
+
case Inst_js_1.Inst.MATCH:
|
|
137
|
+
isMatch = true;
|
|
138
|
+
break;
|
|
139
|
+
case Inst_js_1.Inst.ALT:
|
|
140
|
+
case Inst_js_1.Inst.ALT_MATCH:
|
|
141
|
+
stack.push(inst.out);
|
|
142
|
+
stack.push(inst.arg);
|
|
143
|
+
break;
|
|
144
|
+
case Inst_js_1.Inst.NOP:
|
|
145
|
+
case Inst_js_1.Inst.CAPTURE:
|
|
146
|
+
stack.push(inst.out);
|
|
147
|
+
break;
|
|
148
|
+
case Inst_js_1.Inst.EMPTY_WIDTH:
|
|
149
|
+
if ((inst.arg & ~context) === 0) {
|
|
150
|
+
stack.push(inst.out);
|
|
151
|
+
}
|
|
152
|
+
break;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
return { resolvedPCs: resolved, isMatch };
|
|
156
|
+
}
|
|
157
|
+
getState(pcs) {
|
|
158
|
+
const closureResult = this.computeClosure(pcs);
|
|
159
|
+
const sortedPCs = closureResult.pcs;
|
|
160
|
+
const hash = hashPCs(sortedPCs);
|
|
161
|
+
let bucket = this.stateCache.get(hash);
|
|
162
|
+
if (bucket) {
|
|
163
|
+
for (let i = 0; i < bucket.length; i++) {
|
|
164
|
+
const state = bucket[i];
|
|
165
|
+
if (arraysEqual(state.nfaStates, sortedPCs)) {
|
|
166
|
+
return state;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
else {
|
|
171
|
+
bucket = [];
|
|
172
|
+
this.stateCache.set(hash, bucket);
|
|
173
|
+
}
|
|
174
|
+
if (this.failed)
|
|
175
|
+
return null;
|
|
176
|
+
if (this.stateCount >= this.stateLimit) {
|
|
177
|
+
this.stateCache.clear();
|
|
178
|
+
this.stateCount = 0;
|
|
179
|
+
this.startState = null;
|
|
180
|
+
this.cacheClears++;
|
|
181
|
+
if (this.cacheClears >= DFA.MAX_CACHE_CLEARS) {
|
|
182
|
+
this.failed = true;
|
|
183
|
+
}
|
|
184
|
+
return null;
|
|
185
|
+
}
|
|
186
|
+
const state = new DFAState(sortedPCs, closureResult.isMatch, closureResult.hasEmptyWidth, closureResult.matchIDs);
|
|
187
|
+
bucket.push(state);
|
|
188
|
+
this.stateCount++;
|
|
189
|
+
return state;
|
|
190
|
+
}
|
|
191
|
+
// Compute the next DFA state given a current state, a character, and context.
|
|
192
|
+
// Context is needed only when the state has EMPTY_WIDTH PCs.
|
|
193
|
+
step(state, charCode, anchor, context) {
|
|
194
|
+
// Cache lookup
|
|
195
|
+
let cacheKey = 0;
|
|
196
|
+
if (state.hasEmptyWidth) {
|
|
197
|
+
// Context-dependent: include context in key
|
|
198
|
+
cacheKey =
|
|
199
|
+
charCode * 128 + (context & 0x3f) * 2 + (anchor === RE2Flags_js_1.UNANCHORED ? 0 : 1);
|
|
200
|
+
const cached = state.nextMap.get(cacheKey);
|
|
201
|
+
if (cached !== undefined || state.nextMap.has(cacheKey)) {
|
|
202
|
+
return cached ?? null;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
else {
|
|
206
|
+
// Context-independent: use original caching
|
|
207
|
+
if (anchor === RE2Flags_js_1.UNANCHORED && charCode <= Unicode_js_1.MAX_ASCII) {
|
|
208
|
+
const next = state.nextAscii[charCode];
|
|
209
|
+
if (next !== null) {
|
|
210
|
+
return next;
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
else {
|
|
214
|
+
cacheKey = charCode + (anchor === RE2Flags_js_1.UNANCHORED ? 0 : Unicode_js_1.MAX_RUNE + 1);
|
|
215
|
+
const cached = state.nextMap.get(cacheKey);
|
|
216
|
+
if (cached !== undefined || state.nextMap.has(cacheKey)) {
|
|
217
|
+
return cached ?? null;
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
// Determine which PCs to check for RUNE matches
|
|
222
|
+
let activePCs;
|
|
223
|
+
if (state.hasEmptyWidth) {
|
|
224
|
+
const { resolvedPCs } = this.resolveEmptyWidth(state.nfaStates, context);
|
|
225
|
+
activePCs = resolvedPCs;
|
|
226
|
+
}
|
|
227
|
+
else {
|
|
228
|
+
activePCs = state.nfaStates;
|
|
229
|
+
}
|
|
230
|
+
// Collect next PCs from RUNE matches
|
|
231
|
+
const nextPCs = [];
|
|
232
|
+
const iterPCs = activePCs instanceof Set ? activePCs : state.nfaStates;
|
|
233
|
+
for (const pc of iterPCs) {
|
|
234
|
+
const inst = this.prog.getInst(pc);
|
|
235
|
+
if (Inst_js_1.Inst.isRuneOp(inst.op) && inst.matchRune(charCode)) {
|
|
236
|
+
nextPCs.push(inst.out);
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
if (anchor === RE2Flags_js_1.UNANCHORED) {
|
|
240
|
+
nextPCs.push(this.prog.start);
|
|
241
|
+
}
|
|
242
|
+
const nextState = this.getState(nextPCs);
|
|
243
|
+
// Cache the result
|
|
244
|
+
if (state.hasEmptyWidth) {
|
|
245
|
+
state.nextMap.set(cacheKey, nextState);
|
|
246
|
+
}
|
|
247
|
+
else if (anchor === RE2Flags_js_1.UNANCHORED && charCode <= Unicode_js_1.MAX_ASCII) {
|
|
248
|
+
state.nextAscii[charCode] = nextState;
|
|
249
|
+
}
|
|
250
|
+
else {
|
|
251
|
+
cacheKey = charCode + (anchor === RE2Flags_js_1.UNANCHORED ? 0 : Unicode_js_1.MAX_RUNE + 1);
|
|
252
|
+
state.nextMap.set(cacheKey, nextState);
|
|
253
|
+
}
|
|
254
|
+
return nextState;
|
|
255
|
+
}
|
|
256
|
+
// The hot loop: Execute the Lazy DFA
|
|
257
|
+
match(input, pos, anchor) {
|
|
258
|
+
if (!this.startState) {
|
|
259
|
+
this.startState = this.getState([this.prog.start]);
|
|
260
|
+
if (!this.startState)
|
|
261
|
+
return null;
|
|
262
|
+
}
|
|
263
|
+
const endPos = input.endPos();
|
|
264
|
+
let currentState = this.startState;
|
|
265
|
+
// prevRune: the rune immediately before position `pos`. For pos=0 this is
|
|
266
|
+
// -1 (beginning-of-text sentinel). For pos>0 we query the input so that
|
|
267
|
+
// ^, \A, and \b anchors use the correct context when matching begins
|
|
268
|
+
// from a mid-text offset.
|
|
269
|
+
let prevRune = -1;
|
|
270
|
+
if (pos > 0) {
|
|
271
|
+
const r = input.step(pos - 1) >> 3;
|
|
272
|
+
if (r >= 0)
|
|
273
|
+
prevRune = r;
|
|
274
|
+
}
|
|
275
|
+
// Check if start state matches directly (e.g., empty pattern)
|
|
276
|
+
if (currentState.isMatch) {
|
|
277
|
+
if (anchor === RE2Flags_js_1.ANCHOR_BOTH) {
|
|
278
|
+
if (pos === endPos)
|
|
279
|
+
return true;
|
|
280
|
+
}
|
|
281
|
+
else {
|
|
282
|
+
return true;
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
let i = pos;
|
|
286
|
+
while (i < endPos) {
|
|
287
|
+
const r = input.step(i);
|
|
288
|
+
const rune = r >> 3;
|
|
289
|
+
const width = r & 7;
|
|
290
|
+
if (width === 0)
|
|
291
|
+
break;
|
|
292
|
+
// Compute context at position i (between prevRune and rune)
|
|
293
|
+
const context = (0, Utils_js_1.emptyOpContext)(prevRune, rune);
|
|
294
|
+
// Before consuming: check if EMPTY_WIDTH in current state resolves to MATCH
|
|
295
|
+
if (currentState.hasEmptyWidth) {
|
|
296
|
+
const { isMatch } = this.resolveEmptyWidth(currentState.nfaStates, context);
|
|
297
|
+
if (isMatch) {
|
|
298
|
+
if (anchor === RE2Flags_js_1.ANCHOR_BOTH) {
|
|
299
|
+
// Match at position i (before consuming rune) — only valid if i === endPos
|
|
300
|
+
// which can't happen in this loop, so skip
|
|
301
|
+
}
|
|
302
|
+
else {
|
|
303
|
+
return true;
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
// Consume rune and transition to next state
|
|
308
|
+
if (!currentState.hasEmptyWidth &&
|
|
309
|
+
anchor === RE2Flags_js_1.UNANCHORED &&
|
|
310
|
+
rune <= Unicode_js_1.MAX_ASCII) {
|
|
311
|
+
currentState =
|
|
312
|
+
currentState.nextAscii[rune] ||
|
|
313
|
+
this.step(currentState, rune, anchor, context);
|
|
314
|
+
}
|
|
315
|
+
else {
|
|
316
|
+
currentState = this.step(currentState, rune, anchor, context);
|
|
317
|
+
}
|
|
318
|
+
if (currentState === null)
|
|
319
|
+
return null;
|
|
320
|
+
// After consuming: check if new state is a match
|
|
321
|
+
if (currentState.isMatch) {
|
|
322
|
+
if (anchor === RE2Flags_js_1.ANCHOR_BOTH) {
|
|
323
|
+
if (i + width === endPos)
|
|
324
|
+
return true;
|
|
325
|
+
}
|
|
326
|
+
else {
|
|
327
|
+
return true;
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
if (currentState.nfaStates.length === 0) {
|
|
331
|
+
if (anchor !== RE2Flags_js_1.UNANCHORED)
|
|
332
|
+
return false;
|
|
333
|
+
}
|
|
334
|
+
prevRune = rune;
|
|
335
|
+
i += width;
|
|
336
|
+
}
|
|
337
|
+
// After the loop: check EMPTY_WIDTH at end of text.
|
|
338
|
+
// For all anchor modes, a resolved MATCH here means the pattern succeeded:
|
|
339
|
+
// UNANCHORED/ANCHOR_START accept any match; ANCHOR_BOTH accepts it because
|
|
340
|
+
// we have consumed the entire input up to endPos.
|
|
341
|
+
if (currentState.hasEmptyWidth) {
|
|
342
|
+
const endContext = (0, Utils_js_1.emptyOpContext)(prevRune, -1);
|
|
343
|
+
const { isMatch } = this.resolveEmptyWidth(currentState.nfaStates, endContext);
|
|
344
|
+
if (isMatch)
|
|
345
|
+
return true;
|
|
346
|
+
}
|
|
347
|
+
return false;
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
exports.DFA = DFA;
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* A single instruction in the regular expression virtual machine.
|
|
3
|
+
*
|
|
4
|
+
* @see http://swtch.com/~rsc/regexp/regexp2.html
|
|
5
|
+
*/
|
|
6
|
+
declare class Inst {
|
|
7
|
+
static ALT: number;
|
|
8
|
+
static ALT_MATCH: number;
|
|
9
|
+
static CAPTURE: number;
|
|
10
|
+
static EMPTY_WIDTH: number;
|
|
11
|
+
static FAIL: number;
|
|
12
|
+
static MATCH: number;
|
|
13
|
+
static NOP: number;
|
|
14
|
+
static RUNE: number;
|
|
15
|
+
static RUNE1: number;
|
|
16
|
+
static RUNE_ANY: number;
|
|
17
|
+
static RUNE_ANY_NOT_NL: number;
|
|
18
|
+
op: number;
|
|
19
|
+
out: number;
|
|
20
|
+
arg: number;
|
|
21
|
+
runes: number[];
|
|
22
|
+
static isRuneOp(op: number): boolean;
|
|
23
|
+
constructor(op: number);
|
|
24
|
+
matchRune(r: number): boolean;
|
|
25
|
+
}
|
|
26
|
+
export { Inst };
|