@bufbuild/re2 0.0.1-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +30 -0
- package/dist/cjs/CharClass.d.ts +30 -0
- package/dist/cjs/CharClass.js +284 -0
- package/dist/cjs/CharGroup.d.ts +8 -0
- package/dist/cjs/CharGroup.js +83 -0
- package/dist/cjs/Codepoint.d.ts +3 -0
- package/dist/cjs/Codepoint.js +62 -0
- package/dist/cjs/Compiler.d.ts +40 -0
- package/dist/cjs/Compiler.js +262 -0
- package/dist/cjs/DFA.d.ts +36 -0
- package/dist/cjs/DFA.js +350 -0
- package/dist/cjs/Inst.d.ts +26 -0
- package/dist/cjs/Inst.js +86 -0
- package/dist/cjs/MachineInput.d.ts +17 -0
- package/dist/cjs/MachineInput.js +72 -0
- package/dist/cjs/Parser.d.ts +111 -0
- package/dist/cjs/Parser.js +1538 -0
- package/dist/cjs/Prefilter.d.ts +19 -0
- package/dist/cjs/Prefilter.js +163 -0
- package/dist/cjs/Prog.d.ts +39 -0
- package/dist/cjs/Prog.js +154 -0
- package/dist/cjs/RE2.d.ts +27 -0
- package/dist/cjs/RE2.js +221 -0
- package/dist/cjs/RE2Flags.d.ts +16 -0
- package/dist/cjs/RE2Flags.js +58 -0
- package/dist/cjs/Regexp.d.ts +43 -0
- package/dist/cjs/Regexp.js +98 -0
- package/dist/cjs/Simplify.d.ts +3 -0
- package/dist/cjs/Simplify.js +230 -0
- package/dist/cjs/Unicode.d.ts +17 -0
- package/dist/cjs/Unicode.js +165 -0
- package/dist/cjs/UnicodeRangeTable.d.ts +12 -0
- package/dist/cjs/UnicodeRangeTable.js +31 -0
- package/dist/cjs/UnicodeTables.d.ts +29 -0
- package/dist/cjs/UnicodeTables.js +571 -0
- package/dist/cjs/Utils.d.ts +22 -0
- package/dist/cjs/Utils.js +119 -0
- package/dist/cjs/__fixtures__/find.d.ts +9 -0
- package/dist/cjs/__fixtures__/find.js +115 -0
- package/dist/cjs/chars.d.ts +2 -0
- package/dist/cjs/chars.js +19 -0
- package/dist/cjs/exceptions.d.ts +55 -0
- package/dist/cjs/exceptions.js +94 -0
- package/dist/cjs/index.d.ts +102 -0
- package/dist/cjs/index.js +173 -0
- package/dist/cjs/package.json +1 -0
- package/dist/cjs/testParser.d.ts +3 -0
- package/dist/cjs/testParser.js +143 -0
- package/dist/esm/CharClass.d.ts +30 -0
- package/dist/esm/CharClass.js +281 -0
- package/dist/esm/CharGroup.d.ts +8 -0
- package/dist/esm/CharGroup.js +78 -0
- package/dist/esm/Codepoint.d.ts +3 -0
- package/dist/esm/Codepoint.js +59 -0
- package/dist/esm/Compiler.d.ts +40 -0
- package/dist/esm/Compiler.js +259 -0
- package/dist/esm/DFA.d.ts +36 -0
- package/dist/esm/DFA.js +347 -0
- package/dist/esm/Inst.d.ts +26 -0
- package/dist/esm/Inst.js +83 -0
- package/dist/esm/MachineInput.d.ts +17 -0
- package/dist/esm/MachineInput.js +68 -0
- package/dist/esm/Parser.d.ts +111 -0
- package/dist/esm/Parser.js +1535 -0
- package/dist/esm/Prefilter.d.ts +19 -0
- package/dist/esm/Prefilter.js +159 -0
- package/dist/esm/Prog.d.ts +39 -0
- package/dist/esm/Prog.js +150 -0
- package/dist/esm/RE2.d.ts +27 -0
- package/dist/esm/RE2.js +218 -0
- package/dist/esm/RE2Flags.d.ts +16 -0
- package/dist/esm/RE2Flags.js +41 -0
- package/dist/esm/Regexp.d.ts +43 -0
- package/dist/esm/Regexp.js +94 -0
- package/dist/esm/Simplify.d.ts +3 -0
- package/dist/esm/Simplify.js +228 -0
- package/dist/esm/Unicode.d.ts +17 -0
- package/dist/esm/Unicode.js +150 -0
- package/dist/esm/UnicodeRangeTable.d.ts +12 -0
- package/dist/esm/UnicodeRangeTable.js +28 -0
- package/dist/esm/UnicodeTables.d.ts +29 -0
- package/dist/esm/UnicodeTables.js +568 -0
- package/dist/esm/Utils.d.ts +22 -0
- package/dist/esm/Utils.js +103 -0
- package/dist/esm/__fixtures__/find.d.ts +9 -0
- package/dist/esm/__fixtures__/find.js +112 -0
- package/dist/esm/chars.d.ts +2 -0
- package/dist/esm/chars.js +14 -0
- package/dist/esm/exceptions.d.ts +55 -0
- package/dist/esm/exceptions.js +86 -0
- package/dist/esm/index.d.ts +102 -0
- package/dist/esm/index.js +163 -0
- package/dist/esm/testParser.d.ts +3 -0
- package/dist/esm/testParser.js +138 -0
- package/package.json +49 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { Regexp } from "./Regexp.js";
|
|
2
|
+
import type { MachineUTF16Input } from "./MachineInput.js";
|
|
3
|
+
declare class Prefilter {
|
|
4
|
+
type: number;
|
|
5
|
+
subs: Prefilter[];
|
|
6
|
+
str: string;
|
|
7
|
+
static Type: {
|
|
8
|
+
NONE: number;
|
|
9
|
+
EXACT: number;
|
|
10
|
+
AND: number;
|
|
11
|
+
OR: number;
|
|
12
|
+
};
|
|
13
|
+
constructor(type: number);
|
|
14
|
+
eval(input: MachineUTF16Input, pos: number): boolean;
|
|
15
|
+
}
|
|
16
|
+
declare const PrefilterTree: {
|
|
17
|
+
build: (re: Regexp) => Prefilter;
|
|
18
|
+
};
|
|
19
|
+
export { Prefilter, PrefilterTree };
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import { Regexp } from "./Regexp.js";
|
|
2
|
+
import { FOLD_CASE } from "./RE2Flags.js";
|
|
3
|
+
class Prefilter {
|
|
4
|
+
type;
|
|
5
|
+
subs;
|
|
6
|
+
str;
|
|
7
|
+
static Type = { NONE: 0, EXACT: 1, AND: 2, OR: 3 };
|
|
8
|
+
constructor(type) {
|
|
9
|
+
this.type = type;
|
|
10
|
+
this.subs = [];
|
|
11
|
+
this.str = "";
|
|
12
|
+
}
|
|
13
|
+
eval(input, pos) {
|
|
14
|
+
switch (this.type) {
|
|
15
|
+
case Prefilter.Type.NONE:
|
|
16
|
+
return true;
|
|
17
|
+
case Prefilter.Type.EXACT:
|
|
18
|
+
return input.hasString(this, pos);
|
|
19
|
+
case Prefilter.Type.AND:
|
|
20
|
+
for (let i = 0; i < this.subs.length; i++) {
|
|
21
|
+
if (!this.subs[i].eval(input, pos))
|
|
22
|
+
return false;
|
|
23
|
+
}
|
|
24
|
+
return true;
|
|
25
|
+
case Prefilter.Type.OR:
|
|
26
|
+
for (let i = 0; i < this.subs.length; i++) {
|
|
27
|
+
if (this.subs[i].eval(input, pos))
|
|
28
|
+
return true;
|
|
29
|
+
}
|
|
30
|
+
return false;
|
|
31
|
+
default:
|
|
32
|
+
return true;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
const fromRegexp = (re) => {
|
|
37
|
+
if (!re)
|
|
38
|
+
return new Prefilter(Prefilter.Type.NONE);
|
|
39
|
+
switch (re.op) {
|
|
40
|
+
case Regexp.Op.NO_MATCH:
|
|
41
|
+
case Regexp.Op.EMPTY_MATCH:
|
|
42
|
+
case Regexp.Op.BEGIN_LINE:
|
|
43
|
+
case Regexp.Op.END_LINE:
|
|
44
|
+
case Regexp.Op.BEGIN_TEXT:
|
|
45
|
+
case Regexp.Op.END_TEXT:
|
|
46
|
+
case Regexp.Op.WORD_BOUNDARY:
|
|
47
|
+
case Regexp.Op.NO_WORD_BOUNDARY:
|
|
48
|
+
case Regexp.Op.CHAR_CLASS:
|
|
49
|
+
case Regexp.Op.ANY_CHAR_NOT_NL:
|
|
50
|
+
case Regexp.Op.ANY_CHAR: {
|
|
51
|
+
return new Prefilter(Prefilter.Type.NONE);
|
|
52
|
+
}
|
|
53
|
+
case Regexp.Op.LITERAL: {
|
|
54
|
+
if (re.runes.length === 0 || (re.flags & FOLD_CASE) !== 0) {
|
|
55
|
+
// Skip case-folded literals for simplicity
|
|
56
|
+
return new Prefilter(Prefilter.Type.NONE);
|
|
57
|
+
}
|
|
58
|
+
const pf = new Prefilter(Prefilter.Type.EXACT);
|
|
59
|
+
let str = "";
|
|
60
|
+
for (let i = 0; i < re.runes.length; i++) {
|
|
61
|
+
str += String.fromCodePoint(re.runes[i]);
|
|
62
|
+
}
|
|
63
|
+
pf.str = str;
|
|
64
|
+
return pf;
|
|
65
|
+
}
|
|
66
|
+
case Regexp.Op.CAPTURE:
|
|
67
|
+
case Regexp.Op.PLUS: {
|
|
68
|
+
return fromRegexp(re.subs[0]);
|
|
69
|
+
}
|
|
70
|
+
case Regexp.Op.REPEAT: {
|
|
71
|
+
if (re.min >= 1) {
|
|
72
|
+
return fromRegexp(re.subs[0]);
|
|
73
|
+
}
|
|
74
|
+
return new Prefilter(Prefilter.Type.NONE);
|
|
75
|
+
}
|
|
76
|
+
case Regexp.Op.CONCAT: {
|
|
77
|
+
const pf = new Prefilter(Prefilter.Type.AND);
|
|
78
|
+
for (const sub of re.subs) {
|
|
79
|
+
pf.subs.push(fromRegexp(sub));
|
|
80
|
+
}
|
|
81
|
+
return pf;
|
|
82
|
+
}
|
|
83
|
+
case Regexp.Op.ALTERNATE: {
|
|
84
|
+
const pf = new Prefilter(Prefilter.Type.OR);
|
|
85
|
+
for (const sub of re.subs) {
|
|
86
|
+
pf.subs.push(fromRegexp(sub));
|
|
87
|
+
}
|
|
88
|
+
return pf;
|
|
89
|
+
}
|
|
90
|
+
default:
|
|
91
|
+
return new Prefilter(Prefilter.Type.NONE);
|
|
92
|
+
}
|
|
93
|
+
};
|
|
94
|
+
const simplify = (pf) => {
|
|
95
|
+
if (pf.type === Prefilter.Type.EXACT || pf.type === Prefilter.Type.NONE) {
|
|
96
|
+
return pf;
|
|
97
|
+
}
|
|
98
|
+
if (pf.type === Prefilter.Type.AND) {
|
|
99
|
+
const newSubs = [];
|
|
100
|
+
for (const sub of pf.subs) {
|
|
101
|
+
const s = simplify(sub);
|
|
102
|
+
if (s.type !== Prefilter.Type.NONE) {
|
|
103
|
+
if (s.type === Prefilter.Type.AND) {
|
|
104
|
+
newSubs.push(...s.subs);
|
|
105
|
+
}
|
|
106
|
+
else {
|
|
107
|
+
newSubs.push(s);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
if (newSubs.length === 0)
|
|
112
|
+
return new Prefilter(Prefilter.Type.NONE);
|
|
113
|
+
if (newSubs.length === 1)
|
|
114
|
+
return newSubs[0];
|
|
115
|
+
pf.subs = newSubs;
|
|
116
|
+
return pf;
|
|
117
|
+
}
|
|
118
|
+
if (pf.type === Prefilter.Type.OR) {
|
|
119
|
+
const newSubs = [];
|
|
120
|
+
for (const sub of pf.subs) {
|
|
121
|
+
const s = simplify(sub);
|
|
122
|
+
if (s.type === Prefilter.Type.NONE) {
|
|
123
|
+
// If any branch of an OR has no requirements, the whole OR has no requirements
|
|
124
|
+
return new Prefilter(Prefilter.Type.NONE);
|
|
125
|
+
}
|
|
126
|
+
if (s.type === Prefilter.Type.OR) {
|
|
127
|
+
newSubs.push(...s.subs);
|
|
128
|
+
}
|
|
129
|
+
else {
|
|
130
|
+
newSubs.push(s);
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
if (newSubs.length === 0)
|
|
134
|
+
return new Prefilter(Prefilter.Type.NONE);
|
|
135
|
+
if (newSubs.length === 1)
|
|
136
|
+
return newSubs[0];
|
|
137
|
+
// De-duplicate EXACT branches
|
|
138
|
+
const seen = new Set();
|
|
139
|
+
const uniqueSubs = [];
|
|
140
|
+
for (const sub of newSubs) {
|
|
141
|
+
if (sub.type === Prefilter.Type.EXACT) {
|
|
142
|
+
if (!seen.has(sub.str)) {
|
|
143
|
+
seen.add(sub.str);
|
|
144
|
+
uniqueSubs.push(sub);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
else {
|
|
148
|
+
uniqueSubs.push(sub);
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
pf.subs = uniqueSubs;
|
|
152
|
+
return pf;
|
|
153
|
+
}
|
|
154
|
+
return pf;
|
|
155
|
+
};
|
|
156
|
+
const PrefilterTree = {
|
|
157
|
+
build: (re) => simplify(fromRegexp(re)),
|
|
158
|
+
};
|
|
159
|
+
export { Prefilter, PrefilterTree };
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { Inst } from "./Inst.js";
|
|
2
|
+
/**
|
|
3
|
+
* A list of instruction pointers waiting to be patched.
|
|
4
|
+
* Tracks both `head` and `tail` to allow O(1) appending during compilation.
|
|
5
|
+
* * Values are encoded integers, not standard memory pointers:
|
|
6
|
+
* - Program instruction index: `l >> 1`
|
|
7
|
+
* - Patch `.out` field if: `(l & 1) === 0`
|
|
8
|
+
* - Patch `.arg` field if: `(l & 1) === 1`
|
|
9
|
+
* - `0` denotes an empty list.
|
|
10
|
+
* * @see https://swtch.com/~rsc/regexp/regexp1.html
|
|
11
|
+
*/
|
|
12
|
+
declare class PatchList {
|
|
13
|
+
head: number;
|
|
14
|
+
tail: number;
|
|
15
|
+
/**
|
|
16
|
+
* @param {number} head - Encoded pointer to the start of the patch list.
|
|
17
|
+
* @param {number} tail - Encoded pointer to the end of the patch list.
|
|
18
|
+
*/
|
|
19
|
+
constructor(head?: number, tail?: number);
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* A Prog is a compiled regular expression program.
|
|
23
|
+
*/
|
|
24
|
+
declare class Prog {
|
|
25
|
+
inst: Inst[];
|
|
26
|
+
start: number;
|
|
27
|
+
numCap: number;
|
|
28
|
+
constructor();
|
|
29
|
+
getInst(pc: number): Inst;
|
|
30
|
+
numInst(): number;
|
|
31
|
+
addInst(op: number): void;
|
|
32
|
+
skipNop(pc: number): Inst;
|
|
33
|
+
prefix(): [boolean, string];
|
|
34
|
+
startCond(): number;
|
|
35
|
+
next(l: number): number;
|
|
36
|
+
patch(l: PatchList, val: number): void;
|
|
37
|
+
append(l1: PatchList, l2: PatchList): PatchList;
|
|
38
|
+
}
|
|
39
|
+
export { Prog, PatchList };
|
package/dist/esm/Prog.js
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import { FOLD_CASE } from "./RE2Flags.js";
|
|
2
|
+
import { Inst } from "./Inst.js";
|
|
3
|
+
/**
|
|
4
|
+
* A list of instruction pointers waiting to be patched.
|
|
5
|
+
* Tracks both `head` and `tail` to allow O(1) appending during compilation.
|
|
6
|
+
* * Values are encoded integers, not standard memory pointers:
|
|
7
|
+
* - Program instruction index: `l >> 1`
|
|
8
|
+
* - Patch `.out` field if: `(l & 1) === 0`
|
|
9
|
+
* - Patch `.arg` field if: `(l & 1) === 1`
|
|
10
|
+
* - `0` denotes an empty list.
|
|
11
|
+
* * @see https://swtch.com/~rsc/regexp/regexp1.html
|
|
12
|
+
*/
|
|
13
|
+
class PatchList {
|
|
14
|
+
head;
|
|
15
|
+
tail;
|
|
16
|
+
/**
|
|
17
|
+
* @param {number} head - Encoded pointer to the start of the patch list.
|
|
18
|
+
* @param {number} tail - Encoded pointer to the end of the patch list.
|
|
19
|
+
*/
|
|
20
|
+
constructor(head = 0, tail = 0) {
|
|
21
|
+
this.head = head;
|
|
22
|
+
this.tail = tail;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* A Prog is a compiled regular expression program.
|
|
27
|
+
*/
|
|
28
|
+
class Prog {
|
|
29
|
+
inst;
|
|
30
|
+
start;
|
|
31
|
+
numCap;
|
|
32
|
+
constructor() {
|
|
33
|
+
this.inst = [];
|
|
34
|
+
this.start = 0; // index of start instruction
|
|
35
|
+
// number of CAPTURE insts in re
|
|
36
|
+
// 2 => implicit ( and ) for whole match $0
|
|
37
|
+
this.numCap = 2;
|
|
38
|
+
}
|
|
39
|
+
// Returns the instruction at the specified pc.
|
|
40
|
+
// Precondition: pc > 0 && pc < numInst().
|
|
41
|
+
getInst(pc) {
|
|
42
|
+
return this.inst[pc];
|
|
43
|
+
}
|
|
44
|
+
// Returns the number of instructions in this program.
|
|
45
|
+
numInst() {
|
|
46
|
+
return this.inst.length;
|
|
47
|
+
}
|
|
48
|
+
// Adds a new instruction to this program, with operator |op| and |pc| equal
|
|
49
|
+
// to |numInst()|.
|
|
50
|
+
addInst(op) {
|
|
51
|
+
this.inst.push(new Inst(op));
|
|
52
|
+
}
|
|
53
|
+
// skipNop() follows any no-op or capturing instructions and returns the
|
|
54
|
+
// resulting instruction.
|
|
55
|
+
skipNop(pc) {
|
|
56
|
+
let i = this.inst[pc];
|
|
57
|
+
while (i.op === Inst.NOP || i.op === Inst.CAPTURE) {
|
|
58
|
+
i = this.inst[pc];
|
|
59
|
+
pc = i.out;
|
|
60
|
+
}
|
|
61
|
+
return i;
|
|
62
|
+
}
|
|
63
|
+
// prefix() returns a pair of a literal string that all matches for the
|
|
64
|
+
// regexp must start with, and a boolean which is true if the prefix is the
|
|
65
|
+
// entire match. The string is returned by appending to |prefix|.
|
|
66
|
+
prefix() {
|
|
67
|
+
let prefix = "";
|
|
68
|
+
let i = this.skipNop(this.start);
|
|
69
|
+
if (!Inst.isRuneOp(i.op) || i.runes.length !== 1) {
|
|
70
|
+
return [i.op === Inst.MATCH, prefix];
|
|
71
|
+
}
|
|
72
|
+
while (Inst.isRuneOp(i.op) &&
|
|
73
|
+
i.runes.length === 1 &&
|
|
74
|
+
(i.arg & FOLD_CASE) === 0) {
|
|
75
|
+
prefix += String.fromCodePoint(i.runes[0]);
|
|
76
|
+
i = this.skipNop(i.out);
|
|
77
|
+
}
|
|
78
|
+
return [i.op === Inst.MATCH, prefix];
|
|
79
|
+
}
|
|
80
|
+
// startCond() returns the leading empty-width conditions that must be true
|
|
81
|
+
// in any match. It returns -1 (all bits set) if no matches are possible.
|
|
82
|
+
startCond() {
|
|
83
|
+
let flag = 0;
|
|
84
|
+
let pc = this.start;
|
|
85
|
+
loop: for (;;) {
|
|
86
|
+
const i = this.inst[pc];
|
|
87
|
+
switch (i.op) {
|
|
88
|
+
case Inst.EMPTY_WIDTH:
|
|
89
|
+
flag |= i.arg;
|
|
90
|
+
break;
|
|
91
|
+
case Inst.FAIL:
|
|
92
|
+
return -1;
|
|
93
|
+
case Inst.CAPTURE:
|
|
94
|
+
case Inst.NOP:
|
|
95
|
+
break;
|
|
96
|
+
default:
|
|
97
|
+
break loop;
|
|
98
|
+
}
|
|
99
|
+
pc = i.out;
|
|
100
|
+
}
|
|
101
|
+
return flag;
|
|
102
|
+
}
|
|
103
|
+
// --- Patch list ---
|
|
104
|
+
// A patchlist is a list of instruction pointers that need to be filled in
|
|
105
|
+
// (patched). Because the pointers haven't been filled in yet, we can reuse
|
|
106
|
+
// their storage to hold the list. It's kind of sleazy, but works well in
|
|
107
|
+
// practice. See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
|
|
108
|
+
// These aren't really pointers: they're integers, so we can reinterpret them
|
|
109
|
+
// this way without using package unsafe. A value l denotes p.inst[l>>1].out
|
|
110
|
+
// (l&1==0) or .arg (l&1==1). l == 0 denotes the empty list, okay because we
|
|
111
|
+
// start every program with a fail instruction, so we'll never want to point
|
|
112
|
+
// at its output link.
|
|
113
|
+
next(l) {
|
|
114
|
+
const i = this.inst[l >> 1];
|
|
115
|
+
if ((l & 1) === 0) {
|
|
116
|
+
return i.out;
|
|
117
|
+
}
|
|
118
|
+
return i.arg;
|
|
119
|
+
}
|
|
120
|
+
patch(l, val) {
|
|
121
|
+
let head = l.head;
|
|
122
|
+
while (head !== 0) {
|
|
123
|
+
const i = this.inst[head >> 1];
|
|
124
|
+
if ((head & 1) === 0) {
|
|
125
|
+
head = i.out;
|
|
126
|
+
i.out = val;
|
|
127
|
+
}
|
|
128
|
+
else {
|
|
129
|
+
head = i.arg;
|
|
130
|
+
i.arg = val;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
append(l1, l2) {
|
|
135
|
+
if (l1.head === 0)
|
|
136
|
+
return l2;
|
|
137
|
+
if (l2.head === 0)
|
|
138
|
+
return l1;
|
|
139
|
+
// We know exactly where the tail is
|
|
140
|
+
const i = this.inst[l1.tail >> 1];
|
|
141
|
+
if ((l1.tail & 1) === 0) {
|
|
142
|
+
i.out = l2.head;
|
|
143
|
+
}
|
|
144
|
+
else {
|
|
145
|
+
i.arg = l2.head;
|
|
146
|
+
}
|
|
147
|
+
return new PatchList(l1.head, l2.tail);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
export { Prog, PatchList };
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import { type MachineUTF16Input } from "./MachineInput.js";
|
|
2
|
+
import { DFA } from "./DFA.js";
|
|
3
|
+
import { Prefilter } from "./Prefilter.js";
|
|
4
|
+
import type { Prog } from "./Prog.js";
|
|
5
|
+
declare class RE2 {
|
|
6
|
+
expr: string;
|
|
7
|
+
prog: Prog;
|
|
8
|
+
numSubexp: number;
|
|
9
|
+
cond: number;
|
|
10
|
+
prefix: string;
|
|
11
|
+
prefixComplete: boolean;
|
|
12
|
+
prefixRune: number;
|
|
13
|
+
dfa: DFA;
|
|
14
|
+
prefilter: Prefilter | null;
|
|
15
|
+
namedGroups: Map<string, number>;
|
|
16
|
+
static compile(expr: string): RE2;
|
|
17
|
+
static compileImpl(expr: string, mode: number): RE2;
|
|
18
|
+
constructor(expr: string, mode: number);
|
|
19
|
+
matchPrefixComplete(input: MachineUTF16Input, pos: number, anchor: number, ncap: number): number[] | null;
|
|
20
|
+
executeEngine(input: MachineUTF16Input, pos: number, anchor: number, ncap: number): number[] | null;
|
|
21
|
+
_nfaFallback(input: MachineUTF16Input, pos: number, anchor: number): boolean;
|
|
22
|
+
numberOfCapturingGroups(): number;
|
|
23
|
+
reset(): void;
|
|
24
|
+
toString(): string;
|
|
25
|
+
match(s: string): boolean;
|
|
26
|
+
}
|
|
27
|
+
export { RE2 };
|
package/dist/esm/RE2.js
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import { ANCHOR_BOTH, ANCHOR_START, PERL, UNANCHORED } from "./RE2Flags.js";
|
|
2
|
+
import { fromUTF16 } from "./MachineInput.js";
|
|
3
|
+
import { DFA } from "./DFA.js";
|
|
4
|
+
import { Inst } from "./Inst.js";
|
|
5
|
+
import { Prefilter, PrefilterTree } from "./Prefilter.js";
|
|
6
|
+
import { Compiler } from "./Compiler.js";
|
|
7
|
+
import { simplify } from "./Simplify.js";
|
|
8
|
+
import { Parser } from "./Parser.js";
|
|
9
|
+
import { emptyOpContext } from "./Utils.js";
|
|
10
|
+
class RE2 {
|
|
11
|
+
expr;
|
|
12
|
+
prog;
|
|
13
|
+
numSubexp;
|
|
14
|
+
cond;
|
|
15
|
+
prefix;
|
|
16
|
+
prefixComplete;
|
|
17
|
+
prefixRune;
|
|
18
|
+
dfa;
|
|
19
|
+
prefilter;
|
|
20
|
+
namedGroups;
|
|
21
|
+
static compile(expr) {
|
|
22
|
+
return RE2.compileImpl(expr, PERL);
|
|
23
|
+
}
|
|
24
|
+
static compileImpl(expr, mode) {
|
|
25
|
+
return new RE2(expr, mode);
|
|
26
|
+
}
|
|
27
|
+
constructor(expr, mode) {
|
|
28
|
+
let re = Parser.parse(expr, mode);
|
|
29
|
+
re = simplify(re);
|
|
30
|
+
const prefilter = PrefilterTree.build(re);
|
|
31
|
+
const prog = Compiler.compileRegexp(re);
|
|
32
|
+
this.prefilter = prefilter.type === Prefilter.Type.NONE ? null : prefilter;
|
|
33
|
+
const [prefixCompl, prefixStr] = prog.prefix();
|
|
34
|
+
this.prefixComplete = prefixCompl;
|
|
35
|
+
this.prefix = prefixStr;
|
|
36
|
+
this.prefixRune = 0;
|
|
37
|
+
if (this.prefix.length > 0) {
|
|
38
|
+
const cp = this.prefix.codePointAt(0);
|
|
39
|
+
if (cp === undefined) {
|
|
40
|
+
throw new Error("RE2: prefix has no code point");
|
|
41
|
+
}
|
|
42
|
+
this.prefixRune = cp;
|
|
43
|
+
}
|
|
44
|
+
this.namedGroups = re.namedGroups;
|
|
45
|
+
this.expr = expr;
|
|
46
|
+
this.prog = prog;
|
|
47
|
+
this.numSubexp = re.maxCap();
|
|
48
|
+
this.cond = prog.startCond();
|
|
49
|
+
this.dfa = new DFA(this.prog);
|
|
50
|
+
}
|
|
51
|
+
matchPrefixComplete(input, pos, anchor, ncap) {
|
|
52
|
+
if ((anchor === ANCHOR_START || anchor === ANCHOR_BOTH) && pos !== 0) {
|
|
53
|
+
return null;
|
|
54
|
+
}
|
|
55
|
+
let matchStart = -1;
|
|
56
|
+
let matchEnd = -1;
|
|
57
|
+
const pLen = input.prefixLength(this);
|
|
58
|
+
if (anchor === UNANCHORED) {
|
|
59
|
+
const idx = input.index(this, pos);
|
|
60
|
+
if (idx < 0)
|
|
61
|
+
return null;
|
|
62
|
+
matchStart = pos + idx;
|
|
63
|
+
matchEnd = matchStart + pLen;
|
|
64
|
+
}
|
|
65
|
+
else if (anchor === ANCHOR_BOTH) {
|
|
66
|
+
if (input.endPos() !== pLen)
|
|
67
|
+
return null;
|
|
68
|
+
const idx = input.index(this, 0);
|
|
69
|
+
if (idx !== 0)
|
|
70
|
+
return null;
|
|
71
|
+
matchStart = 0;
|
|
72
|
+
matchEnd = pLen;
|
|
73
|
+
}
|
|
74
|
+
else if (anchor === ANCHOR_START) {
|
|
75
|
+
const idx = input.index(this, 0);
|
|
76
|
+
if (idx !== 0)
|
|
77
|
+
return null;
|
|
78
|
+
matchStart = 0;
|
|
79
|
+
matchEnd = pLen;
|
|
80
|
+
}
|
|
81
|
+
if (matchStart < 0)
|
|
82
|
+
return null;
|
|
83
|
+
// If captures are requested (e.g. findSubmatch instead of test), populate bounds
|
|
84
|
+
if (ncap > 0) {
|
|
85
|
+
const matchcap = new Int32Array(ncap).fill(-1);
|
|
86
|
+
matchcap[0] = matchStart;
|
|
87
|
+
matchcap[1] = matchEnd;
|
|
88
|
+
return Array.from(matchcap);
|
|
89
|
+
}
|
|
90
|
+
return []; // Matched successfully, but no capture data requested
|
|
91
|
+
}
|
|
92
|
+
executeEngine(input, pos, anchor, ncap) {
|
|
93
|
+
if (this.prefixComplete && (ncap === 0 || this.numSubexp === 0)) {
|
|
94
|
+
return this.matchPrefixComplete(input, pos, anchor, ncap);
|
|
95
|
+
}
|
|
96
|
+
if (this.prefilter !== null && anchor === UNANCHORED) {
|
|
97
|
+
if (!this.prefilter.eval(input, pos)) {
|
|
98
|
+
return null;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
const dfaResult = this.dfa.match(input, pos, anchor);
|
|
102
|
+
if (dfaResult !== null) {
|
|
103
|
+
return dfaResult ? [] : null;
|
|
104
|
+
}
|
|
105
|
+
// Minimal NFA fallback for DFA state explosion
|
|
106
|
+
return this._nfaFallback(input, pos, anchor) ? [] : null;
|
|
107
|
+
}
|
|
108
|
+
// Minimal boolean-only NFA for when the DFA bails due to state explosion.
|
|
109
|
+
// No captures, no thread pools — just two sets of NFA states swapped each step.
|
|
110
|
+
_nfaFallback(input, pos, anchor) {
|
|
111
|
+
const prog = this.prog;
|
|
112
|
+
const endPos = input.endPos();
|
|
113
|
+
const addState = (set, visited, pc, context) => {
|
|
114
|
+
if (pc < 0 || pc >= prog.numInst() || visited.has(pc))
|
|
115
|
+
return;
|
|
116
|
+
visited.add(pc);
|
|
117
|
+
const inst = prog.getInst(pc);
|
|
118
|
+
switch (inst.op) {
|
|
119
|
+
case Inst.ALT:
|
|
120
|
+
case Inst.ALT_MATCH:
|
|
121
|
+
addState(set, visited, inst.out, context);
|
|
122
|
+
addState(set, visited, inst.arg, context);
|
|
123
|
+
break;
|
|
124
|
+
case Inst.NOP:
|
|
125
|
+
case Inst.CAPTURE:
|
|
126
|
+
addState(set, visited, inst.out, context);
|
|
127
|
+
break;
|
|
128
|
+
case Inst.EMPTY_WIDTH:
|
|
129
|
+
if ((inst.arg & ~context) === 0) {
|
|
130
|
+
addState(set, visited, inst.out, context);
|
|
131
|
+
}
|
|
132
|
+
break;
|
|
133
|
+
default:
|
|
134
|
+
set.add(pc);
|
|
135
|
+
break;
|
|
136
|
+
}
|
|
137
|
+
};
|
|
138
|
+
let current = new Set();
|
|
139
|
+
let next = new Set();
|
|
140
|
+
// prevRune: the rune immediately before `pos`. See DFA.match for rationale.
|
|
141
|
+
let prevRune = -1;
|
|
142
|
+
if (pos > 0) {
|
|
143
|
+
const r = input.step(pos - 1) >> 3;
|
|
144
|
+
if (r >= 0)
|
|
145
|
+
prevRune = r;
|
|
146
|
+
}
|
|
147
|
+
for (let i = pos; i <= endPos; i++) {
|
|
148
|
+
const rune = i < endPos ? input.step(i) >> 3 : -1;
|
|
149
|
+
const width = i < endPos ? input.step(i) & 7 : 0;
|
|
150
|
+
const context = emptyOpContext(prevRune, rune);
|
|
151
|
+
// Add start state at each position for unanchored search
|
|
152
|
+
if (anchor === UNANCHORED || i === pos) {
|
|
153
|
+
const visited = new Set();
|
|
154
|
+
addState(current, visited, prog.start, context);
|
|
155
|
+
}
|
|
156
|
+
// Check for matches before consuming.
|
|
157
|
+
// For UNANCHORED/ANCHOR_START, a MATCH at any position succeeds.
|
|
158
|
+
// For ANCHOR_BOTH, we must consume the entire input — intermediate
|
|
159
|
+
// matches are skipped; only the final post-loop check accepts MATCH.
|
|
160
|
+
if (anchor !== ANCHOR_BOTH) {
|
|
161
|
+
for (const pc of current) {
|
|
162
|
+
const inst = prog.getInst(pc);
|
|
163
|
+
if (inst.op === Inst.MATCH) {
|
|
164
|
+
return true;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
if (i >= endPos || width === 0)
|
|
169
|
+
break;
|
|
170
|
+
// Step: consume current character
|
|
171
|
+
next.clear();
|
|
172
|
+
for (const pc of current) {
|
|
173
|
+
const inst = prog.getInst(pc);
|
|
174
|
+
if (Inst.isRuneOp(inst.op) && inst.matchRune(rune)) {
|
|
175
|
+
const nextContext = emptyOpContext(rune, i + width < endPos ? input.step(i + width) >> 3 : -1);
|
|
176
|
+
const visited = new Set();
|
|
177
|
+
addState(next, visited, inst.out, nextContext);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
// For unanchored, add start state at next position too
|
|
181
|
+
if (anchor === UNANCHORED) {
|
|
182
|
+
const nextRune = i + width < endPos ? input.step(i + width) >> 3 : -1;
|
|
183
|
+
const nextContext = emptyOpContext(rune, nextRune);
|
|
184
|
+
const visited = new Set();
|
|
185
|
+
addState(next, visited, prog.start, nextContext);
|
|
186
|
+
}
|
|
187
|
+
prevRune = rune;
|
|
188
|
+
[current, next] = [next, current];
|
|
189
|
+
i += width - 1; // loop increments by 1, but we advanced by width
|
|
190
|
+
}
|
|
191
|
+
// Final check for match after processing all input
|
|
192
|
+
const endContext = emptyOpContext(prevRune, -1);
|
|
193
|
+
const visited = new Set();
|
|
194
|
+
const finalSet = new Set();
|
|
195
|
+
for (const pc of current) {
|
|
196
|
+
addState(finalSet, visited, pc, endContext);
|
|
197
|
+
}
|
|
198
|
+
for (const pc of finalSet) {
|
|
199
|
+
const inst = prog.getInst(pc);
|
|
200
|
+
if (inst.op === Inst.MATCH)
|
|
201
|
+
return true;
|
|
202
|
+
}
|
|
203
|
+
return false;
|
|
204
|
+
}
|
|
205
|
+
numberOfCapturingGroups() {
|
|
206
|
+
return this.numSubexp;
|
|
207
|
+
}
|
|
208
|
+
reset() {
|
|
209
|
+
// No-op: machine pool removed
|
|
210
|
+
}
|
|
211
|
+
toString() {
|
|
212
|
+
return this.expr;
|
|
213
|
+
}
|
|
214
|
+
match(s) {
|
|
215
|
+
return this.executeEngine(fromUTF16(s), 0, UNANCHORED, 0) !== null;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
export { RE2 };
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
declare const FOLD_CASE = 1;
|
|
2
|
+
declare const LITERAL = 2;
|
|
3
|
+
declare const CLASS_NL = 4;
|
|
4
|
+
declare const DOT_NL = 8;
|
|
5
|
+
declare const ONE_LINE = 16;
|
|
6
|
+
declare const NON_GREEDY = 32;
|
|
7
|
+
declare const PERL_X = 64;
|
|
8
|
+
declare const UNICODE_GROUPS = 128;
|
|
9
|
+
declare const WAS_DOLLAR = 256;
|
|
10
|
+
declare const MATCH_NL: number;
|
|
11
|
+
declare const PERL: number;
|
|
12
|
+
declare const POSIX = 0;
|
|
13
|
+
declare const UNANCHORED = 0;
|
|
14
|
+
declare const ANCHOR_START = 1;
|
|
15
|
+
declare const ANCHOR_BOTH = 2;
|
|
16
|
+
export { UNANCHORED, ANCHOR_BOTH, NON_GREEDY, FOLD_CASE, LITERAL, ONE_LINE, WAS_DOLLAR, DOT_NL, UNICODE_GROUPS, CLASS_NL, PERL, ANCHOR_START, POSIX, MATCH_NL, PERL_X, };
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
//// Parser flags.
|
|
2
|
+
// Fold case during matching (case-insensitive).
|
|
3
|
+
const FOLD_CASE = 0x01;
|
|
4
|
+
// Treat pattern as a literal string instead of a regexp.
|
|
5
|
+
const LITERAL = 0x02;
|
|
6
|
+
// Allow character classes like [^a-z] and [[:space:]] to match newline.
|
|
7
|
+
const CLASS_NL = 0x04;
|
|
8
|
+
// Allow '.' to match newline.
|
|
9
|
+
const DOT_NL = 0x08;
|
|
10
|
+
// Treat ^ and $ as only matching at beginning and end of text, not
|
|
11
|
+
// around embedded newlines. (Perl's default).
|
|
12
|
+
const ONE_LINE = 0x10;
|
|
13
|
+
// Make repetition operators default to non-greedy.
|
|
14
|
+
const NON_GREEDY = 0x20;
|
|
15
|
+
// allow Perl extensions:
|
|
16
|
+
// non-capturing parens - (?: )
|
|
17
|
+
// non-greedy operators - *? +? ?? {}?
|
|
18
|
+
// flag edits - (?i) (?-i) (?i: )
|
|
19
|
+
// i - FoldCase
|
|
20
|
+
// m - !OneLine
|
|
21
|
+
// s - DotNL
|
|
22
|
+
// U - NonGreedy
|
|
23
|
+
// line ends: \A \z
|
|
24
|
+
// \Q and \E to disable/enable metacharacters
|
|
25
|
+
// (?P<name>expr) for named captures
|
|
26
|
+
// \C (any byte) is not supported.
|
|
27
|
+
const PERL_X = 0x40;
|
|
28
|
+
// Allow \p{Han}, \P{Han} for Unicode group and negation.
|
|
29
|
+
const UNICODE_GROUPS = 0x80;
|
|
30
|
+
// Regexp END_TEXT was $, not \z. Internal use only.
|
|
31
|
+
const WAS_DOLLAR = 0x100;
|
|
32
|
+
const MATCH_NL = CLASS_NL | DOT_NL;
|
|
33
|
+
// As close to Perl as possible.
|
|
34
|
+
const PERL = CLASS_NL | ONE_LINE | PERL_X | UNICODE_GROUPS;
|
|
35
|
+
// POSIX syntax.
|
|
36
|
+
const POSIX = 0;
|
|
37
|
+
//// Anchors
|
|
38
|
+
const UNANCHORED = 0;
|
|
39
|
+
const ANCHOR_START = 1;
|
|
40
|
+
const ANCHOR_BOTH = 2;
|
|
41
|
+
export { UNANCHORED, ANCHOR_BOTH, NON_GREEDY, FOLD_CASE, LITERAL, ONE_LINE, WAS_DOLLAR, DOT_NL, UNICODE_GROUPS, CLASS_NL, PERL, ANCHOR_START, POSIX, MATCH_NL, PERL_X, };
|