@bufbuild/re2 0.0.1-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +30 -0
- package/dist/cjs/CharClass.d.ts +30 -0
- package/dist/cjs/CharClass.js +284 -0
- package/dist/cjs/CharGroup.d.ts +8 -0
- package/dist/cjs/CharGroup.js +83 -0
- package/dist/cjs/Codepoint.d.ts +3 -0
- package/dist/cjs/Codepoint.js +62 -0
- package/dist/cjs/Compiler.d.ts +40 -0
- package/dist/cjs/Compiler.js +262 -0
- package/dist/cjs/DFA.d.ts +36 -0
- package/dist/cjs/DFA.js +350 -0
- package/dist/cjs/Inst.d.ts +26 -0
- package/dist/cjs/Inst.js +86 -0
- package/dist/cjs/MachineInput.d.ts +17 -0
- package/dist/cjs/MachineInput.js +72 -0
- package/dist/cjs/Parser.d.ts +111 -0
- package/dist/cjs/Parser.js +1538 -0
- package/dist/cjs/Prefilter.d.ts +19 -0
- package/dist/cjs/Prefilter.js +163 -0
- package/dist/cjs/Prog.d.ts +39 -0
- package/dist/cjs/Prog.js +154 -0
- package/dist/cjs/RE2.d.ts +27 -0
- package/dist/cjs/RE2.js +221 -0
- package/dist/cjs/RE2Flags.d.ts +16 -0
- package/dist/cjs/RE2Flags.js +58 -0
- package/dist/cjs/Regexp.d.ts +43 -0
- package/dist/cjs/Regexp.js +98 -0
- package/dist/cjs/Simplify.d.ts +3 -0
- package/dist/cjs/Simplify.js +230 -0
- package/dist/cjs/Unicode.d.ts +17 -0
- package/dist/cjs/Unicode.js +165 -0
- package/dist/cjs/UnicodeRangeTable.d.ts +12 -0
- package/dist/cjs/UnicodeRangeTable.js +31 -0
- package/dist/cjs/UnicodeTables.d.ts +29 -0
- package/dist/cjs/UnicodeTables.js +571 -0
- package/dist/cjs/Utils.d.ts +22 -0
- package/dist/cjs/Utils.js +119 -0
- package/dist/cjs/__fixtures__/find.d.ts +9 -0
- package/dist/cjs/__fixtures__/find.js +115 -0
- package/dist/cjs/chars.d.ts +2 -0
- package/dist/cjs/chars.js +19 -0
- package/dist/cjs/exceptions.d.ts +55 -0
- package/dist/cjs/exceptions.js +94 -0
- package/dist/cjs/index.d.ts +102 -0
- package/dist/cjs/index.js +173 -0
- package/dist/cjs/package.json +1 -0
- package/dist/cjs/testParser.d.ts +3 -0
- package/dist/cjs/testParser.js +143 -0
- package/dist/esm/CharClass.d.ts +30 -0
- package/dist/esm/CharClass.js +281 -0
- package/dist/esm/CharGroup.d.ts +8 -0
- package/dist/esm/CharGroup.js +78 -0
- package/dist/esm/Codepoint.d.ts +3 -0
- package/dist/esm/Codepoint.js +59 -0
- package/dist/esm/Compiler.d.ts +40 -0
- package/dist/esm/Compiler.js +259 -0
- package/dist/esm/DFA.d.ts +36 -0
- package/dist/esm/DFA.js +347 -0
- package/dist/esm/Inst.d.ts +26 -0
- package/dist/esm/Inst.js +83 -0
- package/dist/esm/MachineInput.d.ts +17 -0
- package/dist/esm/MachineInput.js +68 -0
- package/dist/esm/Parser.d.ts +111 -0
- package/dist/esm/Parser.js +1535 -0
- package/dist/esm/Prefilter.d.ts +19 -0
- package/dist/esm/Prefilter.js +159 -0
- package/dist/esm/Prog.d.ts +39 -0
- package/dist/esm/Prog.js +150 -0
- package/dist/esm/RE2.d.ts +27 -0
- package/dist/esm/RE2.js +218 -0
- package/dist/esm/RE2Flags.d.ts +16 -0
- package/dist/esm/RE2Flags.js +41 -0
- package/dist/esm/Regexp.d.ts +43 -0
- package/dist/esm/Regexp.js +94 -0
- package/dist/esm/Simplify.d.ts +3 -0
- package/dist/esm/Simplify.js +228 -0
- package/dist/esm/Unicode.d.ts +17 -0
- package/dist/esm/Unicode.js +150 -0
- package/dist/esm/UnicodeRangeTable.d.ts +12 -0
- package/dist/esm/UnicodeRangeTable.js +28 -0
- package/dist/esm/UnicodeTables.d.ts +29 -0
- package/dist/esm/UnicodeTables.js +568 -0
- package/dist/esm/Utils.d.ts +22 -0
- package/dist/esm/Utils.js +103 -0
- package/dist/esm/__fixtures__/find.d.ts +9 -0
- package/dist/esm/__fixtures__/find.js +112 -0
- package/dist/esm/chars.d.ts +2 -0
- package/dist/esm/chars.js +14 -0
- package/dist/esm/exceptions.d.ts +55 -0
- package/dist/esm/exceptions.js +86 -0
- package/dist/esm/index.d.ts +102 -0
- package/dist/esm/index.js +163 -0
- package/dist/esm/testParser.d.ts +3 -0
- package/dist/esm/testParser.js +138 -0
- package/package.json +49 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.PERL_X = exports.MATCH_NL = exports.POSIX = exports.ANCHOR_START = exports.PERL = exports.CLASS_NL = exports.UNICODE_GROUPS = exports.DOT_NL = exports.WAS_DOLLAR = exports.ONE_LINE = exports.LITERAL = exports.FOLD_CASE = exports.NON_GREEDY = exports.ANCHOR_BOTH = exports.UNANCHORED = void 0;
|
|
4
|
+
//// Parser flags.
|
|
5
|
+
// Fold case during matching (case-insensitive).
|
|
6
|
+
const FOLD_CASE = 0x01;
|
|
7
|
+
exports.FOLD_CASE = FOLD_CASE;
|
|
8
|
+
// Treat pattern as a literal string instead of a regexp.
|
|
9
|
+
const LITERAL = 0x02;
|
|
10
|
+
exports.LITERAL = LITERAL;
|
|
11
|
+
// Allow character classes like [^a-z] and [[:space:]] to match newline.
|
|
12
|
+
const CLASS_NL = 0x04;
|
|
13
|
+
exports.CLASS_NL = CLASS_NL;
|
|
14
|
+
// Allow '.' to match newline.
|
|
15
|
+
const DOT_NL = 0x08;
|
|
16
|
+
exports.DOT_NL = DOT_NL;
|
|
17
|
+
// Treat ^ and $ as only matching at beginning and end of text, not
|
|
18
|
+
// around embedded newlines. (Perl's default).
|
|
19
|
+
const ONE_LINE = 0x10;
|
|
20
|
+
exports.ONE_LINE = ONE_LINE;
|
|
21
|
+
// Make repetition operators default to non-greedy.
|
|
22
|
+
const NON_GREEDY = 0x20;
|
|
23
|
+
exports.NON_GREEDY = NON_GREEDY;
|
|
24
|
+
// allow Perl extensions:
|
|
25
|
+
// non-capturing parens - (?: )
|
|
26
|
+
// non-greedy operators - *? +? ?? {}?
|
|
27
|
+
// flag edits - (?i) (?-i) (?i: )
|
|
28
|
+
// i - FoldCase
|
|
29
|
+
// m - !OneLine
|
|
30
|
+
// s - DotNL
|
|
31
|
+
// U - NonGreedy
|
|
32
|
+
// line ends: \A \z
|
|
33
|
+
// \Q and \E to disable/enable metacharacters
|
|
34
|
+
// (?P<name>expr) for named captures
|
|
35
|
+
// \C (any byte) is not supported.
|
|
36
|
+
const PERL_X = 0x40;
|
|
37
|
+
exports.PERL_X = PERL_X;
|
|
38
|
+
// Allow \p{Han}, \P{Han} for Unicode group and negation.
|
|
39
|
+
const UNICODE_GROUPS = 0x80;
|
|
40
|
+
exports.UNICODE_GROUPS = UNICODE_GROUPS;
|
|
41
|
+
// Regexp END_TEXT was $, not \z. Internal use only.
|
|
42
|
+
const WAS_DOLLAR = 0x100;
|
|
43
|
+
exports.WAS_DOLLAR = WAS_DOLLAR;
|
|
44
|
+
const MATCH_NL = CLASS_NL | DOT_NL;
|
|
45
|
+
exports.MATCH_NL = MATCH_NL;
|
|
46
|
+
// As close to Perl as possible.
|
|
47
|
+
const PERL = CLASS_NL | ONE_LINE | PERL_X | UNICODE_GROUPS;
|
|
48
|
+
exports.PERL = PERL;
|
|
49
|
+
// POSIX syntax.
|
|
50
|
+
const POSIX = 0;
|
|
51
|
+
exports.POSIX = POSIX;
|
|
52
|
+
//// Anchors
|
|
53
|
+
const UNANCHORED = 0;
|
|
54
|
+
exports.UNANCHORED = UNANCHORED;
|
|
55
|
+
const ANCHOR_START = 1;
|
|
56
|
+
exports.ANCHOR_START = ANCHOR_START;
|
|
57
|
+
const ANCHOR_BOTH = 2;
|
|
58
|
+
exports.ANCHOR_BOTH = ANCHOR_BOTH;
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Regular expression abstract syntax tree. Produced by parser, used by compiler.
|
|
3
|
+
*/
|
|
4
|
+
export declare class Regexp {
|
|
5
|
+
static Op: {
|
|
6
|
+
readonly NO_MATCH: 0;
|
|
7
|
+
readonly EMPTY_MATCH: 1;
|
|
8
|
+
readonly LITERAL: 2;
|
|
9
|
+
readonly CHAR_CLASS: 3;
|
|
10
|
+
readonly ANY_CHAR_NOT_NL: 4;
|
|
11
|
+
readonly ANY_CHAR: 5;
|
|
12
|
+
readonly BEGIN_LINE: 6;
|
|
13
|
+
readonly END_LINE: 7;
|
|
14
|
+
readonly BEGIN_TEXT: 8;
|
|
15
|
+
readonly END_TEXT: 9;
|
|
16
|
+
readonly WORD_BOUNDARY: 10;
|
|
17
|
+
readonly NO_WORD_BOUNDARY: 11;
|
|
18
|
+
readonly CAPTURE: 12;
|
|
19
|
+
readonly STAR: 13;
|
|
20
|
+
readonly PLUS: 14;
|
|
21
|
+
readonly QUEST: 15;
|
|
22
|
+
readonly REPEAT: 16;
|
|
23
|
+
readonly CONCAT: 17;
|
|
24
|
+
readonly ALTERNATE: 18;
|
|
25
|
+
readonly LEFT_PAREN: 19;
|
|
26
|
+
readonly VERTICAL_BAR: 20;
|
|
27
|
+
};
|
|
28
|
+
static isPseudoOp(op: number): boolean;
|
|
29
|
+
static emptySubs(): Regexp[];
|
|
30
|
+
static fromRegexp(re: Regexp): Regexp;
|
|
31
|
+
op: number;
|
|
32
|
+
flags: number;
|
|
33
|
+
subs: Regexp[];
|
|
34
|
+
runes: number[];
|
|
35
|
+
min: number;
|
|
36
|
+
max: number;
|
|
37
|
+
cap: number;
|
|
38
|
+
name: string | null;
|
|
39
|
+
namedGroups: Map<string, number>;
|
|
40
|
+
constructor(op: number);
|
|
41
|
+
reinit(): void;
|
|
42
|
+
maxCap(): number;
|
|
43
|
+
}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.Regexp = void 0;
|
|
4
|
+
/**
|
|
5
|
+
* Regular expression abstract syntax tree. Produced by parser, used by compiler.
|
|
6
|
+
*/
|
|
7
|
+
class Regexp {
|
|
8
|
+
static Op = {
|
|
9
|
+
NO_MATCH: 0,
|
|
10
|
+
EMPTY_MATCH: 1,
|
|
11
|
+
LITERAL: 2,
|
|
12
|
+
CHAR_CLASS: 3,
|
|
13
|
+
ANY_CHAR_NOT_NL: 4,
|
|
14
|
+
ANY_CHAR: 5,
|
|
15
|
+
BEGIN_LINE: 6,
|
|
16
|
+
END_LINE: 7,
|
|
17
|
+
BEGIN_TEXT: 8,
|
|
18
|
+
END_TEXT: 9,
|
|
19
|
+
WORD_BOUNDARY: 10,
|
|
20
|
+
NO_WORD_BOUNDARY: 11,
|
|
21
|
+
CAPTURE: 12,
|
|
22
|
+
STAR: 13,
|
|
23
|
+
PLUS: 14,
|
|
24
|
+
QUEST: 15,
|
|
25
|
+
REPEAT: 16,
|
|
26
|
+
CONCAT: 17,
|
|
27
|
+
ALTERNATE: 18,
|
|
28
|
+
LEFT_PAREN: 19,
|
|
29
|
+
VERTICAL_BAR: 20,
|
|
30
|
+
};
|
|
31
|
+
static isPseudoOp(op) {
|
|
32
|
+
return op >= Regexp.Op.LEFT_PAREN;
|
|
33
|
+
}
|
|
34
|
+
static emptySubs() {
|
|
35
|
+
return [];
|
|
36
|
+
}
|
|
37
|
+
static fromRegexp(re) {
|
|
38
|
+
const regex = new Regexp(re.op);
|
|
39
|
+
regex.flags = re.flags;
|
|
40
|
+
regex.subs = re.subs;
|
|
41
|
+
regex.runes = re.runes;
|
|
42
|
+
regex.cap = re.cap;
|
|
43
|
+
regex.min = re.min;
|
|
44
|
+
regex.max = re.max;
|
|
45
|
+
regex.name = re.name;
|
|
46
|
+
regex.namedGroups = re.namedGroups;
|
|
47
|
+
return regex;
|
|
48
|
+
}
|
|
49
|
+
op;
|
|
50
|
+
flags;
|
|
51
|
+
subs;
|
|
52
|
+
runes;
|
|
53
|
+
min;
|
|
54
|
+
max;
|
|
55
|
+
cap;
|
|
56
|
+
name;
|
|
57
|
+
namedGroups;
|
|
58
|
+
constructor(op) {
|
|
59
|
+
this.op = op; // operator
|
|
60
|
+
this.flags = 0; // bitmap of parse flags
|
|
61
|
+
// subexpressions, if any. Never null.
|
|
62
|
+
// subs[0] is used as the freelist.
|
|
63
|
+
this.subs = Regexp.emptySubs();
|
|
64
|
+
this.runes = []; // matched runes, for LITERAL, CHAR_CLASS
|
|
65
|
+
this.min = 0; // min for REPEAT
|
|
66
|
+
this.max = 0; // max for REPEAT
|
|
67
|
+
this.cap = 0; // capturing index, for CAPTURE
|
|
68
|
+
this.name = null; // capturing name, for CAPTURE
|
|
69
|
+
this.namedGroups = new Map();
|
|
70
|
+
}
|
|
71
|
+
reinit() {
|
|
72
|
+
this.flags = 0;
|
|
73
|
+
this.subs = Regexp.emptySubs();
|
|
74
|
+
this.runes = [];
|
|
75
|
+
this.cap = 0;
|
|
76
|
+
this.min = 0;
|
|
77
|
+
this.max = 0;
|
|
78
|
+
this.name = null;
|
|
79
|
+
this.namedGroups = new Map();
|
|
80
|
+
}
|
|
81
|
+
// maxCap() walks the regexp to find the maximum capture index.
|
|
82
|
+
maxCap() {
|
|
83
|
+
let m = 0;
|
|
84
|
+
if (this.op === Regexp.Op.CAPTURE) {
|
|
85
|
+
m = this.cap;
|
|
86
|
+
}
|
|
87
|
+
if (this.subs !== null) {
|
|
88
|
+
for (let sub of this.subs) {
|
|
89
|
+
const n = sub.maxCap();
|
|
90
|
+
if (m < n) {
|
|
91
|
+
m = n;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
return m;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
exports.Regexp = Regexp;
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.simplify = simplify;
|
|
4
|
+
const RE2Flags_js_1 = require("./RE2Flags.js");
|
|
5
|
+
const Regexp_js_1 = require("./Regexp.js");
|
|
6
|
+
const Unicode_js_1 = require("./Unicode.js");
|
|
7
|
+
// simplify returns a regexp equivalent to re but without counted
|
|
8
|
+
// repetitions and with various other simplifications, such as
|
|
9
|
+
// rewriting /(?:a+)+/ to /a+/. The resulting regexp will execute
|
|
10
|
+
// correctly but its string representation will not produce the same
|
|
11
|
+
// parse tree, because capturing parentheses may have been duplicated
|
|
12
|
+
// or removed. For example, the simplified form for /(x){1,2}/ is
|
|
13
|
+
// /(x)(x)?/ but both parentheses capture as $1. The returned regexp
|
|
14
|
+
// may share structure with or be the original.
|
|
15
|
+
function simplify(re) {
|
|
16
|
+
switch (re.op) {
|
|
17
|
+
case Regexp_js_1.Regexp.Op.CAPTURE: {
|
|
18
|
+
const sub = simplify(re.subs[0]);
|
|
19
|
+
if (sub !== re.subs[0]) {
|
|
20
|
+
const nre = Regexp_js_1.Regexp.fromRegexp(re);
|
|
21
|
+
nre.runes = [];
|
|
22
|
+
nre.subs = [sub];
|
|
23
|
+
return nre;
|
|
24
|
+
}
|
|
25
|
+
return re;
|
|
26
|
+
}
|
|
27
|
+
case Regexp_js_1.Regexp.Op.CONCAT:
|
|
28
|
+
case Regexp_js_1.Regexp.Op.ALTERNATE: {
|
|
29
|
+
const newSubs = [];
|
|
30
|
+
let changed = false;
|
|
31
|
+
for (let i = 0; i < re.subs.length; i++) {
|
|
32
|
+
const sub = re.subs[i];
|
|
33
|
+
const nsub = simplify(sub);
|
|
34
|
+
if (nsub !== sub) {
|
|
35
|
+
changed = true;
|
|
36
|
+
}
|
|
37
|
+
if (re.op === Regexp_js_1.Regexp.Op.CONCAT) {
|
|
38
|
+
// If any part of a CONCAT is mathematically impossible,
|
|
39
|
+
// the entire CONCAT sequence becomes impossible.
|
|
40
|
+
if (nsub.op === Regexp_js_1.Regexp.Op.NO_MATCH) {
|
|
41
|
+
return new Regexp_js_1.Regexp(Regexp_js_1.Regexp.Op.NO_MATCH);
|
|
42
|
+
}
|
|
43
|
+
// Drop empty 0-width match nodes entirely from sequences
|
|
44
|
+
if (nsub.op === Regexp_js_1.Regexp.Op.EMPTY_MATCH) {
|
|
45
|
+
changed = true;
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
// Flatten nested concatenations
|
|
49
|
+
if (nsub.op === Regexp_js_1.Regexp.Op.CONCAT) {
|
|
50
|
+
changed = true;
|
|
51
|
+
newSubs.push(...nsub.subs);
|
|
52
|
+
continue;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
else if (re.op === Regexp_js_1.Regexp.Op.ALTERNATE) {
|
|
56
|
+
// Drop impossible branches from alternations
|
|
57
|
+
if (nsub.op === Regexp_js_1.Regexp.Op.NO_MATCH) {
|
|
58
|
+
changed = true;
|
|
59
|
+
continue;
|
|
60
|
+
}
|
|
61
|
+
// Flatten nested alternations
|
|
62
|
+
if (nsub.op === Regexp_js_1.Regexp.Op.ALTERNATE) {
|
|
63
|
+
changed = true;
|
|
64
|
+
newSubs.push(...nsub.subs);
|
|
65
|
+
continue;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
newSubs.push(nsub);
|
|
69
|
+
}
|
|
70
|
+
if (changed) {
|
|
71
|
+
// If we filtered out all nodes, return the mathematically correct fallback
|
|
72
|
+
if (newSubs.length === 0) {
|
|
73
|
+
return new Regexp_js_1.Regexp(re.op === Regexp_js_1.Regexp.Op.CONCAT
|
|
74
|
+
? Regexp_js_1.Regexp.Op.EMPTY_MATCH
|
|
75
|
+
: Regexp_js_1.Regexp.Op.NO_MATCH);
|
|
76
|
+
}
|
|
77
|
+
// If only 1 node remains, we don't need a CONCAT/ALT container at all
|
|
78
|
+
if (newSubs.length === 1) {
|
|
79
|
+
return newSubs[0];
|
|
80
|
+
}
|
|
81
|
+
const nre = Regexp_js_1.Regexp.fromRegexp(re);
|
|
82
|
+
nre.runes = [];
|
|
83
|
+
nre.subs = newSubs;
|
|
84
|
+
return nre;
|
|
85
|
+
}
|
|
86
|
+
return re;
|
|
87
|
+
}
|
|
88
|
+
case Regexp_js_1.Regexp.Op.CHAR_CLASS: {
|
|
89
|
+
if (re.runes === null)
|
|
90
|
+
return re;
|
|
91
|
+
// Empty character classes match nothing.
|
|
92
|
+
if (re.runes.length === 0) {
|
|
93
|
+
return new Regexp_js_1.Regexp(Regexp_js_1.Regexp.Op.NO_MATCH);
|
|
94
|
+
}
|
|
95
|
+
// Full character classes match everything.
|
|
96
|
+
if (re.runes.length === 2 &&
|
|
97
|
+
re.runes[0] === 0 &&
|
|
98
|
+
re.runes[1] === Unicode_js_1.MAX_RUNE) {
|
|
99
|
+
return new Regexp_js_1.Regexp(Regexp_js_1.Regexp.Op.ANY_CHAR);
|
|
100
|
+
}
|
|
101
|
+
// Standard catch-all except newline
|
|
102
|
+
if (re.runes.length === 4 &&
|
|
103
|
+
re.runes[0] === 0 &&
|
|
104
|
+
re.runes[1] === 0x0a - 1 &&
|
|
105
|
+
re.runes[2] === 0x0a + 1 &&
|
|
106
|
+
re.runes[3] === Unicode_js_1.MAX_RUNE) {
|
|
107
|
+
return new Regexp_js_1.Regexp(Regexp_js_1.Regexp.Op.ANY_CHAR_NOT_NL);
|
|
108
|
+
}
|
|
109
|
+
return re;
|
|
110
|
+
}
|
|
111
|
+
case Regexp_js_1.Regexp.Op.STAR:
|
|
112
|
+
case Regexp_js_1.Regexp.Op.PLUS:
|
|
113
|
+
case Regexp_js_1.Regexp.Op.QUEST: {
|
|
114
|
+
const sub = simplify(re.subs[0]);
|
|
115
|
+
return simplify1(re.op, re.flags, sub, re);
|
|
116
|
+
}
|
|
117
|
+
case Regexp_js_1.Regexp.Op.REPEAT: {
|
|
118
|
+
// Special special case: x{0} matches the empty string
|
|
119
|
+
// and doesn't even need to consider x.
|
|
120
|
+
if (re.min === 0 && re.max === 0) {
|
|
121
|
+
return new Regexp_js_1.Regexp(Regexp_js_1.Regexp.Op.EMPTY_MATCH);
|
|
122
|
+
}
|
|
123
|
+
// The fun begins.
|
|
124
|
+
const sub = simplify(re.subs[0]);
|
|
125
|
+
// x{n,} means at least n matches of x.
|
|
126
|
+
if (re.max === -1) {
|
|
127
|
+
// Special case: x{0,} is x*.
|
|
128
|
+
if (re.min === 0) {
|
|
129
|
+
return simplify1(Regexp_js_1.Regexp.Op.STAR, re.flags, sub, null);
|
|
130
|
+
}
|
|
131
|
+
// Special case: x{1,} is x+.
|
|
132
|
+
if (re.min === 1) {
|
|
133
|
+
return simplify1(Regexp_js_1.Regexp.Op.PLUS, re.flags, sub, null);
|
|
134
|
+
}
|
|
135
|
+
// General case: x{4,} is xxxx+.
|
|
136
|
+
const nre = new Regexp_js_1.Regexp(Regexp_js_1.Regexp.Op.CONCAT);
|
|
137
|
+
const subs = [];
|
|
138
|
+
for (let i = 0; i < re.min - 1; i++) {
|
|
139
|
+
subs.push(sub);
|
|
140
|
+
}
|
|
141
|
+
subs.push(simplify1(Regexp_js_1.Regexp.Op.PLUS, re.flags, sub, null));
|
|
142
|
+
nre.subs = subs.slice(0);
|
|
143
|
+
// Ensure newly created CONCAT is properly flattened
|
|
144
|
+
return simplify(nre);
|
|
145
|
+
}
|
|
146
|
+
// Special case x{0} handled above.
|
|
147
|
+
// Special case: x{1} is just x.
|
|
148
|
+
if (re.min === 1 && re.max === 1) {
|
|
149
|
+
return sub;
|
|
150
|
+
}
|
|
151
|
+
// General case: x{n,m} means n copies of x and m copies of x?
|
|
152
|
+
// The machine will do less work if we nest the final m copies,
|
|
153
|
+
// so that x{2,5} = xx(x(x(x)?)?)?
|
|
154
|
+
// Build leading prefix: xx.
|
|
155
|
+
let prefixSubs = null;
|
|
156
|
+
if (re.min > 0) {
|
|
157
|
+
prefixSubs = [];
|
|
158
|
+
for (let i = 0; i < re.min; i++) {
|
|
159
|
+
prefixSubs.push(sub);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
// Build and attach suffix: (x(x(x)?)?)?
|
|
163
|
+
if (re.max > re.min) {
|
|
164
|
+
let suffix = simplify1(Regexp_js_1.Regexp.Op.QUEST, re.flags, sub, null);
|
|
165
|
+
for (let i = re.min + 1; i < re.max; i++) {
|
|
166
|
+
const nre2 = new Regexp_js_1.Regexp(Regexp_js_1.Regexp.Op.CONCAT);
|
|
167
|
+
nre2.subs = [sub, suffix];
|
|
168
|
+
suffix = simplify1(Regexp_js_1.Regexp.Op.QUEST, re.flags, nre2, null);
|
|
169
|
+
}
|
|
170
|
+
if (prefixSubs === null) {
|
|
171
|
+
return suffix;
|
|
172
|
+
}
|
|
173
|
+
prefixSubs.push(suffix);
|
|
174
|
+
}
|
|
175
|
+
if (prefixSubs !== null) {
|
|
176
|
+
const prefix = new Regexp_js_1.Regexp(Regexp_js_1.Regexp.Op.CONCAT);
|
|
177
|
+
prefix.subs = prefixSubs.slice(0);
|
|
178
|
+
// Ensure newly created CONCAT is properly flattened
|
|
179
|
+
return simplify(prefix);
|
|
180
|
+
}
|
|
181
|
+
// Some degenerate case like min > max or min < max < 0.
|
|
182
|
+
// Handle as impossible match.
|
|
183
|
+
return new Regexp_js_1.Regexp(Regexp_js_1.Regexp.Op.NO_MATCH);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
return re;
|
|
187
|
+
}
|
|
188
|
+
// simplify1 implements Simplify for the unary OpStar,
|
|
189
|
+
// OpPlus, and OpQuest operators. It returns the simple regexp
|
|
190
|
+
// equivalent to
|
|
191
|
+
//
|
|
192
|
+
// Regexp{Op: op, Flags: flags, Sub: {sub}}
|
|
193
|
+
//
|
|
194
|
+
// under the assumption that sub is already simple, and
|
|
195
|
+
// without first allocating that structure. If the regexp
|
|
196
|
+
// to be returned turns out to be equivalent to re, simplify1
|
|
197
|
+
// returns re instead.
|
|
198
|
+
//
|
|
199
|
+
// simplify1 is factored out of Simplify because the implementation
|
|
200
|
+
// for other operators generates these unary expressions.
|
|
201
|
+
// Letting them call simplify1 makes sure the expressions they
|
|
202
|
+
// generate are simple.
|
|
203
|
+
function simplify1(op, flags, sub, re) {
|
|
204
|
+
// Special case: repeat the empty string as much as
|
|
205
|
+
// you want, but it's still the empty string.
|
|
206
|
+
if (sub.op === Regexp_js_1.Regexp.Op.EMPTY_MATCH) {
|
|
207
|
+
return sub;
|
|
208
|
+
}
|
|
209
|
+
// Handle impossible targets gracefully.
|
|
210
|
+
// e.g. Trying to match "NO_MATCH" 0 or 1 times (QUEST/STAR) evaluates to EMPTY_MATCH.
|
|
211
|
+
if (sub.op === Regexp_js_1.Regexp.Op.NO_MATCH) {
|
|
212
|
+
if (op === Regexp_js_1.Regexp.Op.PLUS)
|
|
213
|
+
return sub; // 1+ times is impossible
|
|
214
|
+
return new Regexp_js_1.Regexp(Regexp_js_1.Regexp.Op.EMPTY_MATCH);
|
|
215
|
+
}
|
|
216
|
+
// The operators are idempotent if the flags match.
|
|
217
|
+
if (op === sub.op && (flags & RE2Flags_js_1.NON_GREEDY) === (sub.flags & RE2Flags_js_1.NON_GREEDY)) {
|
|
218
|
+
return sub;
|
|
219
|
+
}
|
|
220
|
+
if (re !== null &&
|
|
221
|
+
re.op === op &&
|
|
222
|
+
(re.flags & RE2Flags_js_1.NON_GREEDY) === (flags & RE2Flags_js_1.NON_GREEDY) &&
|
|
223
|
+
sub === re.subs[0]) {
|
|
224
|
+
return re;
|
|
225
|
+
}
|
|
226
|
+
const nre = new Regexp_js_1.Regexp(op);
|
|
227
|
+
nre.flags = flags;
|
|
228
|
+
nre.subs = [sub];
|
|
229
|
+
return nre;
|
|
230
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Utilities for dealing with Unicode better than JS does.
|
|
3
|
+
*/
|
|
4
|
+
declare const MAX_RUNE = 1114111;
|
|
5
|
+
declare const MAX_ASCII = 127;
|
|
6
|
+
declare const MAX_BMP = 65535;
|
|
7
|
+
declare const MIN_FOLD = 65;
|
|
8
|
+
declare const MAX_FOLD = 125251;
|
|
9
|
+
declare const MIN_HIGH_SURROGATE = 55296;
|
|
10
|
+
declare const MAX_HIGH_SURROGATE = 56319;
|
|
11
|
+
declare const MIN_LOW_SURROGATE = 56320;
|
|
12
|
+
declare const MAX_LOW_SURROGATE = 57343;
|
|
13
|
+
declare const MIN_SUPPLEMENTARY_CODE_POINT = 65536;
|
|
14
|
+
declare function isUpper(r: number): boolean;
|
|
15
|
+
declare function simpleFold(r: number): number;
|
|
16
|
+
declare function equalsIgnoreCase(r1: number, r2: number): boolean;
|
|
17
|
+
export { MAX_RUNE, MIN_FOLD, MAX_FOLD, simpleFold, MAX_ASCII, equalsIgnoreCase, MIN_SUPPLEMENTARY_CODE_POINT, MIN_LOW_SURROGATE, MIN_HIGH_SURROGATE, MAX_LOW_SURROGATE, MAX_HIGH_SURROGATE, MAX_BMP, isUpper, };
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.MAX_BMP = exports.MAX_HIGH_SURROGATE = exports.MAX_LOW_SURROGATE = exports.MIN_HIGH_SURROGATE = exports.MIN_LOW_SURROGATE = exports.MIN_SUPPLEMENTARY_CODE_POINT = exports.MAX_ASCII = exports.MAX_FOLD = exports.MIN_FOLD = exports.MAX_RUNE = void 0;
|
|
4
|
+
exports.simpleFold = simpleFold;
|
|
5
|
+
exports.equalsIgnoreCase = equalsIgnoreCase;
|
|
6
|
+
exports.isUpper = isUpper;
|
|
7
|
+
const UnicodeTables_js_1 = require("./UnicodeTables.js");
|
|
8
|
+
/**
|
|
9
|
+
* Utilities for dealing with Unicode better than JS does.
|
|
10
|
+
*/
|
|
11
|
+
// The highest legal rune value.
|
|
12
|
+
const MAX_RUNE = 0x10ffff;
|
|
13
|
+
exports.MAX_RUNE = MAX_RUNE;
|
|
14
|
+
// The highest legal ASCII value.
|
|
15
|
+
const MAX_ASCII = 0x7f;
|
|
16
|
+
exports.MAX_ASCII = MAX_ASCII;
|
|
17
|
+
// The highest legal Latin-1 value.
|
|
18
|
+
const MAX_LATIN1 = 0xff;
|
|
19
|
+
// The highest legal Basic Multilingual Plane (BMP) value.
|
|
20
|
+
const MAX_BMP = 0xffff;
|
|
21
|
+
exports.MAX_BMP = MAX_BMP;
|
|
22
|
+
// Minimum and maximum runes involved in folding.
|
|
23
|
+
// Checked during test.
|
|
24
|
+
const MIN_FOLD = 0x0041;
|
|
25
|
+
exports.MIN_FOLD = MIN_FOLD;
|
|
26
|
+
const MAX_FOLD = 0x1e943;
|
|
27
|
+
exports.MAX_FOLD = MAX_FOLD;
|
|
28
|
+
const MIN_HIGH_SURROGATE = 0xd800;
|
|
29
|
+
exports.MIN_HIGH_SURROGATE = MIN_HIGH_SURROGATE;
|
|
30
|
+
const MAX_HIGH_SURROGATE = 0xdbff;
|
|
31
|
+
exports.MAX_HIGH_SURROGATE = MAX_HIGH_SURROGATE;
|
|
32
|
+
const MIN_LOW_SURROGATE = 0xdc00;
|
|
33
|
+
exports.MIN_LOW_SURROGATE = MIN_LOW_SURROGATE;
|
|
34
|
+
const MAX_LOW_SURROGATE = 0xdfff;
|
|
35
|
+
exports.MAX_LOW_SURROGATE = MAX_LOW_SURROGATE;
|
|
36
|
+
const MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
|
|
37
|
+
exports.MIN_SUPPLEMENTARY_CODE_POINT = MIN_SUPPLEMENTARY_CODE_POINT;
|
|
38
|
+
// is32 uses binary search to test whether rune is in the specified
|
|
39
|
+
// slice of 32-bit ranges.
|
|
40
|
+
function is32(ranges, r) {
|
|
41
|
+
// binary search over ranges
|
|
42
|
+
let lo = 0;
|
|
43
|
+
let hi = ranges.length;
|
|
44
|
+
while (lo < hi) {
|
|
45
|
+
const m = lo + Math.floor((hi - lo) / 2);
|
|
46
|
+
const rlo = ranges.getLo(m);
|
|
47
|
+
const rhi = ranges.getHi(m);
|
|
48
|
+
if (rlo <= r && r <= rhi) {
|
|
49
|
+
const stride = ranges.getStride(m);
|
|
50
|
+
return (r - rlo) % stride === 0;
|
|
51
|
+
}
|
|
52
|
+
if (r < rlo) {
|
|
53
|
+
hi = m;
|
|
54
|
+
}
|
|
55
|
+
else {
|
|
56
|
+
lo = m + 1;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
return false;
|
|
60
|
+
}
|
|
61
|
+
// is tests whether rune is in the specified table of ranges.
|
|
62
|
+
function is(ranges, r) {
|
|
63
|
+
// Fast path for Latin-1 characters using linear search.
|
|
64
|
+
if (r <= MAX_LATIN1) {
|
|
65
|
+
for (let i = 0; i < ranges.length; i++) {
|
|
66
|
+
const rhi = ranges.getHi(i);
|
|
67
|
+
if (r > rhi) {
|
|
68
|
+
continue;
|
|
69
|
+
}
|
|
70
|
+
const rlo = ranges.getLo(i);
|
|
71
|
+
if (r < rlo) {
|
|
72
|
+
return false;
|
|
73
|
+
}
|
|
74
|
+
const stride = ranges.getStride(i);
|
|
75
|
+
return (r - rlo) % stride === 0;
|
|
76
|
+
}
|
|
77
|
+
return false;
|
|
78
|
+
}
|
|
79
|
+
// Fallback to binary search for runes outside Latin-1
|
|
80
|
+
return ranges.length > 0 && r >= ranges.getLo(0) && is32(ranges, r);
|
|
81
|
+
}
|
|
82
|
+
// isUpper reports whether the rune is an upper case letter.
|
|
83
|
+
function isUpper(r) {
|
|
84
|
+
if (r <= MAX_LATIN1) {
|
|
85
|
+
const s = String.fromCodePoint(r);
|
|
86
|
+
return s.toUpperCase() === s && s.toLowerCase() !== s;
|
|
87
|
+
}
|
|
88
|
+
return is(UnicodeTables_js_1.UnicodeTables.Upper, r);
|
|
89
|
+
}
|
|
90
|
+
// simpleFold iterates over Unicode code points equivalent under
|
|
91
|
+
// the Unicode-defined simple case folding. Among the code points
|
|
92
|
+
// equivalent to rune (including rune itself), SimpleFold returns the
|
|
93
|
+
// smallest r >= rune if one exists, or else the smallest r >= 0.
|
|
94
|
+
//
|
|
95
|
+
// For example:
|
|
96
|
+
// SimpleFold('A') = 'a'
|
|
97
|
+
// SimpleFold('a') = 'A'
|
|
98
|
+
//
|
|
99
|
+
// SimpleFold('K') = 'k'
|
|
100
|
+
// SimpleFold('k') = '\u212A' (Kelvin symbol, K)
|
|
101
|
+
// SimpleFold('\u212A') = 'K'
|
|
102
|
+
//
|
|
103
|
+
// SimpleFold('1') = '1'
|
|
104
|
+
//
|
|
105
|
+
// Derived from Go's unicode.SimpleFold.
|
|
106
|
+
//
|
|
107
|
+
function simpleFold(r) {
|
|
108
|
+
// Consult caseOrbit table for special cases (3+ element cycles, lossy
|
|
109
|
+
// mappings like ſ→S, and Turkic-specific self-loops).
|
|
110
|
+
const caseOrbit = UnicodeTables_js_1.UnicodeTables.CASE_ORBIT;
|
|
111
|
+
const folded = caseOrbit.get(r);
|
|
112
|
+
if (folded !== undefined) {
|
|
113
|
+
return folded;
|
|
114
|
+
}
|
|
115
|
+
// Fallback for 2-element orbits: use raw native case conversion.
|
|
116
|
+
// The length check rejects multi-char results (e.g., ß→SS) which
|
|
117
|
+
// would otherwise be truncated to a non-equivalent codepoint.
|
|
118
|
+
const s = String.fromCodePoint(r);
|
|
119
|
+
const lower = s.toLowerCase();
|
|
120
|
+
if (lower.length === s.length) {
|
|
121
|
+
const lowerCp = lower.codePointAt(0);
|
|
122
|
+
if (lowerCp !== undefined && lowerCp !== r)
|
|
123
|
+
return lowerCp;
|
|
124
|
+
}
|
|
125
|
+
const upper = s.toUpperCase();
|
|
126
|
+
if (upper.length === s.length) {
|
|
127
|
+
const upperCp = upper.codePointAt(0);
|
|
128
|
+
if (upperCp !== undefined && upperCp !== r)
|
|
129
|
+
return upperCp;
|
|
130
|
+
}
|
|
131
|
+
return r;
|
|
132
|
+
}
|
|
133
|
+
// equalsIgnoreCase performs case-insensitive equality comparison
|
|
134
|
+
// on the given runes |r1| and |r2|, with special consideration
|
|
135
|
+
// for the likely scenario where both runes are ASCII characters.
|
|
136
|
+
// If non-ASCII, Unicode case folding will be performed on |r1|
|
|
137
|
+
// to compare it to |r2|.
|
|
138
|
+
// -1 is interpreted as the end-of-file mark and never matches.
|
|
139
|
+
function equalsIgnoreCase(r1, r2) {
|
|
140
|
+
if (r1 < 0 || r2 < 0) {
|
|
141
|
+
return false;
|
|
142
|
+
}
|
|
143
|
+
if (r1 === r2) {
|
|
144
|
+
return true;
|
|
145
|
+
}
|
|
146
|
+
// Fast path for the common case where both runes are ASCII characters.
|
|
147
|
+
// Coerces both runes to lowercase if applicable.
|
|
148
|
+
if (r1 <= MAX_ASCII && r2 <= MAX_ASCII) {
|
|
149
|
+
if (0x41 <= r1 && r1 <= 0x5a) {
|
|
150
|
+
r1 |= 0x20;
|
|
151
|
+
}
|
|
152
|
+
if (0x41 <= r2 && r2 <= 0x5a) {
|
|
153
|
+
r2 |= 0x20;
|
|
154
|
+
}
|
|
155
|
+
return r1 === r2;
|
|
156
|
+
}
|
|
157
|
+
// Fall back to full Unicode case folding otherwise.
|
|
158
|
+
// Invariant: r1 must be non-negative
|
|
159
|
+
for (let r = simpleFold(r1); r !== r1; r = simpleFold(r)) {
|
|
160
|
+
if (r === r2) {
|
|
161
|
+
return true;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
return false;
|
|
165
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
declare class UnicodeRangeTable {
|
|
2
|
+
data: Uint32Array;
|
|
3
|
+
isStride1: boolean;
|
|
4
|
+
SIZE: number;
|
|
5
|
+
constructor(data: Uint32Array, isStride1?: boolean);
|
|
6
|
+
getLo(index: number): number;
|
|
7
|
+
getHi(index: number): number;
|
|
8
|
+
getStride(index: number): number;
|
|
9
|
+
get(index: number): number[];
|
|
10
|
+
get length(): number;
|
|
11
|
+
}
|
|
12
|
+
export { UnicodeRangeTable };
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.UnicodeRangeTable = void 0;
|
|
4
|
+
class UnicodeRangeTable {
|
|
5
|
+
data;
|
|
6
|
+
isStride1;
|
|
7
|
+
SIZE;
|
|
8
|
+
constructor(data, isStride1 = false) {
|
|
9
|
+
this.data = data; // A Uint32Array
|
|
10
|
+
this.isStride1 = isStride1;
|
|
11
|
+
this.SIZE = isStride1 ? 2 : 3;
|
|
12
|
+
}
|
|
13
|
+
// High-performance getters that do NOT allocate memory
|
|
14
|
+
getLo(index) {
|
|
15
|
+
return this.data[index * this.SIZE];
|
|
16
|
+
}
|
|
17
|
+
getHi(index) {
|
|
18
|
+
return this.data[index * this.SIZE + 1];
|
|
19
|
+
}
|
|
20
|
+
getStride(index) {
|
|
21
|
+
return this.isStride1 ? 1 : this.data[index * this.SIZE + 2];
|
|
22
|
+
}
|
|
23
|
+
get(index) {
|
|
24
|
+
const i = index * this.SIZE;
|
|
25
|
+
return [this.data[i], this.data[i + 1], this.getStride(index)];
|
|
26
|
+
}
|
|
27
|
+
get length() {
|
|
28
|
+
return this.data.length / this.SIZE;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
exports.UnicodeRangeTable = UnicodeRangeTable;
|