@isopodlabs/utilities 1.5.5 → 1.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/algorithm.d.ts +2 -1
- package/dist/algorithm.js +5 -1
- package/dist/array.d.ts +6 -0
- package/dist/array.js +57 -0
- package/dist/async.d.ts +4 -0
- package/dist/async.js +65 -0
- package/dist/bits.d.ts +82 -0
- package/dist/bits.js +605 -0
- package/dist/glob.d.ts +4 -0
- package/dist/glob.js +112 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +17 -4
- package/dist/insensitive.d.ts +1 -1
- package/dist/insensitive.js +2 -10
- package/dist/iterator.d.ts +1 -12
- package/dist/iterator.js +37 -94
- package/dist/object.d.ts +5 -2
- package/dist/object.js +18 -5
- package/dist/regex.d.ts +103 -0
- package/dist/regex.js +1044 -0
- package/dist/regexp.d.ts +90 -0
- package/dist/regexp.js +659 -0
- package/dist/string.d.ts +0 -4
- package/dist/string.js +0 -107
- package/package.json +2 -3
package/dist/regex.js
ADDED
|
@@ -0,0 +1,1044 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.endAnchor = exports.startAnchor = exports.nonWordBoundary = exports.wordBoundary = exports.octal = exports.hex = exports.whitespace = exports.word = exports.alnum = exports.alpha = exports.upper = exports.lower = exports.digit = exports.any = exports.characterClass = void 0;
|
|
4
|
+
exports.range = range;
|
|
5
|
+
exports.chars = chars;
|
|
6
|
+
exports.union = union;
|
|
7
|
+
exports.text = text;
|
|
8
|
+
exports.concatenation = concatenation;
|
|
9
|
+
exports.alternation = alternation;
|
|
10
|
+
exports.noncapture = noncapture;
|
|
11
|
+
exports.lookAhead = lookAhead;
|
|
12
|
+
exports.negLookAhead = negLookAhead;
|
|
13
|
+
exports.lookBehind = lookBehind;
|
|
14
|
+
exports.negLookBehind = negLookBehind;
|
|
15
|
+
exports.capture = capture;
|
|
16
|
+
exports.repeatFrom = repeatFrom;
|
|
17
|
+
exports.repeat = repeat;
|
|
18
|
+
exports.zeroOrMore = zeroOrMore;
|
|
19
|
+
exports.oneOrMore = oneOrMore;
|
|
20
|
+
exports.optional = optional;
|
|
21
|
+
exports.boundary = boundary;
|
|
22
|
+
exports.reference = reference;
|
|
23
|
+
exports.anchored = anchored;
|
|
24
|
+
exports.parse = parse;
|
|
25
|
+
exports.toRegExpString = toRegExpString;
|
|
26
|
+
exports.toRegExp = toRegExp;
|
|
27
|
+
exports.optimize = optimize;
|
|
28
|
+
exports.runDFA = runDFA;
|
|
29
|
+
exports.regexToDFA = regexToDFA;
|
|
30
|
+
exports.parseGlob = parseGlob;
|
|
31
|
+
exports.anchoredRe = anchoredRe;
|
|
32
|
+
exports.globToRe = globToRe;
|
|
33
|
+
exports.globToReMulti = globToReMulti;
|
|
34
|
+
const bits_1 = require("./bits");
|
|
35
|
+
/*
|
|
36
|
+
Characters
|
|
37
|
+
[xyz],[a-c] Character class
|
|
38
|
+
[^xyz],[^a-c] Negated character class
|
|
39
|
+
. Wildcard: Matches any single character except line terminators: \n, \r, \u2028 or \u2029
|
|
40
|
+
\d Digit character class escape: Matches any digit (Arabic numeral). Equivalent to [0-9]
|
|
41
|
+
\D Non-digit character class escape: Matches any character that is not a digit (Arabic numeral)
|
|
42
|
+
\w Word character class escape: Matches any alphanumeric character from the basic Latin alphabet, including the underscore. Equivalent to [A-Za-z0-9_]
|
|
43
|
+
\W Non-word character class escape: Matches any character that is not a word character from the basic Latin alphabet. Equivalent to [^A-Za-z0-9_]
|
|
44
|
+
\s White space character class escape: Matches a single white space character, including space, tab, form feed, line feed, and other Unicode spaces. Equivalent to [\f\n\r\t\v\u0020\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]
|
|
45
|
+
\S Non-white space character class escape
|
|
46
|
+
\t Matches a horizontal tab.
|
|
47
|
+
\r Matches a carriage return.
|
|
48
|
+
\n Matches a linefeed.
|
|
49
|
+
\v Matches a vertical tab.
|
|
50
|
+
\f Matches a form-feed.
|
|
51
|
+
[\b] Matches a backspace.
|
|
52
|
+
\0 Matches a NUL character.
|
|
53
|
+
\cX Matches a control character using caret notation, where "X" is a letter from A–Z
|
|
54
|
+
\xhh Matches the character with the code hh (two hexadecimal digits).
|
|
55
|
+
\uhhhh Matches a UTF-16 code-unit with the value hhhh (four hexadecimal digits).
|
|
56
|
+
\u{hhhh} or \u{hhhhh} (Only when the u flag is set.) Matches the character with the Unicode value U+hhhh or U+hhhhh (hexadecimal digits).
|
|
57
|
+
\p{UnicodeProperty}, \P{UnicodeProperty} Unicode character class escape: Matches a character based on its Unicode character properties
|
|
58
|
+
\ Indicates that the following character should be treated specially, or "escaped"
|
|
59
|
+
|
|
60
|
+
x|y Alternation: Matches either "x" or "y"
|
|
61
|
+
|
|
62
|
+
Boundary-type assertions
|
|
63
|
+
^ Input boundary beginning assertion
|
|
64
|
+
$ Input boundary end assertion
|
|
65
|
+
\b Word boundary assertion
|
|
66
|
+
\B Non-word-boundary assertion
|
|
67
|
+
|
|
68
|
+
Other Assertions
|
|
69
|
+
x(?=y) Lookahead assertion
|
|
70
|
+
x(?!y) Negative lookahead assertion
|
|
71
|
+
(?<=y)x Lookbehind assertion
|
|
72
|
+
(?<!y)x Negative lookbehind assertion
|
|
73
|
+
|
|
74
|
+
Groups and backreferences
|
|
75
|
+
(x) Capturing group
|
|
76
|
+
(?<Name>x) Named capturing group
|
|
77
|
+
(?:x) Non-capturing group
|
|
78
|
+
(?flags:x), (?:flags-flags:x) Modifier (flags can be i, m, s)
|
|
79
|
+
\<int> Backreference
|
|
80
|
+
\k<Name> Named backreference
|
|
81
|
+
|
|
82
|
+
Quantifiers
|
|
83
|
+
x* Matches the preceding item "x" 0 or more times
|
|
84
|
+
x+ Matches the preceding item "x" 1 or more times. Equivalent to {1,}
|
|
85
|
+
x? Matches the preceding item "x" 0 or 1 times. For example, /e?le?/ matches the "el" in "angel" and the "le" in "angle."
|
|
86
|
+
x{<int>} Matches exactly "n" occurrences of the preceding item "x"
|
|
87
|
+
x{<int>,} Matches at least "n" occurrences of the preceding item "x"
|
|
88
|
+
x{n,m} Matches at least "n" and at most "m" occurrences of the preceding item "x"
|
|
89
|
+
|
|
90
|
+
Non greedy quantifiers
|
|
91
|
+
x*?
|
|
92
|
+
x+?
|
|
93
|
+
x??
|
|
94
|
+
x{n}?
|
|
95
|
+
x{n,}?
|
|
96
|
+
x{n,m}?
|
|
97
|
+
|
|
98
|
+
*/
|
|
99
|
+
const posixClasses = {
|
|
100
|
+
alnum: '\\p{L}\\p{Nl}\\p{Nd}',
|
|
101
|
+
alpha: '\\p{L}\\p{Nl}',
|
|
102
|
+
ascii: '\\x00-\\x7f',
|
|
103
|
+
blank: '\\p{Zs}\\t',
|
|
104
|
+
cntrl: '\\p{Cc}',
|
|
105
|
+
digit: '\\p{Nd}',
|
|
106
|
+
graph: '^\\p{Z}\\p{C}',
|
|
107
|
+
lower: '\\p{Ll}',
|
|
108
|
+
print: '\\p{C}',
|
|
109
|
+
punct: '\\p{P}',
|
|
110
|
+
space: '\\p{Z}\\t\\r\\n\\v\\f',
|
|
111
|
+
upper: '\\p{Lu}',
|
|
112
|
+
word: '\\p{L}\\p{Nl}\\p{Nd}\\p{Pc}',
|
|
113
|
+
xdigit: 'A-Fa-f0-9',
|
|
114
|
+
};
|
|
115
|
+
class characterClass extends bits_1.SparseBits {
|
|
116
|
+
type = 'class';
|
|
117
|
+
//setChar(char: string) {
|
|
118
|
+
// this.set(char.charCodeAt(0));
|
|
119
|
+
//}
|
|
120
|
+
test(char) {
|
|
121
|
+
return this.has(char.charCodeAt(0));
|
|
122
|
+
}
|
|
123
|
+
mutable() {
|
|
124
|
+
return new MutablecharacterClass(true).selfIntersect(this);
|
|
125
|
+
}
|
|
126
|
+
isNegated() {
|
|
127
|
+
return !!this.undef;
|
|
128
|
+
}
|
|
129
|
+
toString() {
|
|
130
|
+
let s = this.undef ? '^' : '';
|
|
131
|
+
for (const i in this.bits) {
|
|
132
|
+
const b = this.bits[i] ^ this.undef;
|
|
133
|
+
const c0 = +i * 32;
|
|
134
|
+
for (let j = 0; j < 32; j++) {
|
|
135
|
+
if (b & (1 << j)) {
|
|
136
|
+
const c1 = c0 + j;
|
|
137
|
+
while (j < 32 && (b & (1 << j)))
|
|
138
|
+
j++;
|
|
139
|
+
const c2 = c0 + j - 1;
|
|
140
|
+
s += String.fromCharCode(c1).replace(/[-\\\]]/g, '\\$&');
|
|
141
|
+
if (c1 !== c2)
|
|
142
|
+
s += '-' + String.fromCharCode(c2).replace(/[-\\\]]/g, '\\$&');
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
return s;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
exports.characterClass = characterClass;
|
|
150
|
+
;
|
|
151
|
+
class MutablecharacterClass extends characterClass {
|
|
152
|
+
setChar(char) {
|
|
153
|
+
this.set(char.charCodeAt(0));
|
|
154
|
+
}
|
|
155
|
+
setString(c) {
|
|
156
|
+
for (let i = 0; i < c.length; i++)
|
|
157
|
+
this.set(c.charCodeAt(i));
|
|
158
|
+
return this;
|
|
159
|
+
}
|
|
160
|
+
clearString(c) {
|
|
161
|
+
for (let i = 0; i < c.length; i++)
|
|
162
|
+
this.clear(c.charCodeAt(i));
|
|
163
|
+
return this;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
// characterClass helpers
|
|
167
|
+
function range(from, to) {
|
|
168
|
+
return new MutablecharacterClass(false).setRange(from.charCodeAt(0), to.charCodeAt(0) + 1);
|
|
169
|
+
}
|
|
170
|
+
function chars(chars) {
|
|
171
|
+
return new MutablecharacterClass(false).setString(chars);
|
|
172
|
+
}
|
|
173
|
+
function union(...classes) {
|
|
174
|
+
const result = new MutablecharacterClass(false);
|
|
175
|
+
for (const cls of classes)
|
|
176
|
+
result.selfUnion(cls);
|
|
177
|
+
return result;
|
|
178
|
+
}
|
|
179
|
+
// Common character class constants and ranges
|
|
180
|
+
exports.any = new characterClass(true); //.clearString('\n\r\u2028\u2029');
|
|
181
|
+
exports.digit = range('0', '9'); //digit
|
|
182
|
+
exports.lower = range('a', 'z');
|
|
183
|
+
exports.upper = range('A', 'Z');
|
|
184
|
+
exports.alpha = exports.lower.union(exports.upper);
|
|
185
|
+
exports.alnum = exports.alpha.union(exports.digit);
|
|
186
|
+
exports.word = exports.alnum.union(chars('_')); //word
|
|
187
|
+
exports.whitespace = chars(' \t\r\n\f\v'); //whitespace
|
|
188
|
+
exports.hex = exports.digit.union(chars('abcdefABCDEF'));
|
|
189
|
+
exports.octal = range('0', '7');
|
|
190
|
+
function text(c) {
|
|
191
|
+
return c;
|
|
192
|
+
}
|
|
193
|
+
function concatenation(parts) {
|
|
194
|
+
return parts.length === 1 ? parts[0] : parts;
|
|
195
|
+
}
|
|
196
|
+
function alternation(parts) {
|
|
197
|
+
return parts.length === 1 ? parts[0]
|
|
198
|
+
: parts.length === 2 && !parts[1] ? optional(parts[0])
|
|
199
|
+
: { type: 'alt', parts };
|
|
200
|
+
}
|
|
201
|
+
;
|
|
202
|
+
function noncapture(part, options) {
|
|
203
|
+
return { type: 'noncapture', part, options };
|
|
204
|
+
}
|
|
205
|
+
function lookAhead(part) { return noncapture(part, 'ahead'); }
|
|
206
|
+
function negLookAhead(part) { return noncapture(part, 'neg_ahead'); }
|
|
207
|
+
function lookBehind(part) { return noncapture(part, 'behind'); }
|
|
208
|
+
function negLookBehind(part) { return noncapture(part, 'neg_behind'); }
|
|
209
|
+
function capture(part, name) {
|
|
210
|
+
return { type: 'capture', part, name };
|
|
211
|
+
}
|
|
212
|
+
function repeatFrom(part, min, max = -1, mod = 'greedy') {
|
|
213
|
+
return { type: 'quantified', part, min, max, mod };
|
|
214
|
+
}
|
|
215
|
+
function repeat(part, n, mod = 'greedy') {
|
|
216
|
+
return { type: 'quantified', part, min: n, max: n, mod };
|
|
217
|
+
}
|
|
218
|
+
function zeroOrMore(part, mod = 'greedy') { return repeatFrom(part, 0, -1, mod); }
|
|
219
|
+
function oneOrMore(part, mod = 'greedy') { return repeatFrom(part, 1, -1, mod); }
|
|
220
|
+
function optional(part, mod = 'greedy') { return repeatFrom(part, 0, 1, mod); }
|
|
221
|
+
function boundary(type) {
|
|
222
|
+
return { type };
|
|
223
|
+
}
|
|
224
|
+
exports.wordBoundary = boundary('wordbound');
|
|
225
|
+
exports.nonWordBoundary = boundary('nowordbound');
|
|
226
|
+
exports.startAnchor = boundary('inputboundstart');
|
|
227
|
+
exports.endAnchor = boundary('inputboundend');
|
|
228
|
+
function reference(value) {
|
|
229
|
+
return { type: 'reference', value };
|
|
230
|
+
}
|
|
231
|
+
function anchored(part) {
|
|
232
|
+
return [exports.startAnchor, part, exports.endAnchor];
|
|
233
|
+
}
|
|
234
|
+
/*
|
|
235
|
+
function is0<T extends part0['type']>(part: part, type: T): part is Extract<part0, { type: T }> {
|
|
236
|
+
return typeof part !== 'string' && !Array.isArray(part) && part.type === type;
|
|
237
|
+
}
|
|
238
|
+
*/
|
|
239
|
+
function type(part) {
|
|
240
|
+
return typeof part === 'string' ? 'text'
|
|
241
|
+
: Array.isArray(part) ? 'concat'
|
|
242
|
+
: part.type;
|
|
243
|
+
}
|
|
244
|
+
function is(part, istype) {
|
|
245
|
+
return type(part) === istype;
|
|
246
|
+
//return typeof part === 'string' ? type === 'text'
|
|
247
|
+
// : Array.isArray(part) ? type === 'concat'
|
|
248
|
+
// : part.type === type;
|
|
249
|
+
}
|
|
250
|
+
function typed(part) {
|
|
251
|
+
return typeof part === 'string' ? { type: 'text', part }
|
|
252
|
+
: Array.isArray(part) ? { type: 'concat', part }
|
|
253
|
+
: part;
|
|
254
|
+
}
|
|
255
|
+
function parse(re, unicode = true, extended = false) {
|
|
256
|
+
const stack = [];
|
|
257
|
+
let curr = [];
|
|
258
|
+
let i = 0;
|
|
259
|
+
function skipTo(c) {
|
|
260
|
+
const start = i;
|
|
261
|
+
while (i < re.length && re[i] !== c)
|
|
262
|
+
i++;
|
|
263
|
+
if (re[i] !== c)
|
|
264
|
+
throw new Error(`Missing '${c}'`);
|
|
265
|
+
return re.substring(start, i++);
|
|
266
|
+
}
|
|
267
|
+
function int() {
|
|
268
|
+
const start = i;
|
|
269
|
+
while (re[i] >= '0' && re[i] <= '9')
|
|
270
|
+
i++;
|
|
271
|
+
return parseInt(re.substring(start, i));
|
|
272
|
+
}
|
|
273
|
+
function backslashed() {
|
|
274
|
+
const c = re[i++];
|
|
275
|
+
switch (c) {
|
|
276
|
+
default: return c.charCodeAt(0);
|
|
277
|
+
case 'd': return exports.digit; //digit
|
|
278
|
+
case 'D': return exports.digit.not(); //non-digit
|
|
279
|
+
case 'w': return exports.word; //word
|
|
280
|
+
case 'W': return exports.word.not(); //non-word
|
|
281
|
+
case 's': return exports.whitespace; //whitespace
|
|
282
|
+
case 'S': return exports.whitespace.not(); //non-whitespace
|
|
283
|
+
case 'b': return 8; //backspace
|
|
284
|
+
case 't': return 9; //tab
|
|
285
|
+
case 'n': return 10; //newline
|
|
286
|
+
case 'v': return 11; //vertical tab
|
|
287
|
+
case 'f': return 12; //form feed
|
|
288
|
+
case 'r': return 13; //carriage return
|
|
289
|
+
case 'c': return re.charCodeAt(i++) & 31; //control character
|
|
290
|
+
case '0': {
|
|
291
|
+
const start = i - 1;
|
|
292
|
+
while (re[i] >= '0' && re[i] <= '7' && i - start < 4)
|
|
293
|
+
i++;
|
|
294
|
+
return parseInt(re.substring(start, i), 8);
|
|
295
|
+
}
|
|
296
|
+
case 'x':
|
|
297
|
+
if (i + 2 > re.length)
|
|
298
|
+
throw new Error('bad \\x escape');
|
|
299
|
+
i += 2;
|
|
300
|
+
return parseInt(re.substring(i - 2, i), 16);
|
|
301
|
+
case 'u':
|
|
302
|
+
if (unicode && re[i] === '{') {
|
|
303
|
+
i++; // skip '{'
|
|
304
|
+
return parseInt(skipTo('}'), 16);
|
|
305
|
+
}
|
|
306
|
+
if (i + 4 > re.length)
|
|
307
|
+
throw new Error('bad \\u escape');
|
|
308
|
+
i += 4;
|
|
309
|
+
return parseInt(re.substring(i - 4, i), 16);
|
|
310
|
+
case 'p':
|
|
311
|
+
case 'P':
|
|
312
|
+
if (!unicode || re[i] !== '{')
|
|
313
|
+
throw new Error('\\p and \\P can only be used with unicode enabled, and must be followed by {property}');
|
|
314
|
+
i++; // skip '{'
|
|
315
|
+
return { type: c === 'P' ? 'notunicode' : 'unicode', property: skipTo('}') };
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
function character() {
|
|
319
|
+
const code = unicode ? re.codePointAt(i++) : re.charCodeAt(i++);
|
|
320
|
+
if (code > 0xffff) {
|
|
321
|
+
++i;
|
|
322
|
+
return code;
|
|
323
|
+
}
|
|
324
|
+
return code === 92 ? backslashed() : code;
|
|
325
|
+
}
|
|
326
|
+
function addQuantified(min, max) {
|
|
327
|
+
let mod = 'greedy';
|
|
328
|
+
if (re[i] === '?') {
|
|
329
|
+
mod = 'lazy';
|
|
330
|
+
i++;
|
|
331
|
+
}
|
|
332
|
+
else if (extended && re[i] === '+') {
|
|
333
|
+
mod = 'possessive';
|
|
334
|
+
i++;
|
|
335
|
+
}
|
|
336
|
+
const top = curr.pop();
|
|
337
|
+
if (!top)
|
|
338
|
+
throw new Error('nothing to quantify');
|
|
339
|
+
if (typeof top === 'string' && top.length > 1) {
|
|
340
|
+
curr.push(top.slice(0, -1));
|
|
341
|
+
curr.push(repeatFrom(top.slice(-1), min, max, mod));
|
|
342
|
+
}
|
|
343
|
+
else {
|
|
344
|
+
curr.push(repeatFrom(top, min, max, mod));
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
function addText(c) {
|
|
348
|
+
if (typeof curr.at(-1) === 'string') {
|
|
349
|
+
curr[curr.length - 1] += c;
|
|
350
|
+
}
|
|
351
|
+
else {
|
|
352
|
+
curr.push(c);
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
function closeAlt() {
|
|
356
|
+
let top = stack.pop();
|
|
357
|
+
if (top?.type === 'alt') {
|
|
358
|
+
top.parts.push(concatenation(curr));
|
|
359
|
+
curr = [top];
|
|
360
|
+
top = stack.pop();
|
|
361
|
+
}
|
|
362
|
+
return top;
|
|
363
|
+
}
|
|
364
|
+
const specialChars = /[\\^$*+?{()|[.]/;
|
|
365
|
+
while (i < re.length) {
|
|
366
|
+
const remaining = re.substring(i);
|
|
367
|
+
const next = remaining.search(specialChars);
|
|
368
|
+
if (next === -1) {
|
|
369
|
+
addText(remaining);
|
|
370
|
+
break;
|
|
371
|
+
}
|
|
372
|
+
if (next > 0)
|
|
373
|
+
addText(remaining.substring(0, next));
|
|
374
|
+
i += next;
|
|
375
|
+
switch (re[i++]) {
|
|
376
|
+
case '\\':
|
|
377
|
+
if (re[i] === 'b') {
|
|
378
|
+
i++;
|
|
379
|
+
curr.push(exports.wordBoundary);
|
|
380
|
+
}
|
|
381
|
+
else if (re[i] === 'B') {
|
|
382
|
+
i++;
|
|
383
|
+
curr.push(exports.nonWordBoundary);
|
|
384
|
+
}
|
|
385
|
+
else if (re[i] >= '1' && re[i] <= '9') {
|
|
386
|
+
const n = int();
|
|
387
|
+
curr.push({ type: 'reference', value: n });
|
|
388
|
+
}
|
|
389
|
+
else if (re[i] === 'k' && re[i + 1] === '<') {
|
|
390
|
+
i += 2;
|
|
391
|
+
const name = skipTo('>');
|
|
392
|
+
curr.push({ type: 'reference', value: name });
|
|
393
|
+
}
|
|
394
|
+
else {
|
|
395
|
+
const b = backslashed();
|
|
396
|
+
if (typeof b === 'number')
|
|
397
|
+
addText(String.fromCodePoint(b));
|
|
398
|
+
else
|
|
399
|
+
curr.push(b);
|
|
400
|
+
}
|
|
401
|
+
break;
|
|
402
|
+
case '.':
|
|
403
|
+
curr.push(exports.any);
|
|
404
|
+
break;
|
|
405
|
+
//Boundary-type assertions
|
|
406
|
+
case '^':
|
|
407
|
+
curr.push(exports.startAnchor);
|
|
408
|
+
break;
|
|
409
|
+
case '$':
|
|
410
|
+
curr.push(exports.endAnchor);
|
|
411
|
+
break;
|
|
412
|
+
//Quantifiers
|
|
413
|
+
case '*':
|
|
414
|
+
addQuantified(0, -1);
|
|
415
|
+
break;
|
|
416
|
+
case '+':
|
|
417
|
+
addQuantified(1, -1);
|
|
418
|
+
break;
|
|
419
|
+
case '?':
|
|
420
|
+
addQuantified(0, 1);
|
|
421
|
+
break;
|
|
422
|
+
case '{': {
|
|
423
|
+
const min = int();
|
|
424
|
+
let max = min;
|
|
425
|
+
if (re[i] === ',') {
|
|
426
|
+
++i; // skip ','
|
|
427
|
+
max = re[i] !== '}' ? int() : -1;
|
|
428
|
+
}
|
|
429
|
+
++i; // skip '}'
|
|
430
|
+
addQuantified(min, max);
|
|
431
|
+
break;
|
|
432
|
+
}
|
|
433
|
+
//Alternation
|
|
434
|
+
case '|': {
|
|
435
|
+
const top = stack.at(-1);
|
|
436
|
+
if (top?.type === 'alt') {
|
|
437
|
+
top.parts.push(concatenation(curr));
|
|
438
|
+
}
|
|
439
|
+
else {
|
|
440
|
+
stack.push({ type: 'alt', parts: [concatenation(curr)] });
|
|
441
|
+
}
|
|
442
|
+
curr = [];
|
|
443
|
+
break;
|
|
444
|
+
}
|
|
445
|
+
//Groups
|
|
446
|
+
case '(':
|
|
447
|
+
let group;
|
|
448
|
+
const dummy = ''; //text(''); // placeholder
|
|
449
|
+
if (re[i] === '?') {
|
|
450
|
+
i++;
|
|
451
|
+
switch (re[i++]) {
|
|
452
|
+
case ':':
|
|
453
|
+
group = noncapture(dummy);
|
|
454
|
+
break;
|
|
455
|
+
case '=':
|
|
456
|
+
group = noncapture(dummy, 'ahead');
|
|
457
|
+
break;
|
|
458
|
+
case '!':
|
|
459
|
+
group = noncapture(dummy, 'neg_ahead');
|
|
460
|
+
break;
|
|
461
|
+
case '<':
|
|
462
|
+
if (re[i] === '=') {
|
|
463
|
+
i++;
|
|
464
|
+
group = noncapture(dummy, 'behind');
|
|
465
|
+
}
|
|
466
|
+
else if (re[i] === '!') {
|
|
467
|
+
i++;
|
|
468
|
+
group = noncapture(dummy, 'neg_behind');
|
|
469
|
+
}
|
|
470
|
+
else {
|
|
471
|
+
group = capture(dummy, skipTo('>'));
|
|
472
|
+
}
|
|
473
|
+
break;
|
|
474
|
+
case '>':
|
|
475
|
+
if (extended) {
|
|
476
|
+
group = noncapture(dummy, 'atomic');
|
|
477
|
+
break;
|
|
478
|
+
}
|
|
479
|
+
default: {
|
|
480
|
+
let set = true;
|
|
481
|
+
const flags = {};
|
|
482
|
+
--i; // go back to first flag character
|
|
483
|
+
while (i < re.length) {
|
|
484
|
+
const f = re[i++];
|
|
485
|
+
if (f === ':')
|
|
486
|
+
break;
|
|
487
|
+
if (f === '-')
|
|
488
|
+
set = false;
|
|
489
|
+
else if (f === 'i' || f === 'm' || f === 's')
|
|
490
|
+
flags[f] = set;
|
|
491
|
+
}
|
|
492
|
+
group = noncapture(dummy, flags);
|
|
493
|
+
break;
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
else {
|
|
498
|
+
group = capture(dummy, '');
|
|
499
|
+
}
|
|
500
|
+
stack.push({ type: 'group', group: group, tos: curr });
|
|
501
|
+
curr = [];
|
|
502
|
+
break;
|
|
503
|
+
case ')': {
|
|
504
|
+
const top = closeAlt();
|
|
505
|
+
if (top?.type !== 'group')
|
|
506
|
+
throw new Error('unmatched )');
|
|
507
|
+
top.group.part = concatenation(curr);
|
|
508
|
+
curr = [...top.tos, top.group];
|
|
509
|
+
break;
|
|
510
|
+
}
|
|
511
|
+
//Character classes
|
|
512
|
+
case '[': {
|
|
513
|
+
const neg = re[i] === '^';
|
|
514
|
+
if (neg)
|
|
515
|
+
i++;
|
|
516
|
+
const cs = new characterClass(false);
|
|
517
|
+
if (re[i] === ']' || re[i] === '-')
|
|
518
|
+
cs.set(re.charCodeAt(i++));
|
|
519
|
+
while (i < re.length && re[i] !== ']') {
|
|
520
|
+
const from = character();
|
|
521
|
+
if (typeof from === 'number') {
|
|
522
|
+
if (re[i] === '-' && i + 1 < re.length && re[i + 1] !== ']') {
|
|
523
|
+
++i;
|
|
524
|
+
const to = character();
|
|
525
|
+
if (typeof to !== 'number' || from > to)
|
|
526
|
+
throw new Error('bad character class');
|
|
527
|
+
cs.setRange(from, to + 1);
|
|
528
|
+
}
|
|
529
|
+
else {
|
|
530
|
+
cs.set(from);
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
else if (is(from, 'class')) {
|
|
534
|
+
cs.selfUnion(from);
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
i++; // skip ']'
|
|
538
|
+
curr.push(neg ? cs.selfNot() : cs);
|
|
539
|
+
break;
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
}
|
|
543
|
+
const top = closeAlt();
|
|
544
|
+
if (top)
|
|
545
|
+
throw new Error('unmatched (');
|
|
546
|
+
return concatenation(curr);
|
|
547
|
+
}
|
|
548
|
+
//-----------------------------------------------------------------------------
|
|
549
|
+
// Regex to string
|
|
550
|
+
//-----------------------------------------------------------------------------
|
|
551
|
+
function printQuantified(min, max, mod) {
|
|
552
|
+
return (min === 0 && max === -1 ? '*'
|
|
553
|
+
: min === 1 && max === -1 ? '+'
|
|
554
|
+
: min === 0 && max === 1 ? '?'
|
|
555
|
+
: max === -1 ? `{${min},}`
|
|
556
|
+
: min === max ? `{${min}}`
|
|
557
|
+
: `{${min},${max}}`) + (mod === 'lazy' ? '?' : mod === 'possessive' ? '+' : '');
|
|
558
|
+
}
|
|
559
|
+
function list(parts, join) {
|
|
560
|
+
return parts.map((p, i) => {
|
|
561
|
+
if (is(p, 'quantified') && is(p.part, 'text') && i > 0 && is(parts[i - 1], 'text'))
|
|
562
|
+
return `(?:${p.part})` + printQuantified(p.min, p.max, p.mod);
|
|
563
|
+
const s = toRegExpString(p);
|
|
564
|
+
return is(p, 'alt') ? `(?:${s})` : s;
|
|
565
|
+
}).join(join);
|
|
566
|
+
}
|
|
567
|
+
function toRegExpString(part) {
|
|
568
|
+
if (typeof part === 'string')
|
|
569
|
+
return part.replace(/[\\^$*+?.()|[\]{}]/g, '\\$&');
|
|
570
|
+
if (Array.isArray(part))
|
|
571
|
+
return list(part, '');
|
|
572
|
+
switch (part.type) {
|
|
573
|
+
case 'alt':
|
|
574
|
+
return list(part.parts, '|');
|
|
575
|
+
case 'quantified':
|
|
576
|
+
return toRegExpString(part.part) + printQuantified(part.min, part.max, part.mod);
|
|
577
|
+
case 'noncapture': {
|
|
578
|
+
let header = '';
|
|
579
|
+
const opts = part.options;
|
|
580
|
+
if (opts) {
|
|
581
|
+
if (typeof opts === 'string') {
|
|
582
|
+
header = {
|
|
583
|
+
ahead: '=',
|
|
584
|
+
behind: '<=',
|
|
585
|
+
neg_ahead: '!',
|
|
586
|
+
neg_behind: '<!',
|
|
587
|
+
atomic: '>'
|
|
588
|
+
}[opts];
|
|
589
|
+
}
|
|
590
|
+
else if ((opts.i ?? opts.m ?? opts.s) !== undefined) {
|
|
591
|
+
let posflags = (opts.i ? 'i' : '') + (opts.m ? 'm' : '') + (opts.s ? 's' : '');
|
|
592
|
+
let negflags = (opts.i === false ? 'i' : '') + (opts.m === false ? 'm' : '') + (opts.s === false ? 's' : '');
|
|
593
|
+
header = `${posflags}${negflags ? '-' : ''}${negflags}:`;
|
|
594
|
+
}
|
|
595
|
+
}
|
|
596
|
+
return `(?${header}${toRegExpString(part.part)})`;
|
|
597
|
+
}
|
|
598
|
+
case 'capture':
|
|
599
|
+
return `(${part.name ? `?<${part.name}>` : ''}${toRegExpString(part.part)})`;
|
|
600
|
+
case 'class': {
|
|
601
|
+
if (part.contains(exports.any))
|
|
602
|
+
return '.';
|
|
603
|
+
const neg = part.isNegated();
|
|
604
|
+
const temp = neg ? part.not() : part.mutable();
|
|
605
|
+
const has_w = temp.contains(exports.word);
|
|
606
|
+
if (has_w) {
|
|
607
|
+
if (exports.word.contains(temp))
|
|
608
|
+
return '\\w';
|
|
609
|
+
temp.selfIntersect(exports.word.not());
|
|
610
|
+
}
|
|
611
|
+
const has_d = !has_w && temp.contains(exports.digit);
|
|
612
|
+
if (has_d) {
|
|
613
|
+
if (exports.digit.contains(temp))
|
|
614
|
+
return '\\d';
|
|
615
|
+
temp.selfIntersect(exports.digit.not());
|
|
616
|
+
}
|
|
617
|
+
const has_s = temp.contains(exports.whitespace);
|
|
618
|
+
if (has_s) {
|
|
619
|
+
if (!has_w && !has_d && exports.whitespace.contains(temp))
|
|
620
|
+
return '\\s';
|
|
621
|
+
temp.selfIntersect(exports.whitespace.not());
|
|
622
|
+
}
|
|
623
|
+
return `[${neg ? '^' : ''}${has_w ? '\\w' : has_d ? '\\d' : ''}${has_s ? '\\s' : ''}${temp.toString()}]`;
|
|
624
|
+
//return `[${part.toString()}]`;
|
|
625
|
+
}
|
|
626
|
+
case 'unicode':
|
|
627
|
+
return `\\p{${part.property}}`;
|
|
628
|
+
case 'notunicode':
|
|
629
|
+
return `\\P{${part.property}}`;
|
|
630
|
+
case 'wordbound':
|
|
631
|
+
return '\\b';
|
|
632
|
+
case 'nowordbound':
|
|
633
|
+
return '\\B';
|
|
634
|
+
case 'inputboundstart':
|
|
635
|
+
return '^';
|
|
636
|
+
case 'inputboundend':
|
|
637
|
+
return '$';
|
|
638
|
+
case 'reference':
|
|
639
|
+
return typeof part.value === 'number' ? `\\${part.value}` : `\\k<${part.value}>`;
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
function toRegExp(part, flags) {
|
|
643
|
+
return new RegExp(toRegExpString(part), flags);
|
|
644
|
+
}
|
|
645
|
+
//-----------------------------------------------------------------------------
|
|
646
|
+
// Regex AST manipulation
|
|
647
|
+
//-----------------------------------------------------------------------------
|
|
648
|
+
function visit(part, visitor, previsit) {
|
|
649
|
+
if (previsit)
|
|
650
|
+
part = previsit(part) ?? part;
|
|
651
|
+
/*
|
|
652
|
+
if (typeof part === 'string')
|
|
653
|
+
return visitor(part) ?? part;
|
|
654
|
+
|
|
655
|
+
if (Array.isArray(part)) {
|
|
656
|
+
part = part.map(p => visit(p, visitor, previsit));
|
|
657
|
+
|
|
658
|
+
} else */
|
|
659
|
+
const t = typed(part);
|
|
660
|
+
switch (t.type) {
|
|
661
|
+
case 'concat':
|
|
662
|
+
part = concatenation(part.map(p => visit(p, visitor, previsit)));
|
|
663
|
+
break;
|
|
664
|
+
case 'alt':
|
|
665
|
+
part = alternation(t.parts.map(p => visit(p, visitor, previsit)));
|
|
666
|
+
break;
|
|
667
|
+
case 'quantified':
|
|
668
|
+
part = repeatFrom(visit(t.part, visitor, previsit), t.min, t.max, t.mod);
|
|
669
|
+
break;
|
|
670
|
+
case 'noncapture':
|
|
671
|
+
part = noncapture(visit(t.part, visitor, previsit), t.options);
|
|
672
|
+
break;
|
|
673
|
+
case 'capture':
|
|
674
|
+
part = capture(visit(t.part, visitor, previsit), t.name);
|
|
675
|
+
break;
|
|
676
|
+
}
|
|
677
|
+
return visitor(part) ?? part;
|
|
678
|
+
}
|
|
679
|
+
function commonPrefix(strings) {
|
|
680
|
+
if (strings.length < 2)
|
|
681
|
+
return '';
|
|
682
|
+
let prefix = strings[0];
|
|
683
|
+
for (let i = 1; prefix && i < strings.length; i++) {
|
|
684
|
+
let j = 0;
|
|
685
|
+
while (j < prefix.length && j < strings[i].length && prefix[j] === strings[i][j])
|
|
686
|
+
j++;
|
|
687
|
+
prefix = prefix.slice(0, j);
|
|
688
|
+
}
|
|
689
|
+
return prefix;
|
|
690
|
+
}
|
|
691
|
+
function optimize(part) {
|
|
692
|
+
return visit(part, p => {
|
|
693
|
+
if (Array.isArray(p)) {
|
|
694
|
+
const result = [];
|
|
695
|
+
let current = "";
|
|
696
|
+
for (const i of p) {
|
|
697
|
+
if (typeof i === 'string') {
|
|
698
|
+
current += i;
|
|
699
|
+
}
|
|
700
|
+
else {
|
|
701
|
+
if (current) {
|
|
702
|
+
result.push(current);
|
|
703
|
+
current = "";
|
|
704
|
+
}
|
|
705
|
+
if (is(i, 'concat'))
|
|
706
|
+
result.push(...i);
|
|
707
|
+
else
|
|
708
|
+
result.push(i);
|
|
709
|
+
}
|
|
710
|
+
}
|
|
711
|
+
if (current)
|
|
712
|
+
result.push(current);
|
|
713
|
+
return concatenation(result);
|
|
714
|
+
}
|
|
715
|
+
else if (is(p, 'alt')) {
|
|
716
|
+
const unique = [...new Set(p.parts.map(toRegExpString))];
|
|
717
|
+
const strings = unique.filter((p) => typeof p === 'string');
|
|
718
|
+
const prefix = commonPrefix(strings);
|
|
719
|
+
if (prefix) {
|
|
720
|
+
return concatenation([prefix, alternation([
|
|
721
|
+
...strings.map(s => s.slice(prefix.length)),
|
|
722
|
+
...unique.filter(p => typeof p !== 'string')
|
|
723
|
+
])]);
|
|
724
|
+
}
|
|
725
|
+
const result = [];
|
|
726
|
+
let cs;
|
|
727
|
+
for (const part of unique) {
|
|
728
|
+
if (typeof part === 'string' && part.length === 1) {
|
|
729
|
+
if (!cs)
|
|
730
|
+
cs = new MutablecharacterClass(false);
|
|
731
|
+
cs.setChar(part);
|
|
732
|
+
}
|
|
733
|
+
else if (is(part, 'class')) {
|
|
734
|
+
if (!cs)
|
|
735
|
+
cs = part.mutable();
|
|
736
|
+
else
|
|
737
|
+
cs.selfUnion(part);
|
|
738
|
+
}
|
|
739
|
+
else {
|
|
740
|
+
if (cs) {
|
|
741
|
+
result.push(cs);
|
|
742
|
+
cs = undefined;
|
|
743
|
+
}
|
|
744
|
+
result.push(part);
|
|
745
|
+
}
|
|
746
|
+
}
|
|
747
|
+
if (cs)
|
|
748
|
+
result.push(cs);
|
|
749
|
+
return alternation(result);
|
|
750
|
+
/*
|
|
751
|
+
// not safe to do this, as it changes the order of alternatives
|
|
752
|
+
const single = partition(unique, p => typeof p === 'string' && p.length === 1);
|
|
753
|
+
if (single.true) {
|
|
754
|
+
const c = chars(single.true.map(s => s as string).join(''));
|
|
755
|
+
return single.false ? alternation([c, ...single.false]) : c;
|
|
756
|
+
}
|
|
757
|
+
return alternation(unique);
|
|
758
|
+
*/
|
|
759
|
+
}
|
|
760
|
+
else if (is(p, 'class')) {
|
|
761
|
+
const i = p.next(-1);
|
|
762
|
+
if (i >= 0 && p.next(i) === -1)
|
|
763
|
+
return String.fromCharCode(i);
|
|
764
|
+
}
|
|
765
|
+
return p;
|
|
766
|
+
});
|
|
767
|
+
}
|
|
768
|
+
// Step 1: Build Thompson NFA from regex AST
|
|
769
|
+
function buildNFA(part) {
|
|
770
|
+
let stateId = 0;
|
|
771
|
+
function newState() {
|
|
772
|
+
return { id: stateId++, transitions: new Map(), epsilonTransitions: [], isAccepting: false };
|
|
773
|
+
}
|
|
774
|
+
function build(p) {
|
|
775
|
+
if (typeof p === 'string') {
|
|
776
|
+
// Character literal: start --'c'--> accept
|
|
777
|
+
const start = newState();
|
|
778
|
+
const accept = newState();
|
|
779
|
+
start.transitions.set(p, [accept]);
|
|
780
|
+
return { start, accept };
|
|
781
|
+
}
|
|
782
|
+
if (Array.isArray(p)) {
|
|
783
|
+
let current = build(p[0]);
|
|
784
|
+
for (let i = 1; i < p.length; i++) {
|
|
785
|
+
const next = build(p[i]);
|
|
786
|
+
current.accept.epsilonTransitions.push(next.start);
|
|
787
|
+
current.accept = next.accept;
|
|
788
|
+
}
|
|
789
|
+
return current;
|
|
790
|
+
}
|
|
791
|
+
switch (p.type) {
|
|
792
|
+
case 'alt': {
|
|
793
|
+
// Alternation: start --ε--> frag1.start, frag2.start, ... --ε--> accept
|
|
794
|
+
const start = newState();
|
|
795
|
+
const accept = newState();
|
|
796
|
+
for (const alt of p.parts) {
|
|
797
|
+
const frag = build(alt);
|
|
798
|
+
start.epsilonTransitions.push(frag.start);
|
|
799
|
+
frag.accept.epsilonTransitions.push(accept);
|
|
800
|
+
}
|
|
801
|
+
return { start, accept };
|
|
802
|
+
}
|
|
803
|
+
case 'quantified': {
|
|
804
|
+
const frag = build(p.part);
|
|
805
|
+
const start = newState();
|
|
806
|
+
const accept = newState();
|
|
807
|
+
// Handle min repetitions
|
|
808
|
+
let current = start;
|
|
809
|
+
for (let i = 0; i < p.min; i++) {
|
|
810
|
+
const copy = build(p.part);
|
|
811
|
+
current.epsilonTransitions.push(copy.start);
|
|
812
|
+
current = copy.accept;
|
|
813
|
+
}
|
|
814
|
+
// Handle optional repetitions or infinite
|
|
815
|
+
if (p.max === -1) {
|
|
816
|
+
// Infinite: can loop back
|
|
817
|
+
current.epsilonTransitions.push(frag.start);
|
|
818
|
+
frag.accept.epsilonTransitions.push(frag.start, accept);
|
|
819
|
+
current.epsilonTransitions.push(accept);
|
|
820
|
+
}
|
|
821
|
+
else {
|
|
822
|
+
// Finite: add optional copies
|
|
823
|
+
for (let i = p.min; i < p.max; i++) {
|
|
824
|
+
current.epsilonTransitions.push(accept); // can skip
|
|
825
|
+
const copy = build(p.part);
|
|
826
|
+
current.epsilonTransitions.push(copy.start);
|
|
827
|
+
current = copy.accept;
|
|
828
|
+
}
|
|
829
|
+
current.epsilonTransitions.push(accept);
|
|
830
|
+
}
|
|
831
|
+
return { start, accept };
|
|
832
|
+
}
|
|
833
|
+
case 'class': {
|
|
834
|
+
// Character class: single transition with multiple chars
|
|
835
|
+
const start = newState();
|
|
836
|
+
const accept = newState();
|
|
837
|
+
//if (p.contains(any))
|
|
838
|
+
// return '.';
|
|
839
|
+
// Add transition for each character in the class
|
|
840
|
+
for (let i = p.next(-1); i !== -1; i = p.next(i)) {
|
|
841
|
+
const char = String.fromCodePoint(i);
|
|
842
|
+
const existing = start.transitions.get(char) || [];
|
|
843
|
+
existing.push(accept);
|
|
844
|
+
start.transitions.set(char, existing);
|
|
845
|
+
}
|
|
846
|
+
return { start, accept };
|
|
847
|
+
}
|
|
848
|
+
case 'wordbound':
|
|
849
|
+
case 'nowordbound':
|
|
850
|
+
case 'inputboundstart':
|
|
851
|
+
case 'inputboundend': {
|
|
852
|
+
// Anchors: epsilon transition with special handling
|
|
853
|
+
const start = newState();
|
|
854
|
+
const accept = newState();
|
|
855
|
+
// Mark transition with anchor type for special processing
|
|
856
|
+
start.transitions.set(`__${p.type}__`, [accept]);
|
|
857
|
+
return { start, accept };
|
|
858
|
+
}
|
|
859
|
+
case 'noncapture':
|
|
860
|
+
case 'capture':
|
|
861
|
+
// Groups: just pass through (captures handled at higher level)
|
|
862
|
+
return build(p.part);
|
|
863
|
+
default:
|
|
864
|
+
throw new Error(`Unsupported: ${p.type}`);
|
|
865
|
+
}
|
|
866
|
+
}
|
|
867
|
+
const result = build(part);
|
|
868
|
+
result.accept.isAccepting = true;
|
|
869
|
+
return result;
|
|
870
|
+
}
|
|
871
|
+
// Step 3: Subset Construction - convert NFA to DFA
|
|
872
|
+
function NFAtoDFA(nfaStart) {
|
|
873
|
+
const nfa = [];
|
|
874
|
+
const alphabet = new Set();
|
|
875
|
+
function collectStates(state) {
|
|
876
|
+
if (!nfa[state.id]) {
|
|
877
|
+
nfa[state.id] = state;
|
|
878
|
+
for (const char of state.transitions.keys())
|
|
879
|
+
alphabet.add(char);
|
|
880
|
+
for (const targets of state.transitions.values())
|
|
881
|
+
targets.forEach(collectStates);
|
|
882
|
+
state.epsilonTransitions.forEach(collectStates);
|
|
883
|
+
}
|
|
884
|
+
}
|
|
885
|
+
collectStates(nfaStart);
|
|
886
|
+
const dfaStates = new Map();
|
|
887
|
+
let dfaStateId = 0;
|
|
888
|
+
function stateSetKey(states) {
|
|
889
|
+
return [...states].sort().join(',');
|
|
890
|
+
}
|
|
891
|
+
// Epsilon closure - find all states reachable via ε-transitions
|
|
892
|
+
function epsilonClosure(states) {
|
|
893
|
+
const closure = new Set(states);
|
|
894
|
+
const stack = [...states];
|
|
895
|
+
while (stack.length > 0) {
|
|
896
|
+
const id = stack.pop();
|
|
897
|
+
for (const next of nfa[id].epsilonTransitions) {
|
|
898
|
+
if (!closure.has(next.id)) {
|
|
899
|
+
closure.add(next.id);
|
|
900
|
+
stack.push(next.id);
|
|
901
|
+
}
|
|
902
|
+
}
|
|
903
|
+
}
|
|
904
|
+
return closure;
|
|
905
|
+
}
|
|
906
|
+
function createDFAState(states) {
|
|
907
|
+
const isAccepting = [...states].some(id => nfa[id]?.isAccepting);
|
|
908
|
+
return { /*id: dfaStateId++, */ nfaStates: states, transitions: new Map(), isAccepting };
|
|
909
|
+
}
|
|
910
|
+
// Start with epsilon closure of initial state
|
|
911
|
+
const start = new Set([nfaStart.id]);
|
|
912
|
+
const startClosure = epsilonClosure(start);
|
|
913
|
+
const startDFA = createDFAState(startClosure);
|
|
914
|
+
dfaStates.set(stateSetKey(startClosure), startDFA);
|
|
915
|
+
const worklist = [startDFA];
|
|
916
|
+
while (worklist.length > 0) {
|
|
917
|
+
const currentDFA = worklist.pop();
|
|
918
|
+
// For each character in alphabet
|
|
919
|
+
for (const char of alphabet) {
|
|
920
|
+
// Collect all NFA states reachable by this character
|
|
921
|
+
const next = new Set();
|
|
922
|
+
for (const id of currentDFA.nfaStates) {
|
|
923
|
+
const targets = nfa[id]?.transitions.get(char) || [];
|
|
924
|
+
targets.forEach(target => next.add(target.id));
|
|
925
|
+
}
|
|
926
|
+
if (next.size > 0) {
|
|
927
|
+
// Take epsilon closure of the result
|
|
928
|
+
const nextClosure = epsilonClosure(next);
|
|
929
|
+
// Check if this set of NFA states already has a corresponding DFA state
|
|
930
|
+
const key = stateSetKey(nextClosure);
|
|
931
|
+
let nextDFA = dfaStates.get(key);
|
|
932
|
+
if (!nextDFA) {
|
|
933
|
+
nextDFA = createDFAState(nextClosure);
|
|
934
|
+
dfaStates.set(key, nextDFA);
|
|
935
|
+
worklist.push(nextDFA);
|
|
936
|
+
}
|
|
937
|
+
currentDFA.transitions.set(char, nextDFA);
|
|
938
|
+
}
|
|
939
|
+
}
|
|
940
|
+
}
|
|
941
|
+
return startDFA;
|
|
942
|
+
}
|
|
943
|
+
function runDFA(dfa, str) {
|
|
944
|
+
let currentState = dfa;
|
|
945
|
+
for (const char of str) {
|
|
946
|
+
currentState = currentState.transitions.get(char);
|
|
947
|
+
if (!currentState)
|
|
948
|
+
return false;
|
|
949
|
+
}
|
|
950
|
+
return currentState.isAccepting;
|
|
951
|
+
}
|
|
952
|
+
// Usage:
|
|
953
|
+
function regexToDFA(part) {
|
|
954
|
+
const nfa = buildNFA(part);
|
|
955
|
+
return NFAtoDFA(nfa.start);
|
|
956
|
+
}
|
|
957
|
+
///
|
|
958
|
+
function parseGlob(glob) {
|
|
959
|
+
let result = '';
|
|
960
|
+
let depth = 0;
|
|
961
|
+
for (let i = 0; i < glob.length; ++i) {
|
|
962
|
+
let c = glob[i];
|
|
963
|
+
switch (c) {
|
|
964
|
+
case '\\':
|
|
965
|
+
c = glob[++i];
|
|
966
|
+
if ('*?+.,^$()|[]a-zA-Z'.includes(c))
|
|
967
|
+
result += '\\';
|
|
968
|
+
break;
|
|
969
|
+
case '*':
|
|
970
|
+
if (glob[i + 1] === '*') {
|
|
971
|
+
result += '.*';
|
|
972
|
+
++i;
|
|
973
|
+
}
|
|
974
|
+
else {
|
|
975
|
+
result += '[^/]*';
|
|
976
|
+
}
|
|
977
|
+
continue;
|
|
978
|
+
case '?':
|
|
979
|
+
c = '.';
|
|
980
|
+
break;
|
|
981
|
+
case '+':
|
|
982
|
+
case '.':
|
|
983
|
+
case '^':
|
|
984
|
+
case '$':
|
|
985
|
+
case '(':
|
|
986
|
+
case ')':
|
|
987
|
+
case '|':
|
|
988
|
+
result += `\\`;
|
|
989
|
+
break;
|
|
990
|
+
case '[': {
|
|
991
|
+
const end = glob.indexOf(']', i + 1);
|
|
992
|
+
if (end > i) {
|
|
993
|
+
const next = glob[i + 1];
|
|
994
|
+
if (next === ':' && glob[end - 1] === ':') {
|
|
995
|
+
const p = posixClasses[glob.slice(i + 2, end - 1)];
|
|
996
|
+
if (p) {
|
|
997
|
+
result += `[${p}]`;
|
|
998
|
+
i = end;
|
|
999
|
+
continue;
|
|
1000
|
+
}
|
|
1001
|
+
else {
|
|
1002
|
+
console.log(`Warning: Unknown POSIX class ${glob.slice(i + 2, end - 1)} in glob pattern ${glob}`);
|
|
1003
|
+
}
|
|
1004
|
+
}
|
|
1005
|
+
const neg = next === '!' || next === '^';
|
|
1006
|
+
result += `[${neg ? '^' : ''}${glob.slice(neg ? i + 2 : i + 1, end)}]`;
|
|
1007
|
+
i = end;
|
|
1008
|
+
continue;
|
|
1009
|
+
}
|
|
1010
|
+
result += '\\';
|
|
1011
|
+
break;
|
|
1012
|
+
}
|
|
1013
|
+
case '{':
|
|
1014
|
+
++depth;
|
|
1015
|
+
c = '(';
|
|
1016
|
+
break;
|
|
1017
|
+
case '}':
|
|
1018
|
+
if (depth > 0) {
|
|
1019
|
+
--depth;
|
|
1020
|
+
c = ')';
|
|
1021
|
+
}
|
|
1022
|
+
break;
|
|
1023
|
+
case ',':
|
|
1024
|
+
if (depth > 0)
|
|
1025
|
+
c = '|';
|
|
1026
|
+
break;
|
|
1027
|
+
}
|
|
1028
|
+
result += c;
|
|
1029
|
+
}
|
|
1030
|
+
if (depth > 0) {
|
|
1031
|
+
console.log(`Warning: Unmatched { in glob pattern ${glob}`);
|
|
1032
|
+
result += ')'.repeat(depth);
|
|
1033
|
+
}
|
|
1034
|
+
return result;
|
|
1035
|
+
}
|
|
1036
|
+
function anchoredRe(re) {
|
|
1037
|
+
return new RegExp(`^${re}$`);
|
|
1038
|
+
}
|
|
1039
|
+
function globToRe(glob) {
|
|
1040
|
+
return anchoredRe(parseGlob(glob));
|
|
1041
|
+
}
|
|
1042
|
+
function globToReMulti(globs) {
|
|
1043
|
+
return anchoredRe(globs.map(parseGlob).join('|'));
|
|
1044
|
+
}
|