@isopodlabs/utilities 1.5.5 → 1.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/regex.js ADDED
@@ -0,0 +1,1044 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.endAnchor = exports.startAnchor = exports.nonWordBoundary = exports.wordBoundary = exports.octal = exports.hex = exports.whitespace = exports.word = exports.alnum = exports.alpha = exports.upper = exports.lower = exports.digit = exports.any = exports.characterClass = void 0;
4
+ exports.range = range;
5
+ exports.chars = chars;
6
+ exports.union = union;
7
+ exports.text = text;
8
+ exports.concatenation = concatenation;
9
+ exports.alternation = alternation;
10
+ exports.noncapture = noncapture;
11
+ exports.lookAhead = lookAhead;
12
+ exports.negLookAhead = negLookAhead;
13
+ exports.lookBehind = lookBehind;
14
+ exports.negLookBehind = negLookBehind;
15
+ exports.capture = capture;
16
+ exports.repeatFrom = repeatFrom;
17
+ exports.repeat = repeat;
18
+ exports.zeroOrMore = zeroOrMore;
19
+ exports.oneOrMore = oneOrMore;
20
+ exports.optional = optional;
21
+ exports.boundary = boundary;
22
+ exports.reference = reference;
23
+ exports.anchored = anchored;
24
+ exports.parse = parse;
25
+ exports.toRegExpString = toRegExpString;
26
+ exports.toRegExp = toRegExp;
27
+ exports.optimize = optimize;
28
+ exports.runDFA = runDFA;
29
+ exports.regexToDFA = regexToDFA;
30
+ exports.parseGlob = parseGlob;
31
+ exports.anchoredRe = anchoredRe;
32
+ exports.globToRe = globToRe;
33
+ exports.globToReMulti = globToReMulti;
34
+ const bits_1 = require("./bits");
35
+ /*
36
+ Characters
37
+ [xyz],[a-c] Character class
38
+ [^xyz],[^a-c] Negated character class
39
+ . Wildcard: Matches any single character except line terminators: \n, \r, \u2028 or \u2029
40
+ \d Digit character class escape: Matches any digit (Arabic numeral). Equivalent to [0-9]
41
+ \D Non-digit character class escape: Matches any character that is not a digit (Arabic numeral)
42
+ \w Word character class escape: Matches any alphanumeric character from the basic Latin alphabet, including the underscore. Equivalent to [A-Za-z0-9_]
43
+ \W Non-word character class escape: Matches any character that is not a word character from the basic Latin alphabet. Equivalent to [^A-Za-z0-9_]
44
+ \s White space character class escape: Matches a single white space character, including space, tab, form feed, line feed, and other Unicode spaces. Equivalent to [\f\n\r\t\v\u0020\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]
45
+ \S Non-white space character class escape
46
+ \t Matches a horizontal tab.
47
+ \r Matches a carriage return.
48
+ \n Matches a linefeed.
49
+ \v Matches a vertical tab.
50
+ \f Matches a form-feed.
51
+ [\b] Matches a backspace.
52
+ \0 Matches a NUL character.
53
+ \cX Matches a control character using caret notation, where "X" is a letter from A–Z
54
+ \xhh Matches the character with the code hh (two hexadecimal digits).
55
+ \uhhhh Matches a UTF-16 code-unit with the value hhhh (four hexadecimal digits).
56
+ \u{hhhh} or \u{hhhhh} (Only when the u flag is set.) Matches the character with the Unicode value U+hhhh or U+hhhhh (hexadecimal digits).
57
+ \p{UnicodeProperty}, \P{UnicodeProperty} Unicode character class escape: Matches a character based on its Unicode character properties
58
+ \ Indicates that the following character should be treated specially, or "escaped"
59
+
60
+ x|y Alternation: Matches either "x" or "y"
61
+
62
+ Boundary-type assertions
63
+ ^ Input boundary beginning assertion
64
+ $ Input boundary end assertion
65
+ \b Word boundary assertion
66
+ \B Non-word-boundary assertion
67
+
68
+ Other Assertions
69
+ x(?=y) Lookahead assertion
70
+ x(?!y) Negative lookahead assertion
71
+ (?<=y)x Lookbehind assertion
72
+ (?<!y)x Negative lookbehind assertion
73
+
74
+ Groups and backreferences
75
+ (x) Capturing group
76
+ (?<Name>x) Named capturing group
77
+ (?:x) Non-capturing group
78
+ (?flags:x), (?:flags-flags:x) Modifier (flags can be i, m, s)
79
+ \<int> Backreference
80
+ \k<Name> Named backreference
81
+
82
+ Quantifiers
83
+ x* Matches the preceding item "x" 0 or more times
84
+ x+ Matches the preceding item "x" 1 or more times. Equivalent to {1,}
85
+ x? Matches the preceding item "x" 0 or 1 times. For example, /e?le?/ matches the "el" in "angel" and the "le" in "angle."
86
+ x{<int>} Matches exactly "n" occurrences of the preceding item "x"
87
+ x{<int>,} Matches at least "n" occurrences of the preceding item "x"
88
+ x{n,m} Matches at least "n" and at most "m" occurrences of the preceding item "x"
89
+
90
+ Non greedy quantifiers
91
+ x*?
92
+ x+?
93
+ x??
94
+ x{n}?
95
+ x{n,}?
96
+ x{n,m}?
97
+
98
+ */
99
+ const posixClasses = {
100
+ alnum: '\\p{L}\\p{Nl}\\p{Nd}',
101
+ alpha: '\\p{L}\\p{Nl}',
102
+ ascii: '\\x00-\\x7f',
103
+ blank: '\\p{Zs}\\t',
104
+ cntrl: '\\p{Cc}',
105
+ digit: '\\p{Nd}',
106
+ graph: '^\\p{Z}\\p{C}',
107
+ lower: '\\p{Ll}',
108
+ print: '\\p{C}',
109
+ punct: '\\p{P}',
110
+ space: '\\p{Z}\\t\\r\\n\\v\\f',
111
+ upper: '\\p{Lu}',
112
+ word: '\\p{L}\\p{Nl}\\p{Nd}\\p{Pc}',
113
+ xdigit: 'A-Fa-f0-9',
114
+ };
115
+ class characterClass extends bits_1.SparseBits {
116
+ type = 'class';
117
+ //setChar(char: string) {
118
+ // this.set(char.charCodeAt(0));
119
+ //}
120
+ test(char) {
121
+ return this.has(char.charCodeAt(0));
122
+ }
123
+ mutable() {
124
+ return new MutablecharacterClass(true).selfIntersect(this);
125
+ }
126
+ isNegated() {
127
+ return !!this.undef;
128
+ }
129
+ toString() {
130
+ let s = this.undef ? '^' : '';
131
+ for (const i in this.bits) {
132
+ const b = this.bits[i] ^ this.undef;
133
+ const c0 = +i * 32;
134
+ for (let j = 0; j < 32; j++) {
135
+ if (b & (1 << j)) {
136
+ const c1 = c0 + j;
137
+ while (j < 32 && (b & (1 << j)))
138
+ j++;
139
+ const c2 = c0 + j - 1;
140
+ s += String.fromCharCode(c1).replace(/[-\\\]]/g, '\\$&');
141
+ if (c1 !== c2)
142
+ s += '-' + String.fromCharCode(c2).replace(/[-\\\]]/g, '\\$&');
143
+ }
144
+ }
145
+ }
146
+ return s;
147
+ }
148
+ }
149
+ exports.characterClass = characterClass;
150
+ ;
151
+ class MutablecharacterClass extends characterClass {
152
+ setChar(char) {
153
+ this.set(char.charCodeAt(0));
154
+ }
155
+ setString(c) {
156
+ for (let i = 0; i < c.length; i++)
157
+ this.set(c.charCodeAt(i));
158
+ return this;
159
+ }
160
+ clearString(c) {
161
+ for (let i = 0; i < c.length; i++)
162
+ this.clear(c.charCodeAt(i));
163
+ return this;
164
+ }
165
+ }
166
+ // characterClass helpers
167
+ function range(from, to) {
168
+ return new MutablecharacterClass(false).setRange(from.charCodeAt(0), to.charCodeAt(0) + 1);
169
+ }
170
+ function chars(chars) {
171
+ return new MutablecharacterClass(false).setString(chars);
172
+ }
173
+ function union(...classes) {
174
+ const result = new MutablecharacterClass(false);
175
+ for (const cls of classes)
176
+ result.selfUnion(cls);
177
+ return result;
178
+ }
179
+ // Common character class constants and ranges
180
+ exports.any = new characterClass(true); //.clearString('\n\r\u2028\u2029');
181
+ exports.digit = range('0', '9'); //digit
182
+ exports.lower = range('a', 'z');
183
+ exports.upper = range('A', 'Z');
184
+ exports.alpha = exports.lower.union(exports.upper);
185
+ exports.alnum = exports.alpha.union(exports.digit);
186
+ exports.word = exports.alnum.union(chars('_')); //word
187
+ exports.whitespace = chars(' \t\r\n\f\v'); //whitespace
188
+ exports.hex = exports.digit.union(chars('abcdefABCDEF'));
189
+ exports.octal = range('0', '7');
190
+ function text(c) {
191
+ return c;
192
+ }
193
+ function concatenation(parts) {
194
+ return parts.length === 1 ? parts[0] : parts;
195
+ }
196
+ function alternation(parts) {
197
+ return parts.length === 1 ? parts[0]
198
+ : parts.length === 2 && !parts[1] ? optional(parts[0])
199
+ : { type: 'alt', parts };
200
+ }
201
+ ;
202
+ function noncapture(part, options) {
203
+ return { type: 'noncapture', part, options };
204
+ }
205
+ function lookAhead(part) { return noncapture(part, 'ahead'); }
206
+ function negLookAhead(part) { return noncapture(part, 'neg_ahead'); }
207
+ function lookBehind(part) { return noncapture(part, 'behind'); }
208
+ function negLookBehind(part) { return noncapture(part, 'neg_behind'); }
209
+ function capture(part, name) {
210
+ return { type: 'capture', part, name };
211
+ }
212
+ function repeatFrom(part, min, max = -1, mod = 'greedy') {
213
+ return { type: 'quantified', part, min, max, mod };
214
+ }
215
+ function repeat(part, n, mod = 'greedy') {
216
+ return { type: 'quantified', part, min: n, max: n, mod };
217
+ }
218
+ function zeroOrMore(part, mod = 'greedy') { return repeatFrom(part, 0, -1, mod); }
219
+ function oneOrMore(part, mod = 'greedy') { return repeatFrom(part, 1, -1, mod); }
220
+ function optional(part, mod = 'greedy') { return repeatFrom(part, 0, 1, mod); }
221
+ function boundary(type) {
222
+ return { type };
223
+ }
224
+ exports.wordBoundary = boundary('wordbound');
225
+ exports.nonWordBoundary = boundary('nowordbound');
226
+ exports.startAnchor = boundary('inputboundstart');
227
+ exports.endAnchor = boundary('inputboundend');
228
+ function reference(value) {
229
+ return { type: 'reference', value };
230
+ }
231
+ function anchored(part) {
232
+ return [exports.startAnchor, part, exports.endAnchor];
233
+ }
234
+ /*
235
+ function is0<T extends part0['type']>(part: part, type: T): part is Extract<part0, { type: T }> {
236
+ return typeof part !== 'string' && !Array.isArray(part) && part.type === type;
237
+ }
238
+ */
239
+ function type(part) {
240
+ return typeof part === 'string' ? 'text'
241
+ : Array.isArray(part) ? 'concat'
242
+ : part.type;
243
+ }
244
+ function is(part, istype) {
245
+ return type(part) === istype;
246
+ //return typeof part === 'string' ? type === 'text'
247
+ // : Array.isArray(part) ? type === 'concat'
248
+ // : part.type === type;
249
+ }
250
+ function typed(part) {
251
+ return typeof part === 'string' ? { type: 'text', part }
252
+ : Array.isArray(part) ? { type: 'concat', part }
253
+ : part;
254
+ }
255
+ function parse(re, unicode = true, extended = false) {
256
+ const stack = [];
257
+ let curr = [];
258
+ let i = 0;
259
+ function skipTo(c) {
260
+ const start = i;
261
+ while (i < re.length && re[i] !== c)
262
+ i++;
263
+ if (re[i] !== c)
264
+ throw new Error(`Missing '${c}'`);
265
+ return re.substring(start, i++);
266
+ }
267
+ function int() {
268
+ const start = i;
269
+ while (re[i] >= '0' && re[i] <= '9')
270
+ i++;
271
+ return parseInt(re.substring(start, i));
272
+ }
273
+ function backslashed() {
274
+ const c = re[i++];
275
+ switch (c) {
276
+ default: return c.charCodeAt(0);
277
+ case 'd': return exports.digit; //digit
278
+ case 'D': return exports.digit.not(); //non-digit
279
+ case 'w': return exports.word; //word
280
+ case 'W': return exports.word.not(); //non-word
281
+ case 's': return exports.whitespace; //whitespace
282
+ case 'S': return exports.whitespace.not(); //non-whitespace
283
+ case 'b': return 8; //backspace
284
+ case 't': return 9; //tab
285
+ case 'n': return 10; //newline
286
+ case 'v': return 11; //vertical tab
287
+ case 'f': return 12; //form feed
288
+ case 'r': return 13; //carriage return
289
+ case 'c': return re.charCodeAt(i++) & 31; //control character
290
+ case '0': {
291
+ const start = i - 1;
292
+ while (re[i] >= '0' && re[i] <= '7' && i - start < 4)
293
+ i++;
294
+ return parseInt(re.substring(start, i), 8);
295
+ }
296
+ case 'x':
297
+ if (i + 2 > re.length)
298
+ throw new Error('bad \\x escape');
299
+ i += 2;
300
+ return parseInt(re.substring(i - 2, i), 16);
301
+ case 'u':
302
+ if (unicode && re[i] === '{') {
303
+ i++; // skip '{'
304
+ return parseInt(skipTo('}'), 16);
305
+ }
306
+ if (i + 4 > re.length)
307
+ throw new Error('bad \\u escape');
308
+ i += 4;
309
+ return parseInt(re.substring(i - 4, i), 16);
310
+ case 'p':
311
+ case 'P':
312
+ if (!unicode || re[i] !== '{')
313
+ throw new Error('\\p and \\P can only be used with unicode enabled, and must be followed by {property}');
314
+ i++; // skip '{'
315
+ return { type: c === 'P' ? 'notunicode' : 'unicode', property: skipTo('}') };
316
+ }
317
+ }
318
+ function character() {
319
+ const code = unicode ? re.codePointAt(i++) : re.charCodeAt(i++);
320
+ if (code > 0xffff) {
321
+ ++i;
322
+ return code;
323
+ }
324
+ return code === 92 ? backslashed() : code;
325
+ }
326
+ function addQuantified(min, max) {
327
+ let mod = 'greedy';
328
+ if (re[i] === '?') {
329
+ mod = 'lazy';
330
+ i++;
331
+ }
332
+ else if (extended && re[i] === '+') {
333
+ mod = 'possessive';
334
+ i++;
335
+ }
336
+ const top = curr.pop();
337
+ if (!top)
338
+ throw new Error('nothing to quantify');
339
+ if (typeof top === 'string' && top.length > 1) {
340
+ curr.push(top.slice(0, -1));
341
+ curr.push(repeatFrom(top.slice(-1), min, max, mod));
342
+ }
343
+ else {
344
+ curr.push(repeatFrom(top, min, max, mod));
345
+ }
346
+ }
347
+ function addText(c) {
348
+ if (typeof curr.at(-1) === 'string') {
349
+ curr[curr.length - 1] += c;
350
+ }
351
+ else {
352
+ curr.push(c);
353
+ }
354
+ }
355
+ function closeAlt() {
356
+ let top = stack.pop();
357
+ if (top?.type === 'alt') {
358
+ top.parts.push(concatenation(curr));
359
+ curr = [top];
360
+ top = stack.pop();
361
+ }
362
+ return top;
363
+ }
364
+ const specialChars = /[\\^$*+?{()|[.]/;
365
+ while (i < re.length) {
366
+ const remaining = re.substring(i);
367
+ const next = remaining.search(specialChars);
368
+ if (next === -1) {
369
+ addText(remaining);
370
+ break;
371
+ }
372
+ if (next > 0)
373
+ addText(remaining.substring(0, next));
374
+ i += next;
375
+ switch (re[i++]) {
376
+ case '\\':
377
+ if (re[i] === 'b') {
378
+ i++;
379
+ curr.push(exports.wordBoundary);
380
+ }
381
+ else if (re[i] === 'B') {
382
+ i++;
383
+ curr.push(exports.nonWordBoundary);
384
+ }
385
+ else if (re[i] >= '1' && re[i] <= '9') {
386
+ const n = int();
387
+ curr.push({ type: 'reference', value: n });
388
+ }
389
+ else if (re[i] === 'k' && re[i + 1] === '<') {
390
+ i += 2;
391
+ const name = skipTo('>');
392
+ curr.push({ type: 'reference', value: name });
393
+ }
394
+ else {
395
+ const b = backslashed();
396
+ if (typeof b === 'number')
397
+ addText(String.fromCodePoint(b));
398
+ else
399
+ curr.push(b);
400
+ }
401
+ break;
402
+ case '.':
403
+ curr.push(exports.any);
404
+ break;
405
+ //Boundary-type assertions
406
+ case '^':
407
+ curr.push(exports.startAnchor);
408
+ break;
409
+ case '$':
410
+ curr.push(exports.endAnchor);
411
+ break;
412
+ //Quantifiers
413
+ case '*':
414
+ addQuantified(0, -1);
415
+ break;
416
+ case '+':
417
+ addQuantified(1, -1);
418
+ break;
419
+ case '?':
420
+ addQuantified(0, 1);
421
+ break;
422
+ case '{': {
423
+ const min = int();
424
+ let max = min;
425
+ if (re[i] === ',') {
426
+ ++i; // skip ','
427
+ max = re[i] !== '}' ? int() : -1;
428
+ }
429
+ ++i; // skip '}'
430
+ addQuantified(min, max);
431
+ break;
432
+ }
433
+ //Alternation
434
+ case '|': {
435
+ const top = stack.at(-1);
436
+ if (top?.type === 'alt') {
437
+ top.parts.push(concatenation(curr));
438
+ }
439
+ else {
440
+ stack.push({ type: 'alt', parts: [concatenation(curr)] });
441
+ }
442
+ curr = [];
443
+ break;
444
+ }
445
+ //Groups
446
+ case '(':
447
+ let group;
448
+ const dummy = ''; //text(''); // placeholder
449
+ if (re[i] === '?') {
450
+ i++;
451
+ switch (re[i++]) {
452
+ case ':':
453
+ group = noncapture(dummy);
454
+ break;
455
+ case '=':
456
+ group = noncapture(dummy, 'ahead');
457
+ break;
458
+ case '!':
459
+ group = noncapture(dummy, 'neg_ahead');
460
+ break;
461
+ case '<':
462
+ if (re[i] === '=') {
463
+ i++;
464
+ group = noncapture(dummy, 'behind');
465
+ }
466
+ else if (re[i] === '!') {
467
+ i++;
468
+ group = noncapture(dummy, 'neg_behind');
469
+ }
470
+ else {
471
+ group = capture(dummy, skipTo('>'));
472
+ }
473
+ break;
474
+ case '>':
475
+ if (extended) {
476
+ group = noncapture(dummy, 'atomic');
477
+ break;
478
+ }
479
+ default: {
480
+ let set = true;
481
+ const flags = {};
482
+ --i; // go back to first flag character
483
+ while (i < re.length) {
484
+ const f = re[i++];
485
+ if (f === ':')
486
+ break;
487
+ if (f === '-')
488
+ set = false;
489
+ else if (f === 'i' || f === 'm' || f === 's')
490
+ flags[f] = set;
491
+ }
492
+ group = noncapture(dummy, flags);
493
+ break;
494
+ }
495
+ }
496
+ }
497
+ else {
498
+ group = capture(dummy, '');
499
+ }
500
+ stack.push({ type: 'group', group: group, tos: curr });
501
+ curr = [];
502
+ break;
503
+ case ')': {
504
+ const top = closeAlt();
505
+ if (top?.type !== 'group')
506
+ throw new Error('unmatched )');
507
+ top.group.part = concatenation(curr);
508
+ curr = [...top.tos, top.group];
509
+ break;
510
+ }
511
+ //Character classes
512
+ case '[': {
513
+ const neg = re[i] === '^';
514
+ if (neg)
515
+ i++;
516
+ const cs = new characterClass(false);
517
+ if (re[i] === ']' || re[i] === '-')
518
+ cs.set(re.charCodeAt(i++));
519
+ while (i < re.length && re[i] !== ']') {
520
+ const from = character();
521
+ if (typeof from === 'number') {
522
+ if (re[i] === '-' && i + 1 < re.length && re[i + 1] !== ']') {
523
+ ++i;
524
+ const to = character();
525
+ if (typeof to !== 'number' || from > to)
526
+ throw new Error('bad character class');
527
+ cs.setRange(from, to + 1);
528
+ }
529
+ else {
530
+ cs.set(from);
531
+ }
532
+ }
533
+ else if (is(from, 'class')) {
534
+ cs.selfUnion(from);
535
+ }
536
+ }
537
+ i++; // skip ']'
538
+ curr.push(neg ? cs.selfNot() : cs);
539
+ break;
540
+ }
541
+ }
542
+ }
543
+ const top = closeAlt();
544
+ if (top)
545
+ throw new Error('unmatched (');
546
+ return concatenation(curr);
547
+ }
548
+ //-----------------------------------------------------------------------------
549
+ // Regex to string
550
+ //-----------------------------------------------------------------------------
551
+ function printQuantified(min, max, mod) {
552
+ return (min === 0 && max === -1 ? '*'
553
+ : min === 1 && max === -1 ? '+'
554
+ : min === 0 && max === 1 ? '?'
555
+ : max === -1 ? `{${min},}`
556
+ : min === max ? `{${min}}`
557
+ : `{${min},${max}}`) + (mod === 'lazy' ? '?' : mod === 'possessive' ? '+' : '');
558
+ }
559
+ function list(parts, join) {
560
+ return parts.map((p, i) => {
561
+ if (is(p, 'quantified') && is(p.part, 'text') && i > 0 && is(parts[i - 1], 'text'))
562
+ return `(?:${p.part})` + printQuantified(p.min, p.max, p.mod);
563
+ const s = toRegExpString(p);
564
+ return is(p, 'alt') ? `(?:${s})` : s;
565
+ }).join(join);
566
+ }
567
+ function toRegExpString(part) {
568
+ if (typeof part === 'string')
569
+ return part.replace(/[\\^$*+?.()|[\]{}]/g, '\\$&');
570
+ if (Array.isArray(part))
571
+ return list(part, '');
572
+ switch (part.type) {
573
+ case 'alt':
574
+ return list(part.parts, '|');
575
+ case 'quantified':
576
+ return toRegExpString(part.part) + printQuantified(part.min, part.max, part.mod);
577
+ case 'noncapture': {
578
+ let header = '';
579
+ const opts = part.options;
580
+ if (opts) {
581
+ if (typeof opts === 'string') {
582
+ header = {
583
+ ahead: '=',
584
+ behind: '<=',
585
+ neg_ahead: '!',
586
+ neg_behind: '<!',
587
+ atomic: '>'
588
+ }[opts];
589
+ }
590
+ else if ((opts.i ?? opts.m ?? opts.s) !== undefined) {
591
+ let posflags = (opts.i ? 'i' : '') + (opts.m ? 'm' : '') + (opts.s ? 's' : '');
592
+ let negflags = (opts.i === false ? 'i' : '') + (opts.m === false ? 'm' : '') + (opts.s === false ? 's' : '');
593
+ header = `${posflags}${negflags ? '-' : ''}${negflags}:`;
594
+ }
595
+ }
596
+ return `(?${header}${toRegExpString(part.part)})`;
597
+ }
598
+ case 'capture':
599
+ return `(${part.name ? `?<${part.name}>` : ''}${toRegExpString(part.part)})`;
600
+ case 'class': {
601
+ if (part.contains(exports.any))
602
+ return '.';
603
+ const neg = part.isNegated();
604
+ const temp = neg ? part.not() : part.mutable();
605
+ const has_w = temp.contains(exports.word);
606
+ if (has_w) {
607
+ if (exports.word.contains(temp))
608
+ return '\\w';
609
+ temp.selfIntersect(exports.word.not());
610
+ }
611
+ const has_d = !has_w && temp.contains(exports.digit);
612
+ if (has_d) {
613
+ if (exports.digit.contains(temp))
614
+ return '\\d';
615
+ temp.selfIntersect(exports.digit.not());
616
+ }
617
+ const has_s = temp.contains(exports.whitespace);
618
+ if (has_s) {
619
+ if (!has_w && !has_d && exports.whitespace.contains(temp))
620
+ return '\\s';
621
+ temp.selfIntersect(exports.whitespace.not());
622
+ }
623
+ return `[${neg ? '^' : ''}${has_w ? '\\w' : has_d ? '\\d' : ''}${has_s ? '\\s' : ''}${temp.toString()}]`;
624
+ //return `[${part.toString()}]`;
625
+ }
626
+ case 'unicode':
627
+ return `\\p{${part.property}}`;
628
+ case 'notunicode':
629
+ return `\\P{${part.property}}`;
630
+ case 'wordbound':
631
+ return '\\b';
632
+ case 'nowordbound':
633
+ return '\\B';
634
+ case 'inputboundstart':
635
+ return '^';
636
+ case 'inputboundend':
637
+ return '$';
638
+ case 'reference':
639
+ return typeof part.value === 'number' ? `\\${part.value}` : `\\k<${part.value}>`;
640
+ }
641
+ }
642
+ function toRegExp(part, flags) {
643
+ return new RegExp(toRegExpString(part), flags);
644
+ }
645
+ //-----------------------------------------------------------------------------
646
+ // Regex AST manipulation
647
+ //-----------------------------------------------------------------------------
648
+ function visit(part, visitor, previsit) {
649
+ if (previsit)
650
+ part = previsit(part) ?? part;
651
+ /*
652
+ if (typeof part === 'string')
653
+ return visitor(part) ?? part;
654
+
655
+ if (Array.isArray(part)) {
656
+ part = part.map(p => visit(p, visitor, previsit));
657
+
658
+ } else */
659
+ const t = typed(part);
660
+ switch (t.type) {
661
+ case 'concat':
662
+ part = concatenation(part.map(p => visit(p, visitor, previsit)));
663
+ break;
664
+ case 'alt':
665
+ part = alternation(t.parts.map(p => visit(p, visitor, previsit)));
666
+ break;
667
+ case 'quantified':
668
+ part = repeatFrom(visit(t.part, visitor, previsit), t.min, t.max, t.mod);
669
+ break;
670
+ case 'noncapture':
671
+ part = noncapture(visit(t.part, visitor, previsit), t.options);
672
+ break;
673
+ case 'capture':
674
+ part = capture(visit(t.part, visitor, previsit), t.name);
675
+ break;
676
+ }
677
+ return visitor(part) ?? part;
678
+ }
679
+ function commonPrefix(strings) {
680
+ if (strings.length < 2)
681
+ return '';
682
+ let prefix = strings[0];
683
+ for (let i = 1; prefix && i < strings.length; i++) {
684
+ let j = 0;
685
+ while (j < prefix.length && j < strings[i].length && prefix[j] === strings[i][j])
686
+ j++;
687
+ prefix = prefix.slice(0, j);
688
+ }
689
+ return prefix;
690
+ }
691
+ function optimize(part) {
692
+ return visit(part, p => {
693
+ if (Array.isArray(p)) {
694
+ const result = [];
695
+ let current = "";
696
+ for (const i of p) {
697
+ if (typeof i === 'string') {
698
+ current += i;
699
+ }
700
+ else {
701
+ if (current) {
702
+ result.push(current);
703
+ current = "";
704
+ }
705
+ if (is(i, 'concat'))
706
+ result.push(...i);
707
+ else
708
+ result.push(i);
709
+ }
710
+ }
711
+ if (current)
712
+ result.push(current);
713
+ return concatenation(result);
714
+ }
715
+ else if (is(p, 'alt')) {
716
+ const unique = [...new Set(p.parts.map(toRegExpString))];
717
+ const strings = unique.filter((p) => typeof p === 'string');
718
+ const prefix = commonPrefix(strings);
719
+ if (prefix) {
720
+ return concatenation([prefix, alternation([
721
+ ...strings.map(s => s.slice(prefix.length)),
722
+ ...unique.filter(p => typeof p !== 'string')
723
+ ])]);
724
+ }
725
+ const result = [];
726
+ let cs;
727
+ for (const part of unique) {
728
+ if (typeof part === 'string' && part.length === 1) {
729
+ if (!cs)
730
+ cs = new MutablecharacterClass(false);
731
+ cs.setChar(part);
732
+ }
733
+ else if (is(part, 'class')) {
734
+ if (!cs)
735
+ cs = part.mutable();
736
+ else
737
+ cs.selfUnion(part);
738
+ }
739
+ else {
740
+ if (cs) {
741
+ result.push(cs);
742
+ cs = undefined;
743
+ }
744
+ result.push(part);
745
+ }
746
+ }
747
+ if (cs)
748
+ result.push(cs);
749
+ return alternation(result);
750
+ /*
751
+ // not safe to do this, as it changes the order of alternatives
752
+ const single = partition(unique, p => typeof p === 'string' && p.length === 1);
753
+ if (single.true) {
754
+ const c = chars(single.true.map(s => s as string).join(''));
755
+ return single.false ? alternation([c, ...single.false]) : c;
756
+ }
757
+ return alternation(unique);
758
+ */
759
+ }
760
+ else if (is(p, 'class')) {
761
+ const i = p.next(-1);
762
+ if (i >= 0 && p.next(i) === -1)
763
+ return String.fromCharCode(i);
764
+ }
765
+ return p;
766
+ });
767
+ }
768
+ // Step 1: Build Thompson NFA from regex AST
769
+ function buildNFA(part) {
770
+ let stateId = 0;
771
+ function newState() {
772
+ return { id: stateId++, transitions: new Map(), epsilonTransitions: [], isAccepting: false };
773
+ }
774
+ function build(p) {
775
+ if (typeof p === 'string') {
776
+ // Character literal: start --'c'--> accept
777
+ const start = newState();
778
+ const accept = newState();
779
+ start.transitions.set(p, [accept]);
780
+ return { start, accept };
781
+ }
782
+ if (Array.isArray(p)) {
783
+ let current = build(p[0]);
784
+ for (let i = 1; i < p.length; i++) {
785
+ const next = build(p[i]);
786
+ current.accept.epsilonTransitions.push(next.start);
787
+ current.accept = next.accept;
788
+ }
789
+ return current;
790
+ }
791
+ switch (p.type) {
792
+ case 'alt': {
793
+ // Alternation: start --ε--> frag1.start, frag2.start, ... --ε--> accept
794
+ const start = newState();
795
+ const accept = newState();
796
+ for (const alt of p.parts) {
797
+ const frag = build(alt);
798
+ start.epsilonTransitions.push(frag.start);
799
+ frag.accept.epsilonTransitions.push(accept);
800
+ }
801
+ return { start, accept };
802
+ }
803
+ case 'quantified': {
804
+ const frag = build(p.part);
805
+ const start = newState();
806
+ const accept = newState();
807
+ // Handle min repetitions
808
+ let current = start;
809
+ for (let i = 0; i < p.min; i++) {
810
+ const copy = build(p.part);
811
+ current.epsilonTransitions.push(copy.start);
812
+ current = copy.accept;
813
+ }
814
+ // Handle optional repetitions or infinite
815
+ if (p.max === -1) {
816
+ // Infinite: can loop back
817
+ current.epsilonTransitions.push(frag.start);
818
+ frag.accept.epsilonTransitions.push(frag.start, accept);
819
+ current.epsilonTransitions.push(accept);
820
+ }
821
+ else {
822
+ // Finite: add optional copies
823
+ for (let i = p.min; i < p.max; i++) {
824
+ current.epsilonTransitions.push(accept); // can skip
825
+ const copy = build(p.part);
826
+ current.epsilonTransitions.push(copy.start);
827
+ current = copy.accept;
828
+ }
829
+ current.epsilonTransitions.push(accept);
830
+ }
831
+ return { start, accept };
832
+ }
833
+ case 'class': {
834
+ // Character class: single transition with multiple chars
835
+ const start = newState();
836
+ const accept = newState();
837
+ //if (p.contains(any))
838
+ // return '.';
839
+ // Add transition for each character in the class
840
+ for (let i = p.next(-1); i !== -1; i = p.next(i)) {
841
+ const char = String.fromCodePoint(i);
842
+ const existing = start.transitions.get(char) || [];
843
+ existing.push(accept);
844
+ start.transitions.set(char, existing);
845
+ }
846
+ return { start, accept };
847
+ }
848
+ case 'wordbound':
849
+ case 'nowordbound':
850
+ case 'inputboundstart':
851
+ case 'inputboundend': {
852
+ // Anchors: epsilon transition with special handling
853
+ const start = newState();
854
+ const accept = newState();
855
+ // Mark transition with anchor type for special processing
856
+ start.transitions.set(`__${p.type}__`, [accept]);
857
+ return { start, accept };
858
+ }
859
+ case 'noncapture':
860
+ case 'capture':
861
+ // Groups: just pass through (captures handled at higher level)
862
+ return build(p.part);
863
+ default:
864
+ throw new Error(`Unsupported: ${p.type}`);
865
+ }
866
+ }
867
+ const result = build(part);
868
+ result.accept.isAccepting = true;
869
+ return result;
870
+ }
871
+ // Step 3: Subset Construction - convert NFA to DFA
872
+ function NFAtoDFA(nfaStart) {
873
+ const nfa = [];
874
+ const alphabet = new Set();
875
+ function collectStates(state) {
876
+ if (!nfa[state.id]) {
877
+ nfa[state.id] = state;
878
+ for (const char of state.transitions.keys())
879
+ alphabet.add(char);
880
+ for (const targets of state.transitions.values())
881
+ targets.forEach(collectStates);
882
+ state.epsilonTransitions.forEach(collectStates);
883
+ }
884
+ }
885
+ collectStates(nfaStart);
886
+ const dfaStates = new Map();
887
+ let dfaStateId = 0;
888
+ function stateSetKey(states) {
889
+ return [...states].sort().join(',');
890
+ }
891
+ // Epsilon closure - find all states reachable via ε-transitions
892
+ function epsilonClosure(states) {
893
+ const closure = new Set(states);
894
+ const stack = [...states];
895
+ while (stack.length > 0) {
896
+ const id = stack.pop();
897
+ for (const next of nfa[id].epsilonTransitions) {
898
+ if (!closure.has(next.id)) {
899
+ closure.add(next.id);
900
+ stack.push(next.id);
901
+ }
902
+ }
903
+ }
904
+ return closure;
905
+ }
906
+ function createDFAState(states) {
907
+ const isAccepting = [...states].some(id => nfa[id]?.isAccepting);
908
+ return { /*id: dfaStateId++, */ nfaStates: states, transitions: new Map(), isAccepting };
909
+ }
910
+ // Start with epsilon closure of initial state
911
+ const start = new Set([nfaStart.id]);
912
+ const startClosure = epsilonClosure(start);
913
+ const startDFA = createDFAState(startClosure);
914
+ dfaStates.set(stateSetKey(startClosure), startDFA);
915
+ const worklist = [startDFA];
916
+ while (worklist.length > 0) {
917
+ const currentDFA = worklist.pop();
918
+ // For each character in alphabet
919
+ for (const char of alphabet) {
920
+ // Collect all NFA states reachable by this character
921
+ const next = new Set();
922
+ for (const id of currentDFA.nfaStates) {
923
+ const targets = nfa[id]?.transitions.get(char) || [];
924
+ targets.forEach(target => next.add(target.id));
925
+ }
926
+ if (next.size > 0) {
927
+ // Take epsilon closure of the result
928
+ const nextClosure = epsilonClosure(next);
929
+ // Check if this set of NFA states already has a corresponding DFA state
930
+ const key = stateSetKey(nextClosure);
931
+ let nextDFA = dfaStates.get(key);
932
+ if (!nextDFA) {
933
+ nextDFA = createDFAState(nextClosure);
934
+ dfaStates.set(key, nextDFA);
935
+ worklist.push(nextDFA);
936
+ }
937
+ currentDFA.transitions.set(char, nextDFA);
938
+ }
939
+ }
940
+ }
941
+ return startDFA;
942
+ }
943
+ function runDFA(dfa, str) {
944
+ let currentState = dfa;
945
+ for (const char of str) {
946
+ currentState = currentState.transitions.get(char);
947
+ if (!currentState)
948
+ return false;
949
+ }
950
+ return currentState.isAccepting;
951
+ }
952
+ // Usage:
953
+ function regexToDFA(part) {
954
+ const nfa = buildNFA(part);
955
+ return NFAtoDFA(nfa.start);
956
+ }
957
+ ///
958
+ function parseGlob(glob) {
959
+ let result = '';
960
+ let depth = 0;
961
+ for (let i = 0; i < glob.length; ++i) {
962
+ let c = glob[i];
963
+ switch (c) {
964
+ case '\\':
965
+ c = glob[++i];
966
+ if ('*?+.,^$()|[]a-zA-Z'.includes(c))
967
+ result += '\\';
968
+ break;
969
+ case '*':
970
+ if (glob[i + 1] === '*') {
971
+ result += '.*';
972
+ ++i;
973
+ }
974
+ else {
975
+ result += '[^/]*';
976
+ }
977
+ continue;
978
+ case '?':
979
+ c = '.';
980
+ break;
981
+ case '+':
982
+ case '.':
983
+ case '^':
984
+ case '$':
985
+ case '(':
986
+ case ')':
987
+ case '|':
988
+ result += `\\`;
989
+ break;
990
+ case '[': {
991
+ const end = glob.indexOf(']', i + 1);
992
+ if (end > i) {
993
+ const next = glob[i + 1];
994
+ if (next === ':' && glob[end - 1] === ':') {
995
+ const p = posixClasses[glob.slice(i + 2, end - 1)];
996
+ if (p) {
997
+ result += `[${p}]`;
998
+ i = end;
999
+ continue;
1000
+ }
1001
+ else {
1002
+ console.log(`Warning: Unknown POSIX class ${glob.slice(i + 2, end - 1)} in glob pattern ${glob}`);
1003
+ }
1004
+ }
1005
+ const neg = next === '!' || next === '^';
1006
+ result += `[${neg ? '^' : ''}${glob.slice(neg ? i + 2 : i + 1, end)}]`;
1007
+ i = end;
1008
+ continue;
1009
+ }
1010
+ result += '\\';
1011
+ break;
1012
+ }
1013
+ case '{':
1014
+ ++depth;
1015
+ c = '(';
1016
+ break;
1017
+ case '}':
1018
+ if (depth > 0) {
1019
+ --depth;
1020
+ c = ')';
1021
+ }
1022
+ break;
1023
+ case ',':
1024
+ if (depth > 0)
1025
+ c = '|';
1026
+ break;
1027
+ }
1028
+ result += c;
1029
+ }
1030
+ if (depth > 0) {
1031
+ console.log(`Warning: Unmatched { in glob pattern ${glob}`);
1032
+ result += ')'.repeat(depth);
1033
+ }
1034
+ return result;
1035
+ }
1036
+ function anchoredRe(re) {
1037
+ return new RegExp(`^${re}$`);
1038
+ }
1039
+ function globToRe(glob) {
1040
+ return anchoredRe(parseGlob(glob));
1041
+ }
1042
+ function globToReMulti(globs) {
1043
+ return anchoredRe(globs.map(parseGlob).join('|'));
1044
+ }