wikipeg 4.0.2 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/HISTORY.md +556 -0
  2. package/README.md +230 -12
  3. package/VERSION +1 -1
  4. package/bin/wikipeg +8 -4
  5. package/examples/css.pegphp +9 -8
  6. package/lib/compiler/asts.js +30 -10
  7. package/lib/compiler/charsets.js +306 -0
  8. package/lib/compiler/language/javascript.js +107 -33
  9. package/lib/compiler/language/php.js +193 -55
  10. package/lib/compiler/passes/analyze-always-match.js +141 -0
  11. package/lib/compiler/passes/analyze-first.js +245 -0
  12. package/lib/compiler/passes/ast-to-code.js +316 -100
  13. package/lib/compiler/passes/inline-simple-rules.js +96 -0
  14. package/lib/compiler/passes/optimize-character-class.js +147 -0
  15. package/lib/compiler/passes/optimize-failure-reporting.js +65 -0
  16. package/lib/compiler/passes/remove-proxy-rules.js +7 -5
  17. package/lib/compiler/passes/report-infinite-loops.js +4 -1
  18. package/lib/compiler/passes/report-left-recursion.js +3 -4
  19. package/lib/compiler/passes/report-unknown-attributes.js +39 -0
  20. package/lib/compiler/passes/transform-common-lang.js +1 -1
  21. package/lib/compiler/traverser.js +1 -2
  22. package/lib/compiler/visitor.js +5 -7
  23. package/lib/compiler.js +24 -10
  24. package/lib/parser.js +2784 -3088
  25. package/lib/peg.js +7 -15
  26. package/lib/runtime/template.js +9 -1
  27. package/lib/utils/CaseFolding.txt +1654 -0
  28. package/lib/utils/arrays.js +0 -72
  29. package/lib/utils/casefold.js +697 -0
  30. package/lib/utils/objects.js +9 -39
  31. package/lib/utils/unicode.js +34 -0
  32. package/package.json +6 -4
  33. package/src/DefaultTracer.php +18 -18
  34. package/src/PEGParserBase.php +53 -28
  35. package/src/SyntaxError.php +4 -4
  36. package/src/Tracer.php +1 -1
  37. package/lib/compiler/opcodes.js +0 -54
@@ -43,6 +43,7 @@ let php = {
43
43
  maxFailPos: '$this->maxFailPos',
44
44
  assertionSuccess: 'false',
45
45
  inputLength: '$this->inputLength',
46
+ advanceInputChar: 'self::advanceChar($this->input, $this->currPos);',
46
47
  consumeInputChar: 'self::consumeChar($this->input, $this->currPos);',
47
48
  result: '$result',
48
49
  actionArgPrefix: '$',
@@ -233,7 +234,20 @@ let php = {
233
234
  return escapedChars.join('');
234
235
  },
235
236
 
236
- matchLiteral(node, reg, result) {
237
+ classToRegexp(node) {
238
+ return '['
239
+ + (node.inverted ? '^' : '')
240
+ + node.parts.map(function(part) {
241
+ return part instanceof Array
242
+ ? php.regexpClassEscape(part[0])
243
+ + '-'
244
+ + php.regexpClassEscape(part[1])
245
+ : php.regexpClassEscape(part);
246
+ }).join('')
247
+ + ']';
248
+ },
249
+
250
+ matchLiteral(node, reg, result, discard, discardPos) {
237
251
  let literalLength = getUtf8Length(node.value);
238
252
  let escapedValue = php.stringify(node.value);
239
253
 
@@ -241,9 +255,11 @@ let php = {
241
255
  if (literalLength === 1 && !node.ignoreCase) {
242
256
  result.condition = `($this->input[$this->currPos] ?? null) === ${escapedValue}`;
243
257
  result.onSuccess([
244
- `$this->currPos++;`,
245
- `${reg} = ${php.stringify(node.value)};`
258
+ `${reg} = ${discard ? 'true' : php.stringify(node.value)};`
246
259
  ]);
260
+ if (!discardPos) {
261
+ result.onSuccess([`$this->currPos++;`]);
262
+ }
247
263
  return;
248
264
  }
249
265
 
@@ -266,15 +282,18 @@ let php = {
266
282
  ].join(', ') + ') === 0';
267
283
  if (node.ignoreCase) {
268
284
  result.onSuccess([
269
- `${reg} = substr($this->input, $this->currPos, ${literalLength});`,
270
- `$this->currPos += ${literalLength};`
285
+ discard ?
286
+ `${reg} = true;` :
287
+ `${reg} = substr($this->input, $this->currPos, ${literalLength});`
271
288
  ]);
272
289
  } else {
273
290
  result.onSuccess([
274
- `${reg} = ${php.stringify(node.value)};`,
275
- `$this->currPos += ${literalLength};`
291
+ `${reg} = ${discard ? 'true' : php.stringify(node.value)};`,
276
292
  ]);
277
293
  }
294
+ if (!discardPos) {
295
+ result.onSuccess([`$this->currPos += ${literalLength};`]);
296
+ }
278
297
  return;
279
298
  }
280
299
 
@@ -290,25 +309,13 @@ let php = {
290
309
  result.block.push(`${reg} = self::charAt($this->input, $this->currPos);`);
291
310
  }
292
311
  result.condition = `mb_strtolower(${reg}) === ${php.stringify(node.value.toLowerCase())}`;
293
- result.onSuccess([`$this->currPos += strlen(${reg});`]);
312
+ if (!discardPos) {
313
+ result.onSuccess([`$this->currPos += strlen(${reg});`]);
314
+ }
294
315
  },
295
316
 
296
- matchClass(node, reg, result) {
317
+ analyzeClass(node) {
297
318
  let parts = node.parts;
298
-
299
- // Empty class
300
- if (node.parts.length === 0) {
301
- if (node.inverted) {
302
- // Same as .
303
- result.condition = '$this->currPos < $this->inputLength';
304
- result.onSuccess([`${reg} = self::consumeChar($this->input, $this->currPos);`]);
305
- } else {
306
- // Always fail
307
- result.condition = 'false';
308
- }
309
- return;
310
- }
311
-
312
319
  // Analyze for the potential special case of a class composed of individual
313
320
  // characters
314
321
  let hasRanges = false;
@@ -338,10 +345,119 @@ let php = {
338
345
  }
339
346
  }
340
347
  }
348
+ return { hasRanges: hasRanges, hasNonAscii: hasNonAscii, chars: chars };
349
+ },
350
+
351
+ matchRepeatedClass(node, reg, result, atLeastOne, discard, discardPos) {
352
+ if (node.parts.length === 0) {
353
+ if (node.inverted) {
354
+ // Same as .* / .+
355
+ result.condition = atLeastOne ? '$this->currPos < $this->inputLength' : 'true';
356
+ if (!discard) {
357
+ result.onSuccess([`${reg} = mb_str_split(substr($this->input, $this->currPos), 1, 'utf-8');`]);
358
+ }
359
+ if (!discardPos) {
360
+ result.onSuccess([`$this->currPos = $this->inputLength;`]);
361
+ }
362
+ } else if (atLeastOne) {
363
+ // Always fail
364
+ result.condition = 'false';
365
+ } else {
366
+ // Zero length match
367
+ result.condition = 'true';
368
+ result.onSuccess([`${reg} = [];`]);
369
+ }
370
+ return;
371
+ }
372
+ let {hasRanges,hasNonAscii,chars} = php.analyzeClass(node);
373
+
374
+ // ASCII character lists can be done with strspn/strcspn
375
+ if (!hasRanges && !hasNonAscii) {
376
+ if (node.inverted) {
377
+ result.block.push(`${reg} = strcspn($this->input, ${php.stringify(chars.join(''))}, $this->currPos);`);
378
+ } else {
379
+ result.block.push(`${reg} = strspn($this->input, ${php.stringify(chars.join(''))}, $this->currPos);`);
380
+ }
381
+ result.condition = atLeastOne ? `${reg} > 0` : "true";
382
+ if (discard) {
383
+ if (!discardPos) {
384
+ result.onSuccess([`$this->currPos += ${reg};`]);
385
+ }
386
+ } else {
387
+ // Note that on PHP <= 8.1, str_split('') returns [''] not [], so only
388
+ // use it if if we're guaranteed at least one match.
389
+ if (!discardPos) {
390
+ result.onSuccess([
391
+ `$this->currPos += ${reg};`,
392
+ `${reg} = substr($this->input, $this->currPos - ${reg}, ${reg});`,
393
+ ]);
394
+ } else {
395
+ result.onSuccess([
396
+ `${reg} = substr($this->input, $this->currPos, ${reg});`
397
+ ]);
398
+ }
399
+ result.onSuccess([
400
+ hasNonAscii || node.inverted || (!atLeastOne) ?
401
+ `${reg} = mb_str_split(${reg}, 1, "utf-8");` :
402
+ `${reg} = str_split(${reg});`,
403
+ ]);
404
+ }
405
+ return;
406
+ }
407
+
408
+ // Otherwise we shall construct a regex
409
+ let regexp = '/'
410
+ + php.classToRegexp(node)
411
+ + (atLeastOne ? '+' : '*')+'/A'
412
+ + (node.ignoreCase ? 'i' : '')
413
+ + (hasNonAscii ? 'u' : '');
414
+ result.block.push(`${reg} = null;`);
415
+ result.condition = `preg_match(${php.stringify(regexp)}, $this->input, ${reg}, 0, $this->currPos)`;
416
+ if (!discardPos) {
417
+ result.onSuccess([`$this->currPos += strlen(${reg}[0]);`]);
418
+ }
419
+ if (discard) {
420
+ // free the match result array
421
+ result.onSuccess([`${reg} = true;`]);
422
+ } else {
423
+ // See above: str_split() is only safe to use if at least one match.
424
+ if (hasNonAscii || node.inverted || (!atLeastOne)) {
425
+ result.onSuccess([`${reg} = mb_str_split(${reg}[0], 1, "utf-8");`]);
426
+ } else {
427
+ result.onSuccess([`${reg} = str_split(${reg}[0]);`]);
428
+ }
429
+ }
430
+ },
431
+
432
+ matchClass(node, reg, result, discard, discardPos) {
433
+
434
+ // Empty class
435
+ if (node.parts.length === 0) {
436
+ if (node.inverted) {
437
+ // Same as .
438
+ result.condition = '$this->currPos < $this->inputLength';
439
+ if (discard) {
440
+ result.onSuccess([`${reg} = true;`]);
441
+ if (!discardPos) {
442
+ result.onSuccess([
443
+ `self::advanceChar($this->input, $this->currPos);`,
444
+ ]);
445
+ }
446
+ } else {
447
+ result.onSuccess([`${reg} = self::consumeChar($this->input, $this->currPos);`]);
448
+ }
449
+ } else {
450
+ // Always fail
451
+ result.condition = 'false';
452
+ }
453
+ return;
454
+ }
455
+
456
+ let {hasRanges,hasNonAscii,chars} = php.analyzeClass(node);
341
457
 
342
458
  // Character lists can be done by getting the next character and comparing
343
459
  // it sequentially or looking up in a hashtable
344
- if (!hasRanges && (hasNonAscii || parts.length <= 2 || php.config.preferClassHashtable)) {
460
+ if (!hasRanges && (node.parts.length <= 2 || php.config.preferClassHashtable)) {
345
461
  if (hasNonAscii || node.inverted) {
346
462
  result.block = [`${reg} = self::charAt($this->input, $this->currPos);`];
347
463
  } else {
@@ -363,52 +479,69 @@ let php = {
363
479
  if (node.inverted) {
364
480
  result.condition = `${reg} !== '' && !(${result.condition})`;
365
481
  }
366
- if (hasNonAscii || node.inverted) {
367
- result.onSuccess([`$this->currPos += strlen(${reg});`]);
368
- } else {
369
- result.onSuccess([`$this->currPos++;`]);
482
+ if (!discardPos) {
483
+ if (hasNonAscii || node.inverted) {
484
+ result.onSuccess([`$this->currPos += strlen(${reg});`]);
485
+ } else {
486
+ result.onSuccess([`$this->currPos++;`]);
487
+ }
370
488
  }
371
489
  return;
372
490
  }
373
491
 
374
492
  // ASCII character lists can be done with strspn/strcspn
375
- if (!hasRanges) {
493
+ if (!(hasRanges || hasNonAscii)) {
376
494
  if (node.inverted) {
377
495
  result.condition = `strcspn($this->input, ${php.stringify(chars.join(''))}, `
378
496
  + '$this->currPos, 1) !== 0';
379
- result.onSuccess([`${reg} = self::consumeChar($this->input, $this->currPos);`]);
497
+ if (discard) {
498
+ result.onSuccess([`${reg} = true;`]);
499
+ if (!discardPos) {
500
+ result.onSuccess([`self::advanceChar($this->input, $this->currPos);`]);
501
+ }
502
+ } else {
503
+ result.onSuccess([`${reg} = self::consumeChar($this->input, $this->currPos);`]);
504
+ }
380
505
  } else {
381
506
  result.condition = `strspn($this->input, ${php.stringify(chars.join(''))}, `
382
507
  + '$this->currPos, 1) !== 0';
383
- result.onSuccess([`${reg} = $this->input[$this->currPos++];`]);
508
+ if (discard) {
509
+ result.onSuccess([ `${reg} = true;` ]);
510
+ } else {
511
+ result.onSuccess([`${reg} = $this->input[$this->currPos];`]);
512
+ }
513
+ if (!discardPos) {
514
+ result.onSuccess([`$this->currPos++;`]);
515
+ }
384
516
  }
385
517
  return;
386
518
  }
387
519
 
388
520
  // Otherwise we shall construct a regex
389
- if (node.inverted || hasNonAscii) {
390
- result.block = [`${reg} = self::charAt($this->input, $this->currPos);`];
391
- } else {
392
- result.block = [`${reg} = $this->input[$this->currPos] ?? '';`];
393
- }
394
- let regexp = '/^['
395
- + (node.inverted ? '^' : '')
396
- + node.parts.map(function(part) {
397
- return part instanceof Array
398
- ? php.regexpClassEscape(part[0])
399
- + '-'
400
- + php.regexpClassEscape(part[1])
401
- : php.regexpClassEscape(part);
402
- }).join('')
403
- + ']/'
521
+ let regexp = '/'
522
+ + php.classToRegexp(node)
523
+ + '/A'
404
524
  + (node.ignoreCase ? 'i' : '')
405
525
  + (hasNonAscii ? 'u' : '');
406
526
 
407
- result.condition = `preg_match(${php.stringify(regexp)}, ${reg})`;
408
527
  if (node.inverted || hasNonAscii) {
409
- result.onSuccess([`$this->currPos += strlen(${reg});`]);
528
+ // A multibyte result is possible, and the exact length isn't known
529
+ // unless/until the match succeeds. By using preg_match with an offset,
530
+ // we can avoid creating the substring in the case where the match fails.
531
+ result.condition = `preg_match(${php.stringify(regexp)}, $this->input, ${reg}, 0, $this->currPos)`;
532
+ result.onSuccess([`${reg} = ${reg}[0];`]);
533
+ if (!discardPos) {
534
+ result.onSuccess([`$this->currPos += strlen(${reg});`]);
535
+ }
410
536
  } else {
411
- result.onSuccess(['$this->currPos++;']);
537
+ // Creating the matches array is expensive, and its always done if we
538
+ // pass an offset to preg_match. So it's cheaper to do a substring
539
+ // first, even if we're in 'discard' mode.
540
+ result.block = [`${reg} = $this->input[$this->currPos] ?? '';`];
541
+ result.condition = `preg_match(${php.stringify(regexp)}, ${reg})`;
542
+ if (!discardPos) {
543
+ result.onSuccess(['$this->currPos++;']);
544
+ }
412
545
  }
413
546
  },
414
547
 
@@ -452,18 +585,23 @@ let php = {
452
585
  return `if ($cached->${name} !== self::$UNDEFINED) { $param_${name} = $cached->${name}; }`;
453
586
  },
454
587
 
455
- cacheStoreRef(name, store) {
456
- return store ?
457
- `$saved_${name} !== $param_${name} ? $param_${name} : self::$UNDEFINED` :
588
+ cacheStoreRef(reg, name) {
589
+ return reg ?
590
+ `${reg} !== $param_${name} ? $param_${name} : self::$UNDEFINED` :
458
591
  'self::$UNDEFINED';
459
592
  },
460
593
 
594
+
595
+ cacheRestoreRef(reg, name) {
596
+ return `$param_${name} = ${reg};`;
597
+ },
598
+
461
599
  /**
462
600
  * Get a block which saves ref values to a temporary variable for later
463
- * comparison in getCacheStoreRefs().
601
+ * comparison in getCacheStoreRefs() / getCacheRestoreRefs().
464
602
  */
465
- cacheSaveRef(name) {
466
- return `$saved_${name}=$param_${name};`;
603
+ cacheSaveRef(reg, name) {
604
+ return `${reg} = $param_${name};`;
467
605
  }
468
606
  };
469
607
 
@@ -0,0 +1,141 @@
1
+ "use strict";
2
+
3
+ var GrammarError = require("../../grammar-error"),
4
+ visitor = require("../visitor"),
5
+ asts = require("../asts");
6
+
7
+ // Find rules that always match/succeed:
8
+ // It only contains expressions that always match/succeed, either:
9
+ // * an optional (?) expression, or
10
+ // * a zero_or_more (*) expression, or
11
+ // * a rule reference to a rule that always matches/succeeds, or
12
+ // * a sequence containing only the aforementioned expressions, or
13
+ // * a choice containing at least one expression that always matches
14
+
15
+ function analyzeAlwaysMatch(ast, options) {
16
+ options = options || {};
17
+ if (options.noAlwaysMatch) {
18
+ return;
19
+ }
20
+
21
+ // Look for rules which always match/succeed
22
+ const alwaysMatch = function(node, result) {
23
+ result.alwaysMatch = true;
24
+ };
25
+
26
+ const maybeMatch = function(node, result) {
27
+ result.alwaysMatch = false;
28
+ };
29
+
30
+ const childMatch = function(node, result) {
31
+ checkAlwaysMatch(node.expression, result);
32
+ };
33
+
34
+ const ruleMatch = function(node, result) {
35
+ // To break cycles, mark this rule (conservatively) as *not*
36
+ // always matching, before recursing.
37
+ if (node.hasOwnProperty('alwaysMatch')) {
38
+ result.alwaysMatch = node.alwaysMatch;
39
+ return;
40
+ }
41
+ node.alwaysMatch = false;
42
+ checkAlwaysMatch(node.expression, result);
43
+ node.alwaysMatch = result.alwaysMatch;
44
+ };
45
+
46
+ const checkAlwaysMatch = visitor.build ({
47
+ rule: ruleMatch,
48
+
49
+ rule_ref: function(node, result) {
50
+ const rule = asts.findRule( ast, node.name );
51
+ checkAlwaysMatch(rule, result);
52
+ },
53
+
54
+ choice: function(node, result) {
55
+ let alwaysMatch = false;
56
+ node.alternatives.forEach( (child) => {
57
+ // Don't recurse if we've already found a choice which always matches
58
+ if (alwaysMatch) {
59
+ if (child.type === 'rule_ref' &&
60
+ asts.getRuleAttributeValue(asts.findRule(ast, child.name), "unreachable", false)) {
61
+ // This is okay, the rule is flagged as known unreachable
62
+ } else if (!options.allowUselessChoice) {
63
+ throw new GrammarError(
64
+ "Unreachable alternative.", child.location
65
+ );
66
+ }
67
+ } else {
68
+ let subresult = {};
69
+ checkAlwaysMatch(child, subresult);
70
+ alwaysMatch = subresult.alwaysMatch;
71
+ }
72
+ });
73
+
74
+ result.alwaysMatch = alwaysMatch;
75
+ },
76
+
77
+ sequence: function(node, result) {
78
+ if (node.hasOwnProperty('alwaysMatch')) {
79
+ result.alwaysMatch = node.alwaysMatch;
80
+ return;
81
+ }
82
+ let alwaysMatch = true;
83
+ node.elements.forEach( (child) => {
84
+ let subresult = {};
85
+ checkAlwaysMatch(child, subresult);
86
+ child.alwaysMatch = subresult.alwaysMatch;
87
+ alwaysMatch = alwaysMatch && child.alwaysMatch;
88
+ });
89
+ result.alwaysMatch = alwaysMatch;
90
+ node.alwaysMatch = alwaysMatch;
91
+ },
92
+
93
+ labeled: childMatch,
94
+ text: childMatch,
95
+ simple_and: childMatch,
96
+ simple_not: maybeMatch,
97
+ action: function(node, result) {
98
+ if (node.hasOwnProperty('alwaysMatch')) {
99
+ result.alwaysMatch = node.alwaysMatch;
100
+ return;
101
+ }
102
+ checkAlwaysMatch(node.expression, result);
103
+ node.alwaysMatch = result.alwaysMatch;
104
+ },
105
+
106
+ optional: alwaysMatch,
107
+ zero_or_more: alwaysMatch,
108
+ // "any" can fail to match if we're at the end of file
109
+ any: maybeMatch,
110
+ // Same for 'class': even [^] will fail to match at end of file
111
+ class: maybeMatch,
112
+
113
+ one_or_more: maybeMatch,
114
+ literal: function(node, result) {
115
+ // Empty literal always match on any input
116
+ result.alwaysMatch = node.value.length === 0 ? true : false;
117
+ },
118
+
119
+ semantic_and: maybeMatch,
120
+ semantic_not: maybeMatch,
121
+ parameter_and: maybeMatch,
122
+ parameter_not: maybeMatch,
123
+ labeled_param: maybeMatch,
124
+ });
125
+
126
+ checkAlwaysMatch(ast, {});
127
+ // Specifically label sequence and action nodes
128
+ const checkSequencesAndActions = visitor.build ({
129
+ sequence: function(node) {
130
+ node.elements.forEach( (child) => checkSequencesAndActions(child, {}) );
131
+ checkAlwaysMatch(node, {});
132
+ },
133
+ action: function(node) {
134
+ checkSequencesAndActions(node.expression, {});
135
+ checkAlwaysMatch(node, {});
136
+ },
137
+ });
138
+ checkSequencesAndActions(ast, {});
139
+ }
140
+
141
+ module.exports = analyzeAlwaysMatch;