npm - wikipeg - Versions diffs - 4.0.2 → 6.0.0 - Mend

wikipeg 4.0.2 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/HISTORY.md +556 -0
package/README.md +230 -12
package/VERSION +1 -1
package/bin/wikipeg +8 -4
package/examples/css.pegphp +9 -8
package/lib/compiler/asts.js +30 -10
package/lib/compiler/charsets.js +306 -0
package/lib/compiler/language/javascript.js +107 -33
package/lib/compiler/language/php.js +193 -55
package/lib/compiler/passes/analyze-always-match.js +141 -0
package/lib/compiler/passes/analyze-first.js +245 -0
package/lib/compiler/passes/ast-to-code.js +316 -100
package/lib/compiler/passes/inline-simple-rules.js +96 -0
package/lib/compiler/passes/optimize-character-class.js +147 -0
package/lib/compiler/passes/optimize-failure-reporting.js +65 -0
package/lib/compiler/passes/remove-proxy-rules.js +7 -5
package/lib/compiler/passes/report-infinite-loops.js +4 -1
package/lib/compiler/passes/report-left-recursion.js +3 -4
package/lib/compiler/passes/report-unknown-attributes.js +39 -0
package/lib/compiler/passes/transform-common-lang.js +1 -1
package/lib/compiler/traverser.js +1 -2
package/lib/compiler/visitor.js +5 -7
package/lib/compiler.js +24 -10
package/lib/parser.js +2784 -3088
package/lib/peg.js +7 -15
package/lib/runtime/template.js +9 -1
package/lib/utils/CaseFolding.txt +1654 -0
package/lib/utils/arrays.js +0 -72
package/lib/utils/casefold.js +697 -0
package/lib/utils/objects.js +9 -39
package/lib/utils/unicode.js +34 -0
package/package.json +6 -4
package/src/DefaultTracer.php +18 -18
package/src/PEGParserBase.php +53 -28
package/src/SyntaxError.php +4 -4
package/src/Tracer.php +1 -1
package/lib/compiler/opcodes.js +0 -54

package/lib/compiler/language/php.js CHANGED Viewed

@@ -43,6 +43,7 @@ let php = {
   maxFailPos: '$this->maxFailPos',
   assertionSuccess: 'false',
   inputLength: '$this->inputLength',
+  advanceInputChar: 'self::advanceChar($this->input, $this->currPos);',
   consumeInputChar: 'self::consumeChar($this->input, $this->currPos);',
   result: '$result',
   actionArgPrefix: '$',
@@ -233,7 +234,20 @@ let php = {
     return escapedChars.join('');
   },
-  matchLiteral(node, reg, result) {
+  classToRegexp(node) {
+    return '['
+      + (node.inverted ? '^' : '')
+      + node.parts.map(function(part) {
+        return part instanceof Array
+          ? php.regexpClassEscape(part[0])
+          + '-'
+          + php.regexpClassEscape(part[1])
+          : php.regexpClassEscape(part);
+      }).join('')
+      + ']';
+  },
+  matchLiteral(node, reg, result, discard, discardPos) {
     let literalLength = getUtf8Length(node.value);
     let escapedValue = php.stringify(node.value);
@@ -241,9 +255,11 @@ let php = {
     if (literalLength === 1 && !node.ignoreCase) {
       result.condition = `($this->input[$this->currPos] ?? null) === ${escapedValue}`;
       result.onSuccess([
-        `$this->currPos++;`,
-        `${reg} = ${php.stringify(node.value)};`
+        `${reg} = ${discard ? 'true' : php.stringify(node.value)};`
       ]);
+      if (!discardPos) {
+        result.onSuccess([`$this->currPos++;`]);
+      }
       return;
     }
@@ -266,15 +282,18 @@ let php = {
       ].join(', ') + ') === 0';
       if (node.ignoreCase) {
         result.onSuccess([
-          `${reg} = substr($this->input, $this->currPos, ${literalLength});`,
-          `$this->currPos += ${literalLength};`
+          discard ?
+            `${reg} = true;` :
+            `${reg} = substr($this->input, $this->currPos, ${literalLength});`
         ]);
       } else {
         result.onSuccess([
-          `${reg} = ${php.stringify(node.value)};`,
-          `$this->currPos += ${literalLength};`
+          `${reg} = ${discard ? 'true' : php.stringify(node.value)};`,
         ]);
       }
+      if (!discardPos) {
+        result.onSuccess([`$this->currPos += ${literalLength};`]);
+      }
       return;
     }
@@ -290,25 +309,13 @@ let php = {
       result.block.push(`${reg} = self::charAt($this->input, $this->currPos);`);
     }
     result.condition = `mb_strtolower(${reg}) === ${php.stringify(node.value.toLowerCase())}`;
-    result.onSuccess([`$this->currPos += strlen(${reg});`]);
+    if (!discardPos) {
+      result.onSuccess([`$this->currPos += strlen(${reg});`]);
+    }
   },
-  matchClass(node, reg, result) {
+  analyzeClass(node) {
     let parts = node.parts;
-    // Empty class
-    if (node.parts.length === 0) {
-      if (node.inverted) {
-        // Same as .
-        result.condition = '$this->currPos < $this->inputLength';
-        result.onSuccess([`${reg} = self::consumeChar($this->input, $this->currPos);`]);
-      } else {
-        // Always fail
-        result.condition = 'false';
-      }
-      return;
-    }
     // Analyze for the potential special case of a class composed of individual
     // characters
     let hasRanges = false;
@@ -338,10 +345,119 @@ let php = {
         }
       }
     }
+    return { hasRanges: hasRanges, hasNonAscii: hasNonAscii, chars: chars };
+  },
+  matchRepeatedClass(node, reg, result, atLeastOne, discard, discardPos) {
+    if (node.parts.length === 0) {
+      if (node.inverted) {
+        // Same as .* / .+
+        result.condition = atLeastOne ? '$this->currPos < $this->inputLength' : 'true';
+        if (!discard) {
+          result.onSuccess([`${reg} = mb_str_split(substr($this->input, $this->currPos), 1, 'utf-8');`]);
+        }
+        if (!discardPos) {
+          result.onSuccess([`$this->currPos = $this->inputLength;`]);
+        }
+      } else if (atLeastOne) {
+        // Always fail
+        result.condition = 'false';
+      } else {
+        // Zero length match
+        result.condition = 'true';
+        result.onSuccess([`${reg} = [];`]);
+      }
+      return;
+    }
+    let {hasRanges,hasNonAscii,chars} = php.analyzeClass(node);
+    // ASCII character lists can be done with strspn/strcspn
+    if (!hasRanges && !hasNonAscii) {
+      if (node.inverted) {
+        result.block.push(`${reg} = strcspn($this->input, ${php.stringify(chars.join(''))}, $this->currPos);`);
+      } else {
+        result.block.push(`${reg} = strspn($this->input, ${php.stringify(chars.join(''))}, $this->currPos);`);
+      }
+      result.condition = atLeastOne ? `${reg} > 0` : "true";
+      if (discard) {
+        if (!discardPos) {
+          result.onSuccess([`$this->currPos += ${reg};`]);
+        }
+      } else {
+        // Note that on PHP <= 8.1, str_split('') returns [''] not [], so only
+        // use it if if we're guaranteed at least one match.
+        if (!discardPos) {
+          result.onSuccess([
+            `$this->currPos += ${reg};`,
+            `${reg} = substr($this->input, $this->currPos - ${reg}, ${reg});`,
+          ]);
+        } else {
+          result.onSuccess([
+            `${reg} = substr($this->input, $this->currPos, ${reg});`
+          ]);
+        }
+        result.onSuccess([
+          hasNonAscii || node.inverted || (!atLeastOne) ?
+            `${reg} = mb_str_split(${reg}, 1, "utf-8");` :
+            `${reg} = str_split(${reg});`,
+        ]);
+      }
+      return;
+    }
+    // Otherwise we shall construct a regex
+    let regexp = '/'
+      + php.classToRegexp(node)
+      + (atLeastOne ? '+' : '*')+'/A'
+      + (node.ignoreCase ? 'i' : '')
+      + (hasNonAscii ? 'u' : '');
+    result.block.push(`${reg} = null;`);
+    result.condition = `preg_match(${php.stringify(regexp)}, $this->input, ${reg}, 0, $this->currPos)`;
+    if (!discardPos) {
+      result.onSuccess([`$this->currPos += strlen(${reg}[0]);`]);
+    }
+    if (discard) {
+      // free the match result array
+      result.onSuccess([`${reg} = true;`]);
+    } else {
+      // See above: str_split() is only safe to use if at least one match.
+      if (hasNonAscii || node.inverted || (!atLeastOne)) {
+        result.onSuccess([`${reg} = mb_str_split(${reg}[0], 1, "utf-8");`]);
+      } else {
+        result.onSuccess([`${reg} = str_split(${reg}[0]);`]);
+      }
+    }
+  },
+  matchClass(node, reg, result, discard, discardPos) {
+    // Empty class
+    if (node.parts.length === 0) {
+      if (node.inverted) {
+        // Same as .
+        result.condition = '$this->currPos < $this->inputLength';
+        if (discard) {
+          result.onSuccess([`${reg} = true;`]);
+          if (!discardPos) {
+            result.onSuccess([
+              `self::advanceChar($this->input, $this->currPos);`,
+            ]);
+          }
+        } else {
+          result.onSuccess([`${reg} = self::consumeChar($this->input, $this->currPos);`]);
+        }
+      } else {
+        // Always fail
+        result.condition = 'false';
+      }
+      return;
+    }
+    let {hasRanges,hasNonAscii,chars} = php.analyzeClass(node);
     // Character lists can be done by getting the next character and comparing
     // it sequentially or looking up in a hashtable
-    if (!hasRanges && (hasNonAscii || parts.length <= 2 || php.config.preferClassHashtable)) {
+    if (!hasRanges && (node.parts.length <= 2 || php.config.preferClassHashtable)) {
       if (hasNonAscii || node.inverted) {
         result.block = [`${reg} = self::charAt($this->input, $this->currPos);`];
       } else {
@@ -363,52 +479,69 @@ let php = {
       if (node.inverted) {
         result.condition = `${reg} !== '' && !(${result.condition})`;
       }
-      if (hasNonAscii || node.inverted) {
-        result.onSuccess([`$this->currPos += strlen(${reg});`]);
-      } else {
-        result.onSuccess([`$this->currPos++;`]);
+      if (!discardPos) {
+        if (hasNonAscii || node.inverted) {
+          result.onSuccess([`$this->currPos += strlen(${reg});`]);
+        } else {
+          result.onSuccess([`$this->currPos++;`]);
+        }
       }
       return;
     }
     // ASCII character lists can be done with strspn/strcspn
-    if (!hasRanges) {
+    if (!(hasRanges || hasNonAscii)) {
       if (node.inverted) {
         result.condition = `strcspn($this->input, ${php.stringify(chars.join(''))}, `
           + '$this->currPos, 1) !== 0';
-        result.onSuccess([`${reg} = self::consumeChar($this->input, $this->currPos);`]);
+        if (discard) {
+          result.onSuccess([`${reg} = true;`]);
+          if (!discardPos) {
+            result.onSuccess([`self::advanceChar($this->input, $this->currPos);`]);
+          }
+        } else {
+          result.onSuccess([`${reg} = self::consumeChar($this->input, $this->currPos);`]);
+        }
       } else {
         result.condition = `strspn($this->input, ${php.stringify(chars.join(''))}, `
           + '$this->currPos, 1) !== 0';
-        result.onSuccess([`${reg} = $this->input[$this->currPos++];`]);
+        if (discard) {
+          result.onSuccess([ `${reg} = true;` ]);
+        } else {
+          result.onSuccess([`${reg} = $this->input[$this->currPos];`]);
+        }
+        if (!discardPos) {
+          result.onSuccess([`$this->currPos++;`]);
+        }
       }
       return;
     }
     // Otherwise we shall construct a regex
-    if (node.inverted || hasNonAscii) {
-      result.block = [`${reg} = self::charAt($this->input, $this->currPos);`];
-    } else {
-      result.block = [`${reg} = $this->input[$this->currPos] ?? '';`];
-    }
-    let regexp = '/^['
-      + (node.inverted ? '^' : '')
-      + node.parts.map(function(part) {
-        return part instanceof Array
-          ? php.regexpClassEscape(part[0])
-          + '-'
-          + php.regexpClassEscape(part[1])
-          : php.regexpClassEscape(part);
-      }).join('')
-      + ']/'
+    let regexp = '/'
+      + php.classToRegexp(node)
+      + '/A'
       + (node.ignoreCase ? 'i' : '')
       + (hasNonAscii ? 'u' : '');
-    result.condition = `preg_match(${php.stringify(regexp)}, ${reg})`;
     if (node.inverted || hasNonAscii) {
-      result.onSuccess([`$this->currPos += strlen(${reg});`]);
+      // A multibyte result is possible, and the exact length isn't known
+      // unless/until the match succeeds.  By using preg_match with an offset,
+      // we can avoid creating the substring in the case where the match fails.
+      result.condition = `preg_match(${php.stringify(regexp)}, $this->input, ${reg}, 0, $this->currPos)`;
+      result.onSuccess([`${reg} = ${reg}[0];`]);
+      if (!discardPos) {
+        result.onSuccess([`$this->currPos += strlen(${reg});`]);
+      }
     } else {
-      result.onSuccess(['$this->currPos++;']);
+      // Creating the matches array is expensive, and its always done if we
+      // pass an offset to preg_match.  So it's cheaper to do a substring
+      // first, even if we're in 'discard' mode.
+      result.block = [`${reg} = $this->input[$this->currPos] ?? '';`];
+      result.condition = `preg_match(${php.stringify(regexp)}, ${reg})`;
+      if (!discardPos) {
+        result.onSuccess(['$this->currPos++;']);
+      }
     }
   },
@@ -452,18 +585,23 @@ let php = {
     return `if ($cached->${name} !== self::$UNDEFINED) { $param_${name} = $cached->${name}; }`;
   },
-  cacheStoreRef(name, store) {
-    return store ?
-      `$saved_${name} !== $param_${name} ? $param_${name} : self::$UNDEFINED` :
+  cacheStoreRef(reg, name) {
+    return reg ?
+      `${reg} !== $param_${name} ? $param_${name} : self::$UNDEFINED` :
       'self::$UNDEFINED';
   },
+  cacheRestoreRef(reg, name) {
+    return `$param_${name} = ${reg};`;
+  },
   /**
    * Get a block which saves ref values to a temporary variable for later
-   * comparison in getCacheStoreRefs().
+   * comparison in getCacheStoreRefs() / getCacheRestoreRefs().
    */
-  cacheSaveRef(name) {
-    return `$saved_${name}=$param_${name};`;
+  cacheSaveRef(reg, name) {
+    return `${reg} = $param_${name};`;
   }
 };

package/lib/compiler/passes/analyze-always-match.js ADDED Viewed

@@ -0,0 +1,141 @@
+"use strict";
+var GrammarError = require("../../grammar-error"),
+    visitor        = require("../visitor"),
+    asts           = require("../asts");
+// Find rules that always match/succeed:
+// It only contains expressions that always match/succeed, either:
+// * an optional (?) expression, or
+// * a zero_or_more (*) expression, or
+// * a rule reference to a rule that always matches/succeeds, or
+// * a sequence containing only the aforementioned expressions, or
+// * a choice containing at least one expression that always matches
+function analyzeAlwaysMatch(ast, options) {
+  options = options || {};
+  if (options.noAlwaysMatch) {
+    return;
+  }
+  // Look for rules which always match/succeed
+  const alwaysMatch = function(node, result) {
+    result.alwaysMatch = true;
+  };
+  const maybeMatch = function(node, result) {
+    result.alwaysMatch = false;
+  };
+  const childMatch = function(node, result) {
+    checkAlwaysMatch(node.expression, result);
+  };
+  const ruleMatch = function(node, result) {
+    // To break cycles, mark this rule (conservatively) as *not*
+    // always matching, before recursing.
+    if (node.hasOwnProperty('alwaysMatch')) {
+      result.alwaysMatch = node.alwaysMatch;
+      return;
+    }
+    node.alwaysMatch = false;
+    checkAlwaysMatch(node.expression, result);
+    node.alwaysMatch = result.alwaysMatch;
+  };
+  const checkAlwaysMatch = visitor.build ({
+    rule: ruleMatch,
+    rule_ref: function(node, result) {
+      const rule = asts.findRule( ast, node.name );
+      checkAlwaysMatch(rule, result);
+    },
+    choice: function(node, result) {
+      let alwaysMatch = false;
+      node.alternatives.forEach( (child) => {
+        // Don't recurse if we've already found a choice which always matches
+        if (alwaysMatch) {
+          if (child.type === 'rule_ref' &&
+              asts.getRuleAttributeValue(asts.findRule(ast, child.name), "unreachable", false)) {
+            // This is okay, the rule is flagged as known unreachable
+          } else if (!options.allowUselessChoice) {
+            throw new GrammarError(
+              "Unreachable alternative.", child.location
+            );
+          }
+        } else {
+          let subresult = {};
+          checkAlwaysMatch(child, subresult);
+          alwaysMatch = subresult.alwaysMatch;
+        }
+      });
+      result.alwaysMatch = alwaysMatch;
+    },
+    sequence: function(node, result) {
+      if (node.hasOwnProperty('alwaysMatch')) {
+        result.alwaysMatch = node.alwaysMatch;
+        return;
+      }
+      let alwaysMatch = true;
+      node.elements.forEach( (child) => {
+        let subresult = {};
+        checkAlwaysMatch(child, subresult);
+        child.alwaysMatch = subresult.alwaysMatch;
+        alwaysMatch = alwaysMatch && child.alwaysMatch;
+      });
+      result.alwaysMatch = alwaysMatch;
+      node.alwaysMatch = alwaysMatch;
+    },
+    labeled: childMatch,
+    text: childMatch,
+    simple_and: childMatch,
+    simple_not: maybeMatch,
+    action: function(node, result) {
+      if (node.hasOwnProperty('alwaysMatch')) {
+        result.alwaysMatch = node.alwaysMatch;
+        return;
+      }
+      checkAlwaysMatch(node.expression, result);
+      node.alwaysMatch = result.alwaysMatch;
+    },
+    optional: alwaysMatch,
+    zero_or_more: alwaysMatch,
+    // "any" can fail to match if we're at the end of file
+    any:  maybeMatch,
+    // Same for 'class': even [^] will fail to match at end of file
+    class: maybeMatch,
+    one_or_more: maybeMatch,
+    literal: function(node, result) {
+      // Empty literal always match on any input
+      result.alwaysMatch = node.value.length === 0 ? true : false;
+    },
+    semantic_and:  maybeMatch,
+    semantic_not:  maybeMatch,
+    parameter_and: maybeMatch,
+    parameter_not: maybeMatch,
+    labeled_param: maybeMatch,
+  });
+  checkAlwaysMatch(ast, {});
+  // Specifically label sequence and action nodes
+  const checkSequencesAndActions = visitor.build ({
+    sequence: function(node) {
+      node.elements.forEach( (child) => checkSequencesAndActions(child, {}) );
+      checkAlwaysMatch(node, {});
+    },
+    action: function(node) {
+      checkSequencesAndActions(node.expression, {});
+      checkAlwaysMatch(node, {});
+    },
+  });
+  checkSequencesAndActions(ast, {});
+}
+module.exports = analyzeAlwaysMatch;