npm - wikipeg - Versions diffs - 4.0.2 → 6.0.0 - Mend

wikipeg 4.0.2 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/HISTORY.md +556 -0
package/README.md +230 -12
package/VERSION +1 -1
package/bin/wikipeg +8 -4
package/examples/css.pegphp +9 -8
package/lib/compiler/asts.js +30 -10
package/lib/compiler/charsets.js +306 -0
package/lib/compiler/language/javascript.js +107 -33
package/lib/compiler/language/php.js +193 -55
package/lib/compiler/passes/analyze-always-match.js +141 -0
package/lib/compiler/passes/analyze-first.js +245 -0
package/lib/compiler/passes/ast-to-code.js +316 -100
package/lib/compiler/passes/inline-simple-rules.js +96 -0
package/lib/compiler/passes/optimize-character-class.js +147 -0
package/lib/compiler/passes/optimize-failure-reporting.js +65 -0
package/lib/compiler/passes/remove-proxy-rules.js +7 -5
package/lib/compiler/passes/report-infinite-loops.js +4 -1
package/lib/compiler/passes/report-left-recursion.js +3 -4
package/lib/compiler/passes/report-unknown-attributes.js +39 -0
package/lib/compiler/passes/transform-common-lang.js +1 -1
package/lib/compiler/traverser.js +1 -2
package/lib/compiler/visitor.js +5 -7
package/lib/compiler.js +24 -10
package/lib/parser.js +2784 -3088
package/lib/peg.js +7 -15
package/lib/runtime/template.js +9 -1
package/lib/utils/CaseFolding.txt +1654 -0
package/lib/utils/arrays.js +0 -72
package/lib/utils/casefold.js +697 -0
package/lib/utils/objects.js +9 -39
package/lib/utils/unicode.js +34 -0
package/package.json +6 -4
package/src/DefaultTracer.php +18 -18
package/src/PEGParserBase.php +53 -28
package/src/SyntaxError.php +4 -4
package/src/Tracer.php +1 -1
package/lib/compiler/opcodes.js +0 -54

package/README.md CHANGED Viewed

@@ -57,7 +57,9 @@ want to use the parser in browser environment.
 You can tweak the generated parser with several options:
   * `--cache` — makes the parser cache results, avoiding exponential parsing
-    time in pathological cases but making the parser slower
+    time in pathological cases but making the parser slower. See the
+    `cache` option to `PEG.buildParse` and the [Caching](#caching)
+    section below.
   * `--allowed-start-rules` — comma-separated list of rules the parser will be
     allowed to start parsing from (default: the first rule in the grammar)
   * `--plugin` — makes WikiPEG use a specified plugin (can be specified multiple
@@ -92,16 +94,49 @@ property with more details about the error.
 You can tweak the generated parser by passing a second parameter with an options
 object to `PEG.buildParser`. The following options are supported:
+  * `language` — if set to `"javascript"`, the method will generate parser
+     code in JavaScript; if set to `"php"`, it will generate parser code in PHP
+     (default: `"javascript"`)
   * `cache` — if `true`, makes the parser cache results, avoiding exponential
     parsing time in pathological cases but making the parser slower (default:
-    `false`)
+    `false`). See the [Caching](#caching) section below.
+  * `allowLoops` — if `true`, disables "infinite loop checking", which
+    looks for rules like `""*` which can match an infinite number of
+    times. Disabling this check can be helpful if it uncovers false
+    positives -- matches which can not be empty for reasons outside
+    its analysis.
+  * `allowUselessChoice` — if `true`, disables the check for rules
+    which "always match" as other than the last element in a choice.
+  * `caselessRestrict`  — by default, WikiPEG uses the Unicode "Simple
+    Case Folding" algorithm to implement case-insensitive matching.
+    If `caselessRestrict` is true, the algorithm is modified to
+    prohibit case-insensitive matches between ASCII and non-ASCII
+    characters, in the same way that the PCRE CASELESS_RESTRICT
+    feature does.
+  * `commonLang` — if `true`, performs some simple modifications to
+    action clauses to make it possible to write test cases that work
+    in both javascript and PHP.
+  * `noAlwaysMatch` — if `true`, disables optimization of rules which
+     always match.
+  * `noInlining` — if `true`, disables inlining of simple character
+    classes and repeated character classes. This can be useful if you
+    are tracing execution or testing the parser and wish to see every
+    rule entry/exit, or need to explicitly manage caching. See
+    the [Caching](#caching) section below.
+  * `noOptimizeFirstSet` - if `true`, disables an optimization which
+    fails early if looking at the first character is sufficient to
+    determine that a rule can not match.  This can affect failure
+    reporting, since we might be able to fail on a parent rule before
+    actually recursing into the child responsible.
+  * `cacheInitHook` and `cacheRuleHook` — functions to generate custom cache
+    control code
   * `allowedStartRules` — rules the parser will be allowed to start parsing from
     (default: the first rule in the grammar)
+  * `allowedStreamRules` — rules the parser will be allowed to start parsing from
+     in asynchronous mode
   * `output` — if set to `"parser"`, the method will return generated parser
     object; if set to `"source"`, it will return parser source code as a string
     (default: `"parser"`)
-  * `optimize`— selects between optimizing the generated parser for parsing
-    speed (`"speed"`) or code size (`"size"`) (default: `"speed"`)
   * `plugins` — plugins to use
 Using the Parser
@@ -148,7 +183,7 @@ Let's look at example grammar that recognizes simple arithmetic expressions like
     primary
       = integer
-      / "(" additive:additive ")" { return additive; }
+      / "(" @additive ")"
     integer "integer"
       = digits:[0-9]+ { return parseInt(digits.join(""), 10); }
@@ -163,9 +198,12 @@ happens when the pattern matches successfully. A rule can also contain
 `integer` rule has a human-readable name). The parsing starts at the first rule,
 which is also called the *start rule*.
-A rule name must be a JavaScript identifier. It is followed by an equality sign
-(“=”) and a parsing expression. If the rule has a human-readable name, it is
-written as a JavaScript string between the name and separating equality sign.
+A rule name must be a JavaScript identifier. It is followed by an
+equals sign (“=”) and a parsing expression. If the rule has additional
+attributes, they are written between square brackets (“[” and “]”)
+between the rule name and the equals sign; see the “Rule attribute
+syntax” section below for more details.
 Rules need to be separated only by whitespace (their beginning is easily
 recognizable), but a semicolon (“;”) after the parsing expression is allowed.
@@ -197,7 +235,7 @@ using a simple initializer.
     primary
       = integer
-      / "(" additive:additive ")" { return additive; }
+      / "(" @additive ")"
     integer "integer"
       = digits:[0-9]+ { return makeInteger(digits); }
@@ -215,20 +253,27 @@ example:
     containing matched part of the input.
   * An expression matching repeated occurrence of some subexpression produces a
     JavaScript array with all the matches.
+  * An expression matching a sequence of expressions produces a
+    JavaScript array with all the picked elements.
+    * If no matches are picked, all elements of the sequence will be
+      present in the array.
+    * If the pick operator (`@`) is used, only those elements which
+      are picked will be present.  If only one element is picked, it
+      will be returned directly (not wrapped in a 1-element array).
 The match results propagate through the rules when the rule names are used in
 expressions, up to the start rule. The generated parser returns start rule's
 match result when parsing is successful.
 One special case of parser expression is a *parser action* — a piece of
-JavaScript code inside curly braces (“{” and “}”) that takes match results of
-some of the the preceding expressions and returns a JavaScript value. This value
+JavaScript code inside curly braces (`{` and `}`) that takes match results of
+some of the preceding expressions and returns a JavaScript value. This value
 is considered match result of the preceding expression (in other words, the
 parser action is a match result transformer).
 In our arithmetics example, there are many parser actions. Consider the action
 in expression `digits:[0-9]+ { return parseInt(digits.join(""), 10); }`. It
-takes the match result of the expression [0-9]+, which is an array of strings
+takes the match result of the expression `[0-9]+`, which is an array of strings
 containing digits, as its parameter. It joins the digits together to form a
 number and converts it to a JavaScript `number` object.
@@ -366,6 +411,21 @@ can be accessed by action's JavaScript code.
 #### *expression<sub>1</sub>* *expression<sub>2</sub>* ...  *expression<sub>n</sub>*
 Match a sequence of expressions and return their match results in an array.
+Elements of the sequence can be picked by preceding them with the pick
+operator (`@`), and only those elements will be returned in the array.
+If only one element is picked, it is returned directly (not wrapped in
+an array).
+#### @ *expression*
+Pick the specified expression in a sequence to return.  See the
+description of a sequence expression above.
+Note that sequences with pick operators can be nested, for example:
+    foo = @"a" @("b" @"c" "d") "e"
+will return `["a", "c"]` if it matches.
 #### *expression* { *action* }
@@ -419,6 +479,88 @@ Try to match the first expression, if it does not succeed, try the second one,
 etc. Return the match result of the first successfully matched expression. If no
 expression matches, consider the match failed.
+Rule attribute syntax
+---------------------
+WikiPEG supports attaching attributes to rules which can affect their
+behavior.  The syntax is:
+    rule1 [attr1, attr2=false, attr3="string", ...] = nonterminal1 ... ;
+That is, attributes are comma-separated between square brackets
+between the rule name and the equals sign.  Attributes can have
+boolean, string, or integer values.  An attribute without a value
+is treated as shorthand for setting it to boolean `true`.
+The following attributes affect parsing:
+#### [name="*rule name*"]
+Provide a human-readable *rule name* for this rule.  For example, this
+production:
+    integer [name="simple number"] = [0-9]+
+will produce an error message like:
+    Expected simple number but "a" found.
+when parsing a non-number, referencing the human-readable name "simple
+number".  Without the human-readable name, WikiPEG uses a description
+of the character class that failed to match:
+    Expected [0-9] but "a" found.
+Aside from the content of error messages, providing a `name` attribute
+also affects *where* errors are reported, preferring to report failure
+at the named rule instead of inside it.
+#### [inline] *or* [inline=true]
+Forces inlining of the given rule, regardless of the status of the
+`noInlining` option.
+#### [inline=false]
+Prevents inlining of the given rule.
+#### [cache] *or* [cache=true]
+Turns on caching for the given rule, regardless of the status of the
+top-level `cache` option. This can be useful for enabling caching
+only on a few rules while leaving it mostly disabled.
+If caching is disabled in the top-level WikiPEG options but any rule
+has this attribute set to `true`, then caching will be enabled but all
+rules will default to `[cache=false]`.
+If caching is enabled in the WikiPEG options, then `[cache]` is
+effectively a no-op, since the default is to cache all rules.
+#### [cache=false]
+Turns off caching for the given rule, regardless of the status of the
+top-level `cache` option. This can be useful for selectively disabling
+caching on a few rules while leaving it mostly enabled.
+If caching is disabled in the top-level WikiPEG options, this is
+effectively a no-op.
+If caching is enabled in the top-level WikiPEG options, this will
+prevent the given rule from being cached.
+#### [empty=false]
+Marks a node as non-nullable; that is, asserts that it cannot match
+the empty string -- usually because of some predicate expression in
+the rule which is beyond WikiPEG's ability to analyze.  This can
+prevent false positives when WikiPEG checks for infinite loops.
+#### [unreachable]
+Marks a rule as unreachable. If the `allowUselessChoice` option is
+false, this attribute permits a reference to the rule in a choice even
+if a previous option in the choice appears to always match.
 Rule parameter syntax
 ---------------------
@@ -493,6 +635,82 @@ In JS this will expose the reference parameter "r" as an object with r.set(),
 r.get(). In PHP it will be a native reference such that {$r = 1;} will set
 the value of the reference in the declaration scope.
+Caching
+-------
+Note that caching makes PEG grammars behave somewhat differently from
+recursive descent parsers.  Consider the grammar:
+    start = "a" long_complicated_thing b
+          / "a" long_complicated_thing c
+          / "a" long_complicated_thing
+    // this could be any costly rule, but this is the simplest example
+    // which will take time proportional to the file length
+    long_complicated_thing = $[^]*
+    b = "b"
+    c = "c"
+Without caching, the generated parser will match `"a"`, then scan the
+entire length of the string matching `long_complicated_thing`, then
+match the end-of-file to `"b"` and fail, return to the start of the
+string and do it again (scanning the entire length of the string),
+fail to match `"c"` and so on.
+When caching is enabled, the second time we try to match
+`long_complicated_thing` at position 2 in the string it will recognize
+that it has tried exactly this parse before and return the previous
+match from the cache.  This takes constant time instead of time
+proportional to the input string length.  This can be quite
+significant in a grammar that involves a lot of backtracking.
+There are some caveats, however!
+First, caching is relatively expensive, so it is only done at rule
+boundaries, like `long_complicated_thing`, `b`, and `c` above.  This
+is a departure from a "theoretical" packrat parser.
+Second, the memoization cache stores an entry for every nonterminal at
+every position is it attempted *whether the result is success or
+failure*.  In our example we allocate memory for cache entries for "b"
+and "c" even though they do not match. Writing rules which match
+single characters can easily result in excessive memory use if care is
+not taken.
+Consider two alterations to our example above.  First, consider inlining the
+`long_complicated_thing` rule like so:
+    start = "a" $[^]* "b"
+          / "a" $[^]* "c"
+          / "a" $[^]*
+The grammar would then match exactly the same strings as before, but
+we would do no caching and each of the choice branches would scan to
+the end of the string.
+Alternatively, if we just moved the zero-or-more repetition operator
+like so:
+    start = "a" $long_complicated_thing* b
+          / "a" $long_complicated_thing* c
+          / "a" $long_complicated_thing*
+    long_complicated_thing = [^]
+    b = "b"
+    c = "c"
+Now not only have we broken caching (each choice will scan to the
+end of the input string, matching long_complicated_thing as it goes)
+we're also going to allocate a cache entry for every character in the
+input string.  This can cause ballooning memory requirements for what
+look like simple inputs.
+By default wikipeg inlines "simple expressions", which are rules that
+match simple literals, character classes, or repeated character
+classes, possibly prefixed with the `$` operator.  This is primarily
+done to manage the memory cost of excessive caching of simple matches.
+For more predictable caching, you may wish to use the `noInlining`
+option.
 Requirements
 -------------

package/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 4.0.2
1	+ 6.0.0

package/bin/wikipeg CHANGED Viewed

@@ -2,7 +2,6 @@
 "use strict";
-var util = require("util");
 var fs   = require("fs");
 var path = require("path");
 var PEG  = require("../lib/peg");
@@ -10,11 +9,11 @@ var PEG  = require("../lib/peg");
 /* Helpers */
 function printVersion() {
-  util.puts("WikiPEG " + PEG.VERSION);
+  console.log("WikiPEG " + PEG.VERSION);
 }
 function printHelp() {
-  util.puts(`Usage: wikipeg [options] [--] [<input_file>] [<output_file>]
+  console.log(`Usage: wikipeg [options] [--] [<input_file>] [<output_file>]
 Generates a parser from the PEG grammar specified in the <input_file> and writes
 it to the <output_file>.
@@ -66,7 +65,7 @@ function exitFailure() {
 }
 function abort(message) {
-  util.error(message);
+  console.error(message);
   exitFailure();
 }
@@ -159,6 +158,11 @@ while (args.length > 0 && isOption(args[0])) {
       options.cache = true;
       break;
+    case "--precise-failure":
+      options.noInlining = true;
+      options.noOptimizeFirstSet = true;
+      break;
     case '--allow-loops':
       options.allowLoops = true;
       break;

package/examples/css.pegphp CHANGED Viewed

@@ -288,13 +288,16 @@ nmchar
   / nonascii
   / escape
+nmchars
+  = $[_a-z0-9-]i+ / $[\x80-\uFFFF]+ / escape
 string1
-  = '"' chars:([^\n\r\f\\"] / "\\" nl:nl { return ""; } / escape)* '"' {
+  = '"' chars:($[^\n\r\f\\"]+ / "\\" nl:nl { return ""; } / escape)* '"' {
       return implode("", $chars);
     }
 string2
-  = "'" chars:([^\n\r\f\\'] / "\\" nl:nl { return ""; } / escape)* "'" {
+  = "'" chars:($[^\n\r\f\\']+ / "\\" nl:nl { return ""; } / escape)* "'" {
       return implode("", $chars);
     }
@@ -302,12 +305,10 @@ comment
   = "/*" [^*]* "*"+ ([^/*] [^*]* "*"+)* "/"
 ident
-  = prefix:$"-"? start:nmstart chars:nmchar* {
-      return $prefix . $start . implode("", $chars);
-    }
+  = $( "-"? nmstart nmchars* )
 name
-  = chars:nmchar+ { return implode("", $chars); }
+  = $( nmchars+ )
 num
   = [+-]? ([0-9]+ / [0-9]* "." [0-9]+) ("e" [+-]? [0-9]+)? {
@@ -319,13 +320,13 @@ string
   / string2
 url
-  = chars:([!#$%&*-\[\]-~] / nonascii / escape)* { return implode("", $chars); }
+  = chars:($[!#$%&*-\[\]-~]+ / nonascii / escape)* { return implode("", $chars); }
 plain_ws
   = [ \t\r\n\f]+
 w
-  = plain_ws?
+  = [ \t\r\n\f]*
 nl
   = "\n"

package/lib/compiler/asts.js CHANGED Viewed

@@ -1,7 +1,6 @@
 "use strict";
-var arrays  = require("../utils/arrays"),
-    visitor = require("./visitor");
+var visitor = require("./visitor");
 /* AST utilities. */
 var asts = {
@@ -16,10 +15,19 @@ var asts = {
   },
   indexOfRule: function(ast, name) {
-    return arrays.indexOf(ast.rules, function(r) { return r.name === name; });
+    return ast.rules.findIndex((r) => r.name === name);
   },
-  matchesEmpty: function(ast, node) {
+  findRuleAttribute: function(rule, name) {
+    return (rule.attributes || []).find((attr) => attr.name === name);
+  },
+  getRuleAttributeValue: function(rule, name, defaultValue) {
+    let attr = asts.findRuleAttribute(rule, name);
+    return attr === undefined ? defaultValue : attr.value;
+  },
+  matchesEmpty: function(ast, node, wrapper) {
     function matchesTrue()  { return true;  }
     function matchesFalse() { return false; }
@@ -27,17 +35,25 @@ var asts = {
       return matches(node.expression);
     }
-    var matches = visitor.build({
-      rule: matchesExpression,
+    wrapper = wrapper || ( (f) => f );
+    var matches = wrapper(visitor.build({
+      rule: function(rule) {
+        // Allow explicit override
+        let empty = asts.getRuleAttributeValue(rule, 'empty');
+        if (empty === undefined) {
+          empty = matches(rule.expression);
+        }
+        return empty;
+      },
       choice: function(node) {
-        return arrays.some(node.alternatives, matches);
+        return node.alternatives.some(matches);
       },
       action: matchesExpression,
       sequence: function(node) {
-        return arrays.every(node.elements, matches);
+        return node.elements.every(matches);
       },
       labeled:      matchesExpression,
@@ -50,6 +66,10 @@ var asts = {
       semantic_and: matchesTrue,
       semantic_not: matchesTrue,
+      parameter_and: matchesTrue,
+      parameter_not: matchesTrue,
+      labeled_param: matchesTrue,
       rule_ref: function(node) {
         return matches(asts.findRule(ast, node.name));
       },
@@ -58,9 +78,9 @@ var asts = {
         return node.value === "";
       },
-      "class": matchesFalse,
+      class:   matchesFalse,
       any:     matchesFalse
-    });
+    }));
     return matches(node);
   }