wikipeg 4.0.2 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/HISTORY.md +556 -0
  2. package/README.md +230 -12
  3. package/VERSION +1 -1
  4. package/bin/wikipeg +8 -4
  5. package/examples/css.pegphp +9 -8
  6. package/lib/compiler/asts.js +30 -10
  7. package/lib/compiler/charsets.js +306 -0
  8. package/lib/compiler/language/javascript.js +107 -33
  9. package/lib/compiler/language/php.js +193 -55
  10. package/lib/compiler/passes/analyze-always-match.js +141 -0
  11. package/lib/compiler/passes/analyze-first.js +245 -0
  12. package/lib/compiler/passes/ast-to-code.js +316 -100
  13. package/lib/compiler/passes/inline-simple-rules.js +96 -0
  14. package/lib/compiler/passes/optimize-character-class.js +147 -0
  15. package/lib/compiler/passes/optimize-failure-reporting.js +65 -0
  16. package/lib/compiler/passes/remove-proxy-rules.js +7 -5
  17. package/lib/compiler/passes/report-infinite-loops.js +4 -1
  18. package/lib/compiler/passes/report-left-recursion.js +3 -4
  19. package/lib/compiler/passes/report-unknown-attributes.js +39 -0
  20. package/lib/compiler/passes/transform-common-lang.js +1 -1
  21. package/lib/compiler/traverser.js +1 -2
  22. package/lib/compiler/visitor.js +5 -7
  23. package/lib/compiler.js +24 -10
  24. package/lib/parser.js +2784 -3088
  25. package/lib/peg.js +7 -15
  26. package/lib/runtime/template.js +9 -1
  27. package/lib/utils/CaseFolding.txt +1654 -0
  28. package/lib/utils/arrays.js +0 -72
  29. package/lib/utils/casefold.js +697 -0
  30. package/lib/utils/objects.js +9 -39
  31. package/lib/utils/unicode.js +34 -0
  32. package/package.json +6 -4
  33. package/src/DefaultTracer.php +18 -18
  34. package/src/PEGParserBase.php +53 -28
  35. package/src/SyntaxError.php +4 -4
  36. package/src/Tracer.php +1 -1
  37. package/lib/compiler/opcodes.js +0 -54
package/README.md CHANGED
@@ -57,7 +57,9 @@ want to use the parser in browser environment.
57
57
  You can tweak the generated parser with several options:
58
58
 
59
59
  * `--cache` — makes the parser cache results, avoiding exponential parsing
60
- time in pathological cases but making the parser slower
60
+ time in pathological cases but making the parser slower. See the
61
+ `cache` option to `PEG.buildParse` and the [Caching](#caching)
62
+ section below.
61
63
  * `--allowed-start-rules` — comma-separated list of rules the parser will be
62
64
  allowed to start parsing from (default: the first rule in the grammar)
63
65
  * `--plugin` — makes WikiPEG use a specified plugin (can be specified multiple
@@ -92,16 +94,49 @@ property with more details about the error.
92
94
  You can tweak the generated parser by passing a second parameter with an options
93
95
  object to `PEG.buildParser`. The following options are supported:
94
96
 
97
+ * `language` — if set to `"javascript"`, the method will generate parser
98
+ code in JavaScript; if set to `"php"`, it will generate parser code in PHP
99
+ (default: `"javascript"`)
95
100
  * `cache` — if `true`, makes the parser cache results, avoiding exponential
96
101
  parsing time in pathological cases but making the parser slower (default:
97
- `false`)
102
+ `false`). See the [Caching](#caching) section below.
103
+ * `allowLoops` — if `true`, disables "infinite loop checking", which
104
+ looks for rules like `""*` which can match an infinite number of
105
+ times. Disabling this check can be helpful if it uncovers false
106
+ positives -- matches which can not be empty for reasons outside
107
+ its analysis.
108
+ * `allowUselessChoice` — if `true`, disables the check for rules
109
+ which "always match" as other than the last element in a choice.
110
+ * `caselessRestrict` — by default, WikiPEG uses the Unicode "Simple
111
+ Case Folding" algorithm to implement case-insensitive matching.
112
+ If `caselessRestrict` is true, the algorithm is modified to
113
+ prohibit case-insensitive matches between ASCII and non-ASCII
114
+ characters, in the same way that the PCRE CASELESS_RESTRICT
115
+ feature does.
116
+ * `commonLang` — if `true`, performs some simple modifications to
117
+ action clauses to make it possible to write test cases that work
118
+ in both javascript and PHP.
119
+ * `noAlwaysMatch` — if `true`, disables optimization of rules which
120
+ always match.
121
+ * `noInlining` — if `true`, disables inlining of simple character
122
+ classes and repeated character classes. This can be useful if you
123
+ are tracing execution or testing the parser and wish to see every
124
+ rule entry/exit, or need to explicitly manage caching. See
125
+ the [Caching](#caching) section below.
126
+ * `noOptimizeFirstSet` - if `true`, disables an optimization which
127
+ fails early if looking at the first character is sufficient to
128
+ determine that a rule can not match. This can affect failure
129
+ reporting, since we might be able to fail on a parent rule before
130
+ actually recursing into the child responsible.
131
+ * `cacheInitHook` and `cacheRuleHook` — functions to generate custom cache
132
+ control code
98
133
  * `allowedStartRules` — rules the parser will be allowed to start parsing from
99
134
  (default: the first rule in the grammar)
135
+ * `allowedStreamRules` — rules the parser will be allowed to start parsing from
136
+ in asynchronous mode
100
137
  * `output` — if set to `"parser"`, the method will return generated parser
101
138
  object; if set to `"source"`, it will return parser source code as a string
102
139
  (default: `"parser"`)
103
- * `optimize`— selects between optimizing the generated parser for parsing
104
- speed (`"speed"`) or code size (`"size"`) (default: `"speed"`)
105
140
  * `plugins` — plugins to use
106
141
 
107
142
  Using the Parser
@@ -148,7 +183,7 @@ Let's look at example grammar that recognizes simple arithmetic expressions like
148
183
 
149
184
  primary
150
185
  = integer
151
- / "(" additive:additive ")" { return additive; }
186
+ / "(" @additive ")"
152
187
 
153
188
  integer "integer"
154
189
  = digits:[0-9]+ { return parseInt(digits.join(""), 10); }
@@ -163,9 +198,12 @@ happens when the pattern matches successfully. A rule can also contain
163
198
  `integer` rule has a human-readable name). The parsing starts at the first rule,
164
199
  which is also called the *start rule*.
165
200
 
166
- A rule name must be a JavaScript identifier. It is followed by an equality sign
167
- (“=”) and a parsing expression. If the rule has a human-readable name, it is
168
- written as a JavaScript string between the name and separating equality sign.
201
+ A rule name must be a JavaScript identifier. It is followed by an
202
+ equals sign (“=”) and a parsing expression. If the rule has additional
203
+ attributes, they are written between square brackets (“[” and “]”)
204
+ between the rule name and the equals sign; see the “Rule attribute
205
+ syntax” section below for more details.
206
+
169
207
  Rules need to be separated only by whitespace (their beginning is easily
170
208
  recognizable), but a semicolon (“;”) after the parsing expression is allowed.
171
209
 
@@ -197,7 +235,7 @@ using a simple initializer.
197
235
 
198
236
  primary
199
237
  = integer
200
- / "(" additive:additive ")" { return additive; }
238
+ / "(" @additive ")"
201
239
 
202
240
  integer "integer"
203
241
  = digits:[0-9]+ { return makeInteger(digits); }
@@ -215,20 +253,27 @@ example:
215
253
  containing matched part of the input.
216
254
  * An expression matching repeated occurrence of some subexpression produces a
217
255
  JavaScript array with all the matches.
256
+ * An expression matching a sequence of expressions produces a
257
+ JavaScript array with all the picked elements.
258
+ * If no matches are picked, all elements of the sequence will be
259
+ present in the array.
260
+ * If the pick operator (`@`) is used, only those elements which
261
+ are picked will be present. If only one element is picked, it
262
+ will be returned directly (not wrapped in a 1-element array).
218
263
 
219
264
  The match results propagate through the rules when the rule names are used in
220
265
  expressions, up to the start rule. The generated parser returns start rule's
221
266
  match result when parsing is successful.
222
267
 
223
268
  One special case of parser expression is a *parser action* — a piece of
224
- JavaScript code inside curly braces ({ and }) that takes match results of
225
- some of the the preceding expressions and returns a JavaScript value. This value
269
+ JavaScript code inside curly braces (`{` and `}`) that takes match results of
270
+ some of the preceding expressions and returns a JavaScript value. This value
226
271
  is considered match result of the preceding expression (in other words, the
227
272
  parser action is a match result transformer).
228
273
 
229
274
  In our arithmetics example, there are many parser actions. Consider the action
230
275
  in expression `digits:[0-9]+ { return parseInt(digits.join(""), 10); }`. It
231
- takes the match result of the expression [0-9]+, which is an array of strings
276
+ takes the match result of the expression `[0-9]+`, which is an array of strings
232
277
  containing digits, as its parameter. It joins the digits together to form a
233
278
  number and converts it to a JavaScript `number` object.
234
279
 
@@ -366,6 +411,21 @@ can be accessed by action's JavaScript code.
366
411
  #### *expression<sub>1</sub>* *expression<sub>2</sub>* ... *expression<sub>n</sub>*
367
412
 
368
413
  Match a sequence of expressions and return their match results in an array.
414
+ Elements of the sequence can be picked by preceding them with the pick
415
+ operator (`@`), and only those elements will be returned in the array.
416
+ If only one element is picked, it is returned directly (not wrapped in
417
+ an array).
418
+
419
+ #### @ *expression*
420
+
421
+ Pick the specified expression in a sequence to return. See the
422
+ description of a sequence expression above.
423
+
424
+ Note that sequences with pick operators can be nested, for example:
425
+
426
+ foo = @"a" @("b" @"c" "d") "e"
427
+
428
+ will return `["a", "c"]` if it matches.
369
429
 
370
430
  #### *expression* { *action* }
371
431
 
@@ -419,6 +479,88 @@ Try to match the first expression, if it does not succeed, try the second one,
419
479
  etc. Return the match result of the first successfully matched expression. If no
420
480
  expression matches, consider the match failed.
421
481
 
482
+ Rule attribute syntax
483
+ ---------------------
484
+ WikiPEG supports attaching attributes to rules which can affect their
485
+ behavior. The syntax is:
486
+
487
+ rule1 [attr1, attr2=false, attr3="string", ...] = nonterminal1 ... ;
488
+
489
+ That is, attributes are comma-separated between square brackets
490
+ between the rule name and the equals sign. Attributes can have
491
+ boolean, string, or integer values. An attribute without a value
492
+ is treated as shorthand for setting it to boolean `true`.
493
+
494
+ The following attributes affect parsing:
495
+
496
+ #### [name="*rule name*"]
497
+
498
+ Provide a human-readable *rule name* for this rule. For example, this
499
+ production:
500
+
501
+ integer [name="simple number"] = [0-9]+
502
+
503
+ will produce an error message like:
504
+
505
+ Expected simple number but "a" found.
506
+
507
+ when parsing a non-number, referencing the human-readable name "simple
508
+ number". Without the human-readable name, WikiPEG uses a description
509
+ of the character class that failed to match:
510
+
511
+ Expected [0-9] but "a" found.
512
+
513
+ Aside from the content of error messages, providing a `name` attribute
514
+ also affects *where* errors are reported, preferring to report failure
515
+ at the named rule instead of inside it.
516
+
517
+ #### [inline] *or* [inline=true]
518
+
519
+ Forces inlining of the given rule, regardless of the status of the
520
+ `noInlining` option.
521
+
522
+ #### [inline=false]
523
+
524
+ Prevents inlining of the given rule.
525
+
526
+ #### [cache] *or* [cache=true]
527
+
528
+ Turns on caching for the given rule, regardless of the status of the
529
+ top-level `cache` option. This can be useful for enabling caching
530
+ only on a few rules while leaving it mostly disabled.
531
+
532
+ If caching is disabled in the top-level WikiPEG options but any rule
533
+ has this attribute set to `true`, then caching will be enabled but all
534
+ rules will default to `[cache=false]`.
535
+
536
+ If caching is enabled in the WikiPEG options, then `[cache]` is
537
+ effectively a no-op, since the default is to cache all rules.
538
+
539
+ #### [cache=false]
540
+
541
+ Turns off caching for the given rule, regardless of the status of the
542
+ top-level `cache` option. This can be useful for selectively disabling
543
+ caching on a few rules while leaving it mostly enabled.
544
+
545
+ If caching is disabled in the top-level WikiPEG options, this is
546
+ effectively a no-op.
547
+
548
+ If caching is enabled in the top-level WikiPEG options, this will
549
+ prevent the given rule from being cached.
550
+
551
+ #### [empty=false]
552
+
553
+ Marks a node as non-nullable; that is, asserts that it cannot match
554
+ the empty string -- usually because of some predicate expression in
555
+ the rule which is beyond WikiPEG's ability to analyze. This can
556
+ prevent false positives when WikiPEG checks for infinite loops.
557
+
558
+ #### [unreachable]
559
+
560
+ Marks a rule as unreachable. If the `allowUselessChoice` option is
561
+ false, this attribute permits a reference to the rule in a choice even
562
+ if a previous option in the choice appears to always match.
563
+
422
564
  Rule parameter syntax
423
565
  ---------------------
424
566
 
@@ -493,6 +635,82 @@ In JS this will expose the reference parameter "r" as an object with r.set(),
493
635
  r.get(). In PHP it will be a native reference such that {$r = 1;} will set
494
636
  the value of the reference in the declaration scope.
495
637
 
638
+ Caching
639
+ -------
640
+ Note that caching makes PEG grammars behave somewhat differently from
641
+ recursive descent parsers. Consider the grammar:
642
+
643
+ start = "a" long_complicated_thing b
644
+ / "a" long_complicated_thing c
645
+ / "a" long_complicated_thing
646
+
647
+ // this could be any costly rule, but this is the simplest example
648
+ // which will take time proportional to the file length
649
+ long_complicated_thing = $[^]*
650
+ b = "b"
651
+ c = "c"
652
+
653
+ Without caching, the generated parser will match `"a"`, then scan the
654
+ entire length of the string matching `long_complicated_thing`, then
655
+ match the end-of-file to `"b"` and fail, return to the start of the
656
+ string and do it again (scanning the entire length of the string),
657
+ fail to match `"c"` and so on.
658
+
659
+ When caching is enabled, the second time we try to match
660
+ `long_complicated_thing` at position 2 in the string it will recognize
661
+ that it has tried exactly this parse before and return the previous
662
+ match from the cache. This takes constant time instead of time
663
+ proportional to the input string length. This can be quite
664
+ significant in a grammar that involves a lot of backtracking.
665
+
666
+ There are some caveats, however!
667
+
668
+ First, caching is relatively expensive, so it is only done at rule
669
+ boundaries, like `long_complicated_thing`, `b`, and `c` above. This
670
+ is a departure from a "theoretical" packrat parser.
671
+
672
+ Second, the memoization cache stores an entry for every nonterminal at
673
+ every position is it attempted *whether the result is success or
674
+ failure*. In our example we allocate memory for cache entries for "b"
675
+ and "c" even though they do not match. Writing rules which match
676
+ single characters can easily result in excessive memory use if care is
677
+ not taken.
678
+
679
+ Consider two alterations to our example above. First, consider inlining the
680
+ `long_complicated_thing` rule like so:
681
+
682
+ start = "a" $[^]* "b"
683
+ / "a" $[^]* "c"
684
+ / "a" $[^]*
685
+
686
+ The grammar would then match exactly the same strings as before, but
687
+ we would do no caching and each of the choice branches would scan to
688
+ the end of the string.
689
+
690
+ Alternatively, if we just moved the zero-or-more repetition operator
691
+ like so:
692
+
693
+ start = "a" $long_complicated_thing* b
694
+ / "a" $long_complicated_thing* c
695
+ / "a" $long_complicated_thing*
696
+
697
+ long_complicated_thing = [^]
698
+ b = "b"
699
+ c = "c"
700
+
701
+ Now not only have we broken caching (each choice will scan to the
702
+ end of the input string, matching long_complicated_thing as it goes)
703
+ we're also going to allocate a cache entry for every character in the
704
+ input string. This can cause ballooning memory requirements for what
705
+ look like simple inputs.
706
+
707
+ By default wikipeg inlines "simple expressions", which are rules that
708
+ match simple literals, character classes, or repeated character
709
+ classes, possibly prefixed with the `$` operator. This is primarily
710
+ done to manage the memory cost of excessive caching of simple matches.
711
+ For more predictable caching, you may wish to use the `noInlining`
712
+ option.
713
+
496
714
  Requirements
497
715
  -------------
498
716
 
package/VERSION CHANGED
@@ -1 +1 @@
1
- 4.0.2
1
+ 6.0.0
package/bin/wikipeg CHANGED
@@ -2,7 +2,6 @@
2
2
 
3
3
  "use strict";
4
4
 
5
- var util = require("util");
6
5
  var fs = require("fs");
7
6
  var path = require("path");
8
7
  var PEG = require("../lib/peg");
@@ -10,11 +9,11 @@ var PEG = require("../lib/peg");
10
9
  /* Helpers */
11
10
 
12
11
  function printVersion() {
13
- util.puts("WikiPEG " + PEG.VERSION);
12
+ console.log("WikiPEG " + PEG.VERSION);
14
13
  }
15
14
 
16
15
  function printHelp() {
17
- util.puts(`Usage: wikipeg [options] [--] [<input_file>] [<output_file>]
16
+ console.log(`Usage: wikipeg [options] [--] [<input_file>] [<output_file>]
18
17
 
19
18
  Generates a parser from the PEG grammar specified in the <input_file> and writes
20
19
  it to the <output_file>.
@@ -66,7 +65,7 @@ function exitFailure() {
66
65
  }
67
66
 
68
67
  function abort(message) {
69
- util.error(message);
68
+ console.error(message);
70
69
  exitFailure();
71
70
  }
72
71
 
@@ -159,6 +158,11 @@ while (args.length > 0 && isOption(args[0])) {
159
158
  options.cache = true;
160
159
  break;
161
160
 
161
+ case "--precise-failure":
162
+ options.noInlining = true;
163
+ options.noOptimizeFirstSet = true;
164
+ break;
165
+
162
166
  case '--allow-loops':
163
167
  options.allowLoops = true;
164
168
  break;
@@ -288,13 +288,16 @@ nmchar
288
288
  / nonascii
289
289
  / escape
290
290
 
291
+ nmchars
292
+ = $[_a-z0-9-]i+ / $[\x80-\uFFFF]+ / escape
293
+
291
294
  string1
292
- = '"' chars:([^\n\r\f\\"] / "\\" nl:nl { return ""; } / escape)* '"' {
295
+ = '"' chars:($[^\n\r\f\\"]+ / "\\" nl:nl { return ""; } / escape)* '"' {
293
296
  return implode("", $chars);
294
297
  }
295
298
 
296
299
  string2
297
- = "'" chars:([^\n\r\f\\'] / "\\" nl:nl { return ""; } / escape)* "'" {
300
+ = "'" chars:($[^\n\r\f\\']+ / "\\" nl:nl { return ""; } / escape)* "'" {
298
301
  return implode("", $chars);
299
302
  }
300
303
 
@@ -302,12 +305,10 @@ comment
302
305
  = "/*" [^*]* "*"+ ([^/*] [^*]* "*"+)* "/"
303
306
 
304
307
  ident
305
- = prefix:$"-"? start:nmstart chars:nmchar* {
306
- return $prefix . $start . implode("", $chars);
307
- }
308
+ = $( "-"? nmstart nmchars* )
308
309
 
309
310
  name
310
- = chars:nmchar+ { return implode("", $chars); }
311
+ = $( nmchars+ )
311
312
 
312
313
  num
313
314
  = [+-]? ([0-9]+ / [0-9]* "." [0-9]+) ("e" [+-]? [0-9]+)? {
@@ -319,13 +320,13 @@ string
319
320
  / string2
320
321
 
321
322
  url
322
- = chars:([!#$%&*-\[\]-~] / nonascii / escape)* { return implode("", $chars); }
323
+ = chars:($[!#$%&*-\[\]-~]+ / nonascii / escape)* { return implode("", $chars); }
323
324
 
324
325
  plain_ws
325
326
  = [ \t\r\n\f]+
326
327
 
327
328
  w
328
- = plain_ws?
329
+ = [ \t\r\n\f]*
329
330
 
330
331
  nl
331
332
  = "\n"
@@ -1,7 +1,6 @@
1
1
  "use strict";
2
2
 
3
- var arrays = require("../utils/arrays"),
4
- visitor = require("./visitor");
3
+ var visitor = require("./visitor");
5
4
 
6
5
  /* AST utilities. */
7
6
  var asts = {
@@ -16,10 +15,19 @@ var asts = {
16
15
  },
17
16
 
18
17
  indexOfRule: function(ast, name) {
19
- return arrays.indexOf(ast.rules, function(r) { return r.name === name; });
18
+ return ast.rules.findIndex((r) => r.name === name);
20
19
  },
21
20
 
22
- matchesEmpty: function(ast, node) {
21
+ findRuleAttribute: function(rule, name) {
22
+ return (rule.attributes || []).find((attr) => attr.name === name);
23
+ },
24
+
25
+ getRuleAttributeValue: function(rule, name, defaultValue) {
26
+ let attr = asts.findRuleAttribute(rule, name);
27
+ return attr === undefined ? defaultValue : attr.value;
28
+ },
29
+
30
+ matchesEmpty: function(ast, node, wrapper) {
23
31
  function matchesTrue() { return true; }
24
32
  function matchesFalse() { return false; }
25
33
 
@@ -27,17 +35,25 @@ var asts = {
27
35
  return matches(node.expression);
28
36
  }
29
37
 
30
- var matches = visitor.build({
31
- rule: matchesExpression,
38
+ wrapper = wrapper || ( (f) => f );
39
+ var matches = wrapper(visitor.build({
40
+ rule: function(rule) {
41
+ // Allow explicit override
42
+ let empty = asts.getRuleAttributeValue(rule, 'empty');
43
+ if (empty === undefined) {
44
+ empty = matches(rule.expression);
45
+ }
46
+ return empty;
47
+ },
32
48
 
33
49
  choice: function(node) {
34
- return arrays.some(node.alternatives, matches);
50
+ return node.alternatives.some(matches);
35
51
  },
36
52
 
37
53
  action: matchesExpression,
38
54
 
39
55
  sequence: function(node) {
40
- return arrays.every(node.elements, matches);
56
+ return node.elements.every(matches);
41
57
  },
42
58
 
43
59
  labeled: matchesExpression,
@@ -50,6 +66,10 @@ var asts = {
50
66
  semantic_and: matchesTrue,
51
67
  semantic_not: matchesTrue,
52
68
 
69
+ parameter_and: matchesTrue,
70
+ parameter_not: matchesTrue,
71
+ labeled_param: matchesTrue,
72
+
53
73
  rule_ref: function(node) {
54
74
  return matches(asts.findRule(ast, node.name));
55
75
  },
@@ -58,9 +78,9 @@ var asts = {
58
78
  return node.value === "";
59
79
  },
60
80
 
61
- "class": matchesFalse,
81
+ class: matchesFalse,
62
82
  any: matchesFalse
63
- });
83
+ }));
64
84
 
65
85
  return matches(node);
66
86
  }