wikipeg 4.0.2 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/HISTORY.md +556 -0
  2. package/README.md +230 -12
  3. package/VERSION +1 -1
  4. package/bin/wikipeg +8 -4
  5. package/examples/css.pegphp +9 -8
  6. package/lib/compiler/asts.js +30 -10
  7. package/lib/compiler/charsets.js +306 -0
  8. package/lib/compiler/language/javascript.js +107 -33
  9. package/lib/compiler/language/php.js +193 -55
  10. package/lib/compiler/passes/analyze-always-match.js +141 -0
  11. package/lib/compiler/passes/analyze-first.js +245 -0
  12. package/lib/compiler/passes/ast-to-code.js +316 -100
  13. package/lib/compiler/passes/inline-simple-rules.js +96 -0
  14. package/lib/compiler/passes/optimize-character-class.js +147 -0
  15. package/lib/compiler/passes/optimize-failure-reporting.js +65 -0
  16. package/lib/compiler/passes/remove-proxy-rules.js +7 -5
  17. package/lib/compiler/passes/report-infinite-loops.js +4 -1
  18. package/lib/compiler/passes/report-left-recursion.js +3 -4
  19. package/lib/compiler/passes/report-unknown-attributes.js +39 -0
  20. package/lib/compiler/passes/transform-common-lang.js +1 -1
  21. package/lib/compiler/traverser.js +1 -2
  22. package/lib/compiler/visitor.js +5 -7
  23. package/lib/compiler.js +24 -10
  24. package/lib/parser.js +2784 -3088
  25. package/lib/peg.js +7 -15
  26. package/lib/runtime/template.js +9 -1
  27. package/lib/utils/CaseFolding.txt +1654 -0
  28. package/lib/utils/arrays.js +0 -72
  29. package/lib/utils/casefold.js +697 -0
  30. package/lib/utils/objects.js +9 -39
  31. package/lib/utils/unicode.js +34 -0
  32. package/package.json +6 -4
  33. package/src/DefaultTracer.php +18 -18
  34. package/src/PEGParserBase.php +53 -28
  35. package/src/SyntaxError.php +4 -4
  36. package/src/Tracer.php +1 -1
  37. package/lib/compiler/opcodes.js +0 -54
@@ -0,0 +1,306 @@
1
+ "use strict";
2
+ const unicode = require("../utils/unicode");
3
+
4
+ /* Character set utilities. */
5
+
6
+ // Inverting ranges requires picking MIN/MAX characters
7
+ const CHAR_MIN = 0, CHAR_MAX = 0x10FFFF;
8
+
9
+ // "CASELESS_RESTRICT" support "suppresses case-insensitive matches
10
+ // between ASCII and non-ASCII characters"; in practice this just means
11
+ // excluding unicode matches for 's' and 'k'.
12
+ let caselessRestrict = false;
13
+
14
+ // Operations on {start,end} range objects, using unicode code point values.
15
+ const range = {
16
+
17
+ // Convert a {start,end} range object to a character class "part"
18
+ toPart(r) {
19
+ return (r.start === r.end) ?
20
+ String.fromCodePoint(r.start) :
21
+ [String.fromCodePoint(r.start), String.fromCodePoint(r.end)];
22
+ },
23
+
24
+ // Invert a list of {start,end} range objects
25
+ invert(ranges) {
26
+ let start = CHAR_MIN;
27
+ const result = [];
28
+ for(const range of ranges) {
29
+ const newRange = { start, end: range.start - 1 };
30
+ if (newRange.start <= newRange.end) {
31
+ result.push(newRange);
32
+ }
33
+ start = range.end + 1;
34
+ }
35
+ const newRange = { start, end: CHAR_MAX };
36
+ if (newRange.start <= newRange.end) {
37
+ result.push(newRange);
38
+ }
39
+ return result;
40
+ },
41
+
42
+ // Merge the given left and right ranges
43
+ merge(left, right) {
44
+ let leftIdx = 0, rightIdx = 0;
45
+ let newRanges = [];
46
+ while ((leftIdx < left.length) || (rightIdx < right.length)) {
47
+ let range;
48
+ if (leftIdx >= left.length ||
49
+ (rightIdx < right.length &&
50
+ right[rightIdx].start < left[leftIdx].start)
51
+ ) {
52
+ // Next lowest range comes from the right
53
+ range = right[rightIdx++];
54
+ } else {
55
+ // Next lowest range comes from the left
56
+ range = left[leftIdx++];
57
+ }
58
+ for (;;) {
59
+ if (leftIdx < left.length && left[leftIdx].start <= (range.end + 1)) {
60
+ // Merge a range from the left into the current range
61
+ range.end = Math.max(range.end, left[leftIdx++].end);
62
+ } else if (rightIdx < right.length && right[rightIdx].start <= (range.end + 1)) {
63
+ // Merge a range from the right into the current range
64
+ range.end = Math.max(range.end, right[rightIdx++].end);
65
+ } else {
66
+ // No more mergeable ranges
67
+ break;
68
+ }
69
+ }
70
+ newRanges.push(range);
71
+ }
72
+ return newRanges;
73
+ },
74
+ };
75
+
76
+ // Operations on character class "parts", which could be a single character
77
+ // or a 2-element array of characters indicating a range.
78
+ const part = {
79
+
80
+ // Convert a character class "part", which could be a single character
81
+ // or a range, to a {start,end} range object.
82
+ toRange(el) {
83
+ return Array.isArray(el) ?
84
+ { start: el[0].codePointAt(0), end: el[1].codePointAt(0) } :
85
+ { start: el.codePointAt(0), end: el.codePointAt(0) };
86
+ },
87
+
88
+ // The main union/intersection method: merge two lists of sorted parts,
89
+ // optionally inverting either/both of the inputs or the output.
90
+ merge(left, leftInvert, right, rightInvert, invertResult) {
91
+ left = left.map(part.toRange);
92
+ right = right.map(part.toRange);
93
+ if (leftInvert) {
94
+ left = range.invert(left);
95
+ }
96
+ if (rightInvert) {
97
+ right = range.invert(right);
98
+ }
99
+ let newRanges = range.merge(left, right);
100
+ if (invertResult) {
101
+ newRanges = range.invert(newRanges);
102
+ }
103
+ return newRanges.map(range.toPart);
104
+ },
105
+ };
106
+
107
+ // Public api -------
108
+
109
+ // Operations on "class" nodes
110
+ const classNode = {
111
+ // Option setting
112
+ setCaselessRestrict(value) {
113
+ caselessRestrict = value;
114
+ },
115
+
116
+ // Return the `.` class, ie any single character.
117
+ any() {
118
+ return {
119
+ type: "class",
120
+ parts: [],
121
+ inverted: true,
122
+ ignoreCase: false,
123
+ sorted: true,
124
+ };
125
+ },
126
+
127
+ // Is this the `.` class, ie any single character?
128
+ isAny(node) {
129
+ return node.parts.length === 0 && node.inverted;
130
+ },
131
+
132
+ // Return the empty class, ie will match no character.
133
+ empty() {
134
+ return {
135
+ type: "class",
136
+ parts: [],
137
+ inverted: false,
138
+ ignoreCase: false,
139
+ sorted: true,
140
+ };
141
+ },
142
+
143
+ // Is this the empty class, ie will match no character?
144
+ isEmpty(node) {
145
+ return node.parts.length === 0 && !node.inverted;
146
+ },
147
+
148
+ // Return a case-sensitive node from the input, which may be
149
+ // case-insensitive.
150
+ caseSensitive(node) {
151
+ if (!node.ignoreCase) {
152
+ return node;
153
+ }
154
+ const newParts = [];
155
+ for (let range of node.parts.map(part.toRange)) {
156
+ for (let cp = range.start; cp <= range.end; cp++) {
157
+ // As per https://tc39.es/ecma262/multipage/text-processing.html
158
+ // use the unicode "Simple Case Folding" procedure to find all
159
+ // code points which case fold to the same 'mapped' character that
160
+ // cp does.
161
+ const mapped = unicode.simpleCaseFolding[cp] || cp;
162
+ const expand = unicode.reverseSimpleCaseFolding[mapped] || [mapped];
163
+ for (let expandedPoint of expand) {
164
+ if (caselessRestrict && cp <= 0x7F && expandedPoint > 0x7F) {
165
+ // if caselessRestrict is true, suppress non-ASCII matches when
166
+ // the original code point was ASCII.
167
+ continue;
168
+ }
169
+ newParts.push(String.fromCodePoint(expandedPoint));
170
+ }
171
+ }
172
+ }
173
+ // sort and merge newParts
174
+ return classNode.sort({
175
+ type: "class",
176
+ parts: newParts,
177
+ inverted: node.inverted,
178
+ ignoreCase: false,
179
+ });
180
+ },
181
+
182
+ // Compute the union of two character classes
183
+ union(left, right) {
184
+ if (classNode.isAny(left) || classNode.isEmpty(right)) {
185
+ return left;
186
+ }
187
+ if (classNode.isAny(right) || classNode.isEmpty(left)) {
188
+ return right;
189
+ }
190
+ left = classNode.caseSensitive(classNode.sort(left));
191
+ right = classNode.caseSensitive(classNode.sort(right));
192
+ if (!left.inverted && right.inverted) {
193
+ return classNode.union(right, left);
194
+ }
195
+ // Either both left and right are inverted, or right is not inverted
196
+ let resultInverted = left.inverted;
197
+ let newParts = part.merge(
198
+ left.parts, left.inverted,
199
+ right.parts, right.inverted,
200
+ resultInverted
201
+ );
202
+ return {
203
+ type: "class",
204
+ parts: newParts,
205
+ inverted: resultInverted,
206
+ ignoreCase: false,
207
+ sorted: true,
208
+ };
209
+ },
210
+
211
+ // Compute the intersection of two character classes
212
+ intersection(left, right) {
213
+ if (classNode.isAny(right) || classNode.isEmpty(left)) {
214
+ return left;
215
+ }
216
+ if (classNode.isAny(left) || classNode.isEmpty(right)) {
217
+ return right;
218
+ }
219
+ left = classNode.caseSensitive(classNode.sort(left));
220
+ right = classNode.caseSensitive(classNode.sort(right));
221
+ if (!left.inverted && right.inverted) {
222
+ return classNode.intersection(right, left);
223
+ }
224
+ let resultInverted = right.inverted;
225
+ let newParts = part.merge(
226
+ left.parts, !left.inverted,
227
+ right.parts, !right.inverted,
228
+ !resultInverted
229
+ );
230
+ return {
231
+ type: "class",
232
+ parts: newParts,
233
+ inverted: resultInverted,
234
+ ignoreCase: false,
235
+ sorted: true,
236
+ };
237
+ },
238
+
239
+ subtract(left, right) {
240
+ if (classNode.isAny(right) || classNode.isEmpty(left)) {
241
+ return classNode.empty();
242
+ }
243
+ left = classNode.caseSensitive(classNode.sort(left));
244
+ right = classNode.caseSensitive(classNode.sort(right));
245
+ let resultInverted = left.inverted && !right.inverted;
246
+ let newParts = part.merge(
247
+ left.parts, !left.inverted,
248
+ right.parts, right.inverted,
249
+ !resultInverted
250
+ );
251
+ return {
252
+ type: "class",
253
+ parts: newParts,
254
+ inverted: resultInverted,
255
+ ignoreCase: false,
256
+ sorted: true,
257
+ };
258
+ },
259
+
260
+ // Expand ranges in a character class, up to the specified limit of
261
+ // character entries.
262
+ expand(input, limit) {
263
+ input = classNode.sort(input);
264
+ const newParts = [];
265
+ for (let range of input.parts.map(part.toRange)) {
266
+ for (let i = range.start; i <= range.end; i++) {
267
+ newParts.push(String.fromCodePoint(i));
268
+ if (newParts.length > limit) {
269
+ // Exceeded the limit: return the original class (sorted, but
270
+ // ranges otherwise unmodified).
271
+ return input;
272
+ }
273
+ }
274
+ }
275
+ input.parts = newParts;
276
+ return input;
277
+ },
278
+
279
+ // Sort a character class to make it suitable for input to union, etc.
280
+ sort(first) {
281
+ if (!first.sorted) {
282
+ // firstSet is kept in sorted order and with overlapping ranges merged;
283
+ // class may not be sorted or merged so preprocess it.
284
+ first = Object.assign({}, first); // shallow clone
285
+ first.parts = first.parts.slice(); // deep clone
286
+ first.parts.sort(function(a,b) {
287
+ if (Array.isArray(a)) { a = a[0]; }
288
+ if (Array.isArray(b)) { b = b[0]; }
289
+ if (a < b) {
290
+ return -1;
291
+ } else if (a > b) {
292
+ return 1;
293
+ } else {
294
+ return 0;
295
+ }
296
+ });
297
+ first.parts = part.merge(first.parts, false, [], false, false);
298
+ first.sorted = true;
299
+ }
300
+ return first;
301
+ },
302
+ };
303
+
304
+ module.exports = {
305
+ classNode,
306
+ };
@@ -10,6 +10,7 @@ let javascript = {
10
10
  maxFailPos: 'peg$maxFailPos',
11
11
  inputLength: 'input.length',
12
12
  assertionSuccess: 'void 0',
13
+ advanceInputChar: 'peg$currPos++',
13
14
  consumeInputChar: 'input.charAt(peg$currPos++)',
14
15
  result: 'peg$result',
15
16
  actionArgPrefix: '',
@@ -163,54 +164,123 @@ let javascript = {
163
164
  .replace(/[\u1000-\uFFFF]/g, function(ch) { return '\\u' + hex(ch); });
164
165
  },
165
166
 
166
- matchClass(node, reg, result) {
167
- let regexp;
167
+ classToRegexp(node) {
168
+ return '['
169
+ + (node.inverted ? '^' : '')
170
+ + node.parts.map(function(part) {
171
+ return part instanceof Array
172
+ ? javascript.regexpClassEscape(part[0])
173
+ + '-'
174
+ + javascript.regexpClassEscape(part[1])
175
+ : javascript.regexpClassEscape(part);
176
+ }).join('')
177
+ + ']';
178
+ },
179
+
180
+ matchClass(node, reg, result, discard, discardPos) {
181
+ let regexp, expr;
168
182
  if (node.parts.length === 0) {
169
183
  if (node.inverted) {
170
184
  // Same as .
171
185
  result.condition = 'peg$currPos < input.length';
172
- result.onSuccess([`${reg} = input.charAt(peg$currPos++);`]);
186
+ result.onSuccess([discard ? `${reg} = true;` : `${reg} = input.charAt(peg$currPos++);`]);
173
187
  } else {
174
188
  // Always fail
175
189
  result.condition = 'false';
176
190
  }
177
191
  return;
178
192
  }
179
- regexp = '/^['
180
- + (node.inverted ? '^' : '')
181
- + node.parts.map(function(part) {
182
- return part instanceof Array
183
- ? javascript.regexpClassEscape(part[0])
184
- + '-'
185
- + javascript.regexpClassEscape(part[1])
186
- : javascript.regexpClassEscape(part);
187
- }).join('')
188
- + ']/' + (node.ignoreCase ? 'i' : '');
189
- result.block = [`${reg} = input.charAt(peg$currPos);`];
190
- result.condition = `${regexp}.test(${reg})`;
191
- result.onSuccess(['peg$currPos++;']);
192
- },
193
-
194
- matchLiteral(node, reg, result) {
193
+ regexp = '/^'
194
+ + javascript.classToRegexp(node)
195
+ + '/' + (node.ignoreCase ? 'i' : '');
196
+ expr = `input.charAt(peg$currPos)`;
197
+ if (discard) {
198
+ result.condition = `${regexp}.test(${expr})`;
199
+ result.onSuccess([`${reg} = true;`]);
200
+ } else {
201
+ result.block = [`${reg} = ${expr};`];
202
+ result.condition = `${regexp}.test(${reg})`;
203
+ }
204
+ if (!discardPos) {
205
+ result.onSuccess(['peg$currPos++;']);
206
+ }
207
+ },
208
+
209
+ matchRepeatedClass(node, reg, result, atLeastOne, discard, discardPos) {
210
+ let regexp;
211
+ if (node.parts.length === 0) {
212
+ if (node.inverted) {
213
+ // Same as .* / .+
214
+ result.condition = atLeastOne ? 'peg$currPos < input.length' : 'true';
215
+ if (!discard) {
216
+ result.onSuccess([`${reg} = Array.from(input.substring(peg$currPos));`]);
217
+ }
218
+ if (!discardPos) {
219
+ result.onSuccess([`peg$currPos = input.length;`]);
220
+ }
221
+ } else if (atLeastOne) {
222
+ // Always fail
223
+ result.condition = 'false';
224
+ } else {
225
+ // Zero length match
226
+ result.condition = 'true';
227
+ result.onSuccess([`${reg} = [];`]);
228
+ }
229
+ return;
230
+ }
231
+ regexp = '/'
232
+ + javascript.classToRegexp(node)
233
+ + (atLeastOne ? '+' : '*') + '/y' + (node.ignoreCase ? 'i' : '');
234
+ result.block.push(`${reg} = ${regexp};`);
235
+ result.block.push(`${reg}.lastIndex = peg$currPos;`);
236
+ if (discard) {
237
+ result.condition = `${reg}.exec(input) !== null`;
238
+ if (!discardPos) {
239
+ result.onSuccess([`peg$currPos = ${reg}.lastIndex;`]);
240
+ }
241
+ } else {
242
+ result.block.push(`${reg} = ${reg}.exec(input);`);
243
+ result.condition = `${reg} !== null`;
244
+ if (!discardPos) {
245
+ result.onSuccess([`peg$currPos += ${reg}[0].length;`]);
246
+ }
247
+ result.onSuccess([`${reg} = Array.from(${reg}[0]);`]);
248
+ }
249
+ },
250
+
251
+ matchLiteral(node, reg, result, discard, discardPos) {
252
+ let expr;
195
253
  if (node.value.length === 1 && !node.ignoreCase) {
196
254
  result.condition = 'input.charCodeAt(peg$currPos) === ' + node.value.charCodeAt(0);
197
- result.onSuccess([[reg, ' = ', javascript.stringify(node.value), ';'].join('')]);
255
+ if (discard) {
256
+ result.onSuccess([`${reg} = true;`]);
257
+ } else {
258
+ result.onSuccess([[reg, ' = ', javascript.stringify(node.value), ';'].join('')]);
259
+ }
198
260
  } else {
199
261
  if (node.value.length === 1) {
200
- result.block.push([reg, ' = input.charAt(peg$currPos);'].join(''));
262
+ expr = `input.charAt(peg$currPos)`;
201
263
  } else {
202
- result.block.push([reg, ' = ',
203
- 'input.substr(peg$currPos,', node.value.length, ');'].join(''));
264
+ expr = `input.substr(peg$currPos,${node.value.length})`;
265
+ }
266
+ if (!discard) {
267
+ result.block.push([`${reg} = ${expr};`]);
268
+ expr = reg;
204
269
  }
205
270
  if (node.ignoreCase) {
206
- result.condition = [reg, '.toLowerCase() === ',
271
+ result.condition = [expr, '.toLowerCase() === ',
207
272
  javascript.stringify(node.value.toLowerCase())].join('');
208
273
  } else {
209
- result.condition = [reg, ' === ',
274
+ result.condition = [expr, ' === ',
210
275
  javascript.stringify(node.value)].join('');
211
276
  }
277
+ if (discard) {
278
+ result.onSuccess([`${reg} = true;`]);
279
+ }
280
+ }
281
+ if (!discardPos) {
282
+ result.onSuccess([`peg$currPos += ${node.value.length};`]);
212
283
  }
213
- result.onSuccess([['peg$currPos += ', node.value.length, ';'].join('')]);
214
284
  },
215
285
 
216
286
  initCache(/*opts*/) {
@@ -225,7 +295,7 @@ let javascript = {
225
295
  if (opts.params.length) {
226
296
  keyParts = keyParts.concat(opts.params);
227
297
  }
228
- let storeRefs = opts.storeRefs.filter(function(part) {
298
+ const storeRefs = opts.storeRefs.filter(function(part) {
229
299
  return part !== '';
230
300
  }).map(function(part) {
231
301
  return ' ' + part;
@@ -256,17 +326,21 @@ let javascript = {
256
326
  return ` if (cached.hasOwnProperty(${encName})) param_${name}.value = cached.$${name};`;
257
327
  },
258
328
 
259
- cacheStoreRef(name, store) {
260
- if (!store) return '';
261
- return `if (saved_${name} !== param_${name}.value) cached.$${name} = param_${name}.value;`;
329
+ cacheStoreRef(reg, name) {
330
+ if (!reg) { return ''; }
331
+ return `if (${reg} !== param_${name}.value) cached.$${name} = param_${name}.value;`;
332
+ },
333
+
334
+ cacheRestoreRef(reg, name) {
335
+ return `param_${name}.value = ${reg};`;
262
336
  },
263
337
 
264
338
  /**
265
339
  * Get a block which saves ref values to a temporary variable for later
266
- * comparison in getCacheStoreRefs().
340
+ * comparison in cacheStoreRefs() / cacheRestoreRef().
267
341
  */
268
- cacheSaveRef(name) {
269
- return `var saved_${name}=param_${name}.value;`;
342
+ cacheSaveRef(reg, name) {
343
+ return `${reg} = param_${name}.value;`;
270
344
  }
271
345
  };
272
346