prism 0.15.1 → 0.17.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (91) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +35 -1
  3. data/Makefile +12 -0
  4. data/README.md +3 -1
  5. data/config.yml +66 -50
  6. data/docs/configuration.md +2 -0
  7. data/docs/fuzzing.md +1 -1
  8. data/docs/javascript.md +90 -0
  9. data/docs/releasing.md +27 -0
  10. data/docs/ruby_api.md +2 -0
  11. data/docs/serialization.md +28 -29
  12. data/ext/prism/api_node.c +856 -826
  13. data/ext/prism/api_pack.c +20 -9
  14. data/ext/prism/extension.c +494 -119
  15. data/ext/prism/extension.h +1 -1
  16. data/include/prism/ast.h +3157 -747
  17. data/include/prism/defines.h +40 -8
  18. data/include/prism/diagnostic.h +36 -3
  19. data/include/prism/enc/pm_encoding.h +119 -28
  20. data/include/prism/node.h +38 -30
  21. data/include/prism/options.h +204 -0
  22. data/include/prism/pack.h +44 -33
  23. data/include/prism/parser.h +445 -199
  24. data/include/prism/prettyprint.h +26 -0
  25. data/include/prism/regexp.h +16 -2
  26. data/include/prism/util/pm_buffer.h +102 -18
  27. data/include/prism/util/pm_char.h +162 -48
  28. data/include/prism/util/pm_constant_pool.h +128 -34
  29. data/include/prism/util/pm_list.h +68 -38
  30. data/include/prism/util/pm_memchr.h +18 -3
  31. data/include/prism/util/pm_newline_list.h +71 -28
  32. data/include/prism/util/pm_state_stack.h +25 -7
  33. data/include/prism/util/pm_string.h +115 -27
  34. data/include/prism/util/pm_string_list.h +25 -6
  35. data/include/prism/util/pm_strncasecmp.h +32 -0
  36. data/include/prism/util/pm_strpbrk.h +31 -17
  37. data/include/prism/version.h +28 -3
  38. data/include/prism.h +229 -36
  39. data/lib/prism/compiler.rb +5 -5
  40. data/lib/prism/debug.rb +43 -13
  41. data/lib/prism/desugar_compiler.rb +1 -1
  42. data/lib/prism/dispatcher.rb +27 -26
  43. data/lib/prism/dsl.rb +16 -16
  44. data/lib/prism/ffi.rb +138 -61
  45. data/lib/prism/lex_compat.rb +26 -16
  46. data/lib/prism/mutation_compiler.rb +11 -11
  47. data/lib/prism/node.rb +426 -227
  48. data/lib/prism/node_ext.rb +23 -16
  49. data/lib/prism/node_inspector.rb +1 -1
  50. data/lib/prism/pack.rb +79 -40
  51. data/lib/prism/parse_result/comments.rb +7 -2
  52. data/lib/prism/parse_result/newlines.rb +4 -0
  53. data/lib/prism/parse_result.rb +157 -21
  54. data/lib/prism/pattern.rb +14 -3
  55. data/lib/prism/ripper_compat.rb +28 -10
  56. data/lib/prism/serialize.rb +935 -307
  57. data/lib/prism/visitor.rb +9 -5
  58. data/lib/prism.rb +20 -2
  59. data/prism.gemspec +11 -2
  60. data/rbi/prism.rbi +7305 -0
  61. data/rbi/prism_static.rbi +196 -0
  62. data/sig/prism.rbs +4468 -0
  63. data/sig/prism_static.rbs +123 -0
  64. data/src/diagnostic.c +56 -53
  65. data/src/enc/pm_big5.c +1 -0
  66. data/src/enc/pm_euc_jp.c +1 -0
  67. data/src/enc/pm_gbk.c +1 -0
  68. data/src/enc/pm_shift_jis.c +1 -0
  69. data/src/enc/pm_tables.c +316 -80
  70. data/src/enc/pm_unicode.c +54 -9
  71. data/src/enc/pm_windows_31j.c +1 -0
  72. data/src/node.c +357 -345
  73. data/src/options.c +170 -0
  74. data/src/prettyprint.c +7697 -1643
  75. data/src/prism.c +1964 -1125
  76. data/src/regexp.c +153 -95
  77. data/src/serialize.c +432 -397
  78. data/src/token_type.c +3 -1
  79. data/src/util/pm_buffer.c +88 -23
  80. data/src/util/pm_char.c +103 -57
  81. data/src/util/pm_constant_pool.c +52 -22
  82. data/src/util/pm_list.c +12 -4
  83. data/src/util/pm_memchr.c +5 -3
  84. data/src/util/pm_newline_list.c +25 -63
  85. data/src/util/pm_state_stack.c +9 -3
  86. data/src/util/pm_string.c +95 -85
  87. data/src/util/pm_string_list.c +14 -15
  88. data/src/util/pm_strncasecmp.c +10 -3
  89. data/src/util/pm_strpbrk.c +25 -19
  90. metadata +12 -3
  91. data/docs/prism.png +0 -0
data/src/regexp.c CHANGED
@@ -1,16 +1,31 @@
1
1
  #include "prism/regexp.h"
2
2
 
3
- // This is the parser that is going to handle parsing regular expressions.
3
+ /**
4
+ * This is the parser that is going to handle parsing regular expressions.
5
+ */
4
6
  typedef struct {
7
+ /** A pointer to the start of the source that we are parsing. */
5
8
  const uint8_t *start;
9
+
10
+ /** A pointer to the current position in the source. */
6
11
  const uint8_t *cursor;
12
+
13
+ /** A pointer to the end of the source that we are parsing. */
7
14
  const uint8_t *end;
15
+
16
+ /** A list of named captures that we've found. */
8
17
  pm_string_list_t *named_captures;
18
+
19
+ /** Whether the encoding has changed from the default. */
9
20
  bool encoding_changed;
21
+
22
+ /** The encoding of the source. */
10
23
  pm_encoding_t *encoding;
11
24
  } pm_regexp_parser_t;
12
25
 
13
- // This initializes a new parser with the given source.
26
+ /**
27
+ * This initializes a new parser with the given source.
28
+ */
14
29
  static void
15
30
  pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) {
16
31
  *parser = (pm_regexp_parser_t) {
@@ -23,7 +38,9 @@ pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const ui
23
38
  };
24
39
  }
25
40
 
26
- // This appends a new string to the list of named captures.
41
+ /**
42
+ * This appends a new string to the list of named captures.
43
+ */
27
44
  static void
28
45
  pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
29
46
  pm_string_t string;
@@ -32,13 +49,17 @@ pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start,
32
49
  pm_string_free(&string);
33
50
  }
34
51
 
35
- // Returns true if the next character is the end of the source.
52
+ /**
53
+ * Returns true if the next character is the end of the source.
54
+ */
36
55
  static inline bool
37
56
  pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
38
57
  return parser->cursor >= parser->end;
39
58
  }
40
59
 
41
- // Optionally accept a char and consume it if it exists.
60
+ /**
61
+ * Optionally accept a char and consume it if it exists.
62
+ */
42
63
  static inline bool
43
64
  pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
44
65
  if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
@@ -48,7 +69,9 @@ pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
48
69
  return false;
49
70
  }
50
71
 
51
- // Expect a character to be present and consume it.
72
+ /**
73
+ * Expect a character to be present and consume it.
74
+ */
52
75
  static inline bool
53
76
  pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
54
77
  if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
@@ -58,7 +81,9 @@ pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
58
81
  return false;
59
82
  }
60
83
 
61
- // This advances the current token to the next instance of the given character.
84
+ /**
85
+ * This advances the current token to the next instance of the given character.
86
+ */
62
87
  static bool
63
88
  pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
64
89
  if (pm_regexp_char_is_eof(parser)) {
@@ -74,37 +99,39 @@ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
74
99
  return true;
75
100
  }
76
101
 
77
- // Range quantifiers are a special class of quantifiers that look like
78
- //
79
- // * {digit}
80
- // * {digit,}
81
- // * {digit,digit}
82
- // * {,digit}
83
- //
84
- // Unfortunately, if there are any spaces in between, then this just becomes a
85
- // regular character match expression and we have to backtrack. So when this
86
- // function first starts running, we'll create a "save" point and then attempt
87
- // to parse the quantifier. If it fails, we'll restore the save point and
88
- // return.
89
- //
90
- // The properly track everything, we're going to build a little state machine.
91
- // It looks something like the following:
92
- //
93
- // ┌───────┐ ┌─────────┐ ────────────┐
94
- // ──── lbrace ───> │ start │ ──── digit ───> │ minimum │ │
95
- // └───────┘ └─────────┘ <─── digit ─┘
96
- // │ │ │
97
- // ┌───────┐ │ │ rbrace
98
- // comma <───── comma ┌──── comma ───────┘ │
99
- // └───────┘ V V
100
- // │ ┌─────────┐ ┌─────────┐
101
- // └── digit ──> maximum │ ── rbrace ──> │| final |│
102
- // └─────────┘ └─────────┘
103
- // │ ^
104
- // └─ digit ─┘
105
- //
106
- // Note that by the time we've hit this function, the lbrace has already been
107
- // consumed so we're in the start state.
102
+ /**
103
+ * Range quantifiers are a special class of quantifiers that look like
104
+ *
105
+ * * {digit}
106
+ * * {digit,}
107
+ * * {digit,digit}
108
+ * * {,digit}
109
+ *
110
+ * Unfortunately, if there are any spaces in between, then this just becomes a
111
+ * regular character match expression and we have to backtrack. So when this
112
+ * function first starts running, we'll create a "save" point and then attempt
113
+ * to parse the quantifier. If it fails, we'll restore the save point and
114
+ * return.
115
+ *
116
+ * The properly track everything, we're going to build a little state machine.
117
+ * It looks something like the following:
118
+ *
119
+ * ┌───────┐ ┌─────────┐ ────────────┐
120
+ * ──── lbrace ───> │ start │ ──── digit ───> │ minimum │ │
121
+ * └───────┘ └─────────┘ <─── digit ─┘
122
+ * │ │
123
+ * ┌───────┐ rbrace
124
+ * │ comma │ <───── comma ┌──── comma ───────┘ │
125
+ * └───────┘ V V
126
+ * ┌─────────┐ ┌─────────┐
127
+ * └── digit ──> │ maximum │ ── rbrace ──> │| final |│
128
+ * └─────────┘ └─────────┘
129
+ * │ ^
130
+ * └─ digit ─┘
131
+ *
132
+ * Note that by the time we've hit this function, the lbrace has already been
133
+ * consumed so we're in the start state.
134
+ */
108
135
  static bool
109
136
  pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
110
137
  const uint8_t *savepoint = parser->cursor;
@@ -180,14 +207,18 @@ pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
180
207
  return true;
181
208
  }
182
209
 
183
- // quantifier : star-quantifier
184
- // | plus-quantifier
185
- // | optional-quantifier
186
- // | range-quantifier
187
- // | <empty>
188
- // ;
210
+ /**
211
+ * quantifier : star-quantifier
212
+ * | plus-quantifier
213
+ * | optional-quantifier
214
+ * | range-quantifier
215
+ * | <empty>
216
+ * ;
217
+ */
189
218
  static bool
190
219
  pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
220
+ if (pm_regexp_char_is_eof(parser)) return true;
221
+
191
222
  switch (*parser->cursor) {
192
223
  case '*':
193
224
  case '+':
@@ -203,8 +234,10 @@ pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
203
234
  }
204
235
  }
205
236
 
206
- // match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']'
207
- // ;
237
+ /**
238
+ * match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']'
239
+ * ;
240
+ */
208
241
  static bool
209
242
  pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
210
243
  if (!pm_regexp_char_expect(parser, ':')) {
@@ -224,8 +257,10 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
224
257
  static bool
225
258
  pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);
226
259
 
227
- // match-char-set : '[' '^'? (match-range | match-char)* ']'
228
- // ;
260
+ /**
261
+ * match-char-set : '[' '^'? (match-range | match-char)* ']'
262
+ * ;
263
+ */
229
264
  static bool
230
265
  pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
231
266
  pm_regexp_char_accept(parser, '^');
@@ -249,7 +284,9 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
249
284
  return pm_regexp_char_expect(parser, ']');
250
285
  }
251
286
 
252
- // A left bracket can either mean a POSIX class or a character set.
287
+ /**
288
+ * A left bracket can either mean a POSIX class or a character set.
289
+ */
253
290
  static bool
254
291
  pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
255
292
  const uint8_t *reset = parser->cursor;
@@ -269,8 +306,10 @@ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
269
306
  static bool
270
307
  pm_regexp_parse_expression(pm_regexp_parser_t *parser);
271
308
 
272
- // These are the states of the options that are configurable on the regular
273
- // expression (or from within a group).
309
+ /**
310
+ * These are the states of the options that are configurable on the regular
311
+ * expression (or from within a group).
312
+ */
274
313
  typedef enum {
275
314
  PM_REGEXP_OPTION_STATE_INVALID,
276
315
  PM_REGEXP_OPTION_STATE_TOGGLEABLE,
@@ -281,16 +320,22 @@ typedef enum {
281
320
 
282
321
  // These are the options that are configurable on the regular expression (or
283
322
  // from within a group).
323
+
284
324
  #define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
285
325
  #define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
286
326
  #define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
287
327
 
288
- // This is the set of options that are configurable on the regular expression.
328
+ /**
329
+ * This is the set of options that are configurable on the regular expression.
330
+ */
289
331
  typedef struct {
332
+ /** The current state of each option. */
290
333
  uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
291
334
  } pm_regexp_options_t;
292
335
 
293
- // Initialize a new set of options to their default values.
336
+ /**
337
+ * Initialize a new set of options to their default values.
338
+ */
294
339
  static void
295
340
  pm_regexp_options_init(pm_regexp_options_t *options) {
296
341
  memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
@@ -302,8 +347,10 @@ pm_regexp_options_init(pm_regexp_options_t *options) {
302
347
  options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
303
348
  }
304
349
 
305
- // Attempt to add the given option to the set of options. Returns true if it was
306
- // added, false if it was already present.
350
+ /**
351
+ * Attempt to add the given option to the set of options. Returns true if it was
352
+ * added, false if it was already present.
353
+ */
307
354
  static bool
308
355
  pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
309
356
  if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
@@ -325,8 +372,10 @@ pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
325
372
  return false;
326
373
  }
327
374
 
328
- // Attempt to remove the given option from the set of options. Returns true if
329
- // it was removed, false if it was already absent.
375
+ /**
376
+ * Attempt to remove the given option from the set of options. Returns true if
377
+ * it was removed, false if it was already absent.
378
+ */
330
379
  static bool
331
380
  pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
332
381
  if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
@@ -347,26 +396,27 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
347
396
  return false;
348
397
  }
349
398
 
350
- // Groups can have quite a few different patterns for syntax. They basically
351
- // just wrap a set of expressions, but they can potentially have options after a
352
- // question mark. If there _isn't_ a question mark, then it's just a set of
353
- // expressions. If there _is_, then here are the options:
354
- //
355
- // * (?#...) - inline comments
356
- // * (?:subexp) - non-capturing group
357
- // * (?=subexp) - positive lookahead
358
- // * (?!subexp) - negative lookahead
359
- // * (?>subexp) - atomic group
360
- // * (?~subexp) - absence operator
361
- // * (?<=subexp) - positive lookbehind
362
- // * (?<!subexp) - negative lookbehind
363
- // * (?<name>subexp) - named capturing group
364
- // * (?'name'subexp) - named capturing group
365
- // * (?(cond)yes-subexp) - conditional expression
366
- // * (?(cond)yes-subexp|no-subexp) - conditional expression
367
- // * (?imxdau-imx) - turn on and off configuration
368
- // * (?imxdau-imx:subexp) - turn on and off configuration for an expression
369
- //
399
+ /**
400
+ * Groups can have quite a few different patterns for syntax. They basically
401
+ * just wrap a set of expressions, but they can potentially have options after a
402
+ * question mark. If there _isn't_ a question mark, then it's just a set of
403
+ * expressions. If there _is_, then here are the options:
404
+ *
405
+ * * (?#...) - inline comments
406
+ * * (?:subexp) - non-capturing group
407
+ * * (?=subexp) - positive lookahead
408
+ * * (?!subexp) - negative lookahead
409
+ * * (?>subexp) - atomic group
410
+ * * (?~subexp) - absence operator
411
+ * * (?<=subexp) - positive lookbehind
412
+ * * (?<!subexp) - negative lookbehind
413
+ * * (?<name>subexp) - named capturing group
414
+ * * (?'name'subexp) - named capturing group
415
+ * * (?(cond)yes-subexp) - conditional expression
416
+ * * (?(cond)yes-subexp|no-subexp) - conditional expression
417
+ * * (?imxdau-imx) - turn on and off configuration
418
+ * * (?imxdau-imx:subexp) - turn on and off configuration for an expression
419
+ */
370
420
  static bool
371
421
  pm_regexp_parse_group(pm_regexp_parser_t *parser) {
372
422
  // First, parse any options for the group.
@@ -501,16 +551,18 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
501
551
  return pm_regexp_char_expect(parser, ')');
502
552
  }
503
553
 
504
- // item : anchor
505
- // | match-posix-class
506
- // | match-char-set
507
- // | match-char-class
508
- // | match-char-prop
509
- // | match-char
510
- // | match-any
511
- // | group
512
- // | quantified
513
- // ;
554
+ /**
555
+ * item : anchor
556
+ * | match-posix-class
557
+ * | match-char-set
558
+ * | match-char-class
559
+ * | match-char-prop
560
+ * | match-char
561
+ * | match-any
562
+ * | group
563
+ * | quantified
564
+ * ;
565
+ */
514
566
  static bool
515
567
  pm_regexp_parse_item(pm_regexp_parser_t *parser) {
516
568
  switch (*parser->cursor++) {
@@ -531,8 +583,10 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) {
531
583
  }
532
584
  }
533
585
 
534
- // expression : item+
535
- // ;
586
+ /**
587
+ * expression : item+
588
+ * ;
589
+ */
536
590
  static bool
537
591
  pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
538
592
  if (!pm_regexp_parse_item(parser)) {
@@ -548,10 +602,12 @@ pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
548
602
  return true;
549
603
  }
550
604
 
551
- // pattern : EOF
552
- // | expression EOF
553
- // | expression '|' pattern
554
- // ;
605
+ /**
606
+ * pattern : EOF
607
+ * | expression EOF
608
+ * | expression '|' pattern
609
+ * ;
610
+ */
555
611
  static bool
556
612
  pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
557
613
  return (
@@ -570,8 +626,10 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
570
626
  );
571
627
  }
572
628
 
573
- // Parse a regular expression and extract the names of all of the named capture
574
- // groups.
629
+ /**
630
+ * Parse a regular expression and extract the names of all of the named capture
631
+ * groups.
632
+ */
575
633
  PRISM_EXPORTED_FUNCTION bool
576
634
  pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) {
577
635
  pm_regexp_parser_t parser;