prism 0.15.1 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +35 -1
  3. data/Makefile +12 -0
  4. data/README.md +3 -1
  5. data/config.yml +66 -50
  6. data/docs/configuration.md +2 -0
  7. data/docs/fuzzing.md +1 -1
  8. data/docs/javascript.md +90 -0
  9. data/docs/releasing.md +27 -0
  10. data/docs/ruby_api.md +2 -0
  11. data/docs/serialization.md +28 -29
  12. data/ext/prism/api_node.c +856 -826
  13. data/ext/prism/api_pack.c +20 -9
  14. data/ext/prism/extension.c +494 -119
  15. data/ext/prism/extension.h +1 -1
  16. data/include/prism/ast.h +3157 -747
  17. data/include/prism/defines.h +40 -8
  18. data/include/prism/diagnostic.h +36 -3
  19. data/include/prism/enc/pm_encoding.h +119 -28
  20. data/include/prism/node.h +38 -30
  21. data/include/prism/options.h +204 -0
  22. data/include/prism/pack.h +44 -33
  23. data/include/prism/parser.h +445 -199
  24. data/include/prism/prettyprint.h +26 -0
  25. data/include/prism/regexp.h +16 -2
  26. data/include/prism/util/pm_buffer.h +102 -18
  27. data/include/prism/util/pm_char.h +162 -48
  28. data/include/prism/util/pm_constant_pool.h +128 -34
  29. data/include/prism/util/pm_list.h +68 -38
  30. data/include/prism/util/pm_memchr.h +18 -3
  31. data/include/prism/util/pm_newline_list.h +71 -28
  32. data/include/prism/util/pm_state_stack.h +25 -7
  33. data/include/prism/util/pm_string.h +115 -27
  34. data/include/prism/util/pm_string_list.h +25 -6
  35. data/include/prism/util/pm_strncasecmp.h +32 -0
  36. data/include/prism/util/pm_strpbrk.h +31 -17
  37. data/include/prism/version.h +28 -3
  38. data/include/prism.h +229 -36
  39. data/lib/prism/compiler.rb +5 -5
  40. data/lib/prism/debug.rb +43 -13
  41. data/lib/prism/desugar_compiler.rb +1 -1
  42. data/lib/prism/dispatcher.rb +27 -26
  43. data/lib/prism/dsl.rb +16 -16
  44. data/lib/prism/ffi.rb +138 -61
  45. data/lib/prism/lex_compat.rb +26 -16
  46. data/lib/prism/mutation_compiler.rb +11 -11
  47. data/lib/prism/node.rb +426 -227
  48. data/lib/prism/node_ext.rb +23 -16
  49. data/lib/prism/node_inspector.rb +1 -1
  50. data/lib/prism/pack.rb +79 -40
  51. data/lib/prism/parse_result/comments.rb +7 -2
  52. data/lib/prism/parse_result/newlines.rb +4 -0
  53. data/lib/prism/parse_result.rb +157 -21
  54. data/lib/prism/pattern.rb +14 -3
  55. data/lib/prism/ripper_compat.rb +28 -10
  56. data/lib/prism/serialize.rb +935 -307
  57. data/lib/prism/visitor.rb +9 -5
  58. data/lib/prism.rb +20 -2
  59. data/prism.gemspec +11 -2
  60. data/rbi/prism.rbi +7305 -0
  61. data/rbi/prism_static.rbi +196 -0
  62. data/sig/prism.rbs +4468 -0
  63. data/sig/prism_static.rbs +123 -0
  64. data/src/diagnostic.c +56 -53
  65. data/src/enc/pm_big5.c +1 -0
  66. data/src/enc/pm_euc_jp.c +1 -0
  67. data/src/enc/pm_gbk.c +1 -0
  68. data/src/enc/pm_shift_jis.c +1 -0
  69. data/src/enc/pm_tables.c +316 -80
  70. data/src/enc/pm_unicode.c +54 -9
  71. data/src/enc/pm_windows_31j.c +1 -0
  72. data/src/node.c +357 -345
  73. data/src/options.c +170 -0
  74. data/src/prettyprint.c +7697 -1643
  75. data/src/prism.c +1964 -1125
  76. data/src/regexp.c +153 -95
  77. data/src/serialize.c +432 -397
  78. data/src/token_type.c +3 -1
  79. data/src/util/pm_buffer.c +88 -23
  80. data/src/util/pm_char.c +103 -57
  81. data/src/util/pm_constant_pool.c +52 -22
  82. data/src/util/pm_list.c +12 -4
  83. data/src/util/pm_memchr.c +5 -3
  84. data/src/util/pm_newline_list.c +25 -63
  85. data/src/util/pm_state_stack.c +9 -3
  86. data/src/util/pm_string.c +95 -85
  87. data/src/util/pm_string_list.c +14 -15
  88. data/src/util/pm_strncasecmp.c +10 -3
  89. data/src/util/pm_strpbrk.c +25 -19
  90. metadata +12 -3
  91. data/docs/prism.png +0 -0
data/src/regexp.c CHANGED
@@ -1,16 +1,31 @@
1
1
  #include "prism/regexp.h"
2
2
 
3
- // This is the parser that is going to handle parsing regular expressions.
3
+ /**
4
+ * This is the parser that is going to handle parsing regular expressions.
5
+ */
4
6
  typedef struct {
7
+ /** A pointer to the start of the source that we are parsing. */
5
8
  const uint8_t *start;
9
+
10
+ /** A pointer to the current position in the source. */
6
11
  const uint8_t *cursor;
12
+
13
+ /** A pointer to the end of the source that we are parsing. */
7
14
  const uint8_t *end;
15
+
16
+ /** A list of named captures that we've found. */
8
17
  pm_string_list_t *named_captures;
18
+
19
+ /** Whether the encoding has changed from the default. */
9
20
  bool encoding_changed;
21
+
22
+ /** The encoding of the source. */
10
23
  pm_encoding_t *encoding;
11
24
  } pm_regexp_parser_t;
12
25
 
13
- // This initializes a new parser with the given source.
26
+ /**
27
+ * This initializes a new parser with the given source.
28
+ */
14
29
  static void
15
30
  pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) {
16
31
  *parser = (pm_regexp_parser_t) {
@@ -23,7 +38,9 @@ pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const ui
23
38
  };
24
39
  }
25
40
 
26
- // This appends a new string to the list of named captures.
41
+ /**
42
+ * This appends a new string to the list of named captures.
43
+ */
27
44
  static void
28
45
  pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
29
46
  pm_string_t string;
@@ -32,13 +49,17 @@ pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start,
32
49
  pm_string_free(&string);
33
50
  }
34
51
 
35
- // Returns true if the next character is the end of the source.
52
+ /**
53
+ * Returns true if the next character is the end of the source.
54
+ */
36
55
  static inline bool
37
56
  pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
38
57
  return parser->cursor >= parser->end;
39
58
  }
40
59
 
41
- // Optionally accept a char and consume it if it exists.
60
+ /**
61
+ * Optionally accept a char and consume it if it exists.
62
+ */
42
63
  static inline bool
43
64
  pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
44
65
  if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
@@ -48,7 +69,9 @@ pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
48
69
  return false;
49
70
  }
50
71
 
51
- // Expect a character to be present and consume it.
72
+ /**
73
+ * Expect a character to be present and consume it.
74
+ */
52
75
  static inline bool
53
76
  pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
54
77
  if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
@@ -58,7 +81,9 @@ pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
58
81
  return false;
59
82
  }
60
83
 
61
- // This advances the current token to the next instance of the given character.
84
+ /**
85
+ * This advances the current token to the next instance of the given character.
86
+ */
62
87
  static bool
63
88
  pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
64
89
  if (pm_regexp_char_is_eof(parser)) {
@@ -74,37 +99,39 @@ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
74
99
  return true;
75
100
  }
76
101
 
77
- // Range quantifiers are a special class of quantifiers that look like
78
- //
79
- // * {digit}
80
- // * {digit,}
81
- // * {digit,digit}
82
- // * {,digit}
83
- //
84
- // Unfortunately, if there are any spaces in between, then this just becomes a
85
- // regular character match expression and we have to backtrack. So when this
86
- // function first starts running, we'll create a "save" point and then attempt
87
- // to parse the quantifier. If it fails, we'll restore the save point and
88
- // return.
89
- //
90
- // The properly track everything, we're going to build a little state machine.
91
- // It looks something like the following:
92
- //
93
- // ┌───────┐ ┌─────────┐ ────────────┐
94
- // ──── lbrace ───> │ start │ ──── digit ───> │ minimum │ │
95
- // └───────┘ └─────────┘ <─── digit ─┘
96
- // │ │ │
97
- // ┌───────┐ │ │ rbrace
98
- // comma <───── comma ┌──── comma ───────┘ │
99
- // └───────┘ V V
100
- // │ ┌─────────┐ ┌─────────┐
101
- // └── digit ──> maximum │ ── rbrace ──> │| final |│
102
- // └─────────┘ └─────────┘
103
- // │ ^
104
- // └─ digit ─┘
105
- //
106
- // Note that by the time we've hit this function, the lbrace has already been
107
- // consumed so we're in the start state.
102
+ /**
103
+ * Range quantifiers are a special class of quantifiers that look like
104
+ *
105
+ * * {digit}
106
+ * * {digit,}
107
+ * * {digit,digit}
108
+ * * {,digit}
109
+ *
110
+ * Unfortunately, if there are any spaces in between, then this just becomes a
111
+ * regular character match expression and we have to backtrack. So when this
112
+ * function first starts running, we'll create a "save" point and then attempt
113
+ * to parse the quantifier. If it fails, we'll restore the save point and
114
+ * return.
115
+ *
116
+ * The properly track everything, we're going to build a little state machine.
117
+ * It looks something like the following:
118
+ *
119
+ * ┌───────┐ ┌─────────┐ ────────────┐
120
+ * ──── lbrace ───> │ start │ ──── digit ───> │ minimum │ │
121
+ * └───────┘ └─────────┘ <─── digit ─┘
122
+ * │ │
123
+ * ┌───────┐ rbrace
124
+ * │ comma │ <───── comma ┌──── comma ───────┘ │
125
+ * └───────┘ V V
126
+ * ┌─────────┐ ┌─────────┐
127
+ * └── digit ──> │ maximum │ ── rbrace ──> │| final |│
128
+ * └─────────┘ └─────────┘
129
+ * │ ^
130
+ * └─ digit ─┘
131
+ *
132
+ * Note that by the time we've hit this function, the lbrace has already been
133
+ * consumed so we're in the start state.
134
+ */
108
135
  static bool
109
136
  pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
110
137
  const uint8_t *savepoint = parser->cursor;
@@ -180,14 +207,18 @@ pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
180
207
  return true;
181
208
  }
182
209
 
183
- // quantifier : star-quantifier
184
- // | plus-quantifier
185
- // | optional-quantifier
186
- // | range-quantifier
187
- // | <empty>
188
- // ;
210
+ /**
211
+ * quantifier : star-quantifier
212
+ * | plus-quantifier
213
+ * | optional-quantifier
214
+ * | range-quantifier
215
+ * | <empty>
216
+ * ;
217
+ */
189
218
  static bool
190
219
  pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
220
+ if (pm_regexp_char_is_eof(parser)) return true;
221
+
191
222
  switch (*parser->cursor) {
192
223
  case '*':
193
224
  case '+':
@@ -203,8 +234,10 @@ pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
203
234
  }
204
235
  }
205
236
 
206
- // match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']'
207
- // ;
237
+ /**
238
+ * match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']'
239
+ * ;
240
+ */
208
241
  static bool
209
242
  pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
210
243
  if (!pm_regexp_char_expect(parser, ':')) {
@@ -224,8 +257,10 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
224
257
  static bool
225
258
  pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);
226
259
 
227
- // match-char-set : '[' '^'? (match-range | match-char)* ']'
228
- // ;
260
+ /**
261
+ * match-char-set : '[' '^'? (match-range | match-char)* ']'
262
+ * ;
263
+ */
229
264
  static bool
230
265
  pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
231
266
  pm_regexp_char_accept(parser, '^');
@@ -249,7 +284,9 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
249
284
  return pm_regexp_char_expect(parser, ']');
250
285
  }
251
286
 
252
- // A left bracket can either mean a POSIX class or a character set.
287
+ /**
288
+ * A left bracket can either mean a POSIX class or a character set.
289
+ */
253
290
  static bool
254
291
  pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
255
292
  const uint8_t *reset = parser->cursor;
@@ -269,8 +306,10 @@ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
269
306
  static bool
270
307
  pm_regexp_parse_expression(pm_regexp_parser_t *parser);
271
308
 
272
- // These are the states of the options that are configurable on the regular
273
- // expression (or from within a group).
309
+ /**
310
+ * These are the states of the options that are configurable on the regular
311
+ * expression (or from within a group).
312
+ */
274
313
  typedef enum {
275
314
  PM_REGEXP_OPTION_STATE_INVALID,
276
315
  PM_REGEXP_OPTION_STATE_TOGGLEABLE,
@@ -281,16 +320,22 @@ typedef enum {
281
320
 
282
321
  // These are the options that are configurable on the regular expression (or
283
322
  // from within a group).
323
+
284
324
  #define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
285
325
  #define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
286
326
  #define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
287
327
 
288
- // This is the set of options that are configurable on the regular expression.
328
+ /**
329
+ * This is the set of options that are configurable on the regular expression.
330
+ */
289
331
  typedef struct {
332
+ /** The current state of each option. */
290
333
  uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
291
334
  } pm_regexp_options_t;
292
335
 
293
- // Initialize a new set of options to their default values.
336
+ /**
337
+ * Initialize a new set of options to their default values.
338
+ */
294
339
  static void
295
340
  pm_regexp_options_init(pm_regexp_options_t *options) {
296
341
  memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
@@ -302,8 +347,10 @@ pm_regexp_options_init(pm_regexp_options_t *options) {
302
347
  options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
303
348
  }
304
349
 
305
- // Attempt to add the given option to the set of options. Returns true if it was
306
- // added, false if it was already present.
350
+ /**
351
+ * Attempt to add the given option to the set of options. Returns true if it was
352
+ * added, false if it was already present.
353
+ */
307
354
  static bool
308
355
  pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
309
356
  if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
@@ -325,8 +372,10 @@ pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
325
372
  return false;
326
373
  }
327
374
 
328
- // Attempt to remove the given option from the set of options. Returns true if
329
- // it was removed, false if it was already absent.
375
+ /**
376
+ * Attempt to remove the given option from the set of options. Returns true if
377
+ * it was removed, false if it was already absent.
378
+ */
330
379
  static bool
331
380
  pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
332
381
  if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
@@ -347,26 +396,27 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
347
396
  return false;
348
397
  }
349
398
 
350
- // Groups can have quite a few different patterns for syntax. They basically
351
- // just wrap a set of expressions, but they can potentially have options after a
352
- // question mark. If there _isn't_ a question mark, then it's just a set of
353
- // expressions. If there _is_, then here are the options:
354
- //
355
- // * (?#...) - inline comments
356
- // * (?:subexp) - non-capturing group
357
- // * (?=subexp) - positive lookahead
358
- // * (?!subexp) - negative lookahead
359
- // * (?>subexp) - atomic group
360
- // * (?~subexp) - absence operator
361
- // * (?<=subexp) - positive lookbehind
362
- // * (?<!subexp) - negative lookbehind
363
- // * (?<name>subexp) - named capturing group
364
- // * (?'name'subexp) - named capturing group
365
- // * (?(cond)yes-subexp) - conditional expression
366
- // * (?(cond)yes-subexp|no-subexp) - conditional expression
367
- // * (?imxdau-imx) - turn on and off configuration
368
- // * (?imxdau-imx:subexp) - turn on and off configuration for an expression
369
- //
399
+ /**
400
+ * Groups can have quite a few different patterns for syntax. They basically
401
+ * just wrap a set of expressions, but they can potentially have options after a
402
+ * question mark. If there _isn't_ a question mark, then it's just a set of
403
+ * expressions. If there _is_, then here are the options:
404
+ *
405
+ * * (?#...) - inline comments
406
+ * * (?:subexp) - non-capturing group
407
+ * * (?=subexp) - positive lookahead
408
+ * * (?!subexp) - negative lookahead
409
+ * * (?>subexp) - atomic group
410
+ * * (?~subexp) - absence operator
411
+ * * (?<=subexp) - positive lookbehind
412
+ * * (?<!subexp) - negative lookbehind
413
+ * * (?<name>subexp) - named capturing group
414
+ * * (?'name'subexp) - named capturing group
415
+ * * (?(cond)yes-subexp) - conditional expression
416
+ * * (?(cond)yes-subexp|no-subexp) - conditional expression
417
+ * * (?imxdau-imx) - turn on and off configuration
418
+ * * (?imxdau-imx:subexp) - turn on and off configuration for an expression
419
+ */
370
420
  static bool
371
421
  pm_regexp_parse_group(pm_regexp_parser_t *parser) {
372
422
  // First, parse any options for the group.
@@ -501,16 +551,18 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
501
551
  return pm_regexp_char_expect(parser, ')');
502
552
  }
503
553
 
504
- // item : anchor
505
- // | match-posix-class
506
- // | match-char-set
507
- // | match-char-class
508
- // | match-char-prop
509
- // | match-char
510
- // | match-any
511
- // | group
512
- // | quantified
513
- // ;
554
+ /**
555
+ * item : anchor
556
+ * | match-posix-class
557
+ * | match-char-set
558
+ * | match-char-class
559
+ * | match-char-prop
560
+ * | match-char
561
+ * | match-any
562
+ * | group
563
+ * | quantified
564
+ * ;
565
+ */
514
566
  static bool
515
567
  pm_regexp_parse_item(pm_regexp_parser_t *parser) {
516
568
  switch (*parser->cursor++) {
@@ -531,8 +583,10 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) {
531
583
  }
532
584
  }
533
585
 
534
- // expression : item+
535
- // ;
586
+ /**
587
+ * expression : item+
588
+ * ;
589
+ */
536
590
  static bool
537
591
  pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
538
592
  if (!pm_regexp_parse_item(parser)) {
@@ -548,10 +602,12 @@ pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
548
602
  return true;
549
603
  }
550
604
 
551
- // pattern : EOF
552
- // | expression EOF
553
- // | expression '|' pattern
554
- // ;
605
+ /**
606
+ * pattern : EOF
607
+ * | expression EOF
608
+ * | expression '|' pattern
609
+ * ;
610
+ */
555
611
  static bool
556
612
  pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
557
613
  return (
@@ -570,8 +626,10 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
570
626
  );
571
627
  }
572
628
 
573
- // Parse a regular expression and extract the names of all of the named capture
574
- // groups.
629
+ /**
630
+ * Parse a regular expression and extract the names of all of the named capture
631
+ * groups.
632
+ */
575
633
  PRISM_EXPORTED_FUNCTION bool
576
634
  pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) {
577
635
  pm_regexp_parser_t parser;