prism 0.29.0 → 0.30.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +22 -1
  3. data/CONTRIBUTING.md +0 -4
  4. data/README.md +1 -0
  5. data/config.yml +66 -9
  6. data/docs/fuzzing.md +1 -1
  7. data/docs/ripper_translation.md +22 -0
  8. data/ext/prism/api_node.c +30 -12
  9. data/ext/prism/extension.c +107 -372
  10. data/ext/prism/extension.h +1 -1
  11. data/include/prism/ast.h +138 -70
  12. data/include/prism/diagnostic.h +7 -2
  13. data/include/prism/node.h +0 -21
  14. data/include/prism/parser.h +23 -25
  15. data/include/prism/regexp.h +17 -8
  16. data/include/prism/static_literals.h +3 -2
  17. data/include/prism/util/pm_char.h +1 -2
  18. data/include/prism/util/pm_constant_pool.h +0 -8
  19. data/include/prism/util/pm_integer.h +16 -9
  20. data/include/prism/util/pm_string.h +0 -8
  21. data/include/prism/version.h +2 -2
  22. data/include/prism.h +0 -11
  23. data/lib/prism/compiler.rb +3 -0
  24. data/lib/prism/dispatcher.rb +14 -0
  25. data/lib/prism/dot_visitor.rb +22 -3
  26. data/lib/prism/dsl.rb +7 -2
  27. data/lib/prism/ffi.rb +24 -3
  28. data/lib/prism/inspect_visitor.rb +10 -8
  29. data/lib/prism/mutation_compiler.rb +6 -1
  30. data/lib/prism/node.rb +166 -241
  31. data/lib/prism/node_ext.rb +21 -5
  32. data/lib/prism/parse_result/comments.rb +0 -7
  33. data/lib/prism/parse_result/newlines.rb +101 -11
  34. data/lib/prism/parse_result.rb +17 -0
  35. data/lib/prism/reflection.rb +3 -1
  36. data/lib/prism/serialize.rb +80 -67
  37. data/lib/prism/translation/parser/compiler.rb +134 -114
  38. data/lib/prism/translation/parser.rb +6 -1
  39. data/lib/prism/translation/ripper.rb +8 -6
  40. data/lib/prism/translation/ruby_parser.rb +23 -5
  41. data/lib/prism/visitor.rb +3 -0
  42. data/lib/prism.rb +0 -4
  43. data/prism.gemspec +1 -4
  44. data/rbi/prism/node.rbi +63 -6
  45. data/rbi/prism/visitor.rbi +3 -0
  46. data/rbi/prism.rbi +6 -0
  47. data/sig/prism/dsl.rbs +4 -1
  48. data/sig/prism/mutation_compiler.rbs +1 -0
  49. data/sig/prism/node.rbs +28 -4
  50. data/sig/prism/visitor.rbs +1 -0
  51. data/sig/prism.rbs +21 -0
  52. data/src/diagnostic.c +27 -17
  53. data/src/node.c +408 -1666
  54. data/src/prettyprint.c +49 -6
  55. data/src/prism.c +958 -991
  56. data/src/regexp.c +133 -68
  57. data/src/serialize.c +6 -1
  58. data/src/static_literals.c +63 -84
  59. data/src/token_type.c +2 -2
  60. data/src/util/pm_constant_pool.c +0 -8
  61. data/src/util/pm_integer.c +39 -11
  62. data/src/util/pm_string.c +0 -12
  63. data/src/util/pm_strpbrk.c +32 -6
  64. metadata +2 -5
  65. data/include/prism/util/pm_string_list.h +0 -44
  66. data/lib/prism/debug.rb +0 -249
  67. data/src/util/pm_string_list.c +0 -28
data/src/regexp.c CHANGED
@@ -1,9 +1,14 @@
1
1
  #include "prism/regexp.h"
2
2
 
3
+ #define PM_REGEXP_PARSE_DEPTH_MAX 4096
4
+
3
5
  /**
4
6
  * This is the parser that is going to handle parsing regular expressions.
5
7
  */
6
8
  typedef struct {
9
+ /** The parser that is currently being used. */
10
+ pm_parser_t *parser;
11
+
7
12
  /** A pointer to the start of the source that we are parsing. */
8
13
  const uint8_t *start;
9
14
 
@@ -13,39 +18,42 @@ typedef struct {
13
18
  /** A pointer to the end of the source that we are parsing. */
14
19
  const uint8_t *end;
15
20
 
16
- /** A list of named captures that we've found. */
17
- pm_string_list_t *named_captures;
18
-
19
21
  /** Whether the encoding has changed from the default. */
20
22
  bool encoding_changed;
21
23
 
22
24
  /** The encoding of the source. */
23
25
  const pm_encoding_t *encoding;
26
+
27
+ /** The callback to call when a named capture group is found. */
28
+ pm_regexp_name_callback_t name_callback;
29
+
30
+ /** The data to pass to the name callback. */
31
+ void *name_data;
32
+
33
+ /** The callback to call when a parse error is found. */
34
+ pm_regexp_error_callback_t error_callback;
35
+
36
+ /** The data to pass to the error callback. */
37
+ void *error_data;
24
38
  } pm_regexp_parser_t;
25
39
 
26
40
  /**
27
- * This initializes a new parser with the given source.
41
+ * Append an error to the parser.
28
42
  */
29
- static void
30
- pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding) {
31
- *parser = (pm_regexp_parser_t) {
32
- .start = start,
33
- .cursor = start,
34
- .end = end,
35
- .named_captures = named_captures,
36
- .encoding_changed = encoding_changed,
37
- .encoding = encoding
38
- };
43
+ static inline void
44
+ pm_regexp_parse_error(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, const char *message) {
45
+ parser->error_callback(start, end, message, parser->error_data);
39
46
  }
40
47
 
41
48
  /**
42
- * This appends a new string to the list of named captures.
49
+ * This appends a new string to the list of named captures. This function
50
+ * assumes the caller has already checked the validity of the name callback.
43
51
  */
44
52
  static void
45
53
  pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
46
54
  pm_string_t string;
47
55
  pm_string_shared_init(&string, start, end);
48
- pm_string_list_append(parser->named_captures, &string);
56
+ parser->name_callback(&string, parser->name_data);
49
57
  pm_string_free(&string);
50
58
  }
51
59
 
@@ -217,21 +225,24 @@ pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
217
225
  */
218
226
  static bool
219
227
  pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
220
- if (pm_regexp_char_is_eof(parser)) return true;
221
-
222
- switch (*parser->cursor) {
223
- case '*':
224
- case '+':
225
- case '?':
226
- parser->cursor++;
227
- return true;
228
- case '{':
229
- parser->cursor++;
230
- return pm_regexp_parse_range_quantifier(parser);
231
- default:
232
- // In this case there is no quantifier.
233
- return true;
228
+ while (!pm_regexp_char_is_eof(parser)) {
229
+ switch (*parser->cursor) {
230
+ case '*':
231
+ case '+':
232
+ case '?':
233
+ parser->cursor++;
234
+ break;
235
+ case '{':
236
+ parser->cursor++;
237
+ if (!pm_regexp_parse_range_quantifier(parser)) return false;
238
+ break;
239
+ default:
240
+ // In this case there is no quantifier.
241
+ return true;
242
+ }
234
243
  }
244
+
245
+ return true;
235
246
  }
236
247
 
237
248
  /**
@@ -255,20 +266,20 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
255
266
 
256
267
  // Forward declaration because character sets can be nested.
257
268
  static bool
258
- pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);
269
+ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth);
259
270
 
260
271
  /**
261
272
  * match-char-set : '[' '^'? (match-range | match-char)* ']'
262
273
  * ;
263
274
  */
264
275
  static bool
265
- pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
276
+ pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) {
266
277
  pm_regexp_char_accept(parser, '^');
267
278
 
268
279
  while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') {
269
280
  switch (*parser->cursor++) {
270
281
  case '[':
271
- pm_regexp_parse_lbracket(parser);
282
+ pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1));
272
283
  break;
273
284
  case '\\':
274
285
  if (!pm_regexp_char_is_eof(parser)) {
@@ -288,7 +299,18 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
288
299
  * A left bracket can either mean a POSIX class or a character set.
289
300
  */
290
301
  static bool
291
- pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
302
+ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth) {
303
+ if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
304
+ pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
305
+ return false;
306
+ }
307
+
308
+ if ((parser->cursor < parser->end) && parser->cursor[0] == ']') {
309
+ parser->cursor++;
310
+ pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "empty char-class");
311
+ return true;
312
+ }
313
+
292
314
  const uint8_t *reset = parser->cursor;
293
315
 
294
316
  if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
@@ -298,13 +320,13 @@ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
298
320
  parser->cursor = reset;
299
321
  }
300
322
 
301
- return pm_regexp_parse_character_set(parser);
323
+ return pm_regexp_parse_character_set(parser, depth);
302
324
  }
303
325
 
304
326
  // Forward declaration here since parsing groups needs to go back up the grammar
305
327
  // to parse expressions within them.
306
328
  static bool
307
- pm_regexp_parse_expression(pm_regexp_parser_t *parser);
329
+ pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth);
308
330
 
309
331
  /**
310
332
  * These are the states of the options that are configurable on the regular
@@ -418,17 +440,27 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
418
440
  * * (?imxdau-imx:subexp) - turn on and off configuration for an expression
419
441
  */
420
442
  static bool
421
- pm_regexp_parse_group(pm_regexp_parser_t *parser) {
443
+ pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
444
+ const uint8_t *group_start = parser->cursor;
445
+
422
446
  // First, parse any options for the group.
423
447
  if (pm_regexp_char_accept(parser, '?')) {
424
448
  if (pm_regexp_char_is_eof(parser)) {
449
+ pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
425
450
  return false;
426
451
  }
452
+
427
453
  pm_regexp_options_t options;
428
454
  pm_regexp_options_init(&options);
429
455
 
430
456
  switch (*parser->cursor) {
431
457
  case '#': { // inline comments
458
+ parser->cursor++;
459
+ if (pm_regexp_char_is_eof(parser)) {
460
+ pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
461
+ return false;
462
+ }
463
+
432
464
  if (parser->encoding_changed && parser->encoding->multibyte) {
433
465
  bool escaped = false;
434
466
 
@@ -472,6 +504,7 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
472
504
  case '<':
473
505
  parser->cursor++;
474
506
  if (pm_regexp_char_is_eof(parser)) {
507
+ pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
475
508
  return false;
476
509
  }
477
510
 
@@ -485,7 +518,15 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
485
518
  if (!pm_regexp_char_find(parser, '>')) {
486
519
  return false;
487
520
  }
488
- pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
521
+
522
+ if (parser->cursor - start == 1) {
523
+ pm_regexp_parse_error(parser, start, parser->cursor, "group name is empty");
524
+ }
525
+
526
+ if (parser->name_callback != NULL) {
527
+ pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
528
+ }
529
+
489
530
  break;
490
531
  }
491
532
  }
@@ -496,7 +537,10 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
496
537
  return false;
497
538
  }
498
539
 
499
- pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
540
+ if (parser->name_callback != NULL) {
541
+ pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
542
+ }
543
+
500
544
  break;
501
545
  }
502
546
  case '(': // conditional expression
@@ -535,20 +579,25 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
535
579
  }
536
580
  break;
537
581
  default:
538
- return false;
582
+ parser->cursor++;
583
+ pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "undefined group option");
584
+ break;
539
585
  }
540
586
  }
541
587
 
542
588
  // Now, parse the expressions within this group.
543
589
  while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
544
- if (!pm_regexp_parse_expression(parser)) {
590
+ if (!pm_regexp_parse_expression(parser, (uint16_t) (depth + 1))) {
545
591
  return false;
546
592
  }
547
593
  pm_regexp_char_accept(parser, '|');
548
594
  }
549
595
 
550
596
  // Finally, make sure we have a closing parenthesis.
551
- return pm_regexp_char_expect(parser, ')');
597
+ if (pm_regexp_char_expect(parser, ')')) return true;
598
+
599
+ pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
600
+ return false;
552
601
  }
553
602
 
554
603
  /**
@@ -564,12 +613,12 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
564
613
  * ;
565
614
  */
566
615
  static bool
567
- pm_regexp_parse_item(pm_regexp_parser_t *parser) {
616
+ pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) {
568
617
  switch (*parser->cursor) {
569
618
  case '^':
570
619
  case '$':
571
620
  parser->cursor++;
572
- return true;
621
+ return pm_regexp_parse_quantifier(parser);
573
622
  case '\\':
574
623
  parser->cursor++;
575
624
  if (!pm_regexp_char_is_eof(parser)) {
@@ -578,10 +627,20 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) {
578
627
  return pm_regexp_parse_quantifier(parser);
579
628
  case '(':
580
629
  parser->cursor++;
581
- return pm_regexp_parse_group(parser) && pm_regexp_parse_quantifier(parser);
630
+ return pm_regexp_parse_group(parser, depth) && pm_regexp_parse_quantifier(parser);
582
631
  case '[':
583
632
  parser->cursor++;
584
- return pm_regexp_parse_lbracket(parser) && pm_regexp_parse_quantifier(parser);
633
+ return pm_regexp_parse_lbracket(parser, depth) && pm_regexp_parse_quantifier(parser);
634
+ case '*':
635
+ case '?':
636
+ case '+':
637
+ parser->cursor++;
638
+ pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "target of repeat operator is not specified");
639
+ return true;
640
+ case ')':
641
+ parser->cursor++;
642
+ pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "unmatched close parenthesis");
643
+ return true;
585
644
  default: {
586
645
  size_t width;
587
646
  if (!parser->encoding_changed) {
@@ -603,13 +662,18 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) {
603
662
  * ;
604
663
  */
605
664
  static bool
606
- pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
607
- if (!pm_regexp_parse_item(parser)) {
665
+ pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth) {
666
+ if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
667
+ pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
668
+ return false;
669
+ }
670
+
671
+ if (!pm_regexp_parse_item(parser, depth)) {
608
672
  return false;
609
673
  }
610
674
 
611
675
  while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') {
612
- if (!pm_regexp_parse_item(parser)) {
676
+ if (!pm_regexp_parse_item(parser, depth)) {
613
677
  return false;
614
678
  }
615
679
  }
@@ -625,29 +689,30 @@ pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
625
689
  */
626
690
  static bool
627
691
  pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
628
- return (
629
- (
630
- // Exit early if the pattern is empty.
631
- pm_regexp_char_is_eof(parser) ||
632
- // Parse the first expression in the pattern.
633
- pm_regexp_parse_expression(parser)
634
- ) &&
635
- (
636
- // Return now if we've parsed the entire pattern.
637
- pm_regexp_char_is_eof(parser) ||
638
- // Otherwise, we should have a pipe character.
639
- (pm_regexp_char_expect(parser, '|') && pm_regexp_parse_pattern(parser))
640
- )
641
- );
692
+ do {
693
+ if (pm_regexp_char_is_eof(parser)) return true;
694
+ if (!pm_regexp_parse_expression(parser, 0)) return false;
695
+ } while (pm_regexp_char_accept(parser, '|'));
696
+
697
+ return pm_regexp_char_is_eof(parser);
642
698
  }
643
699
 
644
700
  /**
645
701
  * Parse a regular expression and extract the names of all of the named capture
646
702
  * groups.
647
703
  */
648
- PRISM_EXPORTED_FUNCTION bool
649
- pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding) {
650
- pm_regexp_parser_t parser;
651
- pm_regexp_parser_init(&parser, source, source + size, named_captures, encoding_changed, encoding);
652
- return pm_regexp_parse_pattern(&parser);
704
+ PRISM_EXPORTED_FUNCTION void
705
+ pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) {
706
+ pm_regexp_parse_pattern(&(pm_regexp_parser_t) {
707
+ .parser = parser,
708
+ .start = source,
709
+ .cursor = source,
710
+ .end = source + size,
711
+ .encoding_changed = parser->encoding_changed,
712
+ .encoding = parser->encoding,
713
+ .name_callback = name_callback,
714
+ .name_data = name_data,
715
+ .error_callback = error_callback,
716
+ .error_data = error_data
717
+ });
653
718
  }
data/src/serialize.c CHANGED
@@ -1198,6 +1198,9 @@ pm_serialize_node(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) {
1198
1198
  pm_serialize_location(parser, &((pm_interpolated_x_string_node_t *)node)->closing_loc, buffer);
1199
1199
  break;
1200
1200
  }
1201
+ case PM_IT_LOCAL_VARIABLE_READ_NODE: {
1202
+ break;
1203
+ }
1201
1204
  case PM_IT_PARAMETERS_NODE: {
1202
1205
  break;
1203
1206
  }
@@ -1550,7 +1553,9 @@ pm_serialize_node(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) {
1550
1553
  break;
1551
1554
  }
1552
1555
  case PM_RATIONAL_NODE: {
1553
- pm_serialize_node(parser, (pm_node_t *)((pm_rational_node_t *)node)->numeric, buffer);
1556
+ pm_buffer_append_varuint(buffer, (uint32_t)(node->flags & ~PM_NODE_FLAG_COMMON_MASK));
1557
+ pm_serialize_integer(&((pm_rational_node_t *)node)->numerator, buffer);
1558
+ pm_serialize_integer(&((pm_rational_node_t *)node)->denominator, buffer);
1554
1559
  break;
1555
1560
  }
1556
1561
  case PM_REDO_NODE: {
@@ -58,6 +58,25 @@ murmur_hash(const uint8_t *key, size_t length) {
58
58
  return hash;
59
59
  }
60
60
 
61
+ /**
62
+ * Hash the value of an integer and return it.
63
+ */
64
+ static uint32_t
65
+ integer_hash(const pm_integer_t *integer) {
66
+ uint32_t hash;
67
+ if (integer->values) {
68
+ hash = murmur_hash((const uint8_t *) integer->values, sizeof(uint32_t) * integer->length);
69
+ } else {
70
+ hash = murmur_hash((const uint8_t *) &integer->value, sizeof(uint32_t));
71
+ }
72
+
73
+ if (integer->negative) {
74
+ hash ^= murmur_scramble((uint32_t) 1);
75
+ }
76
+
77
+ return hash;
78
+ }
79
+
61
80
  /**
62
81
  * Return the hash of the given node. It is important that nodes that have
63
82
  * equivalent static literal values have the same hash. This is because we use
@@ -68,19 +87,8 @@ node_hash(const pm_static_literals_metadata_t *metadata, const pm_node_t *node)
68
87
  switch (PM_NODE_TYPE(node)) {
69
88
  case PM_INTEGER_NODE: {
70
89
  // Integers hash their value.
71
- const pm_integer_t *integer = &((const pm_integer_node_t *) node)->value;
72
- uint32_t hash;
73
- if (integer->values) {
74
- hash = murmur_hash((const uint8_t *) integer->values, sizeof(uint32_t) * integer->length);
75
- } else {
76
- hash = murmur_hash((const uint8_t *) &integer->value, sizeof(uint32_t));
77
- }
78
-
79
- if (integer->negative) {
80
- hash ^= murmur_scramble((uint32_t) 1);
81
- }
82
-
83
- return hash;
90
+ const pm_integer_node_t *cast = (const pm_integer_node_t *) node;
91
+ return integer_hash(&cast->value);
84
92
  }
85
93
  case PM_SOURCE_LINE_NODE: {
86
94
  // Source lines hash their line number.
@@ -94,11 +102,9 @@ node_hash(const pm_static_literals_metadata_t *metadata, const pm_node_t *node)
94
102
  return murmur_hash((const uint8_t *) value, sizeof(double));
95
103
  }
96
104
  case PM_RATIONAL_NODE: {
97
- // Rationals hash their numeric value. Because their numeric value
98
- // is stored as a subnode, we hash that node and then mix in the
99
- // fact that this is a rational node.
100
- const pm_node_t *numeric = ((const pm_rational_node_t *) node)->numeric;
101
- return node_hash(metadata, numeric) ^ murmur_scramble((uint32_t) node->type);
105
+ // Rationals hash their numerator and denominator.
106
+ const pm_rational_node_t *cast = (const pm_rational_node_t *) node;
107
+ return integer_hash(&cast->numerator) ^ integer_hash(&cast->denominator) ^ murmur_scramble((uint32_t) cast->base.type);
102
108
  }
103
109
  case PM_IMAGINARY_NODE: {
104
110
  // Imaginaries hash their numeric value. Because their numeric value
@@ -148,7 +154,7 @@ node_hash(const pm_static_literals_metadata_t *metadata, const pm_node_t *node)
148
154
  * and must be able to compare all node types that will be stored in this hash.
149
155
  */
150
156
  static pm_node_t *
151
- pm_node_hash_insert(pm_node_hash_t *hash, const pm_static_literals_metadata_t *metadata, pm_node_t *node, int (*compare)(const pm_static_literals_metadata_t *metadata, const pm_node_t *left, const pm_node_t *right)) {
157
+ pm_node_hash_insert(pm_node_hash_t *hash, const pm_static_literals_metadata_t *metadata, pm_node_t *node, bool replace, int (*compare)(const pm_static_literals_metadata_t *metadata, const pm_node_t *left, const pm_node_t *right)) {
152
158
  // If we are out of space, we need to resize the hash. This will cause all
153
159
  // of the nodes to be rehashed and reinserted into the new hash.
154
160
  if (hash->size * 2 >= hash->capacity) {
@@ -196,9 +202,14 @@ pm_node_hash_insert(pm_node_hash_t *hash, const pm_static_literals_metadata_t *m
196
202
  // already in the hash. Otherwise, we can just increment the size and insert
197
203
  // the new node.
198
204
  pm_node_t *result = hash->nodes[index];
199
- if (result == NULL) hash->size++;
200
205
 
201
- hash->nodes[index] = node;
206
+ if (result == NULL) {
207
+ hash->size++;
208
+ hash->nodes[index] = node;
209
+ } else if (replace) {
210
+ hash->nodes[index] = node;
211
+ }
212
+
202
213
  return result;
203
214
  }
204
215
 
@@ -275,8 +286,15 @@ pm_compare_number_nodes(const pm_static_literals_metadata_t *metadata, const pm_
275
286
  switch (PM_NODE_TYPE(left)) {
276
287
  case PM_IMAGINARY_NODE:
277
288
  return pm_compare_number_nodes(metadata, ((const pm_imaginary_node_t *) left)->numeric, ((const pm_imaginary_node_t *) right)->numeric);
278
- case PM_RATIONAL_NODE:
279
- return pm_compare_number_nodes(metadata, ((const pm_rational_node_t *) left)->numeric, ((const pm_rational_node_t *) right)->numeric);
289
+ case PM_RATIONAL_NODE: {
290
+ const pm_rational_node_t *left_rational = (const pm_rational_node_t *) left;
291
+ const pm_rational_node_t *right_rational = (const pm_rational_node_t *) right;
292
+
293
+ int result = pm_integer_compare(&left_rational->denominator, &right_rational->denominator);
294
+ if (result != 0) return result;
295
+
296
+ return pm_integer_compare(&left_rational->numerator, &right_rational->numerator);
297
+ }
280
298
  case PM_INTEGER_NODE:
281
299
  return pm_compare_integer_nodes(metadata, left, right);
282
300
  case PM_FLOAT_NODE:
@@ -335,7 +353,7 @@ pm_compare_regular_expression_nodes(PRISM_ATTRIBUTE_UNUSED const pm_static_liter
335
353
  * Add a node to the set of static literals.
336
354
  */
337
355
  pm_node_t *
338
- pm_static_literals_add(const pm_newline_list_t *newline_list, int32_t start_line, pm_static_literals_t *literals, pm_node_t *node) {
356
+ pm_static_literals_add(const pm_newline_list_t *newline_list, int32_t start_line, pm_static_literals_t *literals, pm_node_t *node, bool replace) {
339
357
  switch (PM_NODE_TYPE(node)) {
340
358
  case PM_INTEGER_NODE:
341
359
  case PM_SOURCE_LINE_NODE:
@@ -347,6 +365,7 @@ pm_static_literals_add(const pm_newline_list_t *newline_list, int32_t start_line
347
365
  .encoding_name = NULL
348
366
  },
349
367
  node,
368
+ replace,
350
369
  pm_compare_integer_nodes
351
370
  );
352
371
  case PM_FLOAT_NODE:
@@ -358,6 +377,7 @@ pm_static_literals_add(const pm_newline_list_t *newline_list, int32_t start_line
358
377
  .encoding_name = NULL
359
378
  },
360
379
  node,
380
+ replace,
361
381
  pm_compare_float_nodes
362
382
  );
363
383
  case PM_RATIONAL_NODE:
@@ -370,6 +390,7 @@ pm_static_literals_add(const pm_newline_list_t *newline_list, int32_t start_line
370
390
  .encoding_name = NULL
371
391
  },
372
392
  node,
393
+ replace,
373
394
  pm_compare_number_nodes
374
395
  );
375
396
  case PM_STRING_NODE:
@@ -382,6 +403,7 @@ pm_static_literals_add(const pm_newline_list_t *newline_list, int32_t start_line
382
403
  .encoding_name = NULL
383
404
  },
384
405
  node,
406
+ replace,
385
407
  pm_compare_string_nodes
386
408
  );
387
409
  case PM_REGULAR_EXPRESSION_NODE:
@@ -393,6 +415,7 @@ pm_static_literals_add(const pm_newline_list_t *newline_list, int32_t start_line
393
415
  .encoding_name = NULL
394
416
  },
395
417
  node,
418
+ replace,
396
419
  pm_compare_regular_expression_nodes
397
420
  );
398
421
  case PM_SYMBOL_NODE:
@@ -404,26 +427,27 @@ pm_static_literals_add(const pm_newline_list_t *newline_list, int32_t start_line
404
427
  .encoding_name = NULL
405
428
  },
406
429
  node,
430
+ replace,
407
431
  pm_compare_string_nodes
408
432
  );
409
433
  case PM_TRUE_NODE: {
410
434
  pm_node_t *duplicated = literals->true_node;
411
- literals->true_node = node;
435
+ if ((duplicated == NULL) || replace) literals->true_node = node;
412
436
  return duplicated;
413
437
  }
414
438
  case PM_FALSE_NODE: {
415
439
  pm_node_t *duplicated = literals->false_node;
416
- literals->false_node = node;
440
+ if ((duplicated == NULL) || replace) literals->false_node = node;
417
441
  return duplicated;
418
442
  }
419
443
  case PM_NIL_NODE: {
420
444
  pm_node_t *duplicated = literals->nil_node;
421
- literals->nil_node = node;
445
+ if ((duplicated == NULL) || replace) literals->nil_node = node;
422
446
  return duplicated;
423
447
  }
424
448
  case PM_SOURCE_ENCODING_NODE: {
425
449
  pm_node_t *duplicated = literals->source_encoding_node;
426
- literals->source_encoding_node = node;
450
+ if ((duplicated == NULL) || replace) literals->source_encoding_node = node;
427
451
  return duplicated;
428
452
  }
429
453
  default:
@@ -456,7 +480,7 @@ pm_static_literal_positive_p(const pm_node_t *node) {
456
480
  case PM_INTEGER_NODE:
457
481
  return !((const pm_integer_node_t *) node)->value.negative;
458
482
  case PM_RATIONAL_NODE:
459
- return pm_static_literal_positive_p(((const pm_rational_node_t *) node)->numeric);
483
+ return !((const pm_rational_node_t *) node)->numerator.negative;
460
484
  case PM_IMAGINARY_NODE:
461
485
  return pm_static_literal_positive_p(((const pm_imaginary_node_t *) node)->numeric);
462
486
  default:
@@ -465,43 +489,6 @@ pm_static_literal_positive_p(const pm_node_t *node) {
465
489
  }
466
490
  }
467
491
 
468
- /**
469
- * Inspect a rational node that wraps a float node. This is going to be a
470
- * poor-man's version of the Ruby `Rational#to_s` method, because we're not
471
- * going to try to reduce the rational by finding the GCD. We'll leave that for
472
- * a future improvement.
473
- */
474
- static void
475
- pm_rational_inspect(pm_buffer_t *buffer, pm_rational_node_t *node) {
476
- const uint8_t *start = node->base.location.start;
477
- const uint8_t *end = node->base.location.end - 1; // r
478
-
479
- while (start < end && *start == '0') start++; // 0.1 -> .1
480
- while (end > start && end[-1] == '0') end--; // 1.0 -> 1.
481
- size_t length = (size_t) (end - start);
482
-
483
- const uint8_t *point = memchr(start, '.', length);
484
- assert(point && "should have a decimal point");
485
-
486
- uint8_t *digits = malloc(length - 1);
487
- if (digits == NULL) return;
488
-
489
- memcpy(digits, start, (unsigned long) (point - start));
490
- memcpy(digits + (point - start), point + 1, (unsigned long) (end - point - 1));
491
-
492
- pm_integer_t numerator = { 0 };
493
- pm_integer_parse(&numerator, PM_INTEGER_BASE_DECIMAL, digits, digits + length - 1);
494
-
495
- pm_buffer_append_byte(buffer, '(');
496
- pm_integer_string(buffer, &numerator);
497
- pm_buffer_append_string(buffer, "/1", 2);
498
- for (size_t index = 0; index < (size_t) (end - point - 1); index++) pm_buffer_append_byte(buffer, '0');
499
- pm_buffer_append_byte(buffer, ')');
500
-
501
- pm_integer_free(&numerator);
502
- free(digits);
503
- }
504
-
505
492
  /**
506
493
  * Create a string-based representation of the given static literal.
507
494
  */
@@ -544,7 +531,9 @@ pm_static_literal_inspect_node(pm_buffer_t *buffer, const pm_static_literals_met
544
531
  pm_buffer_append_string(buffer, "(0", 2);
545
532
  if (pm_static_literal_positive_p(numeric)) pm_buffer_append_byte(buffer, '+');
546
533
  pm_static_literal_inspect_node(buffer, metadata, numeric);
547
- if (PM_NODE_TYPE_P(numeric, PM_RATIONAL_NODE)) pm_buffer_append_byte(buffer, '*');
534
+ if (PM_NODE_TYPE_P(numeric, PM_RATIONAL_NODE)) {
535
+ pm_buffer_append_byte(buffer, '*');
536
+ }
548
537
  pm_buffer_append_string(buffer, "i)", 2);
549
538
  break;
550
539
  }
@@ -555,22 +544,12 @@ pm_static_literal_inspect_node(pm_buffer_t *buffer, const pm_static_literals_met
555
544
  pm_buffer_append_string(buffer, "nil", 3);
556
545
  break;
557
546
  case PM_RATIONAL_NODE: {
558
- const pm_node_t *numeric = ((const pm_rational_node_t *) node)->numeric;
559
-
560
- switch (PM_NODE_TYPE(numeric)) {
561
- case PM_INTEGER_NODE:
562
- pm_buffer_append_byte(buffer, '(');
563
- pm_static_literal_inspect_node(buffer, metadata, numeric);
564
- pm_buffer_append_string(buffer, "/1)", 3);
565
- break;
566
- case PM_FLOAT_NODE:
567
- pm_rational_inspect(buffer, (pm_rational_node_t *) node);
568
- break;
569
- default:
570
- assert(false && "unreachable");
571
- break;
572
- }
573
-
547
+ const pm_rational_node_t *rational = (const pm_rational_node_t *) node;
548
+ pm_buffer_append_byte(buffer, '(');
549
+ pm_integer_string(buffer, &rational->numerator);
550
+ pm_buffer_append_byte(buffer, '/');
551
+ pm_integer_string(buffer, &rational->denominator);
552
+ pm_buffer_append_byte(buffer, ')');
574
553
  break;
575
554
  }
576
555
  case PM_REGULAR_EXPRESSION_NODE: {
@@ -624,7 +603,7 @@ pm_static_literal_inspect_node(pm_buffer_t *buffer, const pm_static_literals_met
624
603
  /**
625
604
  * Create a string-based representation of the given static literal.
626
605
  */
627
- PRISM_EXPORTED_FUNCTION void
606
+ void
628
607
  pm_static_literal_inspect(pm_buffer_t *buffer, const pm_newline_list_t *newline_list, int32_t start_line, const char *encoding_name, const pm_node_t *node) {
629
608
  pm_static_literal_inspect_node(
630
609
  buffer,