prism 0.22.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/src/serialize.c CHANGED
@@ -1843,6 +1843,17 @@ pm_serialize_node(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) {
1843
1843
  }
1844
1844
  }
1845
1845
 
1846
+ static void
1847
+ pm_serialize_newline_list(pm_newline_list_t *list, pm_buffer_t *buffer) {
1848
+ uint32_t size = pm_sizet_to_u32(list->size);
1849
+ pm_buffer_append_varuint(buffer, size);
1850
+
1851
+ for (uint32_t i = 0; i < size; i++) {
1852
+ uint32_t offset = pm_sizet_to_u32(list->offsets[i]);
1853
+ pm_buffer_append_varuint(buffer, offset);
1854
+ }
1855
+ }
1856
+
1846
1857
  static void
1847
1858
  pm_serialize_comment(pm_parser_t *parser, pm_comment_t *comment, pm_buffer_t *buffer) {
1848
1859
  // serialize type
@@ -1929,19 +1940,25 @@ pm_serialize_encoding(const pm_encoding_t *encoding, pm_buffer_t *buffer) {
1929
1940
  pm_buffer_append_string(buffer, encoding->name, encoding_length);
1930
1941
  }
1931
1942
 
1932
- #line 218 "serialize.c.erb"
1933
- /**
1934
- * Serialize the encoding, metadata, nodes, and constant pool.
1935
- */
1936
- void
1937
- pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) {
1943
+ static void
1944
+ pm_serialize_metadata(pm_parser_t *parser, pm_buffer_t *buffer) {
1938
1945
  pm_serialize_encoding(parser->encoding, buffer);
1939
1946
  pm_buffer_append_varsint(buffer, parser->start_line);
1947
+ pm_serialize_newline_list(&parser->newline_list, buffer);
1940
1948
  pm_serialize_comment_list(parser, &parser->comment_list, buffer);
1941
1949
  pm_serialize_magic_comment_list(parser, &parser->magic_comment_list, buffer);
1942
1950
  pm_serialize_data_loc(parser, buffer);
1943
1951
  pm_serialize_diagnostic_list(parser, &parser->error_list, buffer);
1944
1952
  pm_serialize_diagnostic_list(parser, &parser->warning_list, buffer);
1953
+ }
1954
+
1955
+ #line 243 "serialize.c.erb"
1956
+ /**
1957
+ * Serialize the metadata, nodes, and constant pool.
1958
+ */
1959
+ void
1960
+ pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) {
1961
+ pm_serialize_metadata(parser, buffer);
1945
1962
 
1946
1963
  // Here we're going to leave space for the offset of the constant pool in
1947
1964
  // the buffer.
@@ -2032,13 +2049,7 @@ pm_serialize_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const
2032
2049
  // Append 0 to mark end of tokens.
2033
2050
  pm_buffer_append_byte(buffer, 0);
2034
2051
 
2035
- pm_serialize_encoding(parser.encoding, buffer);
2036
- pm_buffer_append_varsint(buffer, parser.start_line);
2037
- pm_serialize_comment_list(&parser, &parser.comment_list, buffer);
2038
- pm_serialize_magic_comment_list(&parser, &parser.magic_comment_list, buffer);
2039
- pm_serialize_data_loc(&parser, buffer);
2040
- pm_serialize_diagnostic_list(&parser, &parser.error_list, buffer);
2041
- pm_serialize_diagnostic_list(&parser, &parser.warning_list, buffer);
2052
+ pm_serialize_metadata(&parser, buffer);
2042
2053
 
2043
2054
  pm_node_destroy(&parser, node);
2044
2055
  pm_parser_free(&parser);
data/src/token_type.c CHANGED
@@ -469,7 +469,7 @@ pm_token_type_human(pm_token_type_t token_type) {
469
469
  case PM_TOKEN_HEREDOC_START:
470
470
  return "heredoc beginning";
471
471
  case PM_TOKEN_IDENTIFIER:
472
- return "local variable or method identifier";
472
+ return "local variable or method";
473
473
  case PM_TOKEN_IGNORED_NEWLINE:
474
474
  return "ignored newline";
475
475
  case PM_TOKEN_INSTANCE_VARIABLE:
@@ -579,7 +579,7 @@ pm_token_type_human(pm_token_type_t token_type) {
579
579
  case PM_TOKEN_LABEL:
580
580
  return "label";
581
581
  case PM_TOKEN_LABEL_END:
582
- return "':'";
582
+ return "label terminator";
583
583
  case PM_TOKEN_LAMBDA_BEGIN:
584
584
  return "'{'";
585
585
  case PM_TOKEN_LESS:
@@ -681,7 +681,7 @@ pm_token_type_human(pm_token_type_t token_type) {
681
681
  case PM_TOKEN_UPLUS:
682
682
  return "'+'";
683
683
  case PM_TOKEN_USTAR:
684
- return "'*'";
684
+ return "*";
685
685
  case PM_TOKEN_USTAR_STAR:
686
686
  return "'**'";
687
687
  case PM_TOKEN_WORDS_SEP:
@@ -186,7 +186,7 @@ pm_constant_pool_id_to_constant(const pm_constant_pool_t *pool, pm_constant_id_t
186
186
  * the constant is not found.
187
187
  */
188
188
  pm_constant_id_t
189
- pm_constant_pool_find(pm_constant_pool_t *pool, const uint8_t *start, size_t length) {
189
+ pm_constant_pool_find(const pm_constant_pool_t *pool, const uint8_t *start, size_t length) {
190
190
  assert(is_power_of_two(pool->capacity));
191
191
  const uint32_t mask = pool->capacity - 1;
192
192
 
@@ -1,10 +1,18 @@
1
1
  #include "prism/util/pm_strpbrk.h"
2
2
 
3
3
  /**
4
- * This is the slow path that does care about the encoding.
4
+ * Add an invalid multibyte character error to the parser.
5
+ */
6
+ static inline void
7
+ pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
8
+ pm_diagnostic_list_append_format(&parser->error_list, start, end, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *start);
9
+ }
10
+
11
+ /**
12
+ * This is the default path.
5
13
  */
6
14
  static inline const uint8_t *
7
- pm_strpbrk_multi_byte(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) {
15
+ pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
8
16
  size_t index = 0;
9
17
 
10
18
  while (index < maximum) {
@@ -12,22 +20,39 @@ pm_strpbrk_multi_byte(const pm_parser_t *parser, const uint8_t *source, const ui
12
20
  return source + index;
13
21
  }
14
22
 
15
- size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
16
- if (width == 0) {
17
- return NULL;
18
- }
23
+ if (source[index] < 0x80) {
24
+ index++;
25
+ } else {
26
+ size_t width = pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index));
19
27
 
20
- index += width;
28
+ if (width > 0) {
29
+ index += width;
30
+ } else if (!validate) {
31
+ index++;
32
+ } else {
33
+ // At this point we know we have an invalid multibyte character.
34
+ // We'll walk forward as far as we can until we find the next
35
+ // valid character so that we don't spam the user with a ton of
36
+ // the same kind of error.
37
+ const size_t start = index;
38
+
39
+ do {
40
+ index++;
41
+ } while (index < maximum && pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
42
+
43
+ pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
44
+ }
45
+ }
21
46
  }
22
47
 
23
48
  return NULL;
24
49
  }
25
50
 
26
51
  /**
27
- * This is the fast path that does not care about the encoding.
52
+ * This is the path when the encoding is ASCII-8BIT.
28
53
  */
29
54
  static inline const uint8_t *
30
- pm_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t maximum) {
55
+ pm_strpbrk_ascii_8bit(const uint8_t *source, const uint8_t *charset, size_t maximum) {
31
56
  size_t index = 0;
32
57
 
33
58
  while (index < maximum) {
@@ -41,6 +66,85 @@ pm_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t max
41
66
  return NULL;
42
67
  }
43
68
 
69
+ /**
70
+ * This is the slow path that does care about the encoding.
71
+ */
72
+ static inline const uint8_t *
73
+ pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
74
+ size_t index = 0;
75
+
76
+ while (index < maximum) {
77
+ if (strchr((const char *) charset, source[index]) != NULL) {
78
+ return source + index;
79
+ }
80
+
81
+ if (source[index] < 0x80) {
82
+ index++;
83
+ } else {
84
+ size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
85
+
86
+ if (width > 0) {
87
+ index += width;
88
+ } else if (!validate) {
89
+ index++;
90
+ } else {
91
+ // At this point we know we have an invalid multibyte character.
92
+ // We'll walk forward as far as we can until we find the next
93
+ // valid character so that we don't spam the user with a ton of
94
+ // the same kind of error.
95
+ const size_t start = index;
96
+
97
+ do {
98
+ index++;
99
+ } while (index < maximum && parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
100
+
101
+ pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
102
+ }
103
+ }
104
+ }
105
+
106
+ return NULL;
107
+ }
108
+
109
+ /**
110
+ * This is the fast path that does not care about the encoding because we know
111
+ * the encoding only supports single-byte characters.
112
+ */
113
+ static inline const uint8_t *
114
+ pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
115
+ size_t index = 0;
116
+
117
+ while (index < maximum) {
118
+ if (strchr((const char *) charset, source[index]) != NULL) {
119
+ return source + index;
120
+ }
121
+
122
+ if (source[index] < 0x80 || !validate) {
123
+ index++;
124
+ } else {
125
+ size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
126
+
127
+ if (width > 0) {
128
+ index += width;
129
+ } else {
130
+ // At this point we know we have an invalid multibyte character.
131
+ // We'll walk forward as far as we can until we find the next
132
+ // valid character so that we don't spam the user with a ton of
133
+ // the same kind of error.
134
+ const size_t start = index;
135
+
136
+ do {
137
+ index++;
138
+ } while (index < maximum && parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
139
+
140
+ pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
141
+ }
142
+ }
143
+ }
144
+
145
+ return NULL;
146
+ }
147
+
44
148
  /**
45
149
  * Here we have rolled our own version of strpbrk. The standard library strpbrk
46
150
  * has undefined behavior when the source string is not null-terminated. We want
@@ -57,16 +161,20 @@ pm_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t max
57
161
  *
58
162
  * Finally, we want to support encodings wherein the charset could contain
59
163
  * characters that are trailing bytes of multi-byte characters. For example, in
60
- * Shift-JIS, the backslash character can be a trailing byte. In that case we
164
+ * Shift_JIS, the backslash character can be a trailing byte. In that case we
61
165
  * need to take a slower path and iterate one multi-byte character at a time.
62
166
  */
63
167
  const uint8_t *
64
- pm_strpbrk(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) {
168
+ pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate) {
65
169
  if (length <= 0) {
66
170
  return NULL;
67
- } else if (parser->encoding_changed && parser->encoding->multibyte) {
68
- return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length);
171
+ } else if (!parser->encoding_changed) {
172
+ return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate);
173
+ } else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {
174
+ return pm_strpbrk_ascii_8bit(source, charset, (size_t) length);
175
+ } else if (parser->encoding->multibyte) {
176
+ return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate);
69
177
  } else {
70
- return pm_strpbrk_single_byte(source, charset, (size_t) length);
178
+ return pm_strpbrk_single_byte(parser, source, charset, (size_t) length, validate);
71
179
  }
72
180
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: prism
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.22.0
4
+ version: 0.23.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shopify
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-02-07 00:00:00.000000000 Z
11
+ date: 2024-02-14 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email:
@@ -40,6 +40,7 @@ files:
40
40
  - docs/releasing.md
41
41
  - docs/ripper.md
42
42
  - docs/ruby_api.md
43
+ - docs/ruby_parser_translation.md
43
44
  - docs/serialization.md
44
45
  - docs/testing.md
45
46
  - ext/prism/api_node.c
@@ -88,13 +89,14 @@ files:
88
89
  - lib/prism/parse_result/comments.rb
89
90
  - lib/prism/parse_result/newlines.rb
90
91
  - lib/prism/pattern.rb
91
- - lib/prism/ripper_compat.rb
92
92
  - lib/prism/serialize.rb
93
93
  - lib/prism/translation.rb
94
94
  - lib/prism/translation/parser.rb
95
95
  - lib/prism/translation/parser/compiler.rb
96
96
  - lib/prism/translation/parser/lexer.rb
97
97
  - lib/prism/translation/parser/rubocop.rb
98
+ - lib/prism/translation/ripper.rb
99
+ - lib/prism/translation/ruby_parser.rb
98
100
  - lib/prism/visitor.rb
99
101
  - prism.gemspec
100
102
  - rbi/prism.rbi
@@ -144,7 +146,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
144
146
  - !ruby/object:Gem::Version
145
147
  version: '0'
146
148
  requirements: []
147
- rubygems_version: 3.6.0.dev
149
+ rubygems_version: 3.4.1
148
150
  signing_key:
149
151
  specification_version: 4
150
152
  summary: Prism Ruby parser
@@ -1,285 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "ripper"
4
-
5
- module Prism
6
- # Note: This integration is not finished, and therefore still has many
7
- # inconsistencies with Ripper. If you'd like to help out, pull requests would
8
- # be greatly appreciated!
9
- #
10
- # This class is meant to provide a compatibility layer between prism and
11
- # Ripper. It functions by parsing the entire tree first and then walking it
12
- # and executing each of the Ripper callbacks as it goes.
13
- #
14
- # This class is going to necessarily be slower than the native Ripper API. It
15
- # is meant as a stopgap until developers migrate to using prism. It is also
16
- # meant as a test harness for the prism parser.
17
- #
18
- # To use this class, you treat `Prism::RipperCompat` effectively as you would
19
- # treat the `Ripper` class.
20
- class RipperCompat < Visitor
21
- # This class mirrors the ::Ripper::SexpBuilder subclass of ::Ripper that
22
- # returns the arrays of [type, *children].
23
- class SexpBuilder < RipperCompat
24
- private
25
-
26
- Ripper::PARSER_EVENTS.each do |event|
27
- define_method(:"on_#{event}") do |*args|
28
- [event, *args]
29
- end
30
- end
31
-
32
- Ripper::SCANNER_EVENTS.each do |event|
33
- define_method(:"on_#{event}") do |value|
34
- [:"@#{event}", value, [lineno, column]]
35
- end
36
- end
37
- end
38
-
39
- # This class mirrors the ::Ripper::SexpBuilderPP subclass of ::Ripper that
40
- # returns the same values as ::Ripper::SexpBuilder except with a couple of
41
- # niceties that flatten linked lists into arrays.
42
- class SexpBuilderPP < SexpBuilder
43
- private
44
-
45
- def _dispatch_event_new # :nodoc:
46
- []
47
- end
48
-
49
- def _dispatch_event_push(list, item) # :nodoc:
50
- list << item
51
- list
52
- end
53
-
54
- Ripper::PARSER_EVENT_TABLE.each do |event, arity|
55
- case event
56
- when /_new\z/
57
- alias_method :"on_#{event}", :_dispatch_event_new if arity == 0
58
- when /_add\z/
59
- alias_method :"on_#{event}", :_dispatch_event_push
60
- end
61
- end
62
- end
63
-
64
- # The source that is being parsed.
65
- attr_reader :source
66
-
67
- # The current line number of the parser.
68
- attr_reader :lineno
69
-
70
- # The current column number of the parser.
71
- attr_reader :column
72
-
73
- # Create a new RipperCompat object with the given source.
74
- def initialize(source)
75
- @source = source
76
- @result = nil
77
- @lineno = nil
78
- @column = nil
79
- end
80
-
81
- ############################################################################
82
- # Public interface
83
- ############################################################################
84
-
85
- # True if the parser encountered an error during parsing.
86
- def error?
87
- result.failure?
88
- end
89
-
90
- # Parse the source and return the result.
91
- def parse
92
- result.magic_comments.each do |magic_comment|
93
- on_magic_comment(magic_comment.key, magic_comment.value)
94
- end
95
-
96
- if error?
97
- result.errors.each do |error|
98
- on_parse_error(error.message)
99
- end
100
-
101
- nil
102
- else
103
- result.value.accept(self)
104
- end
105
- end
106
-
107
- ############################################################################
108
- # Visitor methods
109
- ############################################################################
110
-
111
- # Visit an ArrayNode node.
112
- def visit_array_node(node)
113
- elements = visit_elements(node.elements) unless node.elements.empty?
114
- bounds(node.location)
115
- on_array(elements)
116
- end
117
-
118
- # Visit a CallNode node.
119
- def visit_call_node(node)
120
- if node.variable_call?
121
- if node.message.match?(/^[[:alpha:]_]/)
122
- bounds(node.message_loc)
123
- return on_vcall(on_ident(node.message))
124
- end
125
-
126
- raise NotImplementedError, "Non-alpha variable call"
127
- end
128
-
129
- if node.opening_loc.nil?
130
- left = visit(node.receiver)
131
- if node.arguments&.arguments&.length == 1
132
- right = visit(node.arguments.arguments.first)
133
-
134
- on_binary(left, node.name, right)
135
- elsif !node.arguments || node.arguments.empty?
136
- on_unary(node.name, left)
137
- else
138
- raise NotImplementedError, "More than two arguments for operator"
139
- end
140
- else
141
- raise NotImplementedError, "Non-nil opening_loc"
142
- end
143
- end
144
-
145
- # Visit a FloatNode node.
146
- def visit_float_node(node)
147
- visit_number(node) { |text| on_float(text) }
148
- end
149
-
150
- # Visit a ImaginaryNode node.
151
- def visit_imaginary_node(node)
152
- visit_number(node) { |text| on_imaginary(text) }
153
- end
154
-
155
- # Visit an IntegerNode node.
156
- def visit_integer_node(node)
157
- visit_number(node) { |text| on_int(text) }
158
- end
159
-
160
- # Visit a ParenthesesNode node.
161
- def visit_parentheses_node(node)
162
- body =
163
- if node.body.nil?
164
- on_stmts_add(on_stmts_new, on_void_stmt)
165
- else
166
- visit(node.body)
167
- end
168
-
169
- bounds(node.location)
170
- on_paren(body)
171
- end
172
-
173
- # Visit a ProgramNode node.
174
- def visit_program_node(node)
175
- statements = visit(node.statements)
176
- bounds(node.location)
177
- on_program(statements)
178
- end
179
-
180
- # Visit a RangeNode node.
181
- def visit_range_node(node)
182
- left = visit(node.left)
183
- right = visit(node.right)
184
-
185
- bounds(node.location)
186
- if node.exclude_end?
187
- on_dot3(left, right)
188
- else
189
- on_dot2(left, right)
190
- end
191
- end
192
-
193
- # Visit a RationalNode node.
194
- def visit_rational_node(node)
195
- visit_number(node) { |text| on_rational(text) }
196
- end
197
-
198
- # Visit a StatementsNode node.
199
- def visit_statements_node(node)
200
- bounds(node.location)
201
- node.body.inject(on_stmts_new) do |stmts, stmt|
202
- on_stmts_add(stmts, visit(stmt))
203
- end
204
- end
205
-
206
- ############################################################################
207
- # Entrypoints for subclasses
208
- ############################################################################
209
-
210
- # This is a convenience method that runs the SexpBuilder subclass parser.
211
- def self.sexp_raw(source)
212
- SexpBuilder.new(source).parse
213
- end
214
-
215
- # This is a convenience method that runs the SexpBuilderPP subclass parser.
216
- def self.sexp(source)
217
- SexpBuilderPP.new(source).parse
218
- end
219
-
220
- private
221
-
222
- # Visit a list of elements, like the elements of an array or arguments.
223
- def visit_elements(elements)
224
- bounds(elements.first.location)
225
- elements.inject(on_args_new) do |args, element|
226
- on_args_add(args, visit(element))
227
- end
228
- end
229
-
230
- # Visit a node that represents a number. We need to explicitly handle the
231
- # unary - operator.
232
- def visit_number(node)
233
- slice = node.slice
234
- location = node.location
235
-
236
- if slice[0] == "-"
237
- bounds_values(location.start_line, location.start_column + 1)
238
- value = yield slice[1..-1]
239
-
240
- bounds(node.location)
241
- on_unary(RUBY_ENGINE == "jruby" ? :- : :-@, value)
242
- else
243
- bounds(location)
244
- yield slice
245
- end
246
- end
247
-
248
- # This method is responsible for updating lineno and column information
249
- # to reflect the current node.
250
- #
251
- # This method could be drastically improved with some caching on the start
252
- # of every line, but for now it's good enough.
253
- def bounds(location)
254
- @lineno = location.start_line
255
- @column = location.start_column
256
- end
257
-
258
- # If we need to do something unusual, we can directly update the line number
259
- # and column to reflect the current node.
260
- def bounds_values(lineno, column)
261
- @lineno = lineno
262
- @column = column
263
- end
264
-
265
- # Lazily initialize the parse result.
266
- def result
267
- @result ||= Prism.parse(source)
268
- end
269
-
270
- def _dispatch0; end # :nodoc:
271
- def _dispatch1(_); end # :nodoc:
272
- def _dispatch2(_, _); end # :nodoc:
273
- def _dispatch3(_, _, _); end # :nodoc:
274
- def _dispatch4(_, _, _, _); end # :nodoc:
275
- def _dispatch5(_, _, _, _, _); end # :nodoc:
276
- def _dispatch7(_, _, _, _, _, _, _); end # :nodoc:
277
-
278
- alias_method :on_parse_error, :_dispatch1
279
- alias_method :on_magic_comment, :_dispatch2
280
-
281
- (Ripper::SCANNER_EVENT_TABLE.merge(Ripper::PARSER_EVENT_TABLE)).each do |event, arity|
282
- alias_method :"on_#{event}", :"_dispatch#{arity}"
283
- end
284
- end
285
- end