prism 0.22.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -542,8 +542,9 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod
542
542
  pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options);
543
543
  pm_parser_register_encoding_changed_callback(&parser, parse_lex_encoding_changed_callback);
544
544
 
545
+ VALUE source_string = rb_str_new((const char *) pm_string_source(input), pm_string_length(input));
545
546
  VALUE offsets = rb_ary_new();
546
- VALUE source_argv[] = { rb_str_new((const char *) pm_string_source(input), pm_string_length(input)), ULONG2NUM(parser.start_line), offsets };
547
+ VALUE source_argv[] = { source_string, LONG2NUM(parser.start_line), offsets };
547
548
  VALUE source = rb_class_new_instance(3, source_argv, rb_cPrismSource);
548
549
 
549
550
  parse_lex_data_t parse_lex_data = {
@@ -561,17 +562,21 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod
561
562
  parser.lex_callback = &lex_callback;
562
563
  pm_node_t *node = pm_parse(&parser);
563
564
 
564
- // Here we need to update the source range to have the correct newline
565
- // offsets. We do it here because we've already created the object and given
566
- // it over to all of the tokens.
565
+ // Here we need to update the Source object to have the correct
566
+ // encoding for the source string and the correct newline offsets.
567
+ // We do it here because we've already created the Source object and given
568
+ // it over to all of the tokens, and both of these are only set after pm_parse().
569
+ rb_encoding *encoding = rb_enc_find(parser.encoding->name);
570
+ rb_enc_associate(source_string, encoding);
571
+
567
572
  for (size_t index = 0; index < parser.newline_list.size; index++) {
568
- rb_ary_push(offsets, INT2FIX(parser.newline_list.offsets[index]));
573
+ rb_ary_push(offsets, ULONG2NUM(parser.newline_list.offsets[index]));
569
574
  }
570
575
 
571
576
  VALUE value;
572
577
  if (return_nodes) {
573
578
  value = rb_ary_new_capa(2);
574
- rb_ary_push(value, pm_ast_new(&parser, node, parse_lex_data.encoding));
579
+ rb_ary_push(value, pm_ast_new(&parser, node, parse_lex_data.encoding, source));
575
580
  rb_ary_push(value, parse_lex_data.tokens);
576
581
  } else {
577
582
  value = parse_lex_data.tokens;
@@ -650,7 +655,7 @@ parse_input(pm_string_t *input, const pm_options_t *options) {
650
655
 
651
656
  VALUE source = pm_source_new(&parser, encoding);
652
657
  VALUE result_argv[] = {
653
- pm_ast_new(&parser, node, encoding),
658
+ pm_ast_new(&parser, node, encoding, source),
654
659
  parser_comments(&parser, source),
655
660
  parser_magic_comments(&parser, source),
656
661
  parser_data_loc(&parser, source),
@@ -1,7 +1,7 @@
1
1
  #ifndef PRISM_EXT_NODE_H
2
2
  #define PRISM_EXT_NODE_H
3
3
 
4
- #define EXPECTED_PRISM_VERSION "0.22.0"
4
+ #define EXPECTED_PRISM_VERSION "0.23.0"
5
5
 
6
6
  #include <ruby.h>
7
7
  #include <ruby/encoding.h>
@@ -9,7 +9,7 @@
9
9
 
10
10
  VALUE pm_source_new(pm_parser_t *parser, rb_encoding *encoding);
11
11
  VALUE pm_token_new(pm_parser_t *parser, pm_token_t *token, rb_encoding *encoding, VALUE source);
12
- VALUE pm_ast_new(pm_parser_t *parser, pm_node_t *node, rb_encoding *encoding);
12
+ VALUE pm_ast_new(pm_parser_t *parser, pm_node_t *node, rb_encoding *encoding, VALUE source);
13
13
 
14
14
  void Init_prism_api_node(void);
15
15
  void Init_prism_pack(void);
@@ -219,6 +219,7 @@ typedef enum {
219
219
  PM_ERR_MODULE_NAME,
220
220
  PM_ERR_MODULE_TERM,
221
221
  PM_ERR_MULTI_ASSIGN_MULTI_SPLATS,
222
+ PM_ERR_MULTI_ASSIGN_UNEXPECTED_REST,
222
223
  PM_ERR_NOT_EXPRESSION,
223
224
  PM_ERR_NO_LOCAL_VARIABLE,
224
225
  PM_ERR_NUMBER_LITERAL_UNDERSCORE,
@@ -272,6 +273,7 @@ typedef enum {
272
273
  PM_ERR_STATEMENT_UNDEF,
273
274
  PM_ERR_STRING_CONCATENATION,
274
275
  PM_ERR_STRING_INTERPOLATED_TERM,
276
+ PM_ERR_STRING_LITERAL_EOF,
275
277
  PM_ERR_STRING_LITERAL_TERM,
276
278
  PM_ERR_SYMBOL_INVALID,
277
279
  PM_ERR_SYMBOL_TERM_DYNAMIC,
@@ -279,10 +281,7 @@ typedef enum {
279
281
  PM_ERR_TERNARY_COLON,
280
282
  PM_ERR_TERNARY_EXPRESSION_FALSE,
281
283
  PM_ERR_TERNARY_EXPRESSION_TRUE,
282
- PM_ERR_UNARY_RECEIVER_BANG,
283
- PM_ERR_UNARY_RECEIVER_MINUS,
284
- PM_ERR_UNARY_RECEIVER_PLUS,
285
- PM_ERR_UNARY_RECEIVER_TILDE,
284
+ PM_ERR_UNARY_RECEIVER,
286
285
  PM_ERR_UNEXPECTED_TOKEN_CLOSE_CONTEXT,
287
286
  PM_ERR_UNEXPECTED_TOKEN_IGNORE,
288
287
  PM_ERR_UNDEF_ARGUMENT,
@@ -245,6 +245,13 @@ extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM];
245
245
  */
246
246
  #define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII])
247
247
 
248
+ /**
249
+ * This is the ASCII-8BIT encoding. We need a reference to it so that pm_strpbrk
250
+ * can compare against it because invalid multibyte characters are not a thing
251
+ * in this encoding.
252
+ */
253
+ #define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT])
254
+
248
255
  /**
249
256
  * Parse the given name of an encoding and return a pointer to the corresponding
250
257
  * encoding struct if one can be found, otherwise return NULL.
@@ -163,7 +163,7 @@ pm_constant_t * pm_constant_pool_id_to_constant(const pm_constant_pool_t *pool,
163
163
  * @param length The length of the constant.
164
164
  * @return The id of the constant.
165
165
  */
166
- pm_constant_id_t pm_constant_pool_find(pm_constant_pool_t *pool, const uint8_t *start, size_t length);
166
+ pm_constant_id_t pm_constant_pool_find(const pm_constant_pool_t *pool, const uint8_t *start, size_t length);
167
167
 
168
168
  /**
169
169
  * Insert a constant into a constant pool that is a slice of a source string.
@@ -7,6 +7,7 @@
7
7
  #define PRISM_STRPBRK_H
8
8
 
9
9
  #include "prism/defines.h"
10
+ #include "prism/diagnostic.h"
10
11
  #include "prism/parser.h"
11
12
 
12
13
  #include <stddef.h>
@@ -35,9 +36,11 @@
35
36
  * @param source The source to search.
36
37
  * @param charset The charset to search for.
37
38
  * @param length The maximum number of bytes to search.
39
+ * @param validate Whether to validate that the source string is valid in the
40
+ * current encoding of the parser.
38
41
  * @return A pointer to the first character in the source string that is in the
39
42
  * charset, or NULL if no such character exists.
40
43
  */
41
- const uint8_t * pm_strpbrk(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length);
44
+ const uint8_t * pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate);
42
45
 
43
46
  #endif
@@ -14,7 +14,7 @@
14
14
  /**
15
15
  * The minor version of the Prism library as an int.
16
16
  */
17
- #define PRISM_VERSION_MINOR 22
17
+ #define PRISM_VERSION_MINOR 23
18
18
 
19
19
  /**
20
20
  * The patch version of the Prism library as an int.
@@ -24,6 +24,6 @@
24
24
  /**
25
25
  * The version of the Prism library as a constant string.
26
26
  */
27
- #define PRISM_VERSION "0.22.0"
27
+ #define PRISM_VERSION "0.23.0"
28
28
 
29
29
  #endif
data/lib/prism/ffi.rb CHANGED
@@ -317,7 +317,7 @@ module Prism
317
317
  values << (options.fetch(:frozen_string_literal, false) ? 1 : 0)
318
318
 
319
319
  template << "C"
320
- values << { nil => 0, "3.3.0" => 1, "latest" => 0 }.fetch(options[:version])
320
+ values << { nil => 0, "3.3.0" => 1, "3.4.0" => 0, "latest" => 0 }.fetch(options[:version])
321
321
 
322
322
  template << "L"
323
323
  if (scopes = options[:scopes])
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "delegate"
4
+ require "ripper"
4
5
 
5
6
  module Prism
6
7
  # This class is responsible for lexing the source using prism and then
@@ -94,7 +94,7 @@ module Prism
94
94
 
95
95
  # Returns the full name of this constant. For example: "Foo"
96
96
  def full_name
97
- name.name
97
+ name.to_s
98
98
  end
99
99
  end
100
100
 
@@ -135,7 +135,17 @@ module Prism
135
135
  # Returns the list of parts for the full name of this constant path.
136
136
  # For example: [:Foo, :Bar]
137
137
  def full_name_parts
138
- (parent&.full_name_parts || [:""]).push(child.name)
138
+ parts = case parent
139
+ when ConstantPathNode, ConstantReadNode
140
+ parent.full_name_parts
141
+ when nil
142
+ [:""]
143
+ else
144
+ raise ConstantPathNode::DynamicPartsInConstantPathError,
145
+ "Constant path target contains dynamic parts. Cannot compute full name"
146
+ end
147
+
148
+ parts.push(child.name)
139
149
  end
140
150
 
141
151
  # Returns the full name of this constant path. For example: "Foo::Bar"
@@ -144,6 +154,19 @@ module Prism
144
154
  end
145
155
  end
146
156
 
157
+ class ConstantTargetNode < Node
158
+ # Returns the list of parts for the full name of this constant.
159
+ # For example: [:Foo]
160
+ def full_name_parts
161
+ [name]
162
+ end
163
+
164
+ # Returns the full name of this constant. For example: "Foo"
165
+ def full_name
166
+ name.to_s
167
+ end
168
+ end
169
+
147
170
  class ParametersNode < Node
148
171
  # Mirrors the Method#parameters method.
149
172
  def signature
@@ -9,18 +9,16 @@ module Prism
9
9
  attr_reader :source
10
10
 
11
11
  # The line number where this source starts.
12
- attr_accessor :start_line
12
+ attr_reader :start_line
13
13
 
14
14
  # The list of newline byte offsets in the source code.
15
15
  attr_reader :offsets
16
16
 
17
- # Create a new source object with the given source code and newline byte
18
- # offsets. If no newline byte offsets are given, they will be computed from
19
- # the source code.
20
- def initialize(source, start_line = 1, offsets = compute_offsets(source))
17
+ # Create a new source object with the given source code.
18
+ def initialize(source, start_line = 1, offsets = [])
21
19
  @source = source
22
- @start_line = start_line
23
- @offsets = offsets
20
+ @start_line = start_line # set after parsing is done
21
+ @offsets = offsets # set after parsing is done
24
22
  end
25
23
 
26
24
  # Perform a byteslice on the source code using the given byte offset and
@@ -56,6 +54,23 @@ module Prism
56
54
  character_offset(byte_offset) - character_offset(line_start(byte_offset))
57
55
  end
58
56
 
57
+ # Returns the offset from the start of the file for the given byte offset
58
+ # counting in code units for the given encoding.
59
+ #
60
+ # This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
61
+ # concept of code units that differs from the number of characters in other
62
+ # encodings, it is not captured here.
63
+ def code_units_offset(byte_offset, encoding)
64
+ byteslice = source.byteslice(0, byte_offset).encode(encoding)
65
+ (encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE) ? (byteslice.bytesize / 2) : byteslice.length
66
+ end
67
+
68
+ # Returns the column number in code units for the given encoding for the
69
+ # given byte offset.
70
+ def code_units_column(byte_offset, encoding)
71
+ code_units_offset(byte_offset, encoding) - code_units_offset(line_start(byte_offset), encoding)
72
+ end
73
+
59
74
  private
60
75
 
61
76
  # Binary search through the offsets to find the line number for the given
@@ -77,14 +92,6 @@ module Prism
77
92
 
78
93
  left - 1
79
94
  end
80
-
81
- # Find all of the newlines in the source code and return their byte offsets
82
- # from the start of the string an array.
83
- def compute_offsets(code)
84
- offsets = [0]
85
- code.b.scan("\n") { offsets << $~.end(0) }
86
- offsets
87
- end
88
95
  end
89
96
 
90
97
  # This represents a location in the source.
@@ -138,6 +145,11 @@ module Prism
138
145
  source.character_offset(start_offset)
139
146
  end
140
147
 
148
+ # The offset from the start of the file in code units of the given encoding.
149
+ def start_code_units_offset(encoding = Encoding::UTF_16LE)
150
+ source.code_units_offset(start_offset, encoding)
151
+ end
152
+
141
153
  # The byte offset from the beginning of the source where this location ends.
142
154
  def end_offset
143
155
  start_offset + length
@@ -149,6 +161,11 @@ module Prism
149
161
  source.character_offset(end_offset)
150
162
  end
151
163
 
164
+ # The offset from the start of the file in code units of the given encoding.
165
+ def end_code_units_offset(encoding = Encoding::UTF_16LE)
166
+ source.code_units_offset(end_offset, encoding)
167
+ end
168
+
152
169
  # The line number where this location starts.
153
170
  def start_line
154
171
  source.line(start_offset)
@@ -177,6 +194,12 @@ module Prism
177
194
  source.character_column(start_offset)
178
195
  end
179
196
 
197
+ # The column number in code units of the given encoding where this location
198
+ # starts from the start of the line.
199
+ def start_code_units_column(encoding = Encoding::UTF_16LE)
200
+ source.code_units_column(start_offset, encoding)
201
+ end
202
+
180
203
  # The column number in bytes where this location ends from the start of the
181
204
  # line.
182
205
  def end_column
@@ -189,6 +212,12 @@ module Prism
189
212
  source.character_column(end_offset)
190
213
  end
191
214
 
215
+ # The column number in code units of the given encoding where this location
216
+ # ends from the start of the line.
217
+ def end_code_units_column(encoding = Encoding::UTF_16LE)
218
+ source.code_units_column(end_offset, encoding)
219
+ end
220
+
192
221
  # Implement the hash pattern matching interface for Location.
193
222
  def deconstruct_keys(keys)
194
223
  { start_offset: start_offset, end_offset: end_offset }
@@ -27,7 +27,7 @@ module Prism
27
27
 
28
28
  # The minor version of prism that we are expecting to find in the serialized
29
29
  # strings.
30
- MINOR_VERSION = 22
30
+ MINOR_VERSION = 23
31
31
 
32
32
  # The patch version of prism that we are expecting to find in the serialized
33
33
  # strings.
@@ -86,11 +86,15 @@ module Prism
86
86
  end
87
87
 
88
88
  def load_start_line
89
- source.start_line = load_varsint
89
+ source.instance_variable_set :@start_line, load_varsint
90
+ end
91
+
92
+ def load_line_offsets
93
+ source.instance_variable_set :@offsets, Array.new(load_varuint) { load_varuint }
90
94
  end
91
95
 
92
96
  def load_comments
93
- load_varuint.times.map do
97
+ Array.new(load_varuint) do
94
98
  case load_varuint
95
99
  when 0 then InlineComment.new(load_location)
96
100
  when 1 then EmbDocComment.new(load_location)
@@ -101,10 +105,10 @@ module Prism
101
105
 
102
106
  def load_metadata
103
107
  comments = load_comments
104
- magic_comments = load_varuint.times.map { MagicComment.new(load_location, load_location) }
108
+ magic_comments = Array.new(load_varuint) { MagicComment.new(load_location, load_location) }
105
109
  data_loc = load_optional_location
106
- errors = load_varuint.times.map { ParseError.new(load_embedded_string, load_location, load_error_level) }
107
- warnings = load_varuint.times.map { ParseWarning.new(load_embedded_string, load_location, load_warning_level) }
110
+ errors = Array.new(load_varuint) { ParseError.new(load_embedded_string, load_location, load_error_level) }
111
+ warnings = Array.new(load_varuint) { ParseWarning.new(load_embedded_string, load_location, load_warning_level) }
108
112
  [comments, magic_comments, data_loc, errors, warnings]
109
113
  end
110
114
 
@@ -125,6 +129,7 @@ module Prism
125
129
  tokens = load_tokens
126
130
  encoding = load_encoding
127
131
  load_start_line
132
+ load_line_offsets
128
133
  comments, magic_comments, data_loc, errors, warnings = load_metadata
129
134
  tokens.each { |token,| token.value.force_encoding(encoding) }
130
135
 
@@ -136,6 +141,7 @@ module Prism
136
141
  load_header
137
142
  load_encoding
138
143
  load_start_line
144
+ load_line_offsets
139
145
 
140
146
  comments, magic_comments, data_loc, errors, warnings = load_metadata
141
147
 
@@ -124,20 +124,21 @@ module Prism
124
124
  # build the parser gem AST.
125
125
  #
126
126
  # If the bytesize of the source is the same as the length, then we can
127
- # just use the offset directly. Otherwise, we build a hash that functions
128
- # as a cache for the conversion.
129
- #
130
- # This is a good opportunity for some optimizations. If the source file
131
- # has any multi-byte characters, this can tank the performance of the
132
- # translator. We could make this significantly faster by using a
133
- # different data structure for the cache.
127
+ # just use the offset directly. Otherwise, we build an array where the
128
+ # index is the byte offset and the value is the character offset.
134
129
  def build_offset_cache(source)
135
130
  if source.bytesize == source.length
136
131
  -> (offset) { offset }
137
132
  else
138
- Hash.new do |hash, offset|
139
- hash[offset] = source.byteslice(0, offset).length
133
+ offset_cache = []
134
+ offset = 0
135
+
136
+ source.each_char do |char|
137
+ char.bytesize.times { offset_cache << offset }
138
+ offset += 1
140
139
  end
140
+
141
+ offset_cache << offset
141
142
  end
142
143
  end
143
144