prism 0.22.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -1
- data/README.md +2 -1
- data/docs/releasing.md +67 -17
- data/docs/ruby_parser_translation.md +19 -0
- data/docs/serialization.md +2 -0
- data/ext/prism/api_node.c +784 -785
- data/ext/prism/extension.c +12 -7
- data/ext/prism/extension.h +2 -2
- data/include/prism/diagnostic.h +3 -4
- data/include/prism/encoding.h +7 -0
- data/include/prism/util/pm_constant_pool.h +1 -1
- data/include/prism/util/pm_strpbrk.h +4 -1
- data/include/prism/version.h +2 -2
- data/lib/prism/ffi.rb +1 -1
- data/lib/prism/lex_compat.rb +1 -0
- data/lib/prism/node_ext.rb +25 -2
- data/lib/prism/parse_result.rb +44 -15
- data/lib/prism/serialize.rb +12 -6
- data/lib/prism/translation/parser.rb +10 -9
- data/lib/prism/translation/ripper.rb +577 -0
- data/lib/prism/translation/ruby_parser.rb +1521 -0
- data/lib/prism/translation.rb +3 -3
- data/lib/prism.rb +0 -1
- data/prism.gemspec +4 -2
- data/src/diagnostic.c +10 -11
- data/src/encoding.c +16 -17
- data/src/options.c +7 -2
- data/src/prism.c +124 -64
- data/src/serialize.c +24 -13
- data/src/token_type.c +3 -3
- data/src/util/pm_constant_pool.c +1 -1
- data/src/util/pm_strpbrk.c +122 -14
- metadata +6 -4
- data/lib/prism/ripper_compat.rb +0 -285
data/ext/prism/extension.c
CHANGED
@@ -542,8 +542,9 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod
|
|
542
542
|
pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options);
|
543
543
|
pm_parser_register_encoding_changed_callback(&parser, parse_lex_encoding_changed_callback);
|
544
544
|
|
545
|
+
VALUE source_string = rb_str_new((const char *) pm_string_source(input), pm_string_length(input));
|
545
546
|
VALUE offsets = rb_ary_new();
|
546
|
-
VALUE source_argv[] = {
|
547
|
+
VALUE source_argv[] = { source_string, LONG2NUM(parser.start_line), offsets };
|
547
548
|
VALUE source = rb_class_new_instance(3, source_argv, rb_cPrismSource);
|
548
549
|
|
549
550
|
parse_lex_data_t parse_lex_data = {
|
@@ -561,17 +562,21 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod
|
|
561
562
|
parser.lex_callback = &lex_callback;
|
562
563
|
pm_node_t *node = pm_parse(&parser);
|
563
564
|
|
564
|
-
// Here we need to update the
|
565
|
-
//
|
566
|
-
// it
|
565
|
+
// Here we need to update the Source object to have the correct
|
566
|
+
// encoding for the source string and the correct newline offsets.
|
567
|
+
// We do it here because we've already created the Source object and given
|
568
|
+
// it over to all of the tokens, and both of these are only set after pm_parse().
|
569
|
+
rb_encoding *encoding = rb_enc_find(parser.encoding->name);
|
570
|
+
rb_enc_associate(source_string, encoding);
|
571
|
+
|
567
572
|
for (size_t index = 0; index < parser.newline_list.size; index++) {
|
568
|
-
rb_ary_push(offsets,
|
573
|
+
rb_ary_push(offsets, ULONG2NUM(parser.newline_list.offsets[index]));
|
569
574
|
}
|
570
575
|
|
571
576
|
VALUE value;
|
572
577
|
if (return_nodes) {
|
573
578
|
value = rb_ary_new_capa(2);
|
574
|
-
rb_ary_push(value, pm_ast_new(&parser, node, parse_lex_data.encoding));
|
579
|
+
rb_ary_push(value, pm_ast_new(&parser, node, parse_lex_data.encoding, source));
|
575
580
|
rb_ary_push(value, parse_lex_data.tokens);
|
576
581
|
} else {
|
577
582
|
value = parse_lex_data.tokens;
|
@@ -650,7 +655,7 @@ parse_input(pm_string_t *input, const pm_options_t *options) {
|
|
650
655
|
|
651
656
|
VALUE source = pm_source_new(&parser, encoding);
|
652
657
|
VALUE result_argv[] = {
|
653
|
-
pm_ast_new(&parser, node, encoding),
|
658
|
+
pm_ast_new(&parser, node, encoding, source),
|
654
659
|
parser_comments(&parser, source),
|
655
660
|
parser_magic_comments(&parser, source),
|
656
661
|
parser_data_loc(&parser, source),
|
data/ext/prism/extension.h
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#ifndef PRISM_EXT_NODE_H
|
2
2
|
#define PRISM_EXT_NODE_H
|
3
3
|
|
4
|
-
#define EXPECTED_PRISM_VERSION "0.
|
4
|
+
#define EXPECTED_PRISM_VERSION "0.23.0"
|
5
5
|
|
6
6
|
#include <ruby.h>
|
7
7
|
#include <ruby/encoding.h>
|
@@ -9,7 +9,7 @@
|
|
9
9
|
|
10
10
|
VALUE pm_source_new(pm_parser_t *parser, rb_encoding *encoding);
|
11
11
|
VALUE pm_token_new(pm_parser_t *parser, pm_token_t *token, rb_encoding *encoding, VALUE source);
|
12
|
-
VALUE pm_ast_new(pm_parser_t *parser, pm_node_t *node, rb_encoding *encoding);
|
12
|
+
VALUE pm_ast_new(pm_parser_t *parser, pm_node_t *node, rb_encoding *encoding, VALUE source);
|
13
13
|
|
14
14
|
void Init_prism_api_node(void);
|
15
15
|
void Init_prism_pack(void);
|
data/include/prism/diagnostic.h
CHANGED
@@ -219,6 +219,7 @@ typedef enum {
|
|
219
219
|
PM_ERR_MODULE_NAME,
|
220
220
|
PM_ERR_MODULE_TERM,
|
221
221
|
PM_ERR_MULTI_ASSIGN_MULTI_SPLATS,
|
222
|
+
PM_ERR_MULTI_ASSIGN_UNEXPECTED_REST,
|
222
223
|
PM_ERR_NOT_EXPRESSION,
|
223
224
|
PM_ERR_NO_LOCAL_VARIABLE,
|
224
225
|
PM_ERR_NUMBER_LITERAL_UNDERSCORE,
|
@@ -272,6 +273,7 @@ typedef enum {
|
|
272
273
|
PM_ERR_STATEMENT_UNDEF,
|
273
274
|
PM_ERR_STRING_CONCATENATION,
|
274
275
|
PM_ERR_STRING_INTERPOLATED_TERM,
|
276
|
+
PM_ERR_STRING_LITERAL_EOF,
|
275
277
|
PM_ERR_STRING_LITERAL_TERM,
|
276
278
|
PM_ERR_SYMBOL_INVALID,
|
277
279
|
PM_ERR_SYMBOL_TERM_DYNAMIC,
|
@@ -279,10 +281,7 @@ typedef enum {
|
|
279
281
|
PM_ERR_TERNARY_COLON,
|
280
282
|
PM_ERR_TERNARY_EXPRESSION_FALSE,
|
281
283
|
PM_ERR_TERNARY_EXPRESSION_TRUE,
|
282
|
-
|
283
|
-
PM_ERR_UNARY_RECEIVER_MINUS,
|
284
|
-
PM_ERR_UNARY_RECEIVER_PLUS,
|
285
|
-
PM_ERR_UNARY_RECEIVER_TILDE,
|
284
|
+
PM_ERR_UNARY_RECEIVER,
|
286
285
|
PM_ERR_UNEXPECTED_TOKEN_CLOSE_CONTEXT,
|
287
286
|
PM_ERR_UNEXPECTED_TOKEN_IGNORE,
|
288
287
|
PM_ERR_UNDEF_ARGUMENT,
|
data/include/prism/encoding.h
CHANGED
@@ -245,6 +245,13 @@ extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM];
|
|
245
245
|
*/
|
246
246
|
#define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII])
|
247
247
|
|
248
|
+
/**
|
249
|
+
* This is the ASCII-8BIT encoding. We need a reference to it so that pm_strpbrk
|
250
|
+
* can compare against it because invalid multibyte characters are not a thing
|
251
|
+
* in this encoding.
|
252
|
+
*/
|
253
|
+
#define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT])
|
254
|
+
|
248
255
|
/**
|
249
256
|
* Parse the given name of an encoding and return a pointer to the corresponding
|
250
257
|
* encoding struct if one can be found, otherwise return NULL.
|
@@ -163,7 +163,7 @@ pm_constant_t * pm_constant_pool_id_to_constant(const pm_constant_pool_t *pool,
|
|
163
163
|
* @param length The length of the constant.
|
164
164
|
* @return The id of the constant.
|
165
165
|
*/
|
166
|
-
pm_constant_id_t pm_constant_pool_find(pm_constant_pool_t *pool, const uint8_t *start, size_t length);
|
166
|
+
pm_constant_id_t pm_constant_pool_find(const pm_constant_pool_t *pool, const uint8_t *start, size_t length);
|
167
167
|
|
168
168
|
/**
|
169
169
|
* Insert a constant into a constant pool that is a slice of a source string.
|
@@ -7,6 +7,7 @@
|
|
7
7
|
#define PRISM_STRPBRK_H
|
8
8
|
|
9
9
|
#include "prism/defines.h"
|
10
|
+
#include "prism/diagnostic.h"
|
10
11
|
#include "prism/parser.h"
|
11
12
|
|
12
13
|
#include <stddef.h>
|
@@ -35,9 +36,11 @@
|
|
35
36
|
* @param source The source to search.
|
36
37
|
* @param charset The charset to search for.
|
37
38
|
* @param length The maximum number of bytes to search.
|
39
|
+
* @param validate Whether to validate that the source string is valid in the
|
40
|
+
* current encoding of the parser.
|
38
41
|
* @return A pointer to the first character in the source string that is in the
|
39
42
|
* charset, or NULL if no such character exists.
|
40
43
|
*/
|
41
|
-
const uint8_t * pm_strpbrk(
|
44
|
+
const uint8_t * pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate);
|
42
45
|
|
43
46
|
#endif
|
data/include/prism/version.h
CHANGED
@@ -14,7 +14,7 @@
|
|
14
14
|
/**
|
15
15
|
* The minor version of the Prism library as an int.
|
16
16
|
*/
|
17
|
-
#define PRISM_VERSION_MINOR
|
17
|
+
#define PRISM_VERSION_MINOR 23
|
18
18
|
|
19
19
|
/**
|
20
20
|
* The patch version of the Prism library as an int.
|
@@ -24,6 +24,6 @@
|
|
24
24
|
/**
|
25
25
|
* The version of the Prism library as a constant string.
|
26
26
|
*/
|
27
|
-
#define PRISM_VERSION "0.
|
27
|
+
#define PRISM_VERSION "0.23.0"
|
28
28
|
|
29
29
|
#endif
|
data/lib/prism/ffi.rb
CHANGED
@@ -317,7 +317,7 @@ module Prism
|
|
317
317
|
values << (options.fetch(:frozen_string_literal, false) ? 1 : 0)
|
318
318
|
|
319
319
|
template << "C"
|
320
|
-
values << { nil => 0, "3.3.0" => 1, "latest" => 0 }.fetch(options[:version])
|
320
|
+
values << { nil => 0, "3.3.0" => 1, "3.4.0" => 0, "latest" => 0 }.fetch(options[:version])
|
321
321
|
|
322
322
|
template << "L"
|
323
323
|
if (scopes = options[:scopes])
|
data/lib/prism/lex_compat.rb
CHANGED
data/lib/prism/node_ext.rb
CHANGED
@@ -94,7 +94,7 @@ module Prism
|
|
94
94
|
|
95
95
|
# Returns the full name of this constant. For example: "Foo"
|
96
96
|
def full_name
|
97
|
-
name.
|
97
|
+
name.to_s
|
98
98
|
end
|
99
99
|
end
|
100
100
|
|
@@ -135,7 +135,17 @@ module Prism
|
|
135
135
|
# Returns the list of parts for the full name of this constant path.
|
136
136
|
# For example: [:Foo, :Bar]
|
137
137
|
def full_name_parts
|
138
|
-
|
138
|
+
parts = case parent
|
139
|
+
when ConstantPathNode, ConstantReadNode
|
140
|
+
parent.full_name_parts
|
141
|
+
when nil
|
142
|
+
[:""]
|
143
|
+
else
|
144
|
+
raise ConstantPathNode::DynamicPartsInConstantPathError,
|
145
|
+
"Constant path target contains dynamic parts. Cannot compute full name"
|
146
|
+
end
|
147
|
+
|
148
|
+
parts.push(child.name)
|
139
149
|
end
|
140
150
|
|
141
151
|
# Returns the full name of this constant path. For example: "Foo::Bar"
|
@@ -144,6 +154,19 @@ module Prism
|
|
144
154
|
end
|
145
155
|
end
|
146
156
|
|
157
|
+
class ConstantTargetNode < Node
|
158
|
+
# Returns the list of parts for the full name of this constant.
|
159
|
+
# For example: [:Foo]
|
160
|
+
def full_name_parts
|
161
|
+
[name]
|
162
|
+
end
|
163
|
+
|
164
|
+
# Returns the full name of this constant. For example: "Foo"
|
165
|
+
def full_name
|
166
|
+
name.to_s
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
147
170
|
class ParametersNode < Node
|
148
171
|
# Mirrors the Method#parameters method.
|
149
172
|
def signature
|
data/lib/prism/parse_result.rb
CHANGED
@@ -9,18 +9,16 @@ module Prism
|
|
9
9
|
attr_reader :source
|
10
10
|
|
11
11
|
# The line number where this source starts.
|
12
|
-
|
12
|
+
attr_reader :start_line
|
13
13
|
|
14
14
|
# The list of newline byte offsets in the source code.
|
15
15
|
attr_reader :offsets
|
16
16
|
|
17
|
-
# Create a new source object with the given source code
|
18
|
-
|
19
|
-
# the source code.
|
20
|
-
def initialize(source, start_line = 1, offsets = compute_offsets(source))
|
17
|
+
# Create a new source object with the given source code.
|
18
|
+
def initialize(source, start_line = 1, offsets = [])
|
21
19
|
@source = source
|
22
|
-
@start_line = start_line
|
23
|
-
@offsets = offsets
|
20
|
+
@start_line = start_line # set after parsing is done
|
21
|
+
@offsets = offsets # set after parsing is done
|
24
22
|
end
|
25
23
|
|
26
24
|
# Perform a byteslice on the source code using the given byte offset and
|
@@ -56,6 +54,23 @@ module Prism
|
|
56
54
|
character_offset(byte_offset) - character_offset(line_start(byte_offset))
|
57
55
|
end
|
58
56
|
|
57
|
+
# Returns the offset from the start of the file for the given byte offset
|
58
|
+
# counting in code units for the given encoding.
|
59
|
+
#
|
60
|
+
# This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
|
61
|
+
# concept of code units that differs from the number of characters in other
|
62
|
+
# encodings, it is not captured here.
|
63
|
+
def code_units_offset(byte_offset, encoding)
|
64
|
+
byteslice = source.byteslice(0, byte_offset).encode(encoding)
|
65
|
+
(encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE) ? (byteslice.bytesize / 2) : byteslice.length
|
66
|
+
end
|
67
|
+
|
68
|
+
# Returns the column number in code units for the given encoding for the
|
69
|
+
# given byte offset.
|
70
|
+
def code_units_column(byte_offset, encoding)
|
71
|
+
code_units_offset(byte_offset, encoding) - code_units_offset(line_start(byte_offset), encoding)
|
72
|
+
end
|
73
|
+
|
59
74
|
private
|
60
75
|
|
61
76
|
# Binary search through the offsets to find the line number for the given
|
@@ -77,14 +92,6 @@ module Prism
|
|
77
92
|
|
78
93
|
left - 1
|
79
94
|
end
|
80
|
-
|
81
|
-
# Find all of the newlines in the source code and return their byte offsets
|
82
|
-
# from the start of the string an array.
|
83
|
-
def compute_offsets(code)
|
84
|
-
offsets = [0]
|
85
|
-
code.b.scan("\n") { offsets << $~.end(0) }
|
86
|
-
offsets
|
87
|
-
end
|
88
95
|
end
|
89
96
|
|
90
97
|
# This represents a location in the source.
|
@@ -138,6 +145,11 @@ module Prism
|
|
138
145
|
source.character_offset(start_offset)
|
139
146
|
end
|
140
147
|
|
148
|
+
# The offset from the start of the file in code units of the given encoding.
|
149
|
+
def start_code_units_offset(encoding = Encoding::UTF_16LE)
|
150
|
+
source.code_units_offset(start_offset, encoding)
|
151
|
+
end
|
152
|
+
|
141
153
|
# The byte offset from the beginning of the source where this location ends.
|
142
154
|
def end_offset
|
143
155
|
start_offset + length
|
@@ -149,6 +161,11 @@ module Prism
|
|
149
161
|
source.character_offset(end_offset)
|
150
162
|
end
|
151
163
|
|
164
|
+
# The offset from the start of the file in code units of the given encoding.
|
165
|
+
def end_code_units_offset(encoding = Encoding::UTF_16LE)
|
166
|
+
source.code_units_offset(end_offset, encoding)
|
167
|
+
end
|
168
|
+
|
152
169
|
# The line number where this location starts.
|
153
170
|
def start_line
|
154
171
|
source.line(start_offset)
|
@@ -177,6 +194,12 @@ module Prism
|
|
177
194
|
source.character_column(start_offset)
|
178
195
|
end
|
179
196
|
|
197
|
+
# The column number in code units of the given encoding where this location
|
198
|
+
# starts from the start of the line.
|
199
|
+
def start_code_units_column(encoding = Encoding::UTF_16LE)
|
200
|
+
source.code_units_column(start_offset, encoding)
|
201
|
+
end
|
202
|
+
|
180
203
|
# The column number in bytes where this location ends from the start of the
|
181
204
|
# line.
|
182
205
|
def end_column
|
@@ -189,6 +212,12 @@ module Prism
|
|
189
212
|
source.character_column(end_offset)
|
190
213
|
end
|
191
214
|
|
215
|
+
# The column number in code units of the given encoding where this location
|
216
|
+
# ends from the start of the line.
|
217
|
+
def end_code_units_column(encoding = Encoding::UTF_16LE)
|
218
|
+
source.code_units_column(end_offset, encoding)
|
219
|
+
end
|
220
|
+
|
192
221
|
# Implement the hash pattern matching interface for Location.
|
193
222
|
def deconstruct_keys(keys)
|
194
223
|
{ start_offset: start_offset, end_offset: end_offset }
|
data/lib/prism/serialize.rb
CHANGED
@@ -27,7 +27,7 @@ module Prism
|
|
27
27
|
|
28
28
|
# The minor version of prism that we are expecting to find in the serialized
|
29
29
|
# strings.
|
30
|
-
MINOR_VERSION =
|
30
|
+
MINOR_VERSION = 23
|
31
31
|
|
32
32
|
# The patch version of prism that we are expecting to find in the serialized
|
33
33
|
# strings.
|
@@ -86,11 +86,15 @@ module Prism
|
|
86
86
|
end
|
87
87
|
|
88
88
|
def load_start_line
|
89
|
-
source.start_line
|
89
|
+
source.instance_variable_set :@start_line, load_varsint
|
90
|
+
end
|
91
|
+
|
92
|
+
def load_line_offsets
|
93
|
+
source.instance_variable_set :@offsets, Array.new(load_varuint) { load_varuint }
|
90
94
|
end
|
91
95
|
|
92
96
|
def load_comments
|
93
|
-
load_varuint
|
97
|
+
Array.new(load_varuint) do
|
94
98
|
case load_varuint
|
95
99
|
when 0 then InlineComment.new(load_location)
|
96
100
|
when 1 then EmbDocComment.new(load_location)
|
@@ -101,10 +105,10 @@ module Prism
|
|
101
105
|
|
102
106
|
def load_metadata
|
103
107
|
comments = load_comments
|
104
|
-
magic_comments = load_varuint
|
108
|
+
magic_comments = Array.new(load_varuint) { MagicComment.new(load_location, load_location) }
|
105
109
|
data_loc = load_optional_location
|
106
|
-
errors = load_varuint
|
107
|
-
warnings = load_varuint
|
110
|
+
errors = Array.new(load_varuint) { ParseError.new(load_embedded_string, load_location, load_error_level) }
|
111
|
+
warnings = Array.new(load_varuint) { ParseWarning.new(load_embedded_string, load_location, load_warning_level) }
|
108
112
|
[comments, magic_comments, data_loc, errors, warnings]
|
109
113
|
end
|
110
114
|
|
@@ -125,6 +129,7 @@ module Prism
|
|
125
129
|
tokens = load_tokens
|
126
130
|
encoding = load_encoding
|
127
131
|
load_start_line
|
132
|
+
load_line_offsets
|
128
133
|
comments, magic_comments, data_loc, errors, warnings = load_metadata
|
129
134
|
tokens.each { |token,| token.value.force_encoding(encoding) }
|
130
135
|
|
@@ -136,6 +141,7 @@ module Prism
|
|
136
141
|
load_header
|
137
142
|
load_encoding
|
138
143
|
load_start_line
|
144
|
+
load_line_offsets
|
139
145
|
|
140
146
|
comments, magic_comments, data_loc, errors, warnings = load_metadata
|
141
147
|
|
@@ -124,20 +124,21 @@ module Prism
|
|
124
124
|
# build the parser gem AST.
|
125
125
|
#
|
126
126
|
# If the bytesize of the source is the same as the length, then we can
|
127
|
-
# just use the offset directly. Otherwise, we build
|
128
|
-
#
|
129
|
-
#
|
130
|
-
# This is a good opportunity for some optimizations. If the source file
|
131
|
-
# has any multi-byte characters, this can tank the performance of the
|
132
|
-
# translator. We could make this significantly faster by using a
|
133
|
-
# different data structure for the cache.
|
127
|
+
# just use the offset directly. Otherwise, we build an array where the
|
128
|
+
# index is the byte offset and the value is the character offset.
|
134
129
|
def build_offset_cache(source)
|
135
130
|
if source.bytesize == source.length
|
136
131
|
-> (offset) { offset }
|
137
132
|
else
|
138
|
-
|
139
|
-
|
133
|
+
offset_cache = []
|
134
|
+
offset = 0
|
135
|
+
|
136
|
+
source.each_char do |char|
|
137
|
+
char.bytesize.times { offset_cache << offset }
|
138
|
+
offset += 1
|
140
139
|
end
|
140
|
+
|
141
|
+
offset_cache << offset
|
141
142
|
end
|
142
143
|
end
|
143
144
|
|