prism 1.1.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +39 -1
- data/Makefile +1 -1
- data/config.yml +422 -3
- data/docs/build_system.md +8 -11
- data/docs/relocation.md +34 -0
- data/ext/prism/api_node.c +18 -10
- data/ext/prism/extconf.rb +13 -36
- data/ext/prism/extension.c +68 -0
- data/ext/prism/extension.h +1 -1
- data/include/prism/ast.h +427 -3
- data/include/prism/defines.h +22 -7
- data/include/prism/diagnostic.h +1 -0
- data/include/prism/parser.h +25 -12
- data/include/prism/version.h +2 -2
- data/include/prism.h +47 -0
- data/lib/prism/dot_visitor.rb +10 -0
- data/lib/prism/dsl.rb +4 -4
- data/lib/prism/ffi.rb +49 -2
- data/lib/prism/inspect_visitor.rb +2 -0
- data/lib/prism/node.rb +1839 -96
- data/lib/prism/parse_result/errors.rb +1 -1
- data/lib/prism/parse_result.rb +140 -3
- data/lib/prism/reflection.rb +2 -2
- data/lib/prism/relocation.rb +504 -0
- data/lib/prism/serialize.rb +17 -5
- data/lib/prism/string_query.rb +30 -0
- data/lib/prism/translation/parser/compiler.rb +36 -26
- data/lib/prism/translation/parser.rb +3 -3
- data/lib/prism/translation/ripper.rb +1 -5
- data/lib/prism/translation/ruby_parser.rb +14 -5
- data/lib/prism.rb +6 -4
- data/prism.gemspec +7 -1
- data/rbi/prism/dsl.rbi +4 -4
- data/rbi/prism/node.rbi +5118 -1030
- data/rbi/prism/parse_result.rbi +29 -0
- data/rbi/prism/string_query.rbi +12 -0
- data/rbi/prism.rbi +34 -34
- data/sig/prism/dsl.rbs +2 -2
- data/sig/prism/node.rbs +13 -98
- data/sig/prism/parse_result.rbs +20 -0
- data/sig/prism/relocation.rbs +185 -0
- data/sig/prism/string_query.rbs +11 -0
- data/src/diagnostic.c +3 -1
- data/src/node.c +18 -0
- data/src/prettyprint.c +32 -0
- data/src/prism.c +586 -195
- data/src/regexp.c +7 -3
- data/src/serialize.c +12 -0
- data/src/static_literals.c +1 -1
- data/src/util/pm_char.c +1 -1
- data/src/util/pm_string.c +1 -0
- metadata +9 -3
@@ -17,7 +17,7 @@ module Prism
|
|
17
17
|
|
18
18
|
# Formats the errors in a human-readable way and return them as a string.
|
19
19
|
def format
|
20
|
-
error_lines = {}
|
20
|
+
error_lines = {} #: Hash[Integer, Array[ParseError]]
|
21
21
|
parse_result.errors.each do |error|
|
22
22
|
location = error.location
|
23
23
|
(location.start_line..location.end_line).each do |line|
|
data/lib/prism/parse_result.rb
CHANGED
@@ -12,6 +12,21 @@ module Prism
|
|
12
12
|
def self.for(source, start_line = 1, offsets = [])
|
13
13
|
if source.ascii_only?
|
14
14
|
ASCIISource.new(source, start_line, offsets)
|
15
|
+
elsif source.encoding == Encoding::BINARY
|
16
|
+
source.force_encoding(Encoding::UTF_8)
|
17
|
+
|
18
|
+
if source.valid_encoding?
|
19
|
+
new(source, start_line, offsets)
|
20
|
+
else
|
21
|
+
# This is an extremely niche use case where the file is marked as
|
22
|
+
# binary, contains multi-byte characters, and those characters are not
|
23
|
+
# valid UTF-8. In this case we'll mark it as binary and fall back to
|
24
|
+
# treating everything as a single-byte character. This _may_ cause
|
25
|
+
# problems when asking for code units, but it appears to be the
|
26
|
+
# cleanest solution at the moment.
|
27
|
+
source.force_encoding(Encoding::BINARY)
|
28
|
+
ASCIISource.new(source, start_line, offsets)
|
29
|
+
end
|
15
30
|
else
|
16
31
|
new(source, start_line, offsets)
|
17
32
|
end
|
@@ -89,8 +104,14 @@ module Prism
|
|
89
104
|
# This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
|
90
105
|
# concept of code units that differs from the number of characters in other
|
91
106
|
# encodings, it is not captured here.
|
107
|
+
#
|
108
|
+
# We purposefully replace invalid and undefined characters with replacement
|
109
|
+
# characters in this conversion. This happens for two reasons. First, it's
|
110
|
+
# possible that the given byte offset will not occur on a character
|
111
|
+
# boundary. Second, it's possible that the source code will contain a
|
112
|
+
# character that has no equivalent in the given encoding.
|
92
113
|
def code_units_offset(byte_offset, encoding)
|
93
|
-
byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding)
|
114
|
+
byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace)
|
94
115
|
|
95
116
|
if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
|
96
117
|
byteslice.bytesize / 2
|
@@ -99,6 +120,12 @@ module Prism
|
|
99
120
|
end
|
100
121
|
end
|
101
122
|
|
123
|
+
# Generate a cache that targets a specific encoding for calculating code
|
124
|
+
# unit offsets.
|
125
|
+
def code_units_cache(encoding)
|
126
|
+
CodeUnitsCache.new(source, encoding)
|
127
|
+
end
|
128
|
+
|
102
129
|
# Returns the column number in code units for the given encoding for the
|
103
130
|
# given byte offset.
|
104
131
|
def code_units_column(byte_offset, encoding)
|
@@ -128,10 +155,84 @@ module Prism
|
|
128
155
|
end
|
129
156
|
end
|
130
157
|
|
158
|
+
# A cache that can be used to quickly compute code unit offsets from byte
|
159
|
+
# offsets. It purposefully provides only a single #[] method to access the
|
160
|
+
# cache in order to minimize surface area.
|
161
|
+
#
|
162
|
+
# Note that there are some known issues here that may or may not be addressed
|
163
|
+
# in the future:
|
164
|
+
#
|
165
|
+
# * The first is that there are issues when the cache computes values that are
|
166
|
+
# not on character boundaries. This can result in subsequent computations
|
167
|
+
# being off by one or more code units.
|
168
|
+
# * The second is that this cache is currently unbounded. In theory we could
|
169
|
+
# introduce some kind of LRU cache to limit the number of entries, but this
|
170
|
+
# has not yet been implemented.
|
171
|
+
#
|
172
|
+
class CodeUnitsCache
|
173
|
+
class UTF16Counter # :nodoc:
|
174
|
+
def initialize(source, encoding)
|
175
|
+
@source = source
|
176
|
+
@encoding = encoding
|
177
|
+
end
|
178
|
+
|
179
|
+
def count(byte_offset, byte_length)
|
180
|
+
@source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).bytesize / 2
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
class LengthCounter # :nodoc:
|
185
|
+
def initialize(source, encoding)
|
186
|
+
@source = source
|
187
|
+
@encoding = encoding
|
188
|
+
end
|
189
|
+
|
190
|
+
def count(byte_offset, byte_length)
|
191
|
+
@source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).length
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
private_constant :UTF16Counter, :LengthCounter
|
196
|
+
|
197
|
+
# Initialize a new cache with the given source and encoding.
|
198
|
+
def initialize(source, encoding)
|
199
|
+
@source = source
|
200
|
+
@counter =
|
201
|
+
if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
|
202
|
+
UTF16Counter.new(source, encoding)
|
203
|
+
else
|
204
|
+
LengthCounter.new(source, encoding)
|
205
|
+
end
|
206
|
+
|
207
|
+
@cache = {} #: Hash[Integer, Integer]
|
208
|
+
@offsets = [] #: Array[Integer]
|
209
|
+
end
|
210
|
+
|
211
|
+
# Retrieve the code units offset from the given byte offset.
|
212
|
+
def [](byte_offset)
|
213
|
+
@cache[byte_offset] ||=
|
214
|
+
if (index = @offsets.bsearch_index { |offset| offset > byte_offset }).nil?
|
215
|
+
@offsets << byte_offset
|
216
|
+
@counter.count(0, byte_offset)
|
217
|
+
elsif index == 0
|
218
|
+
@offsets.unshift(byte_offset)
|
219
|
+
@counter.count(0, byte_offset)
|
220
|
+
else
|
221
|
+
@offsets.insert(index, byte_offset)
|
222
|
+
offset = @offsets[index - 1]
|
223
|
+
@cache[offset] + @counter.count(offset, byte_offset - offset)
|
224
|
+
end
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
131
228
|
# Specialized version of Prism::Source for source code that includes ASCII
|
132
229
|
# characters only. This class is used to apply performance optimizations that
|
133
|
-
# cannot be applied to sources that include multibyte characters.
|
134
|
-
#
|
230
|
+
# cannot be applied to sources that include multibyte characters.
|
231
|
+
#
|
232
|
+
# In the extremely rare case that a source includes multi-byte characters but
|
233
|
+
# is marked as binary because of a magic encoding comment and it cannot be
|
234
|
+
# eagerly converted to UTF-8, this class will be used as well. This is because
|
235
|
+
# at that point we will treat everything as single-byte characters.
|
135
236
|
class ASCIISource < Source
|
136
237
|
# Return the character offset for the given byte offset.
|
137
238
|
def character_offset(byte_offset)
|
@@ -153,6 +254,13 @@ module Prism
|
|
153
254
|
byte_offset
|
154
255
|
end
|
155
256
|
|
257
|
+
# Returns a cache that is the identity function in order to maintain the
|
258
|
+
# same interface. We can do this because code units are always equivalent to
|
259
|
+
# byte offsets for ASCII-only sources.
|
260
|
+
def code_units_cache(encoding)
|
261
|
+
->(byte_offset) { byte_offset }
|
262
|
+
end
|
263
|
+
|
156
264
|
# Specialized version of `code_units_column` that does not depend on
|
157
265
|
# `code_units_offset`, which is a more expensive operation. This is
|
158
266
|
# essentially the same as `Prism::Source#column`.
|
@@ -262,6 +370,12 @@ module Prism
|
|
262
370
|
source.code_units_offset(start_offset, encoding)
|
263
371
|
end
|
264
372
|
|
373
|
+
# The start offset from the start of the file in code units using the given
|
374
|
+
# cache to fetch or calculate the value.
|
375
|
+
def cached_start_code_units_offset(cache)
|
376
|
+
cache[start_offset]
|
377
|
+
end
|
378
|
+
|
265
379
|
# The byte offset from the beginning of the source where this location ends.
|
266
380
|
def end_offset
|
267
381
|
start_offset + length
|
@@ -278,6 +392,12 @@ module Prism
|
|
278
392
|
source.code_units_offset(end_offset, encoding)
|
279
393
|
end
|
280
394
|
|
395
|
+
# The end offset from the start of the file in code units using the given
|
396
|
+
# cache to fetch or calculate the value.
|
397
|
+
def cached_end_code_units_offset(cache)
|
398
|
+
cache[end_offset]
|
399
|
+
end
|
400
|
+
|
281
401
|
# The line number where this location starts.
|
282
402
|
def start_line
|
283
403
|
source.line(start_offset)
|
@@ -312,6 +432,12 @@ module Prism
|
|
312
432
|
source.code_units_column(start_offset, encoding)
|
313
433
|
end
|
314
434
|
|
435
|
+
# The start column in code units using the given cache to fetch or calculate
|
436
|
+
# the value.
|
437
|
+
def cached_start_code_units_column(cache)
|
438
|
+
cache[start_offset] - cache[source.line_start(start_offset)]
|
439
|
+
end
|
440
|
+
|
315
441
|
# The column number in bytes where this location ends from the start of the
|
316
442
|
# line.
|
317
443
|
def end_column
|
@@ -330,6 +456,12 @@ module Prism
|
|
330
456
|
source.code_units_column(end_offset, encoding)
|
331
457
|
end
|
332
458
|
|
459
|
+
# The end column in code units using the given cache to fetch or calculate
|
460
|
+
# the value.
|
461
|
+
def cached_end_code_units_column(cache)
|
462
|
+
cache[end_offset] - cache[source.line_start(end_offset)]
|
463
|
+
end
|
464
|
+
|
333
465
|
# Implement the hash pattern matching interface for Location.
|
334
466
|
def deconstruct_keys(keys)
|
335
467
|
{ start_offset: start_offset, end_offset: end_offset }
|
@@ -579,6 +711,11 @@ module Prism
|
|
579
711
|
def failure?
|
580
712
|
!success?
|
581
713
|
end
|
714
|
+
|
715
|
+
# Create a code units cache for the given encoding.
|
716
|
+
def code_units_cache(encoding)
|
717
|
+
source.code_units_cache(encoding)
|
718
|
+
end
|
582
719
|
end
|
583
720
|
|
584
721
|
# This is a result specific to the `parse` and `parse_file` methods.
|
data/lib/prism/reflection.rb
CHANGED
@@ -396,11 +396,11 @@ module Prism
|
|
396
396
|
when :unless_node
|
397
397
|
[LocationField.new(:keyword_loc), NodeField.new(:predicate), OptionalLocationField.new(:then_keyword_loc), OptionalNodeField.new(:statements), OptionalNodeField.new(:else_clause), OptionalLocationField.new(:end_keyword_loc)]
|
398
398
|
when :until_node
|
399
|
-
[FlagsField.new(:flags, [:begin_modifier?]), LocationField.new(:keyword_loc), OptionalLocationField.new(:closing_loc), NodeField.new(:predicate), OptionalNodeField.new(:statements)]
|
399
|
+
[FlagsField.new(:flags, [:begin_modifier?]), LocationField.new(:keyword_loc), OptionalLocationField.new(:do_keyword_loc), OptionalLocationField.new(:closing_loc), NodeField.new(:predicate), OptionalNodeField.new(:statements)]
|
400
400
|
when :when_node
|
401
401
|
[LocationField.new(:keyword_loc), NodeListField.new(:conditions), OptionalLocationField.new(:then_keyword_loc), OptionalNodeField.new(:statements)]
|
402
402
|
when :while_node
|
403
|
-
[FlagsField.new(:flags, [:begin_modifier?]), LocationField.new(:keyword_loc), OptionalLocationField.new(:closing_loc), NodeField.new(:predicate), OptionalNodeField.new(:statements)]
|
403
|
+
[FlagsField.new(:flags, [:begin_modifier?]), LocationField.new(:keyword_loc), OptionalLocationField.new(:do_keyword_loc), OptionalLocationField.new(:closing_loc), NodeField.new(:predicate), OptionalNodeField.new(:statements)]
|
404
404
|
when :x_string_node
|
405
405
|
[FlagsField.new(:flags, [:forced_utf8_encoding?, :forced_binary_encoding?]), LocationField.new(:opening_loc), LocationField.new(:content_loc), LocationField.new(:closing_loc), StringField.new(:unescaped)]
|
406
406
|
when :yield_node
|