prism 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +39 -1
  3. data/Makefile +1 -1
  4. data/config.yml +422 -3
  5. data/docs/build_system.md +8 -11
  6. data/docs/relocation.md +34 -0
  7. data/ext/prism/api_node.c +18 -10
  8. data/ext/prism/extconf.rb +13 -36
  9. data/ext/prism/extension.c +68 -0
  10. data/ext/prism/extension.h +1 -1
  11. data/include/prism/ast.h +427 -3
  12. data/include/prism/defines.h +22 -7
  13. data/include/prism/diagnostic.h +1 -0
  14. data/include/prism/parser.h +25 -12
  15. data/include/prism/version.h +2 -2
  16. data/include/prism.h +47 -0
  17. data/lib/prism/dot_visitor.rb +10 -0
  18. data/lib/prism/dsl.rb +4 -4
  19. data/lib/prism/ffi.rb +49 -2
  20. data/lib/prism/inspect_visitor.rb +2 -0
  21. data/lib/prism/node.rb +1839 -96
  22. data/lib/prism/parse_result/errors.rb +1 -1
  23. data/lib/prism/parse_result.rb +140 -3
  24. data/lib/prism/reflection.rb +2 -2
  25. data/lib/prism/relocation.rb +504 -0
  26. data/lib/prism/serialize.rb +17 -5
  27. data/lib/prism/string_query.rb +30 -0
  28. data/lib/prism/translation/parser/compiler.rb +36 -26
  29. data/lib/prism/translation/parser.rb +3 -3
  30. data/lib/prism/translation/ripper.rb +1 -5
  31. data/lib/prism/translation/ruby_parser.rb +14 -5
  32. data/lib/prism.rb +6 -4
  33. data/prism.gemspec +7 -1
  34. data/rbi/prism/dsl.rbi +4 -4
  35. data/rbi/prism/node.rbi +5118 -1030
  36. data/rbi/prism/parse_result.rbi +29 -0
  37. data/rbi/prism/string_query.rbi +12 -0
  38. data/rbi/prism.rbi +34 -34
  39. data/sig/prism/dsl.rbs +2 -2
  40. data/sig/prism/node.rbs +13 -98
  41. data/sig/prism/parse_result.rbs +20 -0
  42. data/sig/prism/relocation.rbs +185 -0
  43. data/sig/prism/string_query.rbs +11 -0
  44. data/src/diagnostic.c +3 -1
  45. data/src/node.c +18 -0
  46. data/src/prettyprint.c +32 -0
  47. data/src/prism.c +586 -195
  48. data/src/regexp.c +7 -3
  49. data/src/serialize.c +12 -0
  50. data/src/static_literals.c +1 -1
  51. data/src/util/pm_char.c +1 -1
  52. data/src/util/pm_string.c +1 -0
  53. metadata +9 -3
@@ -17,7 +17,7 @@ module Prism
17
17
 
18
18
  # Formats the errors in a human-readable way and return them as a string.
19
19
  def format
20
- error_lines = {}
20
+ error_lines = {} #: Hash[Integer, Array[ParseError]]
21
21
  parse_result.errors.each do |error|
22
22
  location = error.location
23
23
  (location.start_line..location.end_line).each do |line|
@@ -12,6 +12,21 @@ module Prism
12
12
  def self.for(source, start_line = 1, offsets = [])
13
13
  if source.ascii_only?
14
14
  ASCIISource.new(source, start_line, offsets)
15
+ elsif source.encoding == Encoding::BINARY
16
+ source.force_encoding(Encoding::UTF_8)
17
+
18
+ if source.valid_encoding?
19
+ new(source, start_line, offsets)
20
+ else
21
+ # This is an extremely niche use case where the file is marked as
22
+ # binary, contains multi-byte characters, and those characters are not
23
+ # valid UTF-8. In this case we'll mark it as binary and fall back to
24
+ # treating everything as a single-byte character. This _may_ cause
25
+ # problems when asking for code units, but it appears to be the
26
+ # cleanest solution at the moment.
27
+ source.force_encoding(Encoding::BINARY)
28
+ ASCIISource.new(source, start_line, offsets)
29
+ end
15
30
  else
16
31
  new(source, start_line, offsets)
17
32
  end
@@ -89,8 +104,14 @@ module Prism
89
104
  # This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
90
105
  # concept of code units that differs from the number of characters in other
91
106
  # encodings, it is not captured here.
107
+ #
108
+ # We purposefully replace invalid and undefined characters with replacement
109
+ # characters in this conversion. This happens for two reasons. First, it's
110
+ # possible that the given byte offset will not occur on a character
111
+ # boundary. Second, it's possible that the source code will contain a
112
+ # character that has no equivalent in the given encoding.
92
113
  def code_units_offset(byte_offset, encoding)
93
- byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding)
114
+ byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace)
94
115
 
95
116
  if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
96
117
  byteslice.bytesize / 2
@@ -99,6 +120,12 @@ module Prism
99
120
  end
100
121
  end
101
122
 
123
+ # Generate a cache that targets a specific encoding for calculating code
124
+ # unit offsets.
125
+ def code_units_cache(encoding)
126
+ CodeUnitsCache.new(source, encoding)
127
+ end
128
+
102
129
  # Returns the column number in code units for the given encoding for the
103
130
  # given byte offset.
104
131
  def code_units_column(byte_offset, encoding)
@@ -128,10 +155,84 @@ module Prism
128
155
  end
129
156
  end
130
157
 
158
+ # A cache that can be used to quickly compute code unit offsets from byte
159
+ # offsets. It purposefully provides only a single #[] method to access the
160
+ # cache in order to minimize surface area.
161
+ #
162
+ # Note that there are some known issues here that may or may not be addressed
163
+ # in the future:
164
+ #
165
+ # * The first is that there are issues when the cache computes values that are
166
+ # not on character boundaries. This can result in subsequent computations
167
+ # being off by one or more code units.
168
+ # * The second is that this cache is currently unbounded. In theory we could
169
+ # introduce some kind of LRU cache to limit the number of entries, but this
170
+ # has not yet been implemented.
171
+ #
172
+ class CodeUnitsCache
173
+ class UTF16Counter # :nodoc:
174
+ def initialize(source, encoding)
175
+ @source = source
176
+ @encoding = encoding
177
+ end
178
+
179
+ def count(byte_offset, byte_length)
180
+ @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).bytesize / 2
181
+ end
182
+ end
183
+
184
+ class LengthCounter # :nodoc:
185
+ def initialize(source, encoding)
186
+ @source = source
187
+ @encoding = encoding
188
+ end
189
+
190
+ def count(byte_offset, byte_length)
191
+ @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).length
192
+ end
193
+ end
194
+
195
+ private_constant :UTF16Counter, :LengthCounter
196
+
197
+ # Initialize a new cache with the given source and encoding.
198
+ def initialize(source, encoding)
199
+ @source = source
200
+ @counter =
201
+ if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
202
+ UTF16Counter.new(source, encoding)
203
+ else
204
+ LengthCounter.new(source, encoding)
205
+ end
206
+
207
+ @cache = {} #: Hash[Integer, Integer]
208
+ @offsets = [] #: Array[Integer]
209
+ end
210
+
211
+ # Retrieve the code units offset from the given byte offset.
212
+ def [](byte_offset)
213
+ @cache[byte_offset] ||=
214
+ if (index = @offsets.bsearch_index { |offset| offset > byte_offset }).nil?
215
+ @offsets << byte_offset
216
+ @counter.count(0, byte_offset)
217
+ elsif index == 0
218
+ @offsets.unshift(byte_offset)
219
+ @counter.count(0, byte_offset)
220
+ else
221
+ @offsets.insert(index, byte_offset)
222
+ offset = @offsets[index - 1]
223
+ @cache[offset] + @counter.count(offset, byte_offset - offset)
224
+ end
225
+ end
226
+ end
227
+
131
228
  # Specialized version of Prism::Source for source code that includes ASCII
132
229
  # characters only. This class is used to apply performance optimizations that
133
- # cannot be applied to sources that include multibyte characters. Sources that
134
- # include multibyte characters are represented by the Prism::Source class.
230
+ # cannot be applied to sources that include multibyte characters.
231
+ #
232
+ # In the extremely rare case that a source includes multi-byte characters but
233
+ # is marked as binary because of a magic encoding comment and it cannot be
234
+ # eagerly converted to UTF-8, this class will be used as well. This is because
235
+ # at that point we will treat everything as single-byte characters.
135
236
  class ASCIISource < Source
136
237
  # Return the character offset for the given byte offset.
137
238
  def character_offset(byte_offset)
@@ -153,6 +254,13 @@ module Prism
153
254
  byte_offset
154
255
  end
155
256
 
257
+ # Returns a cache that is the identity function in order to maintain the
258
+ # same interface. We can do this because code units are always equivalent to
259
+ # byte offsets for ASCII-only sources.
260
+ def code_units_cache(encoding)
261
+ ->(byte_offset) { byte_offset }
262
+ end
263
+
156
264
  # Specialized version of `code_units_column` that does not depend on
157
265
  # `code_units_offset`, which is a more expensive operation. This is
158
266
  # essentially the same as `Prism::Source#column`.
@@ -262,6 +370,12 @@ module Prism
262
370
  source.code_units_offset(start_offset, encoding)
263
371
  end
264
372
 
373
+ # The start offset from the start of the file in code units using the given
374
+ # cache to fetch or calculate the value.
375
+ def cached_start_code_units_offset(cache)
376
+ cache[start_offset]
377
+ end
378
+
265
379
  # The byte offset from the beginning of the source where this location ends.
266
380
  def end_offset
267
381
  start_offset + length
@@ -278,6 +392,12 @@ module Prism
278
392
  source.code_units_offset(end_offset, encoding)
279
393
  end
280
394
 
395
+ # The end offset from the start of the file in code units using the given
396
+ # cache to fetch or calculate the value.
397
+ def cached_end_code_units_offset(cache)
398
+ cache[end_offset]
399
+ end
400
+
281
401
  # The line number where this location starts.
282
402
  def start_line
283
403
  source.line(start_offset)
@@ -312,6 +432,12 @@ module Prism
312
432
  source.code_units_column(start_offset, encoding)
313
433
  end
314
434
 
435
+ # The start column in code units using the given cache to fetch or calculate
436
+ # the value.
437
+ def cached_start_code_units_column(cache)
438
+ cache[start_offset] - cache[source.line_start(start_offset)]
439
+ end
440
+
315
441
  # The column number in bytes where this location ends from the start of the
316
442
  # line.
317
443
  def end_column
@@ -330,6 +456,12 @@ module Prism
330
456
  source.code_units_column(end_offset, encoding)
331
457
  end
332
458
 
459
+ # The end column in code units using the given cache to fetch or calculate
460
+ # the value.
461
+ def cached_end_code_units_column(cache)
462
+ cache[end_offset] - cache[source.line_start(end_offset)]
463
+ end
464
+
333
465
  # Implement the hash pattern matching interface for Location.
334
466
  def deconstruct_keys(keys)
335
467
  { start_offset: start_offset, end_offset: end_offset }
@@ -579,6 +711,11 @@ module Prism
579
711
  def failure?
580
712
  !success?
581
713
  end
714
+
715
+ # Create a code units cache for the given encoding.
716
+ def code_units_cache(encoding)
717
+ source.code_units_cache(encoding)
718
+ end
582
719
  end
583
720
 
584
721
  # This is a result specific to the `parse` and `parse_file` methods.
@@ -396,11 +396,11 @@ module Prism
396
396
  when :unless_node
397
397
  [LocationField.new(:keyword_loc), NodeField.new(:predicate), OptionalLocationField.new(:then_keyword_loc), OptionalNodeField.new(:statements), OptionalNodeField.new(:else_clause), OptionalLocationField.new(:end_keyword_loc)]
398
398
  when :until_node
399
- [FlagsField.new(:flags, [:begin_modifier?]), LocationField.new(:keyword_loc), OptionalLocationField.new(:closing_loc), NodeField.new(:predicate), OptionalNodeField.new(:statements)]
399
+ [FlagsField.new(:flags, [:begin_modifier?]), LocationField.new(:keyword_loc), OptionalLocationField.new(:do_keyword_loc), OptionalLocationField.new(:closing_loc), NodeField.new(:predicate), OptionalNodeField.new(:statements)]
400
400
  when :when_node
401
401
  [LocationField.new(:keyword_loc), NodeListField.new(:conditions), OptionalLocationField.new(:then_keyword_loc), OptionalNodeField.new(:statements)]
402
402
  when :while_node
403
- [FlagsField.new(:flags, [:begin_modifier?]), LocationField.new(:keyword_loc), OptionalLocationField.new(:closing_loc), NodeField.new(:predicate), OptionalNodeField.new(:statements)]
403
+ [FlagsField.new(:flags, [:begin_modifier?]), LocationField.new(:keyword_loc), OptionalLocationField.new(:do_keyword_loc), OptionalLocationField.new(:closing_loc), NodeField.new(:predicate), OptionalNodeField.new(:statements)]
404
404
  when :x_string_node
405
405
  [FlagsField.new(:flags, [:forced_utf8_encoding?, :forced_binary_encoding?]), LocationField.new(:opening_loc), LocationField.new(:content_loc), LocationField.new(:closing_loc), StringField.new(:unescaped)]
406
406
  when :yield_node