prism 1.1.0 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +39 -1
  3. data/Makefile +1 -1
  4. data/config.yml +422 -3
  5. data/docs/build_system.md +8 -11
  6. data/docs/relocation.md +34 -0
  7. data/ext/prism/api_node.c +18 -10
  8. data/ext/prism/extconf.rb +13 -36
  9. data/ext/prism/extension.c +68 -0
  10. data/ext/prism/extension.h +1 -1
  11. data/include/prism/ast.h +427 -3
  12. data/include/prism/defines.h +22 -7
  13. data/include/prism/diagnostic.h +1 -0
  14. data/include/prism/parser.h +25 -12
  15. data/include/prism/version.h +2 -2
  16. data/include/prism.h +47 -0
  17. data/lib/prism/dot_visitor.rb +10 -0
  18. data/lib/prism/dsl.rb +4 -4
  19. data/lib/prism/ffi.rb +49 -2
  20. data/lib/prism/inspect_visitor.rb +2 -0
  21. data/lib/prism/node.rb +1839 -96
  22. data/lib/prism/parse_result/errors.rb +1 -1
  23. data/lib/prism/parse_result.rb +140 -3
  24. data/lib/prism/reflection.rb +2 -2
  25. data/lib/prism/relocation.rb +504 -0
  26. data/lib/prism/serialize.rb +17 -5
  27. data/lib/prism/string_query.rb +30 -0
  28. data/lib/prism/translation/parser/compiler.rb +36 -26
  29. data/lib/prism/translation/parser.rb +3 -3
  30. data/lib/prism/translation/ripper.rb +1 -5
  31. data/lib/prism/translation/ruby_parser.rb +14 -5
  32. data/lib/prism.rb +6 -4
  33. data/prism.gemspec +7 -1
  34. data/rbi/prism/dsl.rbi +4 -4
  35. data/rbi/prism/node.rbi +5118 -1030
  36. data/rbi/prism/parse_result.rbi +29 -0
  37. data/rbi/prism/string_query.rbi +12 -0
  38. data/rbi/prism.rbi +34 -34
  39. data/sig/prism/dsl.rbs +2 -2
  40. data/sig/prism/node.rbs +13 -98
  41. data/sig/prism/parse_result.rbs +20 -0
  42. data/sig/prism/relocation.rbs +185 -0
  43. data/sig/prism/string_query.rbs +11 -0
  44. data/src/diagnostic.c +3 -1
  45. data/src/node.c +18 -0
  46. data/src/prettyprint.c +32 -0
  47. data/src/prism.c +586 -195
  48. data/src/regexp.c +7 -3
  49. data/src/serialize.c +12 -0
  50. data/src/static_literals.c +1 -1
  51. data/src/util/pm_char.c +1 -1
  52. data/src/util/pm_string.c +1 -0
  53. metadata +9 -3
@@ -17,7 +17,7 @@ module Prism
17
17
 
18
18
  # Formats the errors in a human-readable way and return them as a string.
19
19
  def format
20
- error_lines = {}
20
+ error_lines = {} #: Hash[Integer, Array[ParseError]]
21
21
  parse_result.errors.each do |error|
22
22
  location = error.location
23
23
  (location.start_line..location.end_line).each do |line|
@@ -12,6 +12,21 @@ module Prism
12
12
  def self.for(source, start_line = 1, offsets = [])
13
13
  if source.ascii_only?
14
14
  ASCIISource.new(source, start_line, offsets)
15
+ elsif source.encoding == Encoding::BINARY
16
+ source.force_encoding(Encoding::UTF_8)
17
+
18
+ if source.valid_encoding?
19
+ new(source, start_line, offsets)
20
+ else
21
+ # This is an extremely niche use case where the file is marked as
22
+ # binary, contains multi-byte characters, and those characters are not
23
+ # valid UTF-8. In this case we'll mark it as binary and fall back to
24
+ # treating everything as a single-byte character. This _may_ cause
25
+ # problems when asking for code units, but it appears to be the
26
+ # cleanest solution at the moment.
27
+ source.force_encoding(Encoding::BINARY)
28
+ ASCIISource.new(source, start_line, offsets)
29
+ end
15
30
  else
16
31
  new(source, start_line, offsets)
17
32
  end
@@ -89,8 +104,14 @@ module Prism
89
104
  # This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
90
105
  # concept of code units that differs from the number of characters in other
91
106
  # encodings, it is not captured here.
107
+ #
108
+ # We purposefully replace invalid and undefined characters with replacement
109
+ # characters in this conversion. This happens for two reasons. First, it's
110
+ # possible that the given byte offset will not occur on a character
111
+ # boundary. Second, it's possible that the source code will contain a
112
+ # character that has no equivalent in the given encoding.
92
113
  def code_units_offset(byte_offset, encoding)
93
- byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding)
114
+ byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace)
94
115
 
95
116
  if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
96
117
  byteslice.bytesize / 2
@@ -99,6 +120,12 @@ module Prism
99
120
  end
100
121
  end
101
122
 
123
+ # Generate a cache that targets a specific encoding for calculating code
124
+ # unit offsets.
125
+ def code_units_cache(encoding)
126
+ CodeUnitsCache.new(source, encoding)
127
+ end
128
+
102
129
  # Returns the column number in code units for the given encoding for the
103
130
  # given byte offset.
104
131
  def code_units_column(byte_offset, encoding)
@@ -128,10 +155,84 @@ module Prism
128
155
  end
129
156
  end
130
157
 
158
+ # A cache that can be used to quickly compute code unit offsets from byte
159
+ # offsets. It purposefully provides only a single #[] method to access the
160
+ # cache in order to minimize surface area.
161
+ #
162
+ # Note that there are some known issues here that may or may not be addressed
163
+ # in the future:
164
+ #
165
+ # * The first is that there are issues when the cache computes values that are
166
+ # not on character boundaries. This can result in subsequent computations
167
+ # being off by one or more code units.
168
+ # * The second is that this cache is currently unbounded. In theory we could
169
+ # introduce some kind of LRU cache to limit the number of entries, but this
170
+ # has not yet been implemented.
171
+ #
172
+ class CodeUnitsCache
173
+ class UTF16Counter # :nodoc:
174
+ def initialize(source, encoding)
175
+ @source = source
176
+ @encoding = encoding
177
+ end
178
+
179
+ def count(byte_offset, byte_length)
180
+ @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).bytesize / 2
181
+ end
182
+ end
183
+
184
+ class LengthCounter # :nodoc:
185
+ def initialize(source, encoding)
186
+ @source = source
187
+ @encoding = encoding
188
+ end
189
+
190
+ def count(byte_offset, byte_length)
191
+ @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).length
192
+ end
193
+ end
194
+
195
+ private_constant :UTF16Counter, :LengthCounter
196
+
197
+ # Initialize a new cache with the given source and encoding.
198
+ def initialize(source, encoding)
199
+ @source = source
200
+ @counter =
201
+ if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
202
+ UTF16Counter.new(source, encoding)
203
+ else
204
+ LengthCounter.new(source, encoding)
205
+ end
206
+
207
+ @cache = {} #: Hash[Integer, Integer]
208
+ @offsets = [] #: Array[Integer]
209
+ end
210
+
211
+ # Retrieve the code units offset from the given byte offset.
212
+ def [](byte_offset)
213
+ @cache[byte_offset] ||=
214
+ if (index = @offsets.bsearch_index { |offset| offset > byte_offset }).nil?
215
+ @offsets << byte_offset
216
+ @counter.count(0, byte_offset)
217
+ elsif index == 0
218
+ @offsets.unshift(byte_offset)
219
+ @counter.count(0, byte_offset)
220
+ else
221
+ @offsets.insert(index, byte_offset)
222
+ offset = @offsets[index - 1]
223
+ @cache[offset] + @counter.count(offset, byte_offset - offset)
224
+ end
225
+ end
226
+ end
227
+
131
228
  # Specialized version of Prism::Source for source code that includes ASCII
132
229
  # characters only. This class is used to apply performance optimizations that
133
- # cannot be applied to sources that include multibyte characters. Sources that
134
- # include multibyte characters are represented by the Prism::Source class.
230
+ # cannot be applied to sources that include multibyte characters.
231
+ #
232
+ # In the extremely rare case that a source includes multi-byte characters but
233
+ # is marked as binary because of a magic encoding comment and it cannot be
234
+ # eagerly converted to UTF-8, this class will be used as well. This is because
235
+ # at that point we will treat everything as single-byte characters.
135
236
  class ASCIISource < Source
136
237
  # Return the character offset for the given byte offset.
137
238
  def character_offset(byte_offset)
@@ -153,6 +254,13 @@ module Prism
153
254
  byte_offset
154
255
  end
155
256
 
257
+ # Returns a cache that is the identity function in order to maintain the
258
+ # same interface. We can do this because code units are always equivalent to
259
+ # byte offsets for ASCII-only sources.
260
+ def code_units_cache(encoding)
261
+ ->(byte_offset) { byte_offset }
262
+ end
263
+
156
264
  # Specialized version of `code_units_column` that does not depend on
157
265
  # `code_units_offset`, which is a more expensive operation. This is
158
266
  # essentially the same as `Prism::Source#column`.
@@ -262,6 +370,12 @@ module Prism
262
370
  source.code_units_offset(start_offset, encoding)
263
371
  end
264
372
 
373
+ # The start offset from the start of the file in code units using the given
374
+ # cache to fetch or calculate the value.
375
+ def cached_start_code_units_offset(cache)
376
+ cache[start_offset]
377
+ end
378
+
265
379
  # The byte offset from the beginning of the source where this location ends.
266
380
  def end_offset
267
381
  start_offset + length
@@ -278,6 +392,12 @@ module Prism
278
392
  source.code_units_offset(end_offset, encoding)
279
393
  end
280
394
 
395
+ # The end offset from the start of the file in code units using the given
396
+ # cache to fetch or calculate the value.
397
+ def cached_end_code_units_offset(cache)
398
+ cache[end_offset]
399
+ end
400
+
281
401
  # The line number where this location starts.
282
402
  def start_line
283
403
  source.line(start_offset)
@@ -312,6 +432,12 @@ module Prism
312
432
  source.code_units_column(start_offset, encoding)
313
433
  end
314
434
 
435
+ # The start column in code units using the given cache to fetch or calculate
436
+ # the value.
437
+ def cached_start_code_units_column(cache)
438
+ cache[start_offset] - cache[source.line_start(start_offset)]
439
+ end
440
+
315
441
  # The column number in bytes where this location ends from the start of the
316
442
  # line.
317
443
  def end_column
@@ -330,6 +456,12 @@ module Prism
330
456
  source.code_units_column(end_offset, encoding)
331
457
  end
332
458
 
459
+ # The end column in code units using the given cache to fetch or calculate
460
+ # the value.
461
+ def cached_end_code_units_column(cache)
462
+ cache[end_offset] - cache[source.line_start(end_offset)]
463
+ end
464
+
333
465
  # Implement the hash pattern matching interface for Location.
334
466
  def deconstruct_keys(keys)
335
467
  { start_offset: start_offset, end_offset: end_offset }
@@ -579,6 +711,11 @@ module Prism
579
711
  def failure?
580
712
  !success?
581
713
  end
714
+
715
+ # Create a code units cache for the given encoding.
716
+ def code_units_cache(encoding)
717
+ source.code_units_cache(encoding)
718
+ end
582
719
  end
583
720
 
584
721
  # This is a result specific to the `parse` and `parse_file` methods.
@@ -396,11 +396,11 @@ module Prism
396
396
  when :unless_node
397
397
  [LocationField.new(:keyword_loc), NodeField.new(:predicate), OptionalLocationField.new(:then_keyword_loc), OptionalNodeField.new(:statements), OptionalNodeField.new(:else_clause), OptionalLocationField.new(:end_keyword_loc)]
398
398
  when :until_node
399
- [FlagsField.new(:flags, [:begin_modifier?]), LocationField.new(:keyword_loc), OptionalLocationField.new(:closing_loc), NodeField.new(:predicate), OptionalNodeField.new(:statements)]
399
+ [FlagsField.new(:flags, [:begin_modifier?]), LocationField.new(:keyword_loc), OptionalLocationField.new(:do_keyword_loc), OptionalLocationField.new(:closing_loc), NodeField.new(:predicate), OptionalNodeField.new(:statements)]
400
400
  when :when_node
401
401
  [LocationField.new(:keyword_loc), NodeListField.new(:conditions), OptionalLocationField.new(:then_keyword_loc), OptionalNodeField.new(:statements)]
402
402
  when :while_node
403
- [FlagsField.new(:flags, [:begin_modifier?]), LocationField.new(:keyword_loc), OptionalLocationField.new(:closing_loc), NodeField.new(:predicate), OptionalNodeField.new(:statements)]
403
+ [FlagsField.new(:flags, [:begin_modifier?]), LocationField.new(:keyword_loc), OptionalLocationField.new(:do_keyword_loc), OptionalLocationField.new(:closing_loc), NodeField.new(:predicate), OptionalNodeField.new(:statements)]
404
404
  when :x_string_node
405
405
  [FlagsField.new(:flags, [:forced_utf8_encoding?, :forced_binary_encoding?]), LocationField.new(:opening_loc), LocationField.new(:content_loc), LocationField.new(:closing_loc), StringField.new(:unescaped)]
406
406
  when :yield_node