ed-precompiled_prism 1.5.2-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/BSDmakefile +58 -0
- data/CHANGELOG.md +723 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +58 -0
- data/LICENSE.md +7 -0
- data/Makefile +110 -0
- data/README.md +143 -0
- data/config.yml +4714 -0
- data/docs/build_system.md +119 -0
- data/docs/configuration.md +68 -0
- data/docs/cruby_compilation.md +27 -0
- data/docs/design.md +53 -0
- data/docs/encoding.md +121 -0
- data/docs/fuzzing.md +88 -0
- data/docs/heredocs.md +36 -0
- data/docs/javascript.md +118 -0
- data/docs/local_variable_depth.md +229 -0
- data/docs/mapping.md +117 -0
- data/docs/parser_translation.md +24 -0
- data/docs/parsing_rules.md +22 -0
- data/docs/releasing.md +98 -0
- data/docs/relocation.md +34 -0
- data/docs/ripper_translation.md +72 -0
- data/docs/ruby_api.md +44 -0
- data/docs/ruby_parser_translation.md +19 -0
- data/docs/serialization.md +233 -0
- data/docs/testing.md +55 -0
- data/ext/prism/api_node.c +6941 -0
- data/ext/prism/api_pack.c +276 -0
- data/ext/prism/extconf.rb +127 -0
- data/ext/prism/extension.c +1419 -0
- data/ext/prism/extension.h +19 -0
- data/include/prism/ast.h +8220 -0
- data/include/prism/defines.h +260 -0
- data/include/prism/diagnostic.h +456 -0
- data/include/prism/encoding.h +283 -0
- data/include/prism/node.h +129 -0
- data/include/prism/options.h +482 -0
- data/include/prism/pack.h +163 -0
- data/include/prism/parser.h +933 -0
- data/include/prism/prettyprint.h +34 -0
- data/include/prism/regexp.h +43 -0
- data/include/prism/static_literals.h +121 -0
- data/include/prism/util/pm_buffer.h +236 -0
- data/include/prism/util/pm_char.h +204 -0
- data/include/prism/util/pm_constant_pool.h +218 -0
- data/include/prism/util/pm_integer.h +130 -0
- data/include/prism/util/pm_list.h +103 -0
- data/include/prism/util/pm_memchr.h +29 -0
- data/include/prism/util/pm_newline_list.h +113 -0
- data/include/prism/util/pm_string.h +200 -0
- data/include/prism/util/pm_strncasecmp.h +32 -0
- data/include/prism/util/pm_strpbrk.h +46 -0
- data/include/prism/version.h +29 -0
- data/include/prism.h +408 -0
- data/lib/prism/3.0/prism.so +0 -0
- data/lib/prism/3.1/prism.so +0 -0
- data/lib/prism/3.2/prism.so +0 -0
- data/lib/prism/3.3/prism.so +0 -0
- data/lib/prism/3.4/prism.so +0 -0
- data/lib/prism/compiler.rb +801 -0
- data/lib/prism/desugar_compiler.rb +392 -0
- data/lib/prism/dispatcher.rb +2210 -0
- data/lib/prism/dot_visitor.rb +4762 -0
- data/lib/prism/dsl.rb +1003 -0
- data/lib/prism/ffi.rb +570 -0
- data/lib/prism/inspect_visitor.rb +2392 -0
- data/lib/prism/lex_compat.rb +928 -0
- data/lib/prism/mutation_compiler.rb +772 -0
- data/lib/prism/node.rb +18816 -0
- data/lib/prism/node_ext.rb +511 -0
- data/lib/prism/pack.rb +230 -0
- data/lib/prism/parse_result/comments.rb +188 -0
- data/lib/prism/parse_result/errors.rb +66 -0
- data/lib/prism/parse_result/newlines.rb +155 -0
- data/lib/prism/parse_result.rb +911 -0
- data/lib/prism/pattern.rb +269 -0
- data/lib/prism/polyfill/append_as_bytes.rb +15 -0
- data/lib/prism/polyfill/byteindex.rb +13 -0
- data/lib/prism/polyfill/scan_byte.rb +14 -0
- data/lib/prism/polyfill/unpack1.rb +14 -0
- data/lib/prism/polyfill/warn.rb +36 -0
- data/lib/prism/prism.so +0 -0
- data/lib/prism/reflection.rb +416 -0
- data/lib/prism/relocation.rb +505 -0
- data/lib/prism/serialize.rb +2398 -0
- data/lib/prism/string_query.rb +31 -0
- data/lib/prism/translation/parser/builder.rb +62 -0
- data/lib/prism/translation/parser/compiler.rb +2234 -0
- data/lib/prism/translation/parser/lexer.rb +820 -0
- data/lib/prism/translation/parser.rb +374 -0
- data/lib/prism/translation/parser33.rb +13 -0
- data/lib/prism/translation/parser34.rb +13 -0
- data/lib/prism/translation/parser35.rb +13 -0
- data/lib/prism/translation/parser_current.rb +24 -0
- data/lib/prism/translation/ripper/sexp.rb +126 -0
- data/lib/prism/translation/ripper/shim.rb +5 -0
- data/lib/prism/translation/ripper.rb +3474 -0
- data/lib/prism/translation/ruby_parser.rb +1929 -0
- data/lib/prism/translation.rb +16 -0
- data/lib/prism/visitor.rb +813 -0
- data/lib/prism.rb +97 -0
- data/prism.gemspec +174 -0
- data/rbi/prism/compiler.rbi +12 -0
- data/rbi/prism/dsl.rbi +524 -0
- data/rbi/prism/inspect_visitor.rbi +12 -0
- data/rbi/prism/node.rbi +8734 -0
- data/rbi/prism/node_ext.rbi +107 -0
- data/rbi/prism/parse_result.rbi +404 -0
- data/rbi/prism/reflection.rbi +58 -0
- data/rbi/prism/string_query.rbi +12 -0
- data/rbi/prism/translation/parser.rbi +11 -0
- data/rbi/prism/translation/parser33.rbi +6 -0
- data/rbi/prism/translation/parser34.rbi +6 -0
- data/rbi/prism/translation/parser35.rbi +6 -0
- data/rbi/prism/translation/ripper.rbi +15 -0
- data/rbi/prism/visitor.rbi +473 -0
- data/rbi/prism.rbi +66 -0
- data/sig/prism/compiler.rbs +9 -0
- data/sig/prism/dispatcher.rbs +19 -0
- data/sig/prism/dot_visitor.rbs +6 -0
- data/sig/prism/dsl.rbs +351 -0
- data/sig/prism/inspect_visitor.rbs +22 -0
- data/sig/prism/lex_compat.rbs +10 -0
- data/sig/prism/mutation_compiler.rbs +159 -0
- data/sig/prism/node.rbs +4028 -0
- data/sig/prism/node_ext.rbs +149 -0
- data/sig/prism/pack.rbs +43 -0
- data/sig/prism/parse_result/comments.rbs +38 -0
- data/sig/prism/parse_result.rbs +196 -0
- data/sig/prism/pattern.rbs +13 -0
- data/sig/prism/reflection.rbs +50 -0
- data/sig/prism/relocation.rbs +185 -0
- data/sig/prism/serialize.rbs +8 -0
- data/sig/prism/string_query.rbs +11 -0
- data/sig/prism/visitor.rbs +169 -0
- data/sig/prism.rbs +254 -0
- data/src/diagnostic.c +850 -0
- data/src/encoding.c +5235 -0
- data/src/node.c +8676 -0
- data/src/options.c +328 -0
- data/src/pack.c +509 -0
- data/src/prettyprint.c +8941 -0
- data/src/prism.c +23361 -0
- data/src/regexp.c +790 -0
- data/src/serialize.c +2268 -0
- data/src/static_literals.c +617 -0
- data/src/token_type.c +703 -0
- data/src/util/pm_buffer.c +357 -0
- data/src/util/pm_char.c +318 -0
- data/src/util/pm_constant_pool.c +342 -0
- data/src/util/pm_integer.c +670 -0
- data/src/util/pm_list.c +49 -0
- data/src/util/pm_memchr.c +35 -0
- data/src/util/pm_newline_list.c +125 -0
- data/src/util/pm_string.c +381 -0
- data/src/util/pm_strncasecmp.c +36 -0
- data/src/util/pm_strpbrk.c +206 -0
- metadata +203 -0
|
@@ -0,0 +1,820 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
# :markup: markdown
|
|
3
|
+
|
|
4
|
+
require "strscan"
|
|
5
|
+
require_relative "../../polyfill/append_as_bytes"
|
|
6
|
+
require_relative "../../polyfill/scan_byte"
|
|
7
|
+
|
|
8
|
+
module Prism
|
|
9
|
+
module Translation
|
|
10
|
+
class Parser
|
|
11
|
+
# Accepts a list of prism tokens and converts them into the expected
|
|
12
|
+
# format for the parser gem.
|
|
13
|
+
class Lexer
|
|
14
|
+
# These tokens are always skipped
|
|
15
|
+
TYPES_ALWAYS_SKIP = Set.new(%i[IGNORED_NEWLINE __END__ EOF])
|
|
16
|
+
private_constant :TYPES_ALWAYS_SKIP
|
|
17
|
+
|
|
18
|
+
# The direct translating of types between the two lexers.
|
|
19
|
+
TYPES = {
|
|
20
|
+
# These tokens should never appear in the output of the lexer.
|
|
21
|
+
MISSING: nil,
|
|
22
|
+
NOT_PROVIDED: nil,
|
|
23
|
+
EMBDOC_END: nil,
|
|
24
|
+
EMBDOC_LINE: nil,
|
|
25
|
+
|
|
26
|
+
# These tokens have more or less direct mappings.
|
|
27
|
+
AMPERSAND: :tAMPER2,
|
|
28
|
+
AMPERSAND_AMPERSAND: :tANDOP,
|
|
29
|
+
AMPERSAND_AMPERSAND_EQUAL: :tOP_ASGN,
|
|
30
|
+
AMPERSAND_DOT: :tANDDOT,
|
|
31
|
+
AMPERSAND_EQUAL: :tOP_ASGN,
|
|
32
|
+
BACK_REFERENCE: :tBACK_REF,
|
|
33
|
+
BACKTICK: :tXSTRING_BEG,
|
|
34
|
+
BANG: :tBANG,
|
|
35
|
+
BANG_EQUAL: :tNEQ,
|
|
36
|
+
BANG_TILDE: :tNMATCH,
|
|
37
|
+
BRACE_LEFT: :tLCURLY,
|
|
38
|
+
BRACE_RIGHT: :tRCURLY,
|
|
39
|
+
BRACKET_LEFT: :tLBRACK2,
|
|
40
|
+
BRACKET_LEFT_ARRAY: :tLBRACK,
|
|
41
|
+
BRACKET_LEFT_RIGHT: :tAREF,
|
|
42
|
+
BRACKET_LEFT_RIGHT_EQUAL: :tASET,
|
|
43
|
+
BRACKET_RIGHT: :tRBRACK,
|
|
44
|
+
CARET: :tCARET,
|
|
45
|
+
CARET_EQUAL: :tOP_ASGN,
|
|
46
|
+
CHARACTER_LITERAL: :tCHARACTER,
|
|
47
|
+
CLASS_VARIABLE: :tCVAR,
|
|
48
|
+
COLON: :tCOLON,
|
|
49
|
+
COLON_COLON: :tCOLON2,
|
|
50
|
+
COMMA: :tCOMMA,
|
|
51
|
+
COMMENT: :tCOMMENT,
|
|
52
|
+
CONSTANT: :tCONSTANT,
|
|
53
|
+
DOT: :tDOT,
|
|
54
|
+
DOT_DOT: :tDOT2,
|
|
55
|
+
DOT_DOT_DOT: :tDOT3,
|
|
56
|
+
EMBDOC_BEGIN: :tCOMMENT,
|
|
57
|
+
EMBEXPR_BEGIN: :tSTRING_DBEG,
|
|
58
|
+
EMBEXPR_END: :tSTRING_DEND,
|
|
59
|
+
EMBVAR: :tSTRING_DVAR,
|
|
60
|
+
EQUAL: :tEQL,
|
|
61
|
+
EQUAL_EQUAL: :tEQ,
|
|
62
|
+
EQUAL_EQUAL_EQUAL: :tEQQ,
|
|
63
|
+
EQUAL_GREATER: :tASSOC,
|
|
64
|
+
EQUAL_TILDE: :tMATCH,
|
|
65
|
+
FLOAT: :tFLOAT,
|
|
66
|
+
FLOAT_IMAGINARY: :tIMAGINARY,
|
|
67
|
+
FLOAT_RATIONAL: :tRATIONAL,
|
|
68
|
+
FLOAT_RATIONAL_IMAGINARY: :tIMAGINARY,
|
|
69
|
+
GLOBAL_VARIABLE: :tGVAR,
|
|
70
|
+
GREATER: :tGT,
|
|
71
|
+
GREATER_EQUAL: :tGEQ,
|
|
72
|
+
GREATER_GREATER: :tRSHFT,
|
|
73
|
+
GREATER_GREATER_EQUAL: :tOP_ASGN,
|
|
74
|
+
HEREDOC_START: :tSTRING_BEG,
|
|
75
|
+
HEREDOC_END: :tSTRING_END,
|
|
76
|
+
IDENTIFIER: :tIDENTIFIER,
|
|
77
|
+
INSTANCE_VARIABLE: :tIVAR,
|
|
78
|
+
INTEGER: :tINTEGER,
|
|
79
|
+
INTEGER_IMAGINARY: :tIMAGINARY,
|
|
80
|
+
INTEGER_RATIONAL: :tRATIONAL,
|
|
81
|
+
INTEGER_RATIONAL_IMAGINARY: :tIMAGINARY,
|
|
82
|
+
KEYWORD_ALIAS: :kALIAS,
|
|
83
|
+
KEYWORD_AND: :kAND,
|
|
84
|
+
KEYWORD_BEGIN: :kBEGIN,
|
|
85
|
+
KEYWORD_BEGIN_UPCASE: :klBEGIN,
|
|
86
|
+
KEYWORD_BREAK: :kBREAK,
|
|
87
|
+
KEYWORD_CASE: :kCASE,
|
|
88
|
+
KEYWORD_CLASS: :kCLASS,
|
|
89
|
+
KEYWORD_DEF: :kDEF,
|
|
90
|
+
KEYWORD_DEFINED: :kDEFINED,
|
|
91
|
+
KEYWORD_DO: :kDO,
|
|
92
|
+
KEYWORD_DO_LOOP: :kDO_COND,
|
|
93
|
+
KEYWORD_END: :kEND,
|
|
94
|
+
KEYWORD_END_UPCASE: :klEND,
|
|
95
|
+
KEYWORD_ENSURE: :kENSURE,
|
|
96
|
+
KEYWORD_ELSE: :kELSE,
|
|
97
|
+
KEYWORD_ELSIF: :kELSIF,
|
|
98
|
+
KEYWORD_FALSE: :kFALSE,
|
|
99
|
+
KEYWORD_FOR: :kFOR,
|
|
100
|
+
KEYWORD_IF: :kIF,
|
|
101
|
+
KEYWORD_IF_MODIFIER: :kIF_MOD,
|
|
102
|
+
KEYWORD_IN: :kIN,
|
|
103
|
+
KEYWORD_MODULE: :kMODULE,
|
|
104
|
+
KEYWORD_NEXT: :kNEXT,
|
|
105
|
+
KEYWORD_NIL: :kNIL,
|
|
106
|
+
KEYWORD_NOT: :kNOT,
|
|
107
|
+
KEYWORD_OR: :kOR,
|
|
108
|
+
KEYWORD_REDO: :kREDO,
|
|
109
|
+
KEYWORD_RESCUE: :kRESCUE,
|
|
110
|
+
KEYWORD_RESCUE_MODIFIER: :kRESCUE_MOD,
|
|
111
|
+
KEYWORD_RETRY: :kRETRY,
|
|
112
|
+
KEYWORD_RETURN: :kRETURN,
|
|
113
|
+
KEYWORD_SELF: :kSELF,
|
|
114
|
+
KEYWORD_SUPER: :kSUPER,
|
|
115
|
+
KEYWORD_THEN: :kTHEN,
|
|
116
|
+
KEYWORD_TRUE: :kTRUE,
|
|
117
|
+
KEYWORD_UNDEF: :kUNDEF,
|
|
118
|
+
KEYWORD_UNLESS: :kUNLESS,
|
|
119
|
+
KEYWORD_UNLESS_MODIFIER: :kUNLESS_MOD,
|
|
120
|
+
KEYWORD_UNTIL: :kUNTIL,
|
|
121
|
+
KEYWORD_UNTIL_MODIFIER: :kUNTIL_MOD,
|
|
122
|
+
KEYWORD_WHEN: :kWHEN,
|
|
123
|
+
KEYWORD_WHILE: :kWHILE,
|
|
124
|
+
KEYWORD_WHILE_MODIFIER: :kWHILE_MOD,
|
|
125
|
+
KEYWORD_YIELD: :kYIELD,
|
|
126
|
+
KEYWORD___ENCODING__: :k__ENCODING__,
|
|
127
|
+
KEYWORD___FILE__: :k__FILE__,
|
|
128
|
+
KEYWORD___LINE__: :k__LINE__,
|
|
129
|
+
LABEL: :tLABEL,
|
|
130
|
+
LABEL_END: :tLABEL_END,
|
|
131
|
+
LAMBDA_BEGIN: :tLAMBEG,
|
|
132
|
+
LESS: :tLT,
|
|
133
|
+
LESS_EQUAL: :tLEQ,
|
|
134
|
+
LESS_EQUAL_GREATER: :tCMP,
|
|
135
|
+
LESS_LESS: :tLSHFT,
|
|
136
|
+
LESS_LESS_EQUAL: :tOP_ASGN,
|
|
137
|
+
METHOD_NAME: :tFID,
|
|
138
|
+
MINUS: :tMINUS,
|
|
139
|
+
MINUS_EQUAL: :tOP_ASGN,
|
|
140
|
+
MINUS_GREATER: :tLAMBDA,
|
|
141
|
+
NEWLINE: :tNL,
|
|
142
|
+
NUMBERED_REFERENCE: :tNTH_REF,
|
|
143
|
+
PARENTHESIS_LEFT: :tLPAREN2,
|
|
144
|
+
PARENTHESIS_LEFT_PARENTHESES: :tLPAREN_ARG,
|
|
145
|
+
PARENTHESIS_RIGHT: :tRPAREN,
|
|
146
|
+
PERCENT: :tPERCENT,
|
|
147
|
+
PERCENT_EQUAL: :tOP_ASGN,
|
|
148
|
+
PERCENT_LOWER_I: :tQSYMBOLS_BEG,
|
|
149
|
+
PERCENT_LOWER_W: :tQWORDS_BEG,
|
|
150
|
+
PERCENT_UPPER_I: :tSYMBOLS_BEG,
|
|
151
|
+
PERCENT_UPPER_W: :tWORDS_BEG,
|
|
152
|
+
PERCENT_LOWER_X: :tXSTRING_BEG,
|
|
153
|
+
PLUS: :tPLUS,
|
|
154
|
+
PLUS_EQUAL: :tOP_ASGN,
|
|
155
|
+
PIPE_EQUAL: :tOP_ASGN,
|
|
156
|
+
PIPE: :tPIPE,
|
|
157
|
+
PIPE_PIPE: :tOROP,
|
|
158
|
+
PIPE_PIPE_EQUAL: :tOP_ASGN,
|
|
159
|
+
QUESTION_MARK: :tEH,
|
|
160
|
+
REGEXP_BEGIN: :tREGEXP_BEG,
|
|
161
|
+
REGEXP_END: :tSTRING_END,
|
|
162
|
+
SEMICOLON: :tSEMI,
|
|
163
|
+
SLASH: :tDIVIDE,
|
|
164
|
+
SLASH_EQUAL: :tOP_ASGN,
|
|
165
|
+
STAR: :tSTAR2,
|
|
166
|
+
STAR_EQUAL: :tOP_ASGN,
|
|
167
|
+
STAR_STAR: :tPOW,
|
|
168
|
+
STAR_STAR_EQUAL: :tOP_ASGN,
|
|
169
|
+
STRING_BEGIN: :tSTRING_BEG,
|
|
170
|
+
STRING_CONTENT: :tSTRING_CONTENT,
|
|
171
|
+
STRING_END: :tSTRING_END,
|
|
172
|
+
SYMBOL_BEGIN: :tSYMBEG,
|
|
173
|
+
TILDE: :tTILDE,
|
|
174
|
+
UAMPERSAND: :tAMPER,
|
|
175
|
+
UCOLON_COLON: :tCOLON3,
|
|
176
|
+
UDOT_DOT: :tBDOT2,
|
|
177
|
+
UDOT_DOT_DOT: :tBDOT3,
|
|
178
|
+
UMINUS: :tUMINUS,
|
|
179
|
+
UMINUS_NUM: :tUNARY_NUM,
|
|
180
|
+
UPLUS: :tUPLUS,
|
|
181
|
+
USTAR: :tSTAR,
|
|
182
|
+
USTAR_STAR: :tDSTAR,
|
|
183
|
+
WORDS_SEP: :tSPACE
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
# These constants represent flags in our lex state. We really, really
|
|
187
|
+
# don't want to be using them and we really, really don't want to be
|
|
188
|
+
# exposing them as part of our public API. Unfortunately, we don't have
|
|
189
|
+
# another way of matching the exact tokens that the parser gem expects
|
|
190
|
+
# without them. We should find another way to do this, but in the
|
|
191
|
+
# meantime we'll hide them from the documentation and mark them as
|
|
192
|
+
# private constants.
|
|
193
|
+
EXPR_BEG = 0x1 # :nodoc:
|
|
194
|
+
EXPR_LABEL = 0x400 # :nodoc:
|
|
195
|
+
|
|
196
|
+
# It is used to determine whether `do` is of the token type `kDO` or `kDO_LAMBDA`.
|
|
197
|
+
#
|
|
198
|
+
# NOTE: In edge cases like `-> (foo = -> (bar) {}) do end`, please note that `kDO` is still returned
|
|
199
|
+
# instead of `kDO_LAMBDA`, which is expected: https://github.com/ruby/prism/pull/3046
|
|
200
|
+
LAMBDA_TOKEN_TYPES = Set.new([:kDO_LAMBDA, :tLAMBDA, :tLAMBEG])
|
|
201
|
+
|
|
202
|
+
# The `PARENTHESIS_LEFT` token in Prism is classified as either `tLPAREN` or `tLPAREN2` in the Parser gem.
|
|
203
|
+
# The following token types are listed as those classified as `tLPAREN`.
|
|
204
|
+
LPAREN_CONVERSION_TOKEN_TYPES = Set.new([
|
|
205
|
+
:kBREAK, :tCARET, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3,
|
|
206
|
+
:tEQL, :tLPAREN, :tLPAREN2, :tLPAREN_ARG, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS, :tLCURLY
|
|
207
|
+
])
|
|
208
|
+
|
|
209
|
+
# Types of tokens that are allowed to continue a method call with comments in-between.
|
|
210
|
+
# For these, the parser gem doesn't emit a newline token after the last comment.
|
|
211
|
+
COMMENT_CONTINUATION_TYPES = Set.new([:COMMENT, :AMPERSAND_DOT, :DOT])
|
|
212
|
+
private_constant :COMMENT_CONTINUATION_TYPES
|
|
213
|
+
|
|
214
|
+
# Heredocs are complex and require us to keep track of a bit of info to refer to later
|
|
215
|
+
HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
|
|
216
|
+
|
|
217
|
+
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
|
|
218
|
+
|
|
219
|
+
# The Parser::Source::Buffer that the tokens were lexed from.
|
|
220
|
+
attr_reader :source_buffer
|
|
221
|
+
|
|
222
|
+
# An array of tuples that contain prism tokens and their associated lex
|
|
223
|
+
# state when they were lexed.
|
|
224
|
+
attr_reader :lexed
|
|
225
|
+
|
|
226
|
+
# A hash that maps offsets in bytes to offsets in characters.
|
|
227
|
+
attr_reader :offset_cache
|
|
228
|
+
|
|
229
|
+
# Initialize the lexer with the given source buffer, prism tokens, and
|
|
230
|
+
# offset cache.
|
|
231
|
+
def initialize(source_buffer, lexed, offset_cache)
|
|
232
|
+
@source_buffer = source_buffer
|
|
233
|
+
@lexed = lexed
|
|
234
|
+
@offset_cache = offset_cache
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
Range = ::Parser::Source::Range # :nodoc:
|
|
238
|
+
private_constant :Range
|
|
239
|
+
|
|
240
|
+
# Convert the prism tokens into the expected format for the parser gem.
|
|
241
|
+
def to_a
|
|
242
|
+
tokens = []
|
|
243
|
+
|
|
244
|
+
index = 0
|
|
245
|
+
length = lexed.length
|
|
246
|
+
|
|
247
|
+
heredoc_stack = []
|
|
248
|
+
quote_stack = []
|
|
249
|
+
|
|
250
|
+
# The parser gem emits the newline tokens for comments out of order. This saves
|
|
251
|
+
# that token location to emit at a later time to properly line everything up.
|
|
252
|
+
# https://github.com/whitequark/parser/issues/1025
|
|
253
|
+
comment_newline_location = nil
|
|
254
|
+
|
|
255
|
+
while index < length
|
|
256
|
+
token, state = lexed[index]
|
|
257
|
+
index += 1
|
|
258
|
+
next if TYPES_ALWAYS_SKIP.include?(token.type)
|
|
259
|
+
|
|
260
|
+
type = TYPES.fetch(token.type)
|
|
261
|
+
value = token.value
|
|
262
|
+
location = range(token.location.start_offset, token.location.end_offset)
|
|
263
|
+
|
|
264
|
+
case type
|
|
265
|
+
when :kDO
|
|
266
|
+
nearest_lambda_token = tokens.reverse_each.find do |token|
|
|
267
|
+
LAMBDA_TOKEN_TYPES.include?(token.first)
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
if nearest_lambda_token&.first == :tLAMBDA
|
|
271
|
+
type = :kDO_LAMBDA
|
|
272
|
+
end
|
|
273
|
+
when :tCHARACTER
|
|
274
|
+
value.delete_prefix!("?")
|
|
275
|
+
# Character literals behave similar to double-quoted strings. We can use the same escaping mechanism.
|
|
276
|
+
value = unescape_string(value, "?")
|
|
277
|
+
when :tCOMMENT
|
|
278
|
+
if token.type == :EMBDOC_BEGIN
|
|
279
|
+
|
|
280
|
+
while !((next_token = lexed[index]&.first) && next_token.type == :EMBDOC_END) && (index < length - 1)
|
|
281
|
+
value += next_token.value
|
|
282
|
+
index += 1
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
value += next_token.value
|
|
286
|
+
location = range(token.location.start_offset, next_token.location.end_offset)
|
|
287
|
+
index += 1
|
|
288
|
+
else
|
|
289
|
+
is_at_eol = value.chomp!.nil?
|
|
290
|
+
location = range(token.location.start_offset, token.location.end_offset + (is_at_eol ? 0 : -1))
|
|
291
|
+
|
|
292
|
+
prev_token, _ = lexed[index - 2] if index - 2 >= 0
|
|
293
|
+
next_token, _ = lexed[index]
|
|
294
|
+
|
|
295
|
+
is_inline_comment = prev_token&.location&.start_line == token.location.start_line
|
|
296
|
+
if is_inline_comment && !is_at_eol && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
|
|
297
|
+
tokens << [:tCOMMENT, [value, location]]
|
|
298
|
+
|
|
299
|
+
nl_location = range(token.location.end_offset - 1, token.location.end_offset)
|
|
300
|
+
tokens << [:tNL, [nil, nl_location]]
|
|
301
|
+
next
|
|
302
|
+
elsif is_inline_comment && next_token&.type == :COMMENT
|
|
303
|
+
comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset)
|
|
304
|
+
elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
|
|
305
|
+
tokens << [:tCOMMENT, [value, location]]
|
|
306
|
+
tokens << [:tNL, [nil, comment_newline_location]]
|
|
307
|
+
comment_newline_location = nil
|
|
308
|
+
next
|
|
309
|
+
end
|
|
310
|
+
end
|
|
311
|
+
when :tNL
|
|
312
|
+
next_token, _ = lexed[index]
|
|
313
|
+
# Newlines after comments are emitted out of order.
|
|
314
|
+
if next_token&.type == :COMMENT
|
|
315
|
+
comment_newline_location = location
|
|
316
|
+
next
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
value = nil
|
|
320
|
+
when :tFLOAT
|
|
321
|
+
value = parse_float(value)
|
|
322
|
+
when :tIMAGINARY
|
|
323
|
+
value = parse_complex(value)
|
|
324
|
+
when :tINTEGER
|
|
325
|
+
if value.start_with?("+")
|
|
326
|
+
tokens << [:tUNARY_NUM, ["+", range(token.location.start_offset, token.location.start_offset + 1)]]
|
|
327
|
+
location = range(token.location.start_offset + 1, token.location.end_offset)
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
value = parse_integer(value)
|
|
331
|
+
when :tLABEL
|
|
332
|
+
value.chomp!(":")
|
|
333
|
+
when :tLABEL_END
|
|
334
|
+
value.chomp!(":")
|
|
335
|
+
when :tLCURLY
|
|
336
|
+
type = :tLBRACE if state == EXPR_BEG | EXPR_LABEL
|
|
337
|
+
when :tLPAREN2
|
|
338
|
+
type = :tLPAREN if tokens.empty? || LPAREN_CONVERSION_TOKEN_TYPES.include?(tokens.dig(-1, 0))
|
|
339
|
+
when :tNTH_REF
|
|
340
|
+
value = parse_integer(value.delete_prefix("$"))
|
|
341
|
+
when :tOP_ASGN
|
|
342
|
+
value.chomp!("=")
|
|
343
|
+
when :tRATIONAL
|
|
344
|
+
value = parse_rational(value)
|
|
345
|
+
when :tSPACE
|
|
346
|
+
location = range(token.location.start_offset, token.location.start_offset + percent_array_leading_whitespace(value))
|
|
347
|
+
value = nil
|
|
348
|
+
when :tSTRING_BEG
|
|
349
|
+
next_token, _ = lexed[index]
|
|
350
|
+
next_next_token, _ = lexed[index + 1]
|
|
351
|
+
basic_quotes = value == '"' || value == "'"
|
|
352
|
+
|
|
353
|
+
if basic_quotes && next_token&.type == :STRING_END
|
|
354
|
+
next_location = token.location.join(next_token.location)
|
|
355
|
+
type = :tSTRING
|
|
356
|
+
value = ""
|
|
357
|
+
location = range(next_location.start_offset, next_location.end_offset)
|
|
358
|
+
index += 1
|
|
359
|
+
elsif value.start_with?("'", '"', "%")
|
|
360
|
+
if next_token&.type == :STRING_CONTENT && next_next_token&.type == :STRING_END
|
|
361
|
+
string_value = next_token.value
|
|
362
|
+
if simplify_string?(string_value, value)
|
|
363
|
+
next_location = token.location.join(next_next_token.location)
|
|
364
|
+
if percent_array?(value)
|
|
365
|
+
value = percent_array_unescape(string_value)
|
|
366
|
+
else
|
|
367
|
+
value = unescape_string(string_value, value)
|
|
368
|
+
end
|
|
369
|
+
type = :tSTRING
|
|
370
|
+
location = range(next_location.start_offset, next_location.end_offset)
|
|
371
|
+
index += 2
|
|
372
|
+
tokens << [type, [value, location]]
|
|
373
|
+
|
|
374
|
+
next
|
|
375
|
+
end
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
quote_stack.push(value)
|
|
379
|
+
elsif token.type == :HEREDOC_START
|
|
380
|
+
quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
|
|
381
|
+
heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
|
|
382
|
+
heredoc = HeredocData.new(
|
|
383
|
+
identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
|
|
384
|
+
common_whitespace: 0,
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
if quote == "`"
|
|
388
|
+
type = :tXSTRING_BEG
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
# The parser gem trims whitespace from squiggly heredocs. We must record
|
|
392
|
+
# the most common whitespace to later remove.
|
|
393
|
+
if heredoc_type == "~" || heredoc_type == "`"
|
|
394
|
+
heredoc.common_whitespace = calculate_heredoc_whitespace(index)
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
if quote == "'" || quote == '"' || quote == "`"
|
|
398
|
+
value = "<<#{quote}"
|
|
399
|
+
else
|
|
400
|
+
value = '<<"'
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
heredoc_stack.push(heredoc)
|
|
404
|
+
quote_stack.push(value)
|
|
405
|
+
end
|
|
406
|
+
when :tSTRING_CONTENT
|
|
407
|
+
is_percent_array = percent_array?(quote_stack.last)
|
|
408
|
+
|
|
409
|
+
if (lines = token.value.lines).one?
|
|
410
|
+
# Prism usually emits a single token for strings with line continuations.
|
|
411
|
+
# For squiggly heredocs they are not joined so we do that manually here.
|
|
412
|
+
current_string = +""
|
|
413
|
+
current_length = 0
|
|
414
|
+
start_offset = token.location.start_offset
|
|
415
|
+
while token.type == :STRING_CONTENT
|
|
416
|
+
current_length += token.value.bytesize
|
|
417
|
+
# Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
|
|
418
|
+
prev_token, _ = lexed[index - 2] if index - 2 >= 0
|
|
419
|
+
is_first_token_on_line = prev_token && token.location.start_line != prev_token.location.start_line
|
|
420
|
+
# The parser gem only removes indentation when the heredoc is not nested
|
|
421
|
+
not_nested = heredoc_stack.size == 1
|
|
422
|
+
if is_percent_array
|
|
423
|
+
value = percent_array_unescape(token.value)
|
|
424
|
+
elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
|
|
425
|
+
value = trim_heredoc_whitespace(token.value, current_heredoc)
|
|
426
|
+
end
|
|
427
|
+
|
|
428
|
+
current_string << unescape_string(value, quote_stack.last)
|
|
429
|
+
relevant_backslash_count = if quote_stack.last.start_with?("%W", "%I")
|
|
430
|
+
0 # the last backslash escapes the newline
|
|
431
|
+
else
|
|
432
|
+
token.value[/(\\{1,})\n/, 1]&.length || 0
|
|
433
|
+
end
|
|
434
|
+
if relevant_backslash_count.even? || !interpolation?(quote_stack.last)
|
|
435
|
+
tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]]
|
|
436
|
+
break
|
|
437
|
+
end
|
|
438
|
+
token, _ = lexed[index]
|
|
439
|
+
index += 1
|
|
440
|
+
end
|
|
441
|
+
else
|
|
442
|
+
# When the parser gem encounters a line continuation inside of a multiline string,
|
|
443
|
+
# it emits a single string node. The backslash (and remaining newline) is removed.
|
|
444
|
+
current_line = +""
|
|
445
|
+
adjustment = 0
|
|
446
|
+
start_offset = token.location.start_offset
|
|
447
|
+
emit = false
|
|
448
|
+
|
|
449
|
+
lines.each.with_index do |line, index|
|
|
450
|
+
chomped_line = line.chomp
|
|
451
|
+
backslash_count = chomped_line[/\\{1,}\z/]&.length || 0
|
|
452
|
+
is_interpolation = interpolation?(quote_stack.last)
|
|
453
|
+
|
|
454
|
+
if backslash_count.odd? && (is_interpolation || is_percent_array)
|
|
455
|
+
if is_percent_array
|
|
456
|
+
current_line << percent_array_unescape(line)
|
|
457
|
+
adjustment += 1
|
|
458
|
+
else
|
|
459
|
+
chomped_line.delete_suffix!("\\")
|
|
460
|
+
current_line << chomped_line
|
|
461
|
+
adjustment += 2
|
|
462
|
+
end
|
|
463
|
+
# If the string ends with a line continuation emit the remainder
|
|
464
|
+
emit = index == lines.count - 1
|
|
465
|
+
else
|
|
466
|
+
current_line << line
|
|
467
|
+
emit = true
|
|
468
|
+
end
|
|
469
|
+
|
|
470
|
+
if emit
|
|
471
|
+
end_offset = start_offset + current_line.bytesize + adjustment
|
|
472
|
+
tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), range(start_offset, end_offset)]]
|
|
473
|
+
start_offset = end_offset
|
|
474
|
+
current_line = +""
|
|
475
|
+
adjustment = 0
|
|
476
|
+
end
|
|
477
|
+
end
|
|
478
|
+
end
|
|
479
|
+
next
|
|
480
|
+
when :tSTRING_DVAR
|
|
481
|
+
value = nil
|
|
482
|
+
when :tSTRING_END
|
|
483
|
+
if token.type == :HEREDOC_END && value.end_with?("\n")
|
|
484
|
+
newline_length = value.end_with?("\r\n") ? 2 : 1
|
|
485
|
+
value = heredoc_stack.pop.identifier
|
|
486
|
+
location = range(token.location.start_offset, token.location.end_offset - newline_length)
|
|
487
|
+
elsif token.type == :REGEXP_END
|
|
488
|
+
value = value[0]
|
|
489
|
+
location = range(token.location.start_offset, token.location.start_offset + 1)
|
|
490
|
+
end
|
|
491
|
+
|
|
492
|
+
if percent_array?(quote_stack.pop)
|
|
493
|
+
prev_token, _ = lexed[index - 2] if index - 2 >= 0
|
|
494
|
+
empty = %i[PERCENT_LOWER_I PERCENT_LOWER_W PERCENT_UPPER_I PERCENT_UPPER_W].include?(prev_token&.type)
|
|
495
|
+
ends_with_whitespace = prev_token&.type == :WORDS_SEP
|
|
496
|
+
# parser always emits a space token after content in a percent array, even if no actual whitespace is present.
|
|
497
|
+
if !empty && !ends_with_whitespace
|
|
498
|
+
tokens << [:tSPACE, [nil, range(token.location.start_offset, token.location.start_offset)]]
|
|
499
|
+
end
|
|
500
|
+
end
|
|
501
|
+
when :tSYMBEG
|
|
502
|
+
if (next_token = lexed[index]&.first) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
|
|
503
|
+
next_location = token.location.join(next_token.location)
|
|
504
|
+
type = :tSYMBOL
|
|
505
|
+
value = next_token.value
|
|
506
|
+
value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
|
|
507
|
+
location = range(next_location.start_offset, next_location.end_offset)
|
|
508
|
+
index += 1
|
|
509
|
+
else
|
|
510
|
+
quote_stack.push(value)
|
|
511
|
+
end
|
|
512
|
+
when :tFID
|
|
513
|
+
if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
|
|
514
|
+
type = :tIDENTIFIER
|
|
515
|
+
end
|
|
516
|
+
when :tXSTRING_BEG
|
|
517
|
+
if (next_token = lexed[index]&.first) && !%i[STRING_CONTENT STRING_END EMBEXPR_BEGIN].include?(next_token.type)
|
|
518
|
+
# self.`()
|
|
519
|
+
type = :tBACK_REF2
|
|
520
|
+
end
|
|
521
|
+
quote_stack.push(value)
|
|
522
|
+
when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
|
|
523
|
+
if (next_token = lexed[index]&.first) && next_token.type == :WORDS_SEP
|
|
524
|
+
index += 1
|
|
525
|
+
end
|
|
526
|
+
|
|
527
|
+
quote_stack.push(value)
|
|
528
|
+
when :tREGEXP_BEG
|
|
529
|
+
quote_stack.push(value)
|
|
530
|
+
end
|
|
531
|
+
|
|
532
|
+
tokens << [type, [value, location]]
|
|
533
|
+
|
|
534
|
+
if token.type == :REGEXP_END
|
|
535
|
+
tokens << [:tREGEXP_OPT, [token.value[1..], range(token.location.start_offset + 1, token.location.end_offset)]]
|
|
536
|
+
end
|
|
537
|
+
end
|
|
538
|
+
|
|
539
|
+
tokens
|
|
540
|
+
end
|
|
541
|
+
|
|
542
|
+
private
|
|
543
|
+
|
|
544
|
+
# Creates a new parser range, taking prisms byte offsets into account
|
|
545
|
+
def range(start_offset, end_offset)
|
|
546
|
+
Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])
|
|
547
|
+
end
|
|
548
|
+
|
|
549
|
+
# Parse an integer from the string representation.
|
|
550
|
+
def parse_integer(value)
|
|
551
|
+
Integer(value)
|
|
552
|
+
rescue ArgumentError
|
|
553
|
+
0
|
|
554
|
+
end
|
|
555
|
+
|
|
556
|
+
# Parse a float from the string representation.
|
|
557
|
+
def parse_float(value)
|
|
558
|
+
Float(value)
|
|
559
|
+
rescue ArgumentError
|
|
560
|
+
0.0
|
|
561
|
+
end
|
|
562
|
+
|
|
563
|
+
# Parse a complex from the string representation.
|
|
564
|
+
def parse_complex(value)
|
|
565
|
+
value.chomp!("i")
|
|
566
|
+
|
|
567
|
+
if value.end_with?("r")
|
|
568
|
+
Complex(0, parse_rational(value))
|
|
569
|
+
elsif value.start_with?(/0[BbOoDdXx]/)
|
|
570
|
+
Complex(0, parse_integer(value))
|
|
571
|
+
else
|
|
572
|
+
Complex(0, value)
|
|
573
|
+
end
|
|
574
|
+
rescue ArgumentError
|
|
575
|
+
0i
|
|
576
|
+
end
|
|
577
|
+
|
|
578
|
+
# Parse a rational from the string representation.
|
|
579
|
+
def parse_rational(value)
|
|
580
|
+
value.chomp!("r")
|
|
581
|
+
|
|
582
|
+
if value.start_with?(/0[BbOoDdXx]/)
|
|
583
|
+
Rational(parse_integer(value))
|
|
584
|
+
else
|
|
585
|
+
Rational(value)
|
|
586
|
+
end
|
|
587
|
+
rescue ArgumentError
|
|
588
|
+
0r
|
|
589
|
+
end
|
|
590
|
+
|
|
591
|
+
# Wonky heredoc tab/spaces rules.
|
|
592
|
+
# https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558
|
|
593
|
+
def calculate_heredoc_whitespace(heredoc_token_index)
|
|
594
|
+
next_token_index = heredoc_token_index
|
|
595
|
+
nesting_level = 0
|
|
596
|
+
previous_line = -1
|
|
597
|
+
result = Float::MAX
|
|
598
|
+
|
|
599
|
+
while (next_token = lexed[next_token_index]&.first)
|
|
600
|
+
next_token_index += 1
|
|
601
|
+
next_next_token, _ = lexed[next_token_index]
|
|
602
|
+
first_token_on_line = next_token.location.start_column == 0
|
|
603
|
+
|
|
604
|
+
# String content inside nested heredocs and interpolation is ignored
|
|
605
|
+
if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
|
|
606
|
+
# When interpolation is the first token of a line there is no string
|
|
607
|
+
# content to check against. There will be no common whitespace.
|
|
608
|
+
if nesting_level == 0 && first_token_on_line
|
|
609
|
+
result = 0
|
|
610
|
+
end
|
|
611
|
+
nesting_level += 1
|
|
612
|
+
elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
|
|
613
|
+
nesting_level -= 1
|
|
614
|
+
# When we encountered the matching heredoc end, we can exit
|
|
615
|
+
break if nesting_level == -1
|
|
616
|
+
elsif next_token.type == :STRING_CONTENT && nesting_level == 0 && first_token_on_line
|
|
617
|
+
common_whitespace = 0
|
|
618
|
+
next_token.value[/^\s*/].each_char do |char|
|
|
619
|
+
if char == "\t"
|
|
620
|
+
common_whitespace = (common_whitespace / 8 + 1) * 8;
|
|
621
|
+
else
|
|
622
|
+
common_whitespace += 1
|
|
623
|
+
end
|
|
624
|
+
end
|
|
625
|
+
|
|
626
|
+
is_first_token_on_line = next_token.location.start_line != previous_line
|
|
627
|
+
# Whitespace is significant if followed by interpolation
|
|
628
|
+
whitespace_only = common_whitespace == next_token.value.length && next_next_token&.location&.start_line != next_token.location.start_line
|
|
629
|
+
if is_first_token_on_line && !whitespace_only && common_whitespace < result
|
|
630
|
+
result = common_whitespace
|
|
631
|
+
previous_line = next_token.location.start_line
|
|
632
|
+
end
|
|
633
|
+
end
|
|
634
|
+
end
|
|
635
|
+
result
|
|
636
|
+
end
|
|
637
|
+
|
|
638
|
+
# Wonky heredoc tab/spaces rules.
|
|
639
|
+
# https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545
|
|
640
|
+
def trim_heredoc_whitespace(string, heredoc)
|
|
641
|
+
trimmed_whitespace = 0
|
|
642
|
+
trimmed_characters = 0
|
|
643
|
+
while (string[trimmed_characters] == "\t" || string[trimmed_characters] == " ") && trimmed_whitespace < heredoc.common_whitespace
|
|
644
|
+
if string[trimmed_characters] == "\t"
|
|
645
|
+
trimmed_whitespace = (trimmed_whitespace / 8 + 1) * 8;
|
|
646
|
+
break if trimmed_whitespace > heredoc.common_whitespace
|
|
647
|
+
else
|
|
648
|
+
trimmed_whitespace += 1
|
|
649
|
+
end
|
|
650
|
+
trimmed_characters += 1
|
|
651
|
+
end
|
|
652
|
+
|
|
653
|
+
string[trimmed_characters..]
|
|
654
|
+
end
|
|
655
|
+
|
|
656
|
+
# Escape sequences that have special and should appear unescaped in the resulting string.
|
|
657
|
+
ESCAPES = {
|
|
658
|
+
"a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f",
|
|
659
|
+
"n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t",
|
|
660
|
+
"v" => "\v", "\\" => "\\"
|
|
661
|
+
}.freeze
|
|
662
|
+
private_constant :ESCAPES
|
|
663
|
+
|
|
664
|
+
# When one of these delimiters is encountered, then the other
|
|
665
|
+
# one is allowed to be escaped as well.
|
|
666
|
+
DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze
|
|
667
|
+
private_constant :DELIMITER_SYMETRY
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
# https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/lexer-strings.rl#L14
|
|
671
|
+
REGEXP_META_CHARACTERS = ["\\", "$", "(", ")", "*", "+", ".", "<", ">", "?", "[", "]", "^", "{", "|", "}"]
|
|
672
|
+
private_constant :REGEXP_META_CHARACTERS
|
|
673
|
+
|
|
674
|
+
# Apply Ruby string escaping rules
|
|
675
|
+
def unescape_string(string, quote)
|
|
676
|
+
# In single-quoted heredocs, everything is taken literally.
|
|
677
|
+
return string if quote == "<<'"
|
|
678
|
+
|
|
679
|
+
# OPTIMIZATION: Assume that few strings need escaping to speed up the common case.
|
|
680
|
+
return string unless string.include?("\\")
|
|
681
|
+
|
|
682
|
+
# Enclosing character for the string. `"` for `"foo"`, `{` for `%w{foo}`, etc.
|
|
683
|
+
delimiter = quote[-1]
|
|
684
|
+
|
|
685
|
+
if regexp?(quote)
|
|
686
|
+
# Should be escaped handled to single-quoted heredocs. The only character that is
|
|
687
|
+
# allowed to be escaped is the delimiter, except when that also has special meaning
|
|
688
|
+
# in the regexp. Since all the symetry delimiters have special meaning, they don't need
|
|
689
|
+
# to be considered separately.
|
|
690
|
+
if REGEXP_META_CHARACTERS.include?(delimiter)
|
|
691
|
+
string
|
|
692
|
+
else
|
|
693
|
+
# There can never be an even amount of backslashes. It would be a syntax error.
|
|
694
|
+
string.gsub(/\\(#{Regexp.escape(delimiter)})/, '\1')
|
|
695
|
+
end
|
|
696
|
+
elsif interpolation?(quote)
|
|
697
|
+
# Appending individual escape sequences may force the string out of its intended
|
|
698
|
+
# encoding. Start out with binary and force it back later.
|
|
699
|
+
result = "".b
|
|
700
|
+
|
|
701
|
+
scanner = StringScanner.new(string)
|
|
702
|
+
while (skipped = scanner.skip_until(/\\/))
|
|
703
|
+
# Append what was just skipped over, excluding the found backslash.
|
|
704
|
+
result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))
|
|
705
|
+
escape_read(result, scanner, false, false)
|
|
706
|
+
end
|
|
707
|
+
|
|
708
|
+
# Add remaining chars
|
|
709
|
+
result.append_as_bytes(string.byteslice(scanner.pos..))
|
|
710
|
+
result.force_encoding(source_buffer.source.encoding)
|
|
711
|
+
else
|
|
712
|
+
delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}")
|
|
713
|
+
string.gsub(/\\([\\#{delimiters}])/, '\1')
|
|
714
|
+
end
|
|
715
|
+
end
|
|
716
|
+
|
|
717
|
+
# Certain strings are merged into a single string token.
|
|
718
|
+
def simplify_string?(value, quote)
|
|
719
|
+
case quote
|
|
720
|
+
when "'"
|
|
721
|
+
# Only simplify 'foo'
|
|
722
|
+
!value.include?("\n")
|
|
723
|
+
when '"'
|
|
724
|
+
# Simplify when every line ends with a line continuation, or it is the last line
|
|
725
|
+
value.lines.all? do |line|
|
|
726
|
+
!line.end_with?("\n") || line[/(\\*)$/, 1]&.length&.odd?
|
|
727
|
+
end
|
|
728
|
+
else
|
|
729
|
+
# %q and similar are never simplified
|
|
730
|
+
false
|
|
731
|
+
end
|
|
732
|
+
end
|
|
733
|
+
|
|
734
|
+
# Escape a byte value, given the control and meta flags.
|
|
735
|
+
def escape_build(value, control, meta)
|
|
736
|
+
value &= 0x9f if control
|
|
737
|
+
value |= 0x80 if meta
|
|
738
|
+
value
|
|
739
|
+
end
|
|
740
|
+
|
|
741
|
+
# Read an escape out of the string scanner, given the control and meta
|
|
742
|
+
# flags, and push the unescaped value into the result.
|
|
743
|
+
def escape_read(result, scanner, control, meta)
|
|
744
|
+
if scanner.skip("\n")
|
|
745
|
+
# Line continuation
|
|
746
|
+
elsif (value = ESCAPES[scanner.peek(1)])
|
|
747
|
+
# Simple single-character escape sequences like \n
|
|
748
|
+
result.append_as_bytes(value)
|
|
749
|
+
scanner.pos += 1
|
|
750
|
+
elsif (value = scanner.scan(/[0-7]{1,3}/))
|
|
751
|
+
# \nnn
|
|
752
|
+
result.append_as_bytes(escape_build(value.to_i(8), control, meta))
|
|
753
|
+
elsif (value = scanner.scan(/x[0-9a-fA-F]{1,2}/))
|
|
754
|
+
# \xnn
|
|
755
|
+
result.append_as_bytes(escape_build(value[1..].to_i(16), control, meta))
|
|
756
|
+
elsif (value = scanner.scan(/u[0-9a-fA-F]{4}/))
|
|
757
|
+
# \unnnn
|
|
758
|
+
result.append_as_bytes(value[1..].hex.chr(Encoding::UTF_8))
|
|
759
|
+
elsif scanner.skip("u{}")
|
|
760
|
+
# https://github.com/whitequark/parser/issues/856
|
|
761
|
+
elsif (value = scanner.scan(/u{.*?}/))
|
|
762
|
+
# \u{nnnn ...}
|
|
763
|
+
value[2..-2].split.each do |unicode|
|
|
764
|
+
result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8))
|
|
765
|
+
end
|
|
766
|
+
elsif (value = scanner.scan(/c\\?(?=[[:print:]])|C-\\?(?=[[:print:]])/))
|
|
767
|
+
# \cx or \C-x where x is an ASCII printable character
|
|
768
|
+
escape_read(result, scanner, true, meta)
|
|
769
|
+
elsif (value = scanner.scan(/M-\\?(?=[[:print:]])/))
|
|
770
|
+
# \M-x where x is an ASCII printable character
|
|
771
|
+
escape_read(result, scanner, control, true)
|
|
772
|
+
elsif (byte = scanner.scan_byte)
|
|
773
|
+
# Something else after an escape.
|
|
774
|
+
if control && byte == 0x3f # ASCII '?'
|
|
775
|
+
result.append_as_bytes(escape_build(0x7f, false, meta))
|
|
776
|
+
else
|
|
777
|
+
result.append_as_bytes(escape_build(byte, control, meta))
|
|
778
|
+
end
|
|
779
|
+
end
|
|
780
|
+
end
|
|
781
|
+
|
|
782
|
+
# In a percent array, certain whitespace can be preceeded with a backslash,
|
|
783
|
+
# causing the following characters to be part of the previous element.
|
|
784
|
+
def percent_array_unescape(string)
|
|
785
|
+
string.gsub(/(\\)+[ \f\n\r\t\v]/) do |full_match|
|
|
786
|
+
full_match.delete_prefix!("\\") if Regexp.last_match[1].length.odd?
|
|
787
|
+
full_match
|
|
788
|
+
end
|
|
789
|
+
end
|
|
790
|
+
|
|
791
|
+
# For %-arrays whitespace, the parser gem only considers whitespace before the newline.
|
|
792
|
+
def percent_array_leading_whitespace(string)
|
|
793
|
+
return 1 if string.start_with?("\n")
|
|
794
|
+
|
|
795
|
+
leading_whitespace = 0
|
|
796
|
+
string.each_char do |c|
|
|
797
|
+
break if c == "\n"
|
|
798
|
+
leading_whitespace += 1
|
|
799
|
+
end
|
|
800
|
+
leading_whitespace
|
|
801
|
+
end
|
|
802
|
+
|
|
803
|
+
# Determine if characters preceeded by a backslash should be escaped or not
|
|
804
|
+
def interpolation?(quote)
|
|
805
|
+
!quote.end_with?("'") && !quote.start_with?("%q", "%w", "%i", "%s")
|
|
806
|
+
end
|
|
807
|
+
|
|
808
|
+
# Regexp allow interpolation but are handled differently during unescaping
|
|
809
|
+
def regexp?(quote)
|
|
810
|
+
quote == "/" || quote.start_with?("%r")
|
|
811
|
+
end
|
|
812
|
+
|
|
813
|
+
# Determine if the string is part of a %-style array.
|
|
814
|
+
def percent_array?(quote)
|
|
815
|
+
quote.start_with?("%w", "%W", "%i", "%I")
|
|
816
|
+
end
|
|
817
|
+
end
|
|
818
|
+
end
|
|
819
|
+
end
|
|
820
|
+
end
|