ruby_parser 3.13.0 → 3.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,40 +4,9 @@
4
4
  $DEBUG = true if ENV["DEBUG"]
5
5
 
6
6
  class RubyLexer
7
-
8
7
  # :stopdoc:
9
- HAS_ENC = "".respond_to? :encoding
10
-
11
- IDENT_CHAR = if HAS_ENC then
12
- /[\w\u0080-\u{10ffff}]/u
13
- else
14
- /[\w\x80-\xFF]/n
15
- end
16
-
17
8
  EOF = :eof_haha!
18
9
 
19
- # ruby constants for strings (should this be moved somewhere else?)
20
-
21
- STR_FUNC_BORING = 0x00
22
- STR_FUNC_ESCAPE = 0x01 # TODO: remove and replace with REGEXP
23
- STR_FUNC_EXPAND = 0x02
24
- STR_FUNC_REGEXP = 0x04
25
- STR_FUNC_QWORDS = 0x08
26
- STR_FUNC_SYMBOL = 0x10
27
- STR_FUNC_INDENT = 0x20 # <<-HEREDOC
28
- STR_FUNC_ICNTNT = 0x40 # <<~HEREDOC
29
-
30
- STR_SQUOTE = STR_FUNC_BORING
31
- STR_DQUOTE = STR_FUNC_BORING | STR_FUNC_EXPAND
32
- STR_XQUOTE = STR_FUNC_BORING | STR_FUNC_EXPAND
33
- STR_REGEXP = STR_FUNC_REGEXP | STR_FUNC_ESCAPE | STR_FUNC_EXPAND
34
- STR_SSYM = STR_FUNC_SYMBOL
35
- STR_DSYM = STR_FUNC_SYMBOL | STR_FUNC_EXPAND
36
-
37
- EXPR_BEG_ANY = [:expr_beg, :expr_mid, :expr_class ]
38
- EXPR_ARG_ANY = [:expr_arg, :expr_cmdarg, ]
39
- EXPR_END_ANY = [:expr_end, :expr_endarg, :expr_endfn]
40
-
41
10
  ESCAPES = {
42
11
  "a" => "\007",
43
12
  "b" => "\010",
@@ -54,6 +23,8 @@ class RubyLexer
54
23
  "c\?" => 127.chr,
55
24
  }
56
25
 
26
+ HAS_ENC = "".respond_to? :encoding
27
+
57
28
  TOKENS = {
58
29
  "!" => :tBANG,
59
30
  "!=" => :tNEQ,
@@ -70,13 +41,26 @@ class RubyLexer
70
41
  "->" => :tLAMBDA,
71
42
  }
72
43
 
73
- TAB_WIDTH = 8
74
-
75
- @@regexp_cache = Hash.new { |h,k| h[k] = Regexp.new(Regexp.escape(k)) }
44
+ @@regexp_cache = Hash.new { |h, k| h[k] = Regexp.new(Regexp.escape(k)) }
76
45
  @@regexp_cache[nil] = nil
77
46
 
47
+ if $DEBUG then
48
+ attr_reader :lex_state
49
+
50
+ def lex_state= o
51
+ return if @lex_state == o
52
+ raise ArgumentError, "bad state: %p" % [o] unless State === o
53
+
54
+ warn "lex_state: %p -> %p" % [lex_state, o]
55
+
56
+ @lex_state = o
57
+ end
58
+ end
59
+
78
60
  # :startdoc:
79
61
 
62
+ attr_accessor :lex_state unless $DEBUG
63
+
80
64
  attr_accessor :lineno # we're bypassing oedipus' lineno handling.
81
65
  attr_accessor :brace_nest
82
66
  attr_accessor :cmdarg
@@ -90,7 +74,6 @@ class RubyLexer
90
74
  # Additional context surrounding tokens that both the lexer and
91
75
  # grammar use.
92
76
 
93
- attr_accessor :lex_state
94
77
  attr_accessor :lex_strterm
95
78
  attr_accessor :lpar_beg
96
79
  attr_accessor :paren_nest
@@ -99,24 +82,14 @@ class RubyLexer
99
82
  attr_accessor :string_buffer
100
83
  attr_accessor :string_nest
101
84
 
102
- if $DEBUG then
103
- alias lex_state= lex_state=
104
- def lex_state=o
105
- return if @lex_state == o
106
- c = caller.first
107
- c = caller[1] if c =~ /\bresult\b/
108
- warn "lex_state: %p -> %p from %s" % [@lex_state, o, c.clean_caller]
109
- @lex_state = o
110
- end
111
- end
112
-
113
85
  # Last token read via next_token.
114
86
  attr_accessor :token
115
87
 
116
88
  attr_writer :comments
117
89
 
118
90
  def initialize _ = nil
119
- @lex_state = :expr_none
91
+ @lex_state = nil # remove one warning under $DEBUG
92
+ self.lex_state = EXPR_NONE
120
93
 
121
94
  self.cond = RubyParserStuff::StackState.new(:cond, $DEBUG)
122
95
  self.cmdarg = RubyParserStuff::StackState.new(:cmdarg, $DEBUG)
@@ -125,17 +98,22 @@ class RubyLexer
125
98
  end
126
99
 
127
100
  def arg_ambiguous
128
- self.warning("Ambiguous first argument. make sure.")
101
+ self.warning "Ambiguous first argument. make sure."
129
102
  end
130
103
 
131
104
  def arg_state
132
- in_arg_state? ? :expr_arg : :expr_beg
105
+ is_after_operator? ? EXPR_ARG : EXPR_BEG
133
106
  end
134
107
 
135
108
  def beginning_of_line?
136
109
  ss.bol?
137
110
  end
138
- alias :bol? :beginning_of_line? # to make .rex file more readable
111
+
112
+ alias bol? beginning_of_line? # to make .rex file more readable
113
+
114
+ def check re
115
+ ss.check re
116
+ end
139
117
 
140
118
  def comments # TODO: remove this... maybe comment_string + attr_accessor
141
119
  c = @comments.join
@@ -143,30 +121,41 @@ class RubyLexer
143
121
  c
144
122
  end
145
123
 
124
+ def eat_whitespace
125
+ r = scan(/\s+/)
126
+ self.extra_lineno += r.count("\n") if r
127
+ r
128
+ end
129
+
146
130
  def end_of_stream?
147
131
  ss.eos?
148
132
  end
149
133
 
150
134
  def expr_dot?
151
- lex_state == :expr_dot
135
+ lex_state =~ EXPR_DOT
152
136
  end
153
137
 
154
- def expr_fname?
155
- lex_state == :expr_fname
138
+ def expr_fname? # REFACTOR
139
+ lex_state =~ EXPR_FNAME
156
140
  end
157
141
 
158
142
  def expr_result token, text
159
143
  cond.push false
160
144
  cmdarg.push false
161
- result :expr_beg, token, text
145
+ result EXPR_BEG, token, text
146
+ end
147
+
148
+ def fixup_lineno extra = 0
149
+ self.lineno += self.extra_lineno + extra
150
+ self.extra_lineno = 0
162
151
  end
163
152
 
164
153
  def heredoc here # TODO: rewrite / remove
165
154
  _, eos, func, last_line = here
166
155
 
167
- indent = (func & STR_FUNC_INDENT) != 0 ? "[ \t]*" : nil
168
- content_indent = (func & STR_FUNC_ICNTNT) != 0
169
- expand = (func & STR_FUNC_EXPAND) != 0
156
+ indent = func =~ STR_FUNC_INDENT ? "[ \t]*" : nil
157
+ expand = func =~ STR_FUNC_EXPAND
158
+ eol = last_line && last_line.end_with?("\r\n") ? "\r\n" : "\n"
170
159
  eos_re = /#{indent}#{Regexp.escape eos}(\r*\n|\z)/
171
160
  err_msg = "can't match #{eos_re.inspect} anywhere in "
172
161
 
@@ -175,30 +164,35 @@ class RubyLexer
175
164
  if beginning_of_line? && scan(eos_re) then
176
165
  self.lineno += 1
177
166
  ss.unread_many last_line # TODO: figure out how to remove this
178
- return :tSTRING_END, eos
167
+ return :tSTRING_END, [eos, func] # TODO: calculate squiggle width at lex?
179
168
  end
180
169
 
181
170
  self.string_buffer = []
182
171
 
183
172
  if expand then
184
173
  case
185
- when scan(/#[$@]/) then
186
- ss.pos -= 1 # FIX omg stupid
174
+ when scan(/#(?=\$(-.|[a-zA-Z_0-9~\*\$\?!@\/\\;,\.=:<>\"\&\`\'+]))/) then
175
+ # TODO: !ISASCII
176
+ # ?! see parser_peek_variable_name
177
+ return :tSTRING_DVAR, matched
178
+ when scan(/#(?=\@\@?[a-zA-Z_])/) then
179
+ # TODO: !ISASCII
187
180
  return :tSTRING_DVAR, matched
188
181
  when scan(/#[{]/) then
182
+ self.command_start = true
189
183
  return :tSTRING_DBEG, matched
190
184
  when scan(/#/) then
191
- string_buffer << '#'
185
+ string_buffer << "#"
192
186
  end
193
187
 
194
188
  begin
195
- c = tokadd_string func, "\n", nil
189
+ c = tokadd_string func, eol, nil
196
190
 
197
191
  rb_compile_error err_msg if
198
192
  c == RubyLexer::EOF
199
193
 
200
- if c != "\n" then
201
- return :tSTRING_CONTENT, string_buffer.join.delete("\r")
194
+ if c != eol then
195
+ return :tSTRING_CONTENT, string_buffer.join
202
196
  else
203
197
  string_buffer << scan(/\n/)
204
198
  end
@@ -214,64 +208,26 @@ class RubyLexer
214
208
 
215
209
  self.lex_strterm = [:heredoc, eos, func, last_line]
216
210
 
217
- string_content = string_buffer.join.delete("\r")
218
-
219
- string_content = heredoc_dedent(string_content) if content_indent && ruby23plus?
211
+ string_content = begin
212
+ s = string_buffer.join
213
+ s.b.force_encoding Encoding::UTF_8
214
+ end
220
215
 
221
216
  return :tSTRING_CONTENT, string_content
222
217
  end
223
218
 
224
- def heredoc_dedent(string_content)
225
- width = string_content.scan(/^[ \t]*(?=\S)/).map do |whitespace|
226
- heredoc_whitespace_indent_size whitespace
227
- end.min || 0
228
-
229
- string_content.split("\n", -1).map do |line|
230
- dedent_string line, width
231
- end.join "\n"
232
- end
233
-
234
- def dedent_string(string, width)
235
- characters_skipped = 0
236
- indentation_skipped = 0
237
-
238
- string.chars.each do |char|
239
- break if indentation_skipped >= width
240
- if char == ' '
241
- characters_skipped += 1
242
- indentation_skipped += 1
243
- elsif char == "\t"
244
- proposed = TAB_WIDTH * (indentation_skipped / TAB_WIDTH + 1)
245
- break if (proposed > width)
246
- characters_skipped += 1
247
- indentation_skipped = proposed
248
- end
249
- end
250
- string[characters_skipped..-1]
251
- end
252
-
253
- def heredoc_whitespace_indent_size(whitespace)
254
- whitespace.chars.inject 0 do |size, char|
255
- if char == "\t"
256
- size + TAB_WIDTH
257
- else
258
- size + 1
259
- end
260
- end
261
- end
262
-
263
219
  def heredoc_identifier # TODO: remove / rewrite
264
220
  term, func = nil, STR_FUNC_BORING
265
221
  self.string_buffer = []
266
222
 
267
- heredoc_indent_mods = '-'
223
+ heredoc_indent_mods = "-"
268
224
  heredoc_indent_mods += '\~' if ruby23plus?
269
225
 
270
226
  case
271
227
  when scan(/([#{heredoc_indent_mods}]?)([\'\"\`])(.*?)\2/) then
272
228
  term = ss[2]
273
- func |= STR_FUNC_INDENT unless ss[1].empty?
274
- func |= STR_FUNC_ICNTNT if ss[1] == '~'
229
+ func |= STR_FUNC_INDENT unless ss[1].empty? # TODO: this seems wrong
230
+ func |= STR_FUNC_ICNTNT if ss[1] == "~"
275
231
  func |= case term
276
232
  when "\'" then
277
233
  STR_SQUOTE
@@ -288,7 +244,7 @@ class RubyLexer
288
244
  func |= STR_DQUOTE
289
245
  unless ss[1].empty? then
290
246
  func |= STR_FUNC_INDENT
291
- func |= STR_FUNC_ICNTNT if ss[1] == '~'
247
+ func |= STR_FUNC_ICNTNT if ss[1] == "~"
292
248
  end
293
249
  string_buffer << ss[2]
294
250
  else
@@ -304,23 +260,15 @@ class RubyLexer
304
260
 
305
261
  self.lex_strterm = [:heredoc, string_buffer.join, func, line]
306
262
 
307
- if term == '`' then
263
+ if term == "`" then
308
264
  result nil, :tXSTRING_BEG, "`"
309
265
  else
310
266
  result nil, :tSTRING_BEG, "\""
311
267
  end
312
268
  end
313
269
 
314
- def in_fname?
315
- in_lex_state? :expr_fname
316
- end
317
-
318
- def in_arg_state? # TODO: rename is_after_operator?
319
- in_lex_state? :expr_fname, :expr_dot
320
- end
321
-
322
- def in_lex_state?(*states)
323
- states.include? lex_state
270
+ def in_fname? # REFACTOR
271
+ lex_state =~ EXPR_FNAME
324
272
  end
325
273
 
326
274
  def int_with_base base
@@ -328,42 +276,35 @@ class RubyLexer
328
276
 
329
277
  text = matched
330
278
  case
331
- when text.end_with?('ri')
332
- return result(:expr_end, :tIMAGINARY, Complex(0, Rational(text.chop.chop.to_i(base))))
333
- when text.end_with?('r')
334
- return result(:expr_end, :tRATIONAL, Rational(text.chop.to_i(base)))
335
- when text.end_with?('i')
336
- return result(:expr_end, :tIMAGINARY, Complex(0, text.chop.to_i(base)))
279
+ when text.end_with?("ri")
280
+ return result(EXPR_NUM, :tIMAGINARY, Complex(0, Rational(text.chop.chop.to_i(base))))
281
+ when text.end_with?("r")
282
+ return result(EXPR_NUM, :tRATIONAL, Rational(text.chop.to_i(base)))
283
+ when text.end_with?("i")
284
+ return result(EXPR_NUM, :tIMAGINARY, Complex(0, text.chop.to_i(base)))
337
285
  else
338
- return result(:expr_end, :tINTEGER, text.to_i(base))
286
+ return result(EXPR_NUM, :tINTEGER, text.to_i(base))
339
287
  end
340
288
  end
341
289
 
290
+ def is_after_operator?
291
+ lex_state =~ EXPR_FNAME|EXPR_DOT
292
+ end
293
+
342
294
  def is_arg?
343
- in_lex_state?(*EXPR_ARG_ANY)
295
+ lex_state =~ EXPR_ARG_ANY
344
296
  end
345
297
 
346
298
  def is_beg?
347
- # TODO: in_lex_state?(*EXPR_BEG_ANY) || lex_state == [:expr_arg, :expr_labeled]
348
- in_lex_state?(*EXPR_BEG_ANY, :expr_value, :expr_labeled)
299
+ lex_state =~ EXPR_BEG_ANY || lex_state == EXPR_LAB # yes, == EXPR_LAB
349
300
  end
350
301
 
351
302
  def is_end?
352
- in_lex_state?(*EXPR_END_ANY)
353
- end
354
-
355
- def lvar_defined? id
356
- # TODO: (dyna_in_block? && dvar_defined?(id)) || local_id?(id)
357
- self.parser.env[id.to_sym] == :lvar
358
- end
359
-
360
-
361
- def ruby22_label?
362
- ruby22plus? and is_label_possible?
303
+ lex_state =~ EXPR_END_ANY
363
304
  end
364
305
 
365
306
  def is_label_possible?
366
- (in_lex_state?(:expr_beg, :expr_endfn) && !cmd_state) || is_arg?
307
+ (lex_state =~ EXPR_LABEL|EXPR_ENDFN && !cmd_state) || is_arg?
367
308
  end
368
309
 
369
310
  def is_label_suffix?
@@ -378,6 +319,16 @@ class RubyLexer
378
319
  lpar_beg && lpar_beg == paren_nest
379
320
  end
380
321
 
322
+ def is_local_id id
323
+ # maybe just make this false for now
324
+ self.parser.env[id.to_sym] == :lvar # HACK: this isn't remotely right
325
+ end
326
+
327
+ def lvar_defined? id
328
+ # TODO: (dyna_in_block? && dvar_defined?(id)) || local_id?(id)
329
+ self.parser.env[id.to_sym] == :lvar
330
+ end
331
+
381
332
  def matched
382
333
  ss.matched
383
334
  end
@@ -386,11 +337,139 @@ class RubyLexer
386
337
  not is_end?
387
338
  end
388
339
 
340
+ def parse_quote # TODO: remove / rewrite
341
+ beg, nnd, short_hand, c = nil, nil, false, nil
342
+
343
+ if scan(/[a-z0-9]{1,2}/i) then # Long-hand (e.g. %Q{}).
344
+ rb_compile_error "unknown type of %string" if ss.matched_size == 2
345
+ c, beg, short_hand = matched, getch, false
346
+ else # Short-hand (e.g. %{, %., %!, etc)
347
+ c, beg, short_hand = "Q", getch, true
348
+ end
349
+
350
+ if end_of_stream? or c == RubyLexer::EOF or beg == RubyLexer::EOF then
351
+ rb_compile_error "unterminated quoted string meets end of file"
352
+ end
353
+
354
+ # Figure nnd-char. "\0" is special to indicate beg=nnd and that no nesting?
355
+ nnd = { "(" => ")", "[" => "]", "{" => "}", "<" => ">" }[beg]
356
+ nnd, beg = beg, "\0" if nnd.nil?
357
+
358
+ token_type, text = nil, "%#{c}#{beg}"
359
+ token_type, string_type = case c
360
+ when "Q" then
361
+ ch = short_hand ? nnd : c + beg
362
+ text = "%#{ch}"
363
+ [:tSTRING_BEG, STR_DQUOTE]
364
+ when "q" then
365
+ [:tSTRING_BEG, STR_SQUOTE]
366
+ when "W" then
367
+ eat_whitespace
368
+ [:tWORDS_BEG, STR_DQUOTE | STR_FUNC_QWORDS]
369
+ when "w" then
370
+ eat_whitespace
371
+ [:tQWORDS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
372
+ when "x" then
373
+ [:tXSTRING_BEG, STR_XQUOTE]
374
+ when "r" then
375
+ [:tREGEXP_BEG, STR_REGEXP]
376
+ when "s" then
377
+ self.lex_state = EXPR_FNAME
378
+ [:tSYMBEG, STR_SSYM]
379
+ when "I" then
380
+ eat_whitespace
381
+ [:tSYMBOLS_BEG, STR_DQUOTE | STR_FUNC_QWORDS]
382
+ when "i" then
383
+ eat_whitespace
384
+ [:tQSYMBOLS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
385
+ end
386
+
387
+ rb_compile_error "Bad %string type. Expected [QqWwIixrs], found '#{c}'." if
388
+ token_type.nil?
389
+
390
+ raise "huh" unless string_type
391
+
392
+ string string_type, nnd, beg
393
+
394
+ return token_type, text
395
+ end
396
+
397
+ def parse_string quote # TODO: rewrite / remove
398
+ _, string_type, term, open = quote
399
+
400
+ space = false # FIX: remove these
401
+ func = string_type
402
+ paren = open
403
+ term_re = @@regexp_cache[term]
404
+
405
+ qwords = func =~ STR_FUNC_QWORDS
406
+ regexp = func =~ STR_FUNC_REGEXP
407
+ expand = func =~ STR_FUNC_EXPAND
408
+
409
+ unless func then # nil'ed from qwords below. *sigh*
410
+ return :tSTRING_END, nil
411
+ end
412
+
413
+ space = true if qwords and eat_whitespace
414
+
415
+ if self.string_nest == 0 && scan(/#{term_re}/) then
416
+ if qwords then
417
+ quote[1] = nil
418
+ return :tSPACE, nil
419
+ elsif regexp then
420
+ return :tREGEXP_END, self.regx_options
421
+ else
422
+ return :tSTRING_END, term
423
+ end
424
+ end
425
+
426
+ return :tSPACE, nil if space
427
+
428
+ self.string_buffer = []
429
+
430
+ if expand
431
+ case
432
+ when scan(/#(?=\$(-.|[a-zA-Z_0-9~\*\$\?!@\/\\;,\.=:<>\"\&\`\'+]))/) then
433
+ # TODO: !ISASCII
434
+ # ?! see parser_peek_variable_name
435
+ return :tSTRING_DVAR, nil
436
+ when scan(/#(?=\@\@?[a-zA-Z_])/) then
437
+ # TODO: !ISASCII
438
+ return :tSTRING_DVAR, nil
439
+ when scan(/#[{]/) then
440
+ self.command_start = true
441
+ return :tSTRING_DBEG, nil
442
+ when scan(/#/) then
443
+ string_buffer << "#"
444
+ end
445
+ end
446
+
447
+ if tokadd_string(func, term, paren) == RubyLexer::EOF then
448
+ if func =~ STR_FUNC_REGEXP then
449
+ rb_compile_error "unterminated regexp meets end of file"
450
+ else
451
+ rb_compile_error "unterminated string meets end of file"
452
+ end
453
+ end
454
+
455
+ return :tSTRING_CONTENT, string_buffer.join
456
+ end
457
+
458
+ def possibly_escape_string text, check
459
+ content = match[1]
460
+
461
+ if text =~ check then
462
+ content.gsub(ESC) { unescape $1 }
463
+ else
464
+ content.gsub(/\\\\/, "\\").gsub(/\\\'/, "'")
465
+ end
466
+ end
467
+
389
468
  def process_amper text
390
469
  token = if is_arg? && space_seen && !check(/\s/) then
391
470
  warning("`&' interpreted as argument prefix")
392
471
  :tAMPER
393
- elsif in_lex_state? :expr_beg, :expr_mid then
472
+ elsif lex_state =~ EXPR_BEG|EXPR_MID then
394
473
  :tAMPER
395
474
  else
396
475
  :tAMPER2
@@ -402,7 +481,7 @@ class RubyLexer
402
481
  def process_backref text
403
482
  token = ss[1].to_sym
404
483
  # TODO: can't do lineno hack w/ symbol
405
- result :expr_end, :tBACK_REF, token
484
+ result EXPR_END, :tBACK_REF, token
406
485
  end
407
486
 
408
487
  def process_begin text
@@ -420,54 +499,33 @@ class RubyLexer
420
499
  end
421
500
 
422
501
  def process_brace_close text
423
- # matching compare/parse23.y:8561
424
- cond.lexpop
425
- cmdarg.lexpop
426
-
427
502
  case matched
428
503
  when "}" then
429
504
  self.brace_nest -= 1
430
- self.lex_state = :expr_endarg # TODO: :expr_end ? Look at 2.6
431
-
432
505
  return :tSTRING_DEND, matched if brace_nest < 0
506
+ end
507
+
508
+ # matching compare/parse26.y:8099
509
+ cond.pop
510
+ cmdarg.pop
511
+
512
+ case matched
513
+ when "}" then
514
+ self.lex_state = ruby24minus? ? EXPR_ENDARG : EXPR_END
433
515
  return :tRCURLY, matched
434
516
  when "]" then
435
517
  self.paren_nest -= 1
436
- self.lex_state = :expr_endarg
518
+ self.lex_state = ruby24minus? ? EXPR_ENDARG : EXPR_END
437
519
  return :tRBRACK, matched
438
520
  when ")" then
439
521
  self.paren_nest -= 1
440
- self.lex_state = :expr_endfn
522
+ self.lex_state = EXPR_ENDFN
441
523
  return :tRPAREN, matched
442
524
  else
443
525
  raise "Unknown bracing: #{matched.inspect}"
444
526
  end
445
527
  end
446
528
 
447
- def process_colon1 text
448
- # ?: / then / when
449
- if is_end? || check(/\s/) then
450
- return result :expr_beg, :tCOLON, text
451
- end
452
-
453
- case
454
- when scan(/\'/) then
455
- string STR_SSYM
456
- when scan(/\"/) then
457
- string STR_DSYM
458
- end
459
-
460
- result :expr_fname, :tSYMBEG, text
461
- end
462
-
463
- def process_colon2 text
464
- if is_beg? || in_lex_state?(:expr_class) || is_space_arg? then
465
- result :expr_beg, :tCOLON3, text
466
- else
467
- result :expr_dot, :tCOLON2, text
468
- end
469
- end
470
-
471
529
  def process_brace_open text
472
530
  # matching compare/parse23.y:8694
473
531
  self.brace_nest += 1
@@ -479,67 +537,111 @@ class RubyLexer
479
537
  return expr_result(:tLAMBEG, "{")
480
538
  end
481
539
 
482
- token = case lex_state
483
- when :expr_labeled then
540
+ token = case
541
+ when lex_state =~ EXPR_LABELED then
484
542
  :tLBRACE # hash
485
- when *EXPR_ARG_ANY, :expr_end, :expr_endfn then
486
- :tLCURLY # block (primary)
487
- when :expr_endarg
543
+ when lex_state =~ EXPR_ARG_ANY|EXPR_END|EXPR_ENDFN then
544
+ :tLCURLY # block (primary) "{" in parse.y
545
+ when lex_state =~ EXPR_ENDARG then
488
546
  :tLBRACE_ARG # block (expr)
489
547
  else
490
548
  :tLBRACE # hash
491
549
  end
492
550
 
493
- # TODO: self.lex_state |= :expr_label if token != :tLBRACE_ARG
551
+ state = token == :tLBRACE_ARG ? EXPR_BEG : EXPR_PAR
494
552
  self.command_start = true if token != :tLBRACE
495
553
 
496
- return expr_result(token, "{")
554
+ cond.push false
555
+ cmdarg.push false
556
+ result state, token, text
557
+ end
558
+
559
+ def process_colon1 text
560
+ # ?: / then / when
561
+ if is_end? || check(/\s/) then
562
+ return result EXPR_BEG, :tCOLON, text
563
+ end
564
+
565
+ case
566
+ when scan(/\'/) then
567
+ string STR_SSYM
568
+ when scan(/\"/) then
569
+ string STR_DSYM
570
+ end
571
+
572
+ result EXPR_FNAME, :tSYMBEG, text
573
+ end
574
+
575
+ def process_colon2 text
576
+ if is_beg? || lex_state =~ EXPR_CLASS || is_space_arg? then
577
+ result EXPR_BEG, :tCOLON3, text
578
+ else
579
+ result EXPR_DOT, :tCOLON2, text
580
+ end
497
581
  end
498
582
 
499
583
  def process_float text
500
584
  rb_compile_error "Invalid numeric format" if text =~ /__/
501
585
 
502
586
  case
503
- when text.end_with?('ri')
504
- return result(:expr_end, :tIMAGINARY, Complex(0, Rational(text.chop.chop)))
505
- when text.end_with?('r')
506
- return result(:expr_end, :tRATIONAL, Rational(text.chop))
507
- when text.end_with?('i')
508
- return result(:expr_end, :tIMAGINARY, Complex(0, text.chop.to_f))
587
+ when text.end_with?("ri")
588
+ return result EXPR_NUM, :tIMAGINARY, Complex(0, Rational(text.chop.chop))
589
+ when text.end_with?("i")
590
+ return result EXPR_NUM, :tIMAGINARY, Complex(0, text.chop.to_f)
591
+ when text.end_with?("r")
592
+ return result EXPR_NUM, :tRATIONAL, Rational(text.chop)
509
593
  else
510
- return result(:expr_end, :tFLOAT, text.to_f)
594
+ return result EXPR_NUM, :tFLOAT, text.to_f
511
595
  end
512
596
  end
513
597
 
514
598
  def process_gvar text
515
599
  text.lineno = self.lineno
516
- result(:expr_end, :tGVAR, text)
600
+ result EXPR_END, :tGVAR, text
517
601
  end
518
602
 
519
603
  def process_gvar_oddity text
520
- return result :expr_end, "$", "$" if text == "$" # TODO: wtf is this?
604
+ return result EXPR_END, "$", "$" if text == "$" # TODO: wtf is this?
521
605
  rb_compile_error "#{text.inspect} is not allowed as a global variable name"
522
606
  end
523
607
 
524
608
  def process_ivar text
525
609
  tok_id = text =~ /^@@/ ? :tCVAR : :tIVAR
526
610
  text.lineno = self.lineno
527
- return result(:expr_end, tok_id, text)
611
+ result EXPR_END, tok_id, text
612
+ end
613
+
614
+ def process_label text
615
+ symbol = possibly_escape_string text, /^\"/
616
+
617
+ result EXPR_LAB, :tLABEL, [symbol, self.lineno]
618
+ end
619
+
620
+ def process_label_or_string text
621
+ if @was_label && text =~ /:\Z/ then
622
+ @was_label = nil
623
+ return process_label text
624
+ elsif text =~ /:\Z/ then
625
+ ss.pos -= 1 # put back ":"
626
+ text = text[0..-2]
627
+ end
628
+
629
+ result EXPR_END, :tSTRING, text[1..-2].gsub(/\\\\/, "\\").gsub(/\\\'/, "\'")
528
630
  end
529
631
 
530
632
  def process_lchevron text
531
- if (!in_lex_state?(:expr_dot, :expr_class) &&
633
+ if (lex_state !~ EXPR_DOT|EXPR_CLASS &&
532
634
  !is_end? &&
533
- (!is_arg? || space_seen)) then # TODO: || in_state(:expr_labeled)
635
+ (!is_arg? || lex_state =~ EXPR_LABELED || space_seen)) then
534
636
  tok = self.heredoc_identifier
535
637
  return tok if tok
536
638
  end
537
639
 
538
- if in_arg_state? then
539
- self.lex_state = :expr_arg
640
+ if is_after_operator? then
641
+ self.lex_state = EXPR_ARG
540
642
  else
541
- self.command_start = true if lex_state == :expr_class
542
- self.lex_state = :expr_beg
643
+ self.command_start = true if lex_state =~ EXPR_CLASS
644
+ self.lex_state = EXPR_BEG
543
645
  end
544
646
 
545
647
  return result(lex_state, :tLSHFT, "\<\<")
@@ -549,14 +651,14 @@ class RubyLexer
549
651
  c = matched
550
652
  hit = false
551
653
 
552
- if c == '#' then
654
+ if c == "#" then
553
655
  ss.pos -= 1
554
656
 
555
657
  # TODO: handle magic comments
556
658
  while scan(/\s*\#.*(\n+|\z)/) do
557
659
  hit = true
558
660
  self.lineno += matched.lines.to_a.size
559
- @comments << matched.gsub(/^ +#/, '#').gsub(/^ +$/, '')
661
+ @comments << matched.gsub(/^ +#/, "#").gsub(/^ +$/, "")
560
662
  end
561
663
 
562
664
  return nil if end_of_stream?
@@ -567,17 +669,15 @@ class RubyLexer
567
669
  # Replace a string of newlines with a single one
568
670
  self.lineno += matched.lines.to_a.size if scan(/\n+/)
569
671
 
570
- # TODO: remove :expr_value -- audit all uses of it
571
- c = in_lex_state?(:expr_beg, :expr_value, :expr_class,
572
- :expr_fname, :expr_dot) && !in_lex_state?(:expr_labeled)
573
-
672
+ c = (lex_state =~ EXPR_BEG|EXPR_CLASS|EXPR_FNAME|EXPR_DOT &&
673
+ lex_state !~ EXPR_LABELED)
574
674
  # TODO: figure out what token_seen is for
575
- # TODO: if c || self.lex_state == [:expr_beg, :expr_labeled] then
576
- if c || self.lex_state == :expr_labeled then
675
+ if c || self.lex_state == EXPR_LAB then # yes, == EXPR_LAB
577
676
  # ignore if !fallthrough?
578
677
  if !c && parser.in_kwarg then
579
678
  # normal newline
580
- return result(:expr_beg, :tNL, nil)
679
+ self.command_start = true
680
+ return result EXPR_BEG, :tNL, nil
581
681
  else
582
682
  return # skip
583
683
  end
@@ -592,41 +692,46 @@ class RubyLexer
592
692
 
593
693
  self.command_start = true
594
694
 
595
- return result(:expr_beg, :tNL, nil)
695
+ return result(EXPR_BEG, :tNL, nil)
596
696
  end
597
697
 
598
698
  def process_nthref text
599
699
  # TODO: can't do lineno hack w/ number
600
- result :expr_end, :tNTH_REF, ss[1].to_i
700
+ result EXPR_END, :tNTH_REF, ss[1].to_i
601
701
  end
602
702
 
603
703
  def process_paren text
604
- token = process_paren19
704
+ token = if is_beg? then
705
+ :tLPAREN
706
+ elsif !space_seen then
707
+ # foo( ... ) => method call, no ambiguity
708
+ :tLPAREN2
709
+ elsif is_space_arg? then
710
+ :tLPAREN_ARG
711
+ elsif lex_state =~ EXPR_ENDFN && !lambda_beginning? then
712
+ # TODO:
713
+ # warn("parentheses after method name is interpreted as " \
714
+ # "an argument list, not a decomposed argument")
715
+ :tLPAREN2
716
+ else
717
+ :tLPAREN2 # plain "(" in parse.y
718
+ end
605
719
 
606
720
  self.paren_nest += 1
607
721
 
608
- # TODO: add :expr_label to :expr_beg (set in expr_result below)
609
- return expr_result(token, "(")
610
- end
611
-
612
- def process_paren19
613
- if is_beg? then
614
- :tLPAREN
615
- elsif is_space_arg? then
616
- :tLPAREN_ARG
617
- else
618
- :tLPAREN2 # plain '(' in parse.y
619
- end
722
+ cond.push false
723
+ cmdarg.push false
724
+ result EXPR_PAR, token, text
620
725
  end
621
726
 
622
727
  def process_percent text
623
728
  return parse_quote if is_beg?
624
729
 
625
- return result(:expr_beg, :tOP_ASGN, "%") if scan(/\=/)
730
+ return result EXPR_BEG, :tOP_ASGN, "%" if scan(/\=/)
626
731
 
627
- return parse_quote if is_arg? && space_seen && ! check(/\s/)
732
+ return parse_quote if is_space_arg?(check(/\s/)) || (lex_state =~ EXPR_FITEM && check(/s/))
628
733
 
629
- return result(:arg_state, :tPERCENT, "%")
734
+ return result :arg_state, :tPERCENT, "%"
630
735
  end
631
736
 
632
737
  def process_plus_minus text
@@ -637,33 +742,33 @@ class RubyLexer
637
742
  [:tUMINUS, :tMINUS]
638
743
  end
639
744
 
640
- if in_arg_state? then
745
+ if is_after_operator? then
641
746
  if scan(/@/) then
642
- return result(:expr_arg, utype, "#{sign}@")
747
+ return result(EXPR_ARG, utype, "#{sign}@")
643
748
  else
644
- return result(:expr_arg, type, sign)
749
+ return result(EXPR_ARG, type, sign)
645
750
  end
646
751
  end
647
752
 
648
- return result(:expr_beg, :tOP_ASGN, sign) if scan(/\=/)
753
+ return result(EXPR_BEG, :tOP_ASGN, sign) if scan(/\=/)
649
754
 
650
- if (is_beg? || (is_arg? && space_seen && !check(/\s/))) then
755
+ if is_beg? || (is_arg? && space_seen && !check(/\s/)) then
651
756
  arg_ambiguous if is_arg?
652
757
 
653
758
  if check(/\d/) then
654
759
  return nil if utype == :tUPLUS
655
- return result(:expr_beg, :tUMINUS_NUM, sign)
760
+ return result EXPR_BEG, :tUMINUS_NUM, sign
656
761
  end
657
762
 
658
- return result(:expr_beg, utype, sign)
763
+ return result EXPR_BEG, utype, sign
659
764
  end
660
765
 
661
- return result(:expr_beg, type, sign)
766
+ result EXPR_BEG, type, sign
662
767
  end
663
768
 
664
769
  def process_questionmark text
665
770
  if is_end? then
666
- return result(:expr_value, :tEH, "?")
771
+ return result EXPR_BEG, :tEH, "?"
667
772
  end
668
773
 
669
774
  if end_of_stream? then
@@ -672,12 +777,12 @@ class RubyLexer
672
777
 
673
778
  if check(/\s|\v/) then
674
779
  unless is_arg? then
675
- c2 = { " " => 's',
676
- "\n" => 'n',
677
- "\t" => 't',
678
- "\v" => 'v',
679
- "\r" => 'r',
680
- "\f" => 'f' }[matched]
780
+ c2 = { " " => "s",
781
+ "\n" => "n",
782
+ "\t" => "t",
783
+ "\v" => "v",
784
+ "\r" => "r",
785
+ "\f" => "f" }[matched]
681
786
 
682
787
  if c2 then
683
788
  warning("invalid character syntax; use ?\\" + c2)
@@ -685,18 +790,28 @@ class RubyLexer
685
790
  end
686
791
 
687
792
  # ternary
688
- return result(:expr_value, :tEH, "?")
793
+ return result EXPR_BEG, :tEH, "?"
689
794
  elsif check(/\w(?=\w)/) then # ternary, also
690
- return result(:expr_beg, :tEH, "?")
795
+ return result EXPR_BEG, :tEH, "?"
691
796
  end
692
797
 
693
798
  c = if scan(/\\/) then
694
799
  self.read_escape
695
800
  else
696
- ss.getch
801
+ getch
697
802
  end
698
803
 
699
- return result(:expr_end, :tSTRING, c)
804
+ result EXPR_END, :tSTRING, c
805
+ end
806
+
807
+ def process_simple_string text
808
+ replacement = text[1..-2].gsub(ESC) {
809
+ unescape($1).b.force_encoding Encoding::UTF_8
810
+ }
811
+
812
+ replacement = replacement.b unless replacement.valid_encoding?
813
+
814
+ result EXPR_END, :tSTRING, replacement
700
815
  end
701
816
 
702
817
  def process_slash text
@@ -707,7 +822,7 @@ class RubyLexer
707
822
  end
708
823
 
709
824
  if scan(/\=/) then
710
- return result(:expr_beg, :tOP_ASGN, "/")
825
+ return result(EXPR_BEG, :tOP_ASGN, "/")
711
826
  end
712
827
 
713
828
  if is_arg? && space_seen then
@@ -726,73 +841,68 @@ class RubyLexer
726
841
 
727
842
  token = nil
728
843
 
729
- if in_arg_state? then
844
+ if is_after_operator? then
730
845
  case
731
846
  when scan(/\]\=/) then
732
847
  self.paren_nest -= 1 # HACK? I dunno, or bug in MRI
733
- return result(:expr_arg, :tASET, "[]=")
848
+ return result EXPR_ARG, :tASET, "[]="
734
849
  when scan(/\]/) then
735
850
  self.paren_nest -= 1 # HACK? I dunno, or bug in MRI
736
- return result(:expr_arg, :tAREF, "[]")
851
+ return result EXPR_ARG, :tAREF, "[]"
737
852
  else
738
853
  rb_compile_error "unexpected '['"
739
854
  end
740
855
  elsif is_beg? then
741
856
  token = :tLBRACK
742
- elsif is_arg? && space_seen then
857
+ elsif is_arg? && (space_seen || lex_state =~ EXPR_LABELED) then
743
858
  token = :tLBRACK
744
859
  else
745
860
  token = :tLBRACK2
746
861
  end
747
862
 
748
- # TODO: this is done by expr_result except "|EXPR_LABEL")
749
- # SET_LEX_STATE(EXPR_BEG|EXPR_LABEL);
750
- expr_result token, "["
751
- end
752
-
753
- def possibly_escape_string text, check
754
- content = match[1]
755
-
756
- if text =~ check then
757
- content.gsub(ESC) { unescape $1 }
758
- else
759
- content.gsub(/\\\\/, "\\").gsub(/\\'/, "'")
760
- end
863
+ cond.push false
864
+ cmdarg.push false
865
+ result EXPR_PAR, token, text
761
866
  end
762
867
 
763
- def process_symbol text
764
- symbol = possibly_escape_string text, /^:"/
868
+ def process_string # TODO: rewrite / remove
869
+ # matches top of parser_yylex in compare/parse23.y:8113
870
+ token = if lex_strterm[0] == :heredoc then
871
+ self.heredoc lex_strterm
872
+ else
873
+ self.parse_string lex_strterm
874
+ end
765
875
 
766
- return result(:expr_end, :tSYMBOL, symbol)
767
- end
876
+ token_type, c = token
768
877
 
769
- def was_label?
770
- @was_label = ruby22_label?
771
- true
772
- end
878
+ # matches parser_string_term from 2.3, but way off from 2.5
879
+ if ruby22plus? && token_type == :tSTRING_END && ["'", '"'].include?(c) then
880
+ if ((lex_state =~ EXPR_BEG|EXPR_ENDFN &&
881
+ !cond.is_in_state) || is_arg?) &&
882
+ is_label_suffix? then
883
+ scan(/:/)
884
+ token_type = token[0] = :tLABEL_END
885
+ end
886
+ end
773
887
 
774
- def process_label_or_string text
775
- if @was_label && text =~ /:\Z/ then
776
- @was_label = nil
777
- return process_label text
778
- elsif text =~ /:\Z/ then
779
- ss.pos -= 1 # put back ":"
780
- text = text[0..-2]
888
+ if [:tSTRING_END, :tREGEXP_END, :tLABEL_END].include? token_type then
889
+ self.lex_strterm = nil
890
+ self.lex_state = (token_type == :tLABEL_END) ? EXPR_PAR : EXPR_LIT
781
891
  end
782
892
 
783
- result :expr_end, :tSTRING, text[1..-2].gsub(/\\\\/, "\\").gsub(/\\'/, "'")
893
+ return token
784
894
  end
785
895
 
786
- def process_label text
787
- symbol = possibly_escape_string text, /^"/
896
+ def process_symbol text
897
+ symbol = possibly_escape_string text, /^:\"/ # stupid emacs
788
898
 
789
- result(:expr_labeled, :tLABEL, [symbol, self.lineno]) # TODO: expr_arg|expr_labeled
899
+ result EXPR_LIT, :tSYMBOL, symbol
790
900
  end
791
901
 
792
902
  def process_token text
793
903
  # matching: parse_ident in compare/parse23.y:7989
794
904
  # TODO: make this always return [token, lineno]
795
- self.last_state = lex_state
905
+ # FIX: remove: self.last_state = lex_state
796
906
 
797
907
  token = self.token = text
798
908
  token << matched if scan(/[\!\?](?!=)/)
@@ -801,7 +911,7 @@ class RubyLexer
801
911
  case
802
912
  when token =~ /[!?]$/ then
803
913
  :tFID
804
- when in_lex_state?(:expr_fname) && scan(/=(?:(?![~>=])|(?==>))/) then
914
+ when lex_state =~ EXPR_FNAME && scan(/=(?:(?![~>=])|(?==>))/) then
805
915
  # ident=, not =~ => == or followed by =>
806
916
  # TODO test lexing of a=>b vs a==>b
807
917
  token << matched
@@ -814,31 +924,33 @@ class RubyLexer
814
924
 
815
925
  if is_label_possible? and is_label_suffix? then
816
926
  scan(/:/)
817
- # TODO: :expr_arg|:expr_labeled
818
- return result :expr_labeled, :tLABEL, [token, self.lineno]
927
+ # TODO: propagate the lineno to ALL results
928
+ return result EXPR_LAB, :tLABEL, [token, self.lineno]
819
929
  end
820
930
 
821
- # TODO: mb == ENC_CODERANGE_7BIT && !in_lex_state?(:expr_dot)
822
- unless in_lex_state? :expr_dot then
931
+ # TODO: mb == ENC_CODERANGE_7BIT && lex_state !~ EXPR_DOT
932
+ if lex_state !~ EXPR_DOT then
823
933
  # See if it is a reserved word.
824
934
  keyword = RubyParserStuff::Keyword.keyword token
825
935
 
826
936
  return process_token_keyword keyword if keyword
827
- end # unless in_lex_state? :expr_dot
937
+ end
828
938
 
829
939
  # matching: compare/parse23.y:8079
830
- state = if is_beg? or is_arg? or in_lex_state? :expr_dot then
831
- cmd_state ? :expr_cmdarg : :expr_arg
832
- elsif in_lex_state? :expr_fname then
833
- :expr_endfn
940
+ state = if is_beg? or is_arg? or lex_state =~ EXPR_DOT then
941
+ cmd_state ? EXPR_CMDARG : EXPR_ARG
942
+ elsif lex_state =~ EXPR_FNAME then
943
+ EXPR_ENDFN
834
944
  else
835
- :expr_end
945
+ EXPR_END
836
946
  end
837
947
 
838
- if not [:expr_dot, :expr_fname].include? last_state and
839
- (tok_id == :tIDENTIFIER) and # not :expr_fname, not attrasgn
948
+ tok_id = :tIDENTIFIER if tok_id == :tCONSTANT && is_local_id(token)
949
+
950
+ if last_state !~ EXPR_DOT|EXPR_FNAME and
951
+ (tok_id == :tIDENTIFIER) and # not EXPR_FNAME, not attrasgn
840
952
  lvar_defined?(token) then
841
- state = :expr_end # TODO: EXPR_END|EXPR_LABEL
953
+ state = EXPR_END|EXPR_LABEL
842
954
  end
843
955
 
844
956
  token.lineno = self.lineno # yes, on a string. I know... I know...
@@ -853,32 +965,30 @@ class RubyLexer
853
965
 
854
966
  value = [token, self.lineno]
855
967
 
856
- return result(lex_state, keyword.id0, value) if state == :expr_fname
968
+ return result(lex_state, keyword.id0, value) if state =~ EXPR_FNAME
857
969
 
858
- self.command_start = true if lex_state == :expr_beg
970
+ self.command_start = true if lex_state =~ EXPR_BEG
859
971
 
860
972
  case
861
- when keyword.id0 == :kDO then
973
+ when keyword.id0 == :kDO then # parse26.y line 7591
862
974
  case
863
975
  when lambda_beginning? then
864
976
  self.lpar_beg = nil # lambda_beginning? == FALSE in the body of "-> do ... end"
865
- self.paren_nest -= 1
866
- result(lex_state, :kDO_LAMBDA, value)
977
+ self.paren_nest -= 1 # TODO: question this?
978
+ result lex_state, :kDO_LAMBDA, value
867
979
  when cond.is_in_state then
868
- result(lex_state, :kDO_COND, value)
869
- when cmdarg.is_in_state && state != :expr_cmdarg then
870
- result(lex_state, :kDO_BLOCK, value)
871
- when [:expr_beg, :expr_endarg].include?(state) then
872
- result(lex_state, :kDO_BLOCK, value)
980
+ result lex_state, :kDO_COND, value
981
+ when cmdarg.is_in_state && state != EXPR_CMDARG then
982
+ result lex_state, :kDO_BLOCK, value
873
983
  else
874
- result(lex_state, :kDO, value)
984
+ result lex_state, :kDO, value
875
985
  end
876
- when [:expr_beg, :expr_labeled].include?(state) then
877
- result(lex_state, keyword.id0, value)
986
+ when state =~ EXPR_PAD then
987
+ result lex_state, keyword.id0, value
878
988
  when keyword.id0 != keyword.id1 then
879
- result(:expr_beg, keyword.id1, value) # TODO: :expr_beg|:expr_label
989
+ result EXPR_PAR, keyword.id1, value
880
990
  else
881
- result(lex_state, keyword.id1, value)
991
+ result lex_state, keyword.id1, value
882
992
  end
883
993
  end
884
994
 
@@ -886,9 +996,9 @@ class RubyLexer
886
996
  ss.unscan # put back "_"
887
997
 
888
998
  if beginning_of_line? && scan(/\__END__(\r?\n|\Z)/) then
889
- return [RubyLexer::EOF, RubyLexer::EOF]
890
- elsif scan(/\_\w*/) then
891
- return process_token matched
999
+ [RubyLexer::EOF, RubyLexer::EOF]
1000
+ elsif scan(/#{IDENT_CHAR}+/) then
1001
+ process_token matched
892
1002
  end
893
1003
  end
894
1004
 
@@ -921,10 +1031,11 @@ class RubyLexer
921
1031
  when scan(/s/) then # space
922
1032
  " "
923
1033
  when scan(/[0-7]{1,3}/) then # octal constant
924
- (matched.to_i(8) & 0xFF).chr
1034
+ (matched.to_i(8) & 0xFF).chr.force_encoding Encoding::UTF_8
925
1035
  when scan(/x([0-9a-fA-F]{1,2})/) then # hex constant
926
- ss[1].to_i(16).chr
927
- when check(/M-\\[\\MCc]/) then
1036
+ # TODO: force encode everything to UTF-8?
1037
+ ss[1].to_i(16).chr.force_encoding Encoding::UTF_8
1038
+ when check(/M-\\./) then
928
1039
  scan(/M-\\/) # eat it
929
1040
  c = self.read_escape
930
1041
  c[0] = (c[0].ord | 0x80).chr
@@ -938,6 +1049,11 @@ class RubyLexer
938
1049
  c = self.read_escape
939
1050
  c[0] = (c[0].ord & 0x9f).chr
940
1051
  c
1052
+ when check(/(C-|c)\\(?!u|\\)/) then
1053
+ scan(/(C-|c)\\/) # eat it
1054
+ c = read_escape
1055
+ c[0] = (c[0].ord & 0x9f).chr
1056
+ c
941
1057
  when scan(/C-\?|c\?/) then
942
1058
  127.chr
943
1059
  when scan(/(C-|c)(.)/) then
@@ -946,15 +1062,25 @@ class RubyLexer
946
1062
  c
947
1063
  when scan(/^[89]/i) then # bad octal or hex... MRI ignores them :(
948
1064
  matched
949
- when scan(/u([0-9a-fA-F]{2,4}|\{[0-9a-fA-F]{2,6}\})/) then
950
- [ss[1].delete("{}").to_i(16)].pack("U")
1065
+ when scan(/u(\h{4})/) then
1066
+ [ss[1].to_i(16)].pack("U")
1067
+ when scan(/u(\h{1,3})/) then
1068
+ rb_compile_error "Invalid escape character syntax"
1069
+ when scan(/u\{(\h+(?:\s+\h+)*)\}/) then
1070
+ ss[1].split.map { |s| s.to_i(16) }.pack("U*")
951
1071
  when scan(/[McCx0-9]/) || end_of_stream? then
952
1072
  rb_compile_error("Invalid escape character syntax")
953
1073
  else
954
- ss.getch
1074
+ getch
955
1075
  end.dup
956
1076
  end
957
1077
 
1078
+ def getch
1079
+ c = ss.getch
1080
+ c = ss.getch if c == "\r" && ss.peek(1) == "\n"
1081
+ c
1082
+ end
1083
+
958
1084
  def regx_options # TODO: rewrite / remove
959
1085
  good, bad = [], []
960
1086
 
@@ -974,7 +1100,7 @@ class RubyLexer
974
1100
  self.brace_nest = 0
975
1101
  self.command_start = true
976
1102
  self.comments = []
977
- self.lex_state = :expr_none
1103
+ self.lex_state = EXPR_NONE
978
1104
  self.lex_strterm = nil
979
1105
  self.lineno = 1
980
1106
  self.lpar_beg = nil
@@ -988,29 +1114,30 @@ class RubyLexer
988
1114
  self.cmdarg.reset
989
1115
  end
990
1116
 
991
- def result lex_state, token, text # :nodoc:
992
- lex_state = self.arg_state if lex_state == :arg_state
993
- self.lex_state = lex_state if lex_state
1117
+ def result new_state, token, text # :nodoc:
1118
+ new_state = self.arg_state if new_state == :arg_state
1119
+ self.lex_state = new_state if new_state
994
1120
  [token, text]
995
1121
  end
996
1122
 
997
- def scan re
998
- ss.scan re
1123
+ def ruby22_label?
1124
+ ruby22plus? and is_label_possible?
999
1125
  end
1000
1126
 
1001
- def check re
1002
- ss.check re
1127
+ def ruby22plus?
1128
+ parser.class.version >= 22
1003
1129
  end
1004
1130
 
1005
- def eat_whitespace
1006
- r = scan(/\s+/)
1007
- self.extra_lineno += r.count("\n") if r
1008
- r
1131
+ def ruby23plus?
1132
+ parser.class.version >= 23
1009
1133
  end
1010
1134
 
1011
- def fixup_lineno extra = 0
1012
- self.lineno += self.extra_lineno + extra
1013
- self.extra_lineno = 0
1135
+ def ruby24minus?
1136
+ parser.class.version <= 24
1137
+ end
1138
+
1139
+ def scan re
1140
+ ss.scan re
1014
1141
  end
1015
1142
 
1016
1143
  def scanner_class # TODO: design this out of oedipus_lex. or something.
@@ -1033,12 +1160,6 @@ class RubyLexer
1033
1160
  self.lex_strterm = [:strterm, type, beg, nnd]
1034
1161
  end
1035
1162
 
1036
- # TODO: consider
1037
- # def src= src
1038
- # raise "bad src: #{src.inspect}" unless String === src
1039
- # @src = RPStringScanner.new(src)
1040
- # end
1041
-
1042
1163
  def tokadd_escape term # TODO: rewrite / remove
1043
1164
  case
1044
1165
  when scan(/\\\n/) then
@@ -1057,8 +1178,10 @@ class RubyLexer
1057
1178
  prev = self.string_buffer.last
1058
1179
  if term == chr && prev && prev.end_with?("(?") then
1059
1180
  self.string_buffer << chr
1181
+ elsif term == chr || chr.ascii_only? then
1182
+ self.string_buffer << matched # dunno why we keep them for ascii
1060
1183
  else
1061
- self.string_buffer << matched
1184
+ self.string_buffer << chr # HACK? this is such a rat's nest
1062
1185
  end
1063
1186
  else
1064
1187
  rb_compile_error "Invalid escape character syntax"
@@ -1066,22 +1189,24 @@ class RubyLexer
1066
1189
  end
1067
1190
 
1068
1191
  def tokadd_string(func, term, paren) # TODO: rewrite / remove
1069
- qwords = (func & STR_FUNC_QWORDS) != 0
1070
- escape = (func & STR_FUNC_ESCAPE) != 0
1071
- expand = (func & STR_FUNC_EXPAND) != 0
1072
- regexp = (func & STR_FUNC_REGEXP) != 0
1073
- symbol = (func & STR_FUNC_SYMBOL) != 0
1192
+ qwords = func =~ STR_FUNC_QWORDS
1193
+ escape = func =~ STR_FUNC_ESCAPE
1194
+ expand = func =~ STR_FUNC_EXPAND
1195
+ regexp = func =~ STR_FUNC_REGEXP
1196
+ symbol = func =~ STR_FUNC_SYMBOL
1074
1197
 
1075
1198
  paren_re = @@regexp_cache[paren]
1076
- term_re = @@regexp_cache[term]
1199
+ term_re = if term == "\n"
1200
+ /#{Regexp.escape "\r"}?#{Regexp.escape "\n"}/
1201
+ else
1202
+ @@regexp_cache[term]
1203
+ end
1077
1204
 
1078
1205
  until end_of_stream? do
1079
1206
  c = nil
1080
1207
  handled = true
1081
1208
 
1082
1209
  case
1083
- when paren_re && scan(paren_re) then
1084
- self.string_nest += 1
1085
1210
  when scan(term_re) then
1086
1211
  if self.string_nest == 0 then
1087
1212
  ss.pos -= 1
@@ -1089,7 +1214,9 @@ class RubyLexer
1089
1214
  else
1090
1215
  self.string_nest -= 1
1091
1216
  end
1092
- when expand && scan(/#(?=[\$\@\{])/) then
1217
+ when paren_re && scan(paren_re) then
1218
+ self.string_nest += 1
1219
+ when expand && scan(/#(?=[\$\@\{])/) then # TODO: this seems wrong
1093
1220
  ss.pos -= 1
1094
1221
  break
1095
1222
  when qwords && scan(/\s/) then
@@ -1103,7 +1230,7 @@ class RubyLexer
1103
1230
  string_buffer << "\n"
1104
1231
  next
1105
1232
  when qwords && scan(/\\\s/) then
1106
- c = ' '
1233
+ c = " "
1107
1234
  when expand && scan(/\\\n/) then
1108
1235
  next
1109
1236
  when regexp && check(/\\/) then
@@ -1128,12 +1255,16 @@ class RubyLexer
1128
1255
  end # top case
1129
1256
 
1130
1257
  unless handled then
1131
- t = Regexp.escape term
1132
- x = Regexp.escape(paren) if paren && paren != "\000"
1258
+ t = if term == "\n"
1259
+ Regexp.escape "\r\n"
1260
+ else
1261
+ Regexp.escape term
1262
+ end
1263
+ x = Regexp.escape paren if paren && paren != "\000"
1133
1264
  re = if qwords then
1134
- /[^#{t}#{x}\#\0\\\s]+|./ # |. to pick up whatever
1265
+ /[^#{t}#{x}\#\\\s]+|./ # |. to pick up whatever
1135
1266
  else
1136
- /[^#{t}#{x}\#\0\\]+|./
1267
+ /[^#{t}#{x}\#\\]+|./
1137
1268
  end
1138
1269
 
1139
1270
  scan re
@@ -1173,12 +1304,15 @@ class RubyLexer
1173
1304
  s
1174
1305
  when /^[McCx0-9]/ then
1175
1306
  rb_compile_error("Invalid escape character syntax")
1176
- when /u([0-9a-fA-F]{2,4}|\{[0-9a-fA-F]{2,6}\})/ then
1307
+ when /u(\h{4})/ then
1177
1308
  [$1.delete("{}").to_i(16)].pack("U")
1309
+ when /u(\h{1,3})/ then
1310
+ rb_compile_error("Invalid escape character syntax")
1311
+ when /u\{(\h+(?:\s+\h+)*)\}/ then
1312
+ $1.split.map { |s| s.to_i(16) }.pack("U*")
1178
1313
  else
1179
1314
  s
1180
1315
  end
1181
- x.force_encoding "UTF-8" if HAS_ENC
1182
1316
  x
1183
1317
  end
1184
1318
 
@@ -1186,172 +1320,154 @@ class RubyLexer
1186
1320
  # do nothing for now
1187
1321
  end
1188
1322
 
1189
- def ruby22plus?
1190
- parser.class.version >= 22
1191
- end
1192
-
1193
- def ruby23plus?
1194
- parser.class.version >= 23
1323
+ def was_label?
1324
+ @was_label = ruby22_label?
1325
+ true
1195
1326
  end
1196
1327
 
1197
- def process_string # TODO: rewrite / remove
1198
- # matches top of parser_yylex in compare/parse23.y:8113
1199
- token = if lex_strterm[0] == :heredoc then
1200
- self.heredoc lex_strterm
1201
- else
1202
- self.parse_string lex_strterm
1203
- end
1328
+ class State
1329
+ attr_accessor :n
1330
+ attr_accessor :names
1204
1331
 
1205
- token_type, c = token
1332
+ # TODO: take a shared hash of strings for inspect/to_s
1333
+ def initialize o, names
1334
+ raise ArgumentError, "bad state: %p" % [o] unless Integer === o # TODO: remove
1206
1335
 
1207
- # matches parser_string_term
1208
- if ruby22plus? && token_type == :tSTRING_END && ["'", '"'].include?(c) then
1209
- if (([:expr_beg, :expr_endfn].include?(lex_state) &&
1210
- !cond.is_in_state) || is_arg?) &&
1211
- is_label_suffix? then
1212
- scan(/:/)
1213
- token_type = token[0] = :tLABEL_END
1214
- end
1336
+ self.n = o
1337
+ self.names = names
1215
1338
  end
1216
1339
 
1217
- if [:tSTRING_END, :tREGEXP_END, :tLABEL_END].include? token_type then
1218
- self.lex_strterm = nil
1219
- # TODO: :expr_beg|:expr_label
1220
- self.lex_state = (token_type == :tLABEL_END) ? :expr_label : :expr_end
1340
+ def == o
1341
+ self.equal?(o) || (o.class == self.class && o.n == self.n)
1221
1342
  end
1222
1343
 
1223
- return token
1224
- end
1225
-
1226
- def parse_quote # TODO: remove / rewrite
1227
- beg, nnd, short_hand, c = nil, nil, false, nil
1228
-
1229
- if scan(/[a-z0-9]{1,2}/i) then # Long-hand (e.g. %Q{}).
1230
- rb_compile_error "unknown type of %string" if ss.matched_size == 2
1231
- c, beg, short_hand = matched, ss.getch, false
1232
- else # Short-hand (e.g. %{, %., %!, etc)
1233
- c, beg, short_hand = 'Q', ss.getch, true
1234
- end
1235
-
1236
- if end_of_stream? or c == RubyLexer::EOF or beg == RubyLexer::EOF then
1237
- rb_compile_error "unterminated quoted string meets end of file"
1238
- end
1239
-
1240
- # Figure nnd-char. "\0" is special to indicate beg=nnd and that no nesting?
1241
- nnd = { "(" => ")", "[" => "]", "{" => "}", "<" => ">" }[beg]
1242
- nnd, beg = beg, "\0" if nnd.nil?
1243
-
1244
- token_type, text = nil, "%#{c}#{beg}"
1245
- token_type, string_type = case c
1246
- when 'Q' then
1247
- ch = short_hand ? nnd : c + beg
1248
- text = "%#{ch}"
1249
- [:tSTRING_BEG, STR_DQUOTE]
1250
- when 'q' then
1251
- [:tSTRING_BEG, STR_SQUOTE]
1252
- when 'W' then
1253
- eat_whitespace
1254
- [:tWORDS_BEG, STR_DQUOTE | STR_FUNC_QWORDS]
1255
- when 'w' then
1256
- eat_whitespace
1257
- [:tQWORDS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
1258
- when 'x' then
1259
- [:tXSTRING_BEG, STR_XQUOTE]
1260
- when 'r' then
1261
- [:tREGEXP_BEG, STR_REGEXP]
1262
- when 's' then
1263
- self.lex_state = :expr_fname
1264
- [:tSYMBEG, STR_SSYM]
1265
- when 'I' then
1266
- eat_whitespace
1267
- [:tSYMBOLS_BEG, STR_DQUOTE | STR_FUNC_QWORDS]
1268
- when 'i' then
1269
- eat_whitespace
1270
- [:tQSYMBOLS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
1271
- end
1272
-
1273
- rb_compile_error "Bad %string type. Expected [QqWwIixrs], found '#{c}'." if
1274
- token_type.nil?
1275
-
1276
- raise "huh" unless string_type
1277
-
1278
- string string_type, nnd, beg
1279
-
1280
- return token_type, text
1281
- end
1282
-
1283
- def parse_string quote # TODO: rewrite / remove
1284
- _, string_type, term, open = quote
1285
-
1286
- space = false # FIX: remove these
1287
- func = string_type
1288
- paren = open
1289
- term_re = @@regexp_cache[term]
1290
-
1291
- qwords = (func & STR_FUNC_QWORDS) != 0
1292
- regexp = (func & STR_FUNC_REGEXP) != 0
1293
- expand = (func & STR_FUNC_EXPAND) != 0
1294
-
1295
- unless func then # nil'ed from qwords below. *sigh*
1296
- return :tSTRING_END, nil
1344
+ def =~ v
1345
+ (self.n & v.n) != 0
1297
1346
  end
1298
1347
 
1299
- space = true if qwords and eat_whitespace
1300
-
1301
- if self.string_nest == 0 && scan(/#{term_re}/) then
1302
- if qwords then
1303
- quote[1] = nil
1304
- return :tSPACE, nil
1305
- elsif regexp then
1306
- return :tREGEXP_END, self.regx_options
1307
- else
1308
- return :tSTRING_END, term
1309
- end
1348
+ def | v
1349
+ raise ArgumentError, "Incompatible State: %p vs %p" % [self, v] unless
1350
+ self.names == v.names
1351
+ self.class.new(self.n | v.n, self.names)
1310
1352
  end
1311
1353
 
1312
- return :tSPACE, nil if space
1354
+ def inspect
1355
+ return "Value(0)" if n.zero? # HACK?
1313
1356
 
1314
- self.string_buffer = []
1315
-
1316
- if expand
1317
- case
1318
- when scan(/#(?=\$(-.|[a-zA-Z_0-9~\*\$\?!@\/\\;,\.=:<>\"\&\`\'+]))/) then
1319
- # TODO: !ISASCII
1320
- # ?! see parser_peek_variable_name
1321
- return :tSTRING_DVAR, nil
1322
- when scan(/#(?=\@\@?[a-zA-Z_])/) then
1323
- # TODO: !ISASCII
1324
- return :tSTRING_DVAR, nil
1325
- when scan(/#[{]/) then
1326
- self.command_start = true
1327
- return :tSTRING_DBEG, nil
1328
- when scan(/#/) then
1329
- string_buffer << '#'
1330
- end
1357
+ names.map { |v, k| k if self =~ v }.
1358
+ compact.
1359
+ join("|").
1360
+ gsub(/(?:EXPR_|STR_(?:FUNC_)?)/, "")
1331
1361
  end
1332
1362
 
1333
- if tokadd_string(func, term, paren) == RubyLexer::EOF then
1334
- rb_compile_error "unterminated string meets end of file"
1363
+ alias to_s inspect
1364
+
1365
+ module Values
1366
+ expr_names = {}
1367
+
1368
+ EXPR_NONE = State.new 0x0, expr_names
1369
+ EXPR_BEG = State.new 0x1, expr_names
1370
+ EXPR_END = State.new 0x2, expr_names
1371
+ EXPR_ENDARG = State.new 0x4, expr_names
1372
+ EXPR_ENDFN = State.new 0x8, expr_names
1373
+ EXPR_ARG = State.new 0x10, expr_names
1374
+ EXPR_CMDARG = State.new 0x20, expr_names
1375
+ EXPR_MID = State.new 0x40, expr_names
1376
+ EXPR_FNAME = State.new 0x80, expr_names
1377
+ EXPR_DOT = State.new 0x100, expr_names
1378
+ EXPR_CLASS = State.new 0x200, expr_names
1379
+ EXPR_LABEL = State.new 0x400, expr_names
1380
+ EXPR_LABELED = State.new 0x800, expr_names
1381
+ EXPR_FITEM = State.new 0x1000, expr_names
1382
+
1383
+ EXPR_BEG_ANY = EXPR_BEG | EXPR_MID | EXPR_CLASS
1384
+ EXPR_ARG_ANY = EXPR_ARG | EXPR_CMDARG
1385
+ EXPR_END_ANY = EXPR_END | EXPR_ENDARG | EXPR_ENDFN
1386
+
1387
+ # extra fake lex_state names to make things a bit cleaner
1388
+
1389
+ EXPR_LAB = EXPR_ARG|EXPR_LABELED
1390
+ EXPR_LIT = EXPR_END|EXPR_ENDARG
1391
+ EXPR_PAR = EXPR_BEG|EXPR_LABEL
1392
+ EXPR_PAD = EXPR_BEG|EXPR_LABELED
1393
+
1394
+ EXPR_NUM = EXPR_LIT
1395
+
1396
+ expr_names.merge!(EXPR_NONE => "EXPR_NONE",
1397
+ EXPR_BEG => "EXPR_BEG",
1398
+ EXPR_END => "EXPR_END",
1399
+ EXPR_ENDARG => "EXPR_ENDARG",
1400
+ EXPR_ENDFN => "EXPR_ENDFN",
1401
+ EXPR_ARG => "EXPR_ARG",
1402
+ EXPR_CMDARG => "EXPR_CMDARG",
1403
+ EXPR_MID => "EXPR_MID",
1404
+ EXPR_FNAME => "EXPR_FNAME",
1405
+ EXPR_DOT => "EXPR_DOT",
1406
+ EXPR_CLASS => "EXPR_CLASS",
1407
+ EXPR_LABEL => "EXPR_LABEL",
1408
+ EXPR_LABELED => "EXPR_LABELED",
1409
+ EXPR_FITEM => "EXPR_FITEM")
1410
+
1411
+ # ruby constants for strings
1412
+
1413
+ str_func_names = {}
1414
+
1415
+ STR_FUNC_BORING = State.new 0x00, str_func_names
1416
+ STR_FUNC_ESCAPE = State.new 0x01, str_func_names
1417
+ STR_FUNC_EXPAND = State.new 0x02, str_func_names
1418
+ STR_FUNC_REGEXP = State.new 0x04, str_func_names
1419
+ STR_FUNC_QWORDS = State.new 0x08, str_func_names
1420
+ STR_FUNC_SYMBOL = State.new 0x10, str_func_names
1421
+ STR_FUNC_INDENT = State.new 0x20, str_func_names # <<-HEREDOC
1422
+ STR_FUNC_LABEL = State.new 0x40, str_func_names
1423
+ STR_FUNC_LIST = State.new 0x4000, str_func_names
1424
+ STR_FUNC_TERM = State.new 0x8000, str_func_names
1425
+ STR_FUNC_ICNTNT = State.new 0x10000, str_func_names # <<~HEREDOC -- TODO: remove?
1426
+
1427
+ # TODO: check parser25.y on how they do STR_FUNC_INDENT
1428
+
1429
+ STR_SQUOTE = STR_FUNC_BORING
1430
+ STR_DQUOTE = STR_FUNC_EXPAND
1431
+ STR_XQUOTE = STR_FUNC_EXPAND
1432
+ STR_REGEXP = STR_FUNC_REGEXP | STR_FUNC_ESCAPE | STR_FUNC_EXPAND
1433
+ STR_SWORD = STR_FUNC_QWORDS | STR_FUNC_LIST
1434
+ STR_DWORD = STR_FUNC_QWORDS | STR_FUNC_EXPAND | STR_FUNC_LIST
1435
+ STR_SSYM = STR_FUNC_SYMBOL
1436
+ STR_DSYM = STR_FUNC_SYMBOL | STR_FUNC_EXPAND
1437
+
1438
+ str_func_names.merge!(STR_FUNC_ESCAPE => "STR_FUNC_ESCAPE",
1439
+ STR_FUNC_EXPAND => "STR_FUNC_EXPAND",
1440
+ STR_FUNC_REGEXP => "STR_FUNC_REGEXP",
1441
+ STR_FUNC_QWORDS => "STR_FUNC_QWORDS",
1442
+ STR_FUNC_SYMBOL => "STR_FUNC_SYMBOL",
1443
+ STR_FUNC_INDENT => "STR_FUNC_INDENT",
1444
+ STR_FUNC_LABEL => "STR_FUNC_LABEL",
1445
+ STR_FUNC_LIST => "STR_FUNC_LIST",
1446
+ STR_FUNC_TERM => "STR_FUNC_TERM",
1447
+ STR_FUNC_ICNTNT => "STR_FUNC_ICNTNT",
1448
+ STR_SQUOTE => "STR_SQUOTE")
1335
1449
  end
1336
1450
 
1337
- return :tSTRING_CONTENT, string_buffer.join
1451
+ include Values
1338
1452
  end
1453
+
1454
+ include State::Values
1339
1455
  end
1340
1456
 
1341
1457
  require "ruby_lexer.rex"
1342
1458
 
1343
1459
  if ENV["RP_LINENO_DEBUG"] then
1344
1460
  class RubyLexer
1345
- alias :old_lineno= :lineno=
1346
-
1347
1461
  def d o
1348
1462
  $stderr.puts o.inspect
1349
1463
  end
1350
1464
 
1465
+ alias old_lineno= lineno=
1466
+
1351
1467
  def lineno= n
1352
1468
  self.old_lineno= n
1353
1469
  where = caller.first.split(/:/).first(2).join(":")
1354
- d :lineno => [n, where, ss && ss.rest[0,40]]
1470
+ d :lineno => [n, where, ss && ss.rest[0, 40]]
1355
1471
  end
1356
1472
  end
1357
1473
  end