ruby_parser 3.13.0 → 3.15.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,40 +4,9 @@
4
4
  $DEBUG = true if ENV["DEBUG"]
5
5
 
6
6
  class RubyLexer
7
-
8
7
  # :stopdoc:
9
- HAS_ENC = "".respond_to? :encoding
10
-
11
- IDENT_CHAR = if HAS_ENC then
12
- /[\w\u0080-\u{10ffff}]/u
13
- else
14
- /[\w\x80-\xFF]/n
15
- end
16
-
17
8
  EOF = :eof_haha!
18
9
 
19
- # ruby constants for strings (should this be moved somewhere else?)
20
-
21
- STR_FUNC_BORING = 0x00
22
- STR_FUNC_ESCAPE = 0x01 # TODO: remove and replace with REGEXP
23
- STR_FUNC_EXPAND = 0x02
24
- STR_FUNC_REGEXP = 0x04
25
- STR_FUNC_QWORDS = 0x08
26
- STR_FUNC_SYMBOL = 0x10
27
- STR_FUNC_INDENT = 0x20 # <<-HEREDOC
28
- STR_FUNC_ICNTNT = 0x40 # <<~HEREDOC
29
-
30
- STR_SQUOTE = STR_FUNC_BORING
31
- STR_DQUOTE = STR_FUNC_BORING | STR_FUNC_EXPAND
32
- STR_XQUOTE = STR_FUNC_BORING | STR_FUNC_EXPAND
33
- STR_REGEXP = STR_FUNC_REGEXP | STR_FUNC_ESCAPE | STR_FUNC_EXPAND
34
- STR_SSYM = STR_FUNC_SYMBOL
35
- STR_DSYM = STR_FUNC_SYMBOL | STR_FUNC_EXPAND
36
-
37
- EXPR_BEG_ANY = [:expr_beg, :expr_mid, :expr_class ]
38
- EXPR_ARG_ANY = [:expr_arg, :expr_cmdarg, ]
39
- EXPR_END_ANY = [:expr_end, :expr_endarg, :expr_endfn]
40
-
41
10
  ESCAPES = {
42
11
  "a" => "\007",
43
12
  "b" => "\010",
@@ -54,6 +23,8 @@ class RubyLexer
54
23
  "c\?" => 127.chr,
55
24
  }
56
25
 
26
+ HAS_ENC = "".respond_to? :encoding
27
+
57
28
  TOKENS = {
58
29
  "!" => :tBANG,
59
30
  "!=" => :tNEQ,
@@ -70,13 +41,26 @@ class RubyLexer
70
41
  "->" => :tLAMBDA,
71
42
  }
72
43
 
73
- TAB_WIDTH = 8
74
-
75
- @@regexp_cache = Hash.new { |h,k| h[k] = Regexp.new(Regexp.escape(k)) }
44
+ @@regexp_cache = Hash.new { |h, k| h[k] = Regexp.new(Regexp.escape(k)) }
76
45
  @@regexp_cache[nil] = nil
77
46
 
47
+ if $DEBUG then
48
+ attr_reader :lex_state
49
+
50
+ def lex_state= o
51
+ return if @lex_state == o
52
+ raise ArgumentError, "bad state: %p" % [o] unless State === o
53
+
54
+ warn "lex_state: %p -> %p" % [lex_state, o]
55
+
56
+ @lex_state = o
57
+ end
58
+ end
59
+
78
60
  # :startdoc:
79
61
 
62
+ attr_accessor :lex_state unless $DEBUG
63
+
80
64
  attr_accessor :lineno # we're bypassing oedipus' lineno handling.
81
65
  attr_accessor :brace_nest
82
66
  attr_accessor :cmdarg
@@ -90,7 +74,6 @@ class RubyLexer
90
74
  # Additional context surrounding tokens that both the lexer and
91
75
  # grammar use.
92
76
 
93
- attr_accessor :lex_state
94
77
  attr_accessor :lex_strterm
95
78
  attr_accessor :lpar_beg
96
79
  attr_accessor :paren_nest
@@ -99,24 +82,14 @@ class RubyLexer
99
82
  attr_accessor :string_buffer
100
83
  attr_accessor :string_nest
101
84
 
102
- if $DEBUG then
103
- alias lex_state= lex_state=
104
- def lex_state=o
105
- return if @lex_state == o
106
- c = caller.first
107
- c = caller[1] if c =~ /\bresult\b/
108
- warn "lex_state: %p -> %p from %s" % [@lex_state, o, c.clean_caller]
109
- @lex_state = o
110
- end
111
- end
112
-
113
85
  # Last token read via next_token.
114
86
  attr_accessor :token
115
87
 
116
88
  attr_writer :comments
117
89
 
118
90
  def initialize _ = nil
119
- @lex_state = :expr_none
91
+ @lex_state = nil # remove one warning under $DEBUG
92
+ self.lex_state = EXPR_NONE
120
93
 
121
94
  self.cond = RubyParserStuff::StackState.new(:cond, $DEBUG)
122
95
  self.cmdarg = RubyParserStuff::StackState.new(:cmdarg, $DEBUG)
@@ -125,17 +98,22 @@ class RubyLexer
125
98
  end
126
99
 
127
100
  def arg_ambiguous
128
- self.warning("Ambiguous first argument. make sure.")
101
+ self.warning "Ambiguous first argument. make sure."
129
102
  end
130
103
 
131
104
  def arg_state
132
- in_arg_state? ? :expr_arg : :expr_beg
105
+ is_after_operator? ? EXPR_ARG : EXPR_BEG
133
106
  end
134
107
 
135
108
  def beginning_of_line?
136
109
  ss.bol?
137
110
  end
138
- alias :bol? :beginning_of_line? # to make .rex file more readable
111
+
112
+ alias bol? beginning_of_line? # to make .rex file more readable
113
+
114
+ def check re
115
+ ss.check re
116
+ end
139
117
 
140
118
  def comments # TODO: remove this... maybe comment_string + attr_accessor
141
119
  c = @comments.join
@@ -143,30 +121,41 @@ class RubyLexer
143
121
  c
144
122
  end
145
123
 
124
+ def eat_whitespace
125
+ r = scan(/\s+/)
126
+ self.extra_lineno += r.count("\n") if r
127
+ r
128
+ end
129
+
146
130
  def end_of_stream?
147
131
  ss.eos?
148
132
  end
149
133
 
150
134
  def expr_dot?
151
- lex_state == :expr_dot
135
+ lex_state =~ EXPR_DOT
152
136
  end
153
137
 
154
- def expr_fname?
155
- lex_state == :expr_fname
138
+ def expr_fname? # REFACTOR
139
+ lex_state =~ EXPR_FNAME
156
140
  end
157
141
 
158
142
  def expr_result token, text
159
143
  cond.push false
160
144
  cmdarg.push false
161
- result :expr_beg, token, text
145
+ result EXPR_BEG, token, text
146
+ end
147
+
148
+ def fixup_lineno extra = 0
149
+ self.lineno += self.extra_lineno + extra
150
+ self.extra_lineno = 0
162
151
  end
163
152
 
164
153
  def heredoc here # TODO: rewrite / remove
165
154
  _, eos, func, last_line = here
166
155
 
167
- indent = (func & STR_FUNC_INDENT) != 0 ? "[ \t]*" : nil
168
- content_indent = (func & STR_FUNC_ICNTNT) != 0
169
- expand = (func & STR_FUNC_EXPAND) != 0
156
+ indent = func =~ STR_FUNC_INDENT ? "[ \t]*" : nil
157
+ expand = func =~ STR_FUNC_EXPAND
158
+ eol = last_line && last_line.end_with?("\r\n") ? "\r\n" : "\n"
170
159
  eos_re = /#{indent}#{Regexp.escape eos}(\r*\n|\z)/
171
160
  err_msg = "can't match #{eos_re.inspect} anywhere in "
172
161
 
@@ -175,30 +164,35 @@ class RubyLexer
175
164
  if beginning_of_line? && scan(eos_re) then
176
165
  self.lineno += 1
177
166
  ss.unread_many last_line # TODO: figure out how to remove this
178
- return :tSTRING_END, eos
167
+ return :tSTRING_END, [eos, func] # TODO: calculate squiggle width at lex?
179
168
  end
180
169
 
181
170
  self.string_buffer = []
182
171
 
183
172
  if expand then
184
173
  case
185
- when scan(/#[$@]/) then
186
- ss.pos -= 1 # FIX omg stupid
174
+ when scan(/#(?=\$(-.|[a-zA-Z_0-9~\*\$\?!@\/\\;,\.=:<>\"\&\`\'+]))/) then
175
+ # TODO: !ISASCII
176
+ # ?! see parser_peek_variable_name
177
+ return :tSTRING_DVAR, matched
178
+ when scan(/#(?=\@\@?[a-zA-Z_])/) then
179
+ # TODO: !ISASCII
187
180
  return :tSTRING_DVAR, matched
188
181
  when scan(/#[{]/) then
182
+ self.command_start = true
189
183
  return :tSTRING_DBEG, matched
190
184
  when scan(/#/) then
191
- string_buffer << '#'
185
+ string_buffer << "#"
192
186
  end
193
187
 
194
188
  begin
195
- c = tokadd_string func, "\n", nil
189
+ c = tokadd_string func, eol, nil
196
190
 
197
191
  rb_compile_error err_msg if
198
192
  c == RubyLexer::EOF
199
193
 
200
- if c != "\n" then
201
- return :tSTRING_CONTENT, string_buffer.join.delete("\r")
194
+ if c != eol then
195
+ return :tSTRING_CONTENT, string_buffer.join
202
196
  else
203
197
  string_buffer << scan(/\n/)
204
198
  end
@@ -214,64 +208,26 @@ class RubyLexer
214
208
 
215
209
  self.lex_strterm = [:heredoc, eos, func, last_line]
216
210
 
217
- string_content = string_buffer.join.delete("\r")
218
-
219
- string_content = heredoc_dedent(string_content) if content_indent && ruby23plus?
211
+ string_content = begin
212
+ s = string_buffer.join
213
+ s.b.force_encoding Encoding::UTF_8
214
+ end
220
215
 
221
216
  return :tSTRING_CONTENT, string_content
222
217
  end
223
218
 
224
- def heredoc_dedent(string_content)
225
- width = string_content.scan(/^[ \t]*(?=\S)/).map do |whitespace|
226
- heredoc_whitespace_indent_size whitespace
227
- end.min || 0
228
-
229
- string_content.split("\n", -1).map do |line|
230
- dedent_string line, width
231
- end.join "\n"
232
- end
233
-
234
- def dedent_string(string, width)
235
- characters_skipped = 0
236
- indentation_skipped = 0
237
-
238
- string.chars.each do |char|
239
- break if indentation_skipped >= width
240
- if char == ' '
241
- characters_skipped += 1
242
- indentation_skipped += 1
243
- elsif char == "\t"
244
- proposed = TAB_WIDTH * (indentation_skipped / TAB_WIDTH + 1)
245
- break if (proposed > width)
246
- characters_skipped += 1
247
- indentation_skipped = proposed
248
- end
249
- end
250
- string[characters_skipped..-1]
251
- end
252
-
253
- def heredoc_whitespace_indent_size(whitespace)
254
- whitespace.chars.inject 0 do |size, char|
255
- if char == "\t"
256
- size + TAB_WIDTH
257
- else
258
- size + 1
259
- end
260
- end
261
- end
262
-
263
219
  def heredoc_identifier # TODO: remove / rewrite
264
220
  term, func = nil, STR_FUNC_BORING
265
221
  self.string_buffer = []
266
222
 
267
- heredoc_indent_mods = '-'
223
+ heredoc_indent_mods = "-"
268
224
  heredoc_indent_mods += '\~' if ruby23plus?
269
225
 
270
226
  case
271
227
  when scan(/([#{heredoc_indent_mods}]?)([\'\"\`])(.*?)\2/) then
272
228
  term = ss[2]
273
- func |= STR_FUNC_INDENT unless ss[1].empty?
274
- func |= STR_FUNC_ICNTNT if ss[1] == '~'
229
+ func |= STR_FUNC_INDENT unless ss[1].empty? # TODO: this seems wrong
230
+ func |= STR_FUNC_ICNTNT if ss[1] == "~"
275
231
  func |= case term
276
232
  when "\'" then
277
233
  STR_SQUOTE
@@ -288,7 +244,7 @@ class RubyLexer
288
244
  func |= STR_DQUOTE
289
245
  unless ss[1].empty? then
290
246
  func |= STR_FUNC_INDENT
291
- func |= STR_FUNC_ICNTNT if ss[1] == '~'
247
+ func |= STR_FUNC_ICNTNT if ss[1] == "~"
292
248
  end
293
249
  string_buffer << ss[2]
294
250
  else
@@ -304,23 +260,15 @@ class RubyLexer
304
260
 
305
261
  self.lex_strterm = [:heredoc, string_buffer.join, func, line]
306
262
 
307
- if term == '`' then
263
+ if term == "`" then
308
264
  result nil, :tXSTRING_BEG, "`"
309
265
  else
310
266
  result nil, :tSTRING_BEG, "\""
311
267
  end
312
268
  end
313
269
 
314
- def in_fname?
315
- in_lex_state? :expr_fname
316
- end
317
-
318
- def in_arg_state? # TODO: rename is_after_operator?
319
- in_lex_state? :expr_fname, :expr_dot
320
- end
321
-
322
- def in_lex_state?(*states)
323
- states.include? lex_state
270
+ def in_fname? # REFACTOR
271
+ lex_state =~ EXPR_FNAME
324
272
  end
325
273
 
326
274
  def int_with_base base
@@ -328,42 +276,35 @@ class RubyLexer
328
276
 
329
277
  text = matched
330
278
  case
331
- when text.end_with?('ri')
332
- return result(:expr_end, :tIMAGINARY, Complex(0, Rational(text.chop.chop.to_i(base))))
333
- when text.end_with?('r')
334
- return result(:expr_end, :tRATIONAL, Rational(text.chop.to_i(base)))
335
- when text.end_with?('i')
336
- return result(:expr_end, :tIMAGINARY, Complex(0, text.chop.to_i(base)))
279
+ when text.end_with?("ri")
280
+ return result(EXPR_NUM, :tIMAGINARY, Complex(0, Rational(text.chop.chop.to_i(base))))
281
+ when text.end_with?("r")
282
+ return result(EXPR_NUM, :tRATIONAL, Rational(text.chop.to_i(base)))
283
+ when text.end_with?("i")
284
+ return result(EXPR_NUM, :tIMAGINARY, Complex(0, text.chop.to_i(base)))
337
285
  else
338
- return result(:expr_end, :tINTEGER, text.to_i(base))
286
+ return result(EXPR_NUM, :tINTEGER, text.to_i(base))
339
287
  end
340
288
  end
341
289
 
290
+ def is_after_operator?
291
+ lex_state =~ EXPR_FNAME|EXPR_DOT
292
+ end
293
+
342
294
  def is_arg?
343
- in_lex_state?(*EXPR_ARG_ANY)
295
+ lex_state =~ EXPR_ARG_ANY
344
296
  end
345
297
 
346
298
  def is_beg?
347
- # TODO: in_lex_state?(*EXPR_BEG_ANY) || lex_state == [:expr_arg, :expr_labeled]
348
- in_lex_state?(*EXPR_BEG_ANY, :expr_value, :expr_labeled)
299
+ lex_state =~ EXPR_BEG_ANY || lex_state == EXPR_LAB # yes, == EXPR_LAB
349
300
  end
350
301
 
351
302
  def is_end?
352
- in_lex_state?(*EXPR_END_ANY)
353
- end
354
-
355
- def lvar_defined? id
356
- # TODO: (dyna_in_block? && dvar_defined?(id)) || local_id?(id)
357
- self.parser.env[id.to_sym] == :lvar
358
- end
359
-
360
-
361
- def ruby22_label?
362
- ruby22plus? and is_label_possible?
303
+ lex_state =~ EXPR_END_ANY
363
304
  end
364
305
 
365
306
  def is_label_possible?
366
- (in_lex_state?(:expr_beg, :expr_endfn) && !cmd_state) || is_arg?
307
+ (lex_state =~ EXPR_LABEL|EXPR_ENDFN && !cmd_state) || is_arg?
367
308
  end
368
309
 
369
310
  def is_label_suffix?
@@ -378,6 +319,16 @@ class RubyLexer
378
319
  lpar_beg && lpar_beg == paren_nest
379
320
  end
380
321
 
322
+ def is_local_id id
323
+ # maybe just make this false for now
324
+ self.parser.env[id.to_sym] == :lvar # HACK: this isn't remotely right
325
+ end
326
+
327
+ def lvar_defined? id
328
+ # TODO: (dyna_in_block? && dvar_defined?(id)) || local_id?(id)
329
+ self.parser.env[id.to_sym] == :lvar
330
+ end
331
+
381
332
  def matched
382
333
  ss.matched
383
334
  end
@@ -386,11 +337,139 @@ class RubyLexer
386
337
  not is_end?
387
338
  end
388
339
 
340
+ def parse_quote # TODO: remove / rewrite
341
+ beg, nnd, short_hand, c = nil, nil, false, nil
342
+
343
+ if scan(/[a-z0-9]{1,2}/i) then # Long-hand (e.g. %Q{}).
344
+ rb_compile_error "unknown type of %string" if ss.matched_size == 2
345
+ c, beg, short_hand = matched, getch, false
346
+ else # Short-hand (e.g. %{, %., %!, etc)
347
+ c, beg, short_hand = "Q", getch, true
348
+ end
349
+
350
+ if end_of_stream? or c == RubyLexer::EOF or beg == RubyLexer::EOF then
351
+ rb_compile_error "unterminated quoted string meets end of file"
352
+ end
353
+
354
+ # Figure nnd-char. "\0" is special to indicate beg=nnd and that no nesting?
355
+ nnd = { "(" => ")", "[" => "]", "{" => "}", "<" => ">" }[beg]
356
+ nnd, beg = beg, "\0" if nnd.nil?
357
+
358
+ token_type, text = nil, "%#{c}#{beg}"
359
+ token_type, string_type = case c
360
+ when "Q" then
361
+ ch = short_hand ? nnd : c + beg
362
+ text = "%#{ch}"
363
+ [:tSTRING_BEG, STR_DQUOTE]
364
+ when "q" then
365
+ [:tSTRING_BEG, STR_SQUOTE]
366
+ when "W" then
367
+ eat_whitespace
368
+ [:tWORDS_BEG, STR_DQUOTE | STR_FUNC_QWORDS]
369
+ when "w" then
370
+ eat_whitespace
371
+ [:tQWORDS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
372
+ when "x" then
373
+ [:tXSTRING_BEG, STR_XQUOTE]
374
+ when "r" then
375
+ [:tREGEXP_BEG, STR_REGEXP]
376
+ when "s" then
377
+ self.lex_state = EXPR_FNAME
378
+ [:tSYMBEG, STR_SSYM]
379
+ when "I" then
380
+ eat_whitespace
381
+ [:tSYMBOLS_BEG, STR_DQUOTE | STR_FUNC_QWORDS]
382
+ when "i" then
383
+ eat_whitespace
384
+ [:tQSYMBOLS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
385
+ end
386
+
387
+ rb_compile_error "Bad %string type. Expected [QqWwIixrs], found '#{c}'." if
388
+ token_type.nil?
389
+
390
+ raise "huh" unless string_type
391
+
392
+ string string_type, nnd, beg
393
+
394
+ return token_type, text
395
+ end
396
+
397
+ def parse_string quote # TODO: rewrite / remove
398
+ _, string_type, term, open = quote
399
+
400
+ space = false # FIX: remove these
401
+ func = string_type
402
+ paren = open
403
+ term_re = @@regexp_cache[term]
404
+
405
+ qwords = func =~ STR_FUNC_QWORDS
406
+ regexp = func =~ STR_FUNC_REGEXP
407
+ expand = func =~ STR_FUNC_EXPAND
408
+
409
+ unless func then # nil'ed from qwords below. *sigh*
410
+ return :tSTRING_END, nil
411
+ end
412
+
413
+ space = true if qwords and eat_whitespace
414
+
415
+ if self.string_nest == 0 && scan(/#{term_re}/) then
416
+ if qwords then
417
+ quote[1] = nil
418
+ return :tSPACE, nil
419
+ elsif regexp then
420
+ return :tREGEXP_END, self.regx_options
421
+ else
422
+ return :tSTRING_END, term
423
+ end
424
+ end
425
+
426
+ return :tSPACE, nil if space
427
+
428
+ self.string_buffer = []
429
+
430
+ if expand
431
+ case
432
+ when scan(/#(?=\$(-.|[a-zA-Z_0-9~\*\$\?!@\/\\;,\.=:<>\"\&\`\'+]))/) then
433
+ # TODO: !ISASCII
434
+ # ?! see parser_peek_variable_name
435
+ return :tSTRING_DVAR, nil
436
+ when scan(/#(?=\@\@?[a-zA-Z_])/) then
437
+ # TODO: !ISASCII
438
+ return :tSTRING_DVAR, nil
439
+ when scan(/#[{]/) then
440
+ self.command_start = true
441
+ return :tSTRING_DBEG, nil
442
+ when scan(/#/) then
443
+ string_buffer << "#"
444
+ end
445
+ end
446
+
447
+ if tokadd_string(func, term, paren) == RubyLexer::EOF then
448
+ if func =~ STR_FUNC_REGEXP then
449
+ rb_compile_error "unterminated regexp meets end of file"
450
+ else
451
+ rb_compile_error "unterminated string meets end of file"
452
+ end
453
+ end
454
+
455
+ return :tSTRING_CONTENT, string_buffer.join
456
+ end
457
+
458
+ def possibly_escape_string text, check
459
+ content = match[1]
460
+
461
+ if text =~ check then
462
+ content.gsub(ESC) { unescape $1 }
463
+ else
464
+ content.gsub(/\\\\/, "\\").gsub(/\\\'/, "'")
465
+ end
466
+ end
467
+
389
468
  def process_amper text
390
469
  token = if is_arg? && space_seen && !check(/\s/) then
391
470
  warning("`&' interpreted as argument prefix")
392
471
  :tAMPER
393
- elsif in_lex_state? :expr_beg, :expr_mid then
472
+ elsif lex_state =~ EXPR_BEG|EXPR_MID then
394
473
  :tAMPER
395
474
  else
396
475
  :tAMPER2
@@ -402,7 +481,7 @@ class RubyLexer
402
481
  def process_backref text
403
482
  token = ss[1].to_sym
404
483
  # TODO: can't do lineno hack w/ symbol
405
- result :expr_end, :tBACK_REF, token
484
+ result EXPR_END, :tBACK_REF, token
406
485
  end
407
486
 
408
487
  def process_begin text
@@ -420,54 +499,33 @@ class RubyLexer
420
499
  end
421
500
 
422
501
  def process_brace_close text
423
- # matching compare/parse23.y:8561
424
- cond.lexpop
425
- cmdarg.lexpop
426
-
427
502
  case matched
428
503
  when "}" then
429
504
  self.brace_nest -= 1
430
- self.lex_state = :expr_endarg # TODO: :expr_end ? Look at 2.6
431
-
432
505
  return :tSTRING_DEND, matched if brace_nest < 0
506
+ end
507
+
508
+ # matching compare/parse26.y:8099
509
+ cond.pop
510
+ cmdarg.pop
511
+
512
+ case matched
513
+ when "}" then
514
+ self.lex_state = ruby24minus? ? EXPR_ENDARG : EXPR_END
433
515
  return :tRCURLY, matched
434
516
  when "]" then
435
517
  self.paren_nest -= 1
436
- self.lex_state = :expr_endarg
518
+ self.lex_state = ruby24minus? ? EXPR_ENDARG : EXPR_END
437
519
  return :tRBRACK, matched
438
520
  when ")" then
439
521
  self.paren_nest -= 1
440
- self.lex_state = :expr_endfn
522
+ self.lex_state = EXPR_ENDFN
441
523
  return :tRPAREN, matched
442
524
  else
443
525
  raise "Unknown bracing: #{matched.inspect}"
444
526
  end
445
527
  end
446
528
 
447
- def process_colon1 text
448
- # ?: / then / when
449
- if is_end? || check(/\s/) then
450
- return result :expr_beg, :tCOLON, text
451
- end
452
-
453
- case
454
- when scan(/\'/) then
455
- string STR_SSYM
456
- when scan(/\"/) then
457
- string STR_DSYM
458
- end
459
-
460
- result :expr_fname, :tSYMBEG, text
461
- end
462
-
463
- def process_colon2 text
464
- if is_beg? || in_lex_state?(:expr_class) || is_space_arg? then
465
- result :expr_beg, :tCOLON3, text
466
- else
467
- result :expr_dot, :tCOLON2, text
468
- end
469
- end
470
-
471
529
  def process_brace_open text
472
530
  # matching compare/parse23.y:8694
473
531
  self.brace_nest += 1
@@ -479,67 +537,111 @@ class RubyLexer
479
537
  return expr_result(:tLAMBEG, "{")
480
538
  end
481
539
 
482
- token = case lex_state
483
- when :expr_labeled then
540
+ token = case
541
+ when lex_state =~ EXPR_LABELED then
484
542
  :tLBRACE # hash
485
- when *EXPR_ARG_ANY, :expr_end, :expr_endfn then
486
- :tLCURLY # block (primary)
487
- when :expr_endarg
543
+ when lex_state =~ EXPR_ARG_ANY|EXPR_END|EXPR_ENDFN then
544
+ :tLCURLY # block (primary) "{" in parse.y
545
+ when lex_state =~ EXPR_ENDARG then
488
546
  :tLBRACE_ARG # block (expr)
489
547
  else
490
548
  :tLBRACE # hash
491
549
  end
492
550
 
493
- # TODO: self.lex_state |= :expr_label if token != :tLBRACE_ARG
551
+ state = token == :tLBRACE_ARG ? EXPR_BEG : EXPR_PAR
494
552
  self.command_start = true if token != :tLBRACE
495
553
 
496
- return expr_result(token, "{")
554
+ cond.push false
555
+ cmdarg.push false
556
+ result state, token, text
557
+ end
558
+
559
+ def process_colon1 text
560
+ # ?: / then / when
561
+ if is_end? || check(/\s/) then
562
+ return result EXPR_BEG, :tCOLON, text
563
+ end
564
+
565
+ case
566
+ when scan(/\'/) then
567
+ string STR_SSYM
568
+ when scan(/\"/) then
569
+ string STR_DSYM
570
+ end
571
+
572
+ result EXPR_FNAME, :tSYMBEG, text
573
+ end
574
+
575
+ def process_colon2 text
576
+ if is_beg? || lex_state =~ EXPR_CLASS || is_space_arg? then
577
+ result EXPR_BEG, :tCOLON3, text
578
+ else
579
+ result EXPR_DOT, :tCOLON2, text
580
+ end
497
581
  end
498
582
 
499
583
  def process_float text
500
584
  rb_compile_error "Invalid numeric format" if text =~ /__/
501
585
 
502
586
  case
503
- when text.end_with?('ri')
504
- return result(:expr_end, :tIMAGINARY, Complex(0, Rational(text.chop.chop)))
505
- when text.end_with?('r')
506
- return result(:expr_end, :tRATIONAL, Rational(text.chop))
507
- when text.end_with?('i')
508
- return result(:expr_end, :tIMAGINARY, Complex(0, text.chop.to_f))
587
+ when text.end_with?("ri")
588
+ return result EXPR_NUM, :tIMAGINARY, Complex(0, Rational(text.chop.chop))
589
+ when text.end_with?("i")
590
+ return result EXPR_NUM, :tIMAGINARY, Complex(0, text.chop.to_f)
591
+ when text.end_with?("r")
592
+ return result EXPR_NUM, :tRATIONAL, Rational(text.chop)
509
593
  else
510
- return result(:expr_end, :tFLOAT, text.to_f)
594
+ return result EXPR_NUM, :tFLOAT, text.to_f
511
595
  end
512
596
  end
513
597
 
514
598
  def process_gvar text
515
599
  text.lineno = self.lineno
516
- result(:expr_end, :tGVAR, text)
600
+ result EXPR_END, :tGVAR, text
517
601
  end
518
602
 
519
603
  def process_gvar_oddity text
520
- return result :expr_end, "$", "$" if text == "$" # TODO: wtf is this?
604
+ return result EXPR_END, "$", "$" if text == "$" # TODO: wtf is this?
521
605
  rb_compile_error "#{text.inspect} is not allowed as a global variable name"
522
606
  end
523
607
 
524
608
  def process_ivar text
525
609
  tok_id = text =~ /^@@/ ? :tCVAR : :tIVAR
526
610
  text.lineno = self.lineno
527
- return result(:expr_end, tok_id, text)
611
+ result EXPR_END, tok_id, text
612
+ end
613
+
614
+ def process_label text
615
+ symbol = possibly_escape_string text, /^\"/
616
+
617
+ result EXPR_LAB, :tLABEL, [symbol, self.lineno]
618
+ end
619
+
620
+ def process_label_or_string text
621
+ if @was_label && text =~ /:\Z/ then
622
+ @was_label = nil
623
+ return process_label text
624
+ elsif text =~ /:\Z/ then
625
+ ss.pos -= 1 # put back ":"
626
+ text = text[0..-2]
627
+ end
628
+
629
+ result EXPR_END, :tSTRING, text[1..-2].gsub(/\\\\/, "\\").gsub(/\\\'/, "\'")
528
630
  end
529
631
 
530
632
  def process_lchevron text
531
- if (!in_lex_state?(:expr_dot, :expr_class) &&
633
+ if (lex_state !~ EXPR_DOT|EXPR_CLASS &&
532
634
  !is_end? &&
533
- (!is_arg? || space_seen)) then # TODO: || in_state(:expr_labeled)
635
+ (!is_arg? || lex_state =~ EXPR_LABELED || space_seen)) then
534
636
  tok = self.heredoc_identifier
535
637
  return tok if tok
536
638
  end
537
639
 
538
- if in_arg_state? then
539
- self.lex_state = :expr_arg
640
+ if is_after_operator? then
641
+ self.lex_state = EXPR_ARG
540
642
  else
541
- self.command_start = true if lex_state == :expr_class
542
- self.lex_state = :expr_beg
643
+ self.command_start = true if lex_state =~ EXPR_CLASS
644
+ self.lex_state = EXPR_BEG
543
645
  end
544
646
 
545
647
  return result(lex_state, :tLSHFT, "\<\<")
@@ -549,14 +651,14 @@ class RubyLexer
549
651
  c = matched
550
652
  hit = false
551
653
 
552
- if c == '#' then
654
+ if c == "#" then
553
655
  ss.pos -= 1
554
656
 
555
657
  # TODO: handle magic comments
556
658
  while scan(/\s*\#.*(\n+|\z)/) do
557
659
  hit = true
558
660
  self.lineno += matched.lines.to_a.size
559
- @comments << matched.gsub(/^ +#/, '#').gsub(/^ +$/, '')
661
+ @comments << matched.gsub(/^ +#/, "#").gsub(/^ +$/, "")
560
662
  end
561
663
 
562
664
  return nil if end_of_stream?
@@ -567,17 +669,15 @@ class RubyLexer
567
669
  # Replace a string of newlines with a single one
568
670
  self.lineno += matched.lines.to_a.size if scan(/\n+/)
569
671
 
570
- # TODO: remove :expr_value -- audit all uses of it
571
- c = in_lex_state?(:expr_beg, :expr_value, :expr_class,
572
- :expr_fname, :expr_dot) && !in_lex_state?(:expr_labeled)
573
-
672
+ c = (lex_state =~ EXPR_BEG|EXPR_CLASS|EXPR_FNAME|EXPR_DOT &&
673
+ lex_state !~ EXPR_LABELED)
574
674
  # TODO: figure out what token_seen is for
575
- # TODO: if c || self.lex_state == [:expr_beg, :expr_labeled] then
576
- if c || self.lex_state == :expr_labeled then
675
+ if c || self.lex_state == EXPR_LAB then # yes, == EXPR_LAB
577
676
  # ignore if !fallthrough?
578
677
  if !c && parser.in_kwarg then
579
678
  # normal newline
580
- return result(:expr_beg, :tNL, nil)
679
+ self.command_start = true
680
+ return result EXPR_BEG, :tNL, nil
581
681
  else
582
682
  return # skip
583
683
  end
@@ -592,41 +692,46 @@ class RubyLexer
592
692
 
593
693
  self.command_start = true
594
694
 
595
- return result(:expr_beg, :tNL, nil)
695
+ return result(EXPR_BEG, :tNL, nil)
596
696
  end
597
697
 
598
698
  def process_nthref text
599
699
  # TODO: can't do lineno hack w/ number
600
- result :expr_end, :tNTH_REF, ss[1].to_i
700
+ result EXPR_END, :tNTH_REF, ss[1].to_i
601
701
  end
602
702
 
603
703
  def process_paren text
604
- token = process_paren19
704
+ token = if is_beg? then
705
+ :tLPAREN
706
+ elsif !space_seen then
707
+ # foo( ... ) => method call, no ambiguity
708
+ :tLPAREN2
709
+ elsif is_space_arg? then
710
+ :tLPAREN_ARG
711
+ elsif lex_state =~ EXPR_ENDFN && !lambda_beginning? then
712
+ # TODO:
713
+ # warn("parentheses after method name is interpreted as " \
714
+ # "an argument list, not a decomposed argument")
715
+ :tLPAREN2
716
+ else
717
+ :tLPAREN2 # plain "(" in parse.y
718
+ end
605
719
 
606
720
  self.paren_nest += 1
607
721
 
608
- # TODO: add :expr_label to :expr_beg (set in expr_result below)
609
- return expr_result(token, "(")
610
- end
611
-
612
- def process_paren19
613
- if is_beg? then
614
- :tLPAREN
615
- elsif is_space_arg? then
616
- :tLPAREN_ARG
617
- else
618
- :tLPAREN2 # plain '(' in parse.y
619
- end
722
+ cond.push false
723
+ cmdarg.push false
724
+ result EXPR_PAR, token, text
620
725
  end
621
726
 
622
727
  def process_percent text
623
728
  return parse_quote if is_beg?
624
729
 
625
- return result(:expr_beg, :tOP_ASGN, "%") if scan(/\=/)
730
+ return result EXPR_BEG, :tOP_ASGN, "%" if scan(/\=/)
626
731
 
627
- return parse_quote if is_arg? && space_seen && ! check(/\s/)
732
+ return parse_quote if is_space_arg?(check(/\s/)) || (lex_state =~ EXPR_FITEM && check(/s/))
628
733
 
629
- return result(:arg_state, :tPERCENT, "%")
734
+ return result :arg_state, :tPERCENT, "%"
630
735
  end
631
736
 
632
737
  def process_plus_minus text
@@ -637,33 +742,33 @@ class RubyLexer
637
742
  [:tUMINUS, :tMINUS]
638
743
  end
639
744
 
640
- if in_arg_state? then
745
+ if is_after_operator? then
641
746
  if scan(/@/) then
642
- return result(:expr_arg, utype, "#{sign}@")
747
+ return result(EXPR_ARG, utype, "#{sign}@")
643
748
  else
644
- return result(:expr_arg, type, sign)
749
+ return result(EXPR_ARG, type, sign)
645
750
  end
646
751
  end
647
752
 
648
- return result(:expr_beg, :tOP_ASGN, sign) if scan(/\=/)
753
+ return result(EXPR_BEG, :tOP_ASGN, sign) if scan(/\=/)
649
754
 
650
- if (is_beg? || (is_arg? && space_seen && !check(/\s/))) then
755
+ if is_beg? || (is_arg? && space_seen && !check(/\s/)) then
651
756
  arg_ambiguous if is_arg?
652
757
 
653
758
  if check(/\d/) then
654
759
  return nil if utype == :tUPLUS
655
- return result(:expr_beg, :tUMINUS_NUM, sign)
760
+ return result EXPR_BEG, :tUMINUS_NUM, sign
656
761
  end
657
762
 
658
- return result(:expr_beg, utype, sign)
763
+ return result EXPR_BEG, utype, sign
659
764
  end
660
765
 
661
- return result(:expr_beg, type, sign)
766
+ result EXPR_BEG, type, sign
662
767
  end
663
768
 
664
769
  def process_questionmark text
665
770
  if is_end? then
666
- return result(:expr_value, :tEH, "?")
771
+ return result EXPR_BEG, :tEH, "?"
667
772
  end
668
773
 
669
774
  if end_of_stream? then
@@ -672,12 +777,12 @@ class RubyLexer
672
777
 
673
778
  if check(/\s|\v/) then
674
779
  unless is_arg? then
675
- c2 = { " " => 's',
676
- "\n" => 'n',
677
- "\t" => 't',
678
- "\v" => 'v',
679
- "\r" => 'r',
680
- "\f" => 'f' }[matched]
780
+ c2 = { " " => "s",
781
+ "\n" => "n",
782
+ "\t" => "t",
783
+ "\v" => "v",
784
+ "\r" => "r",
785
+ "\f" => "f" }[matched]
681
786
 
682
787
  if c2 then
683
788
  warning("invalid character syntax; use ?\\" + c2)
@@ -685,18 +790,28 @@ class RubyLexer
685
790
  end
686
791
 
687
792
  # ternary
688
- return result(:expr_value, :tEH, "?")
793
+ return result EXPR_BEG, :tEH, "?"
689
794
  elsif check(/\w(?=\w)/) then # ternary, also
690
- return result(:expr_beg, :tEH, "?")
795
+ return result EXPR_BEG, :tEH, "?"
691
796
  end
692
797
 
693
798
  c = if scan(/\\/) then
694
799
  self.read_escape
695
800
  else
696
- ss.getch
801
+ getch
697
802
  end
698
803
 
699
- return result(:expr_end, :tSTRING, c)
804
+ result EXPR_END, :tSTRING, c
805
+ end
806
+
807
+ def process_simple_string text
808
+ replacement = text[1..-2].gsub(ESC) {
809
+ unescape($1).b.force_encoding Encoding::UTF_8
810
+ }
811
+
812
+ replacement = replacement.b unless replacement.valid_encoding?
813
+
814
+ result EXPR_END, :tSTRING, replacement
700
815
  end
701
816
 
702
817
  def process_slash text
@@ -707,7 +822,7 @@ class RubyLexer
707
822
  end
708
823
 
709
824
  if scan(/\=/) then
710
- return result(:expr_beg, :tOP_ASGN, "/")
825
+ return result(EXPR_BEG, :tOP_ASGN, "/")
711
826
  end
712
827
 
713
828
  if is_arg? && space_seen then
@@ -726,73 +841,68 @@ class RubyLexer
726
841
 
727
842
  token = nil
728
843
 
729
- if in_arg_state? then
844
+ if is_after_operator? then
730
845
  case
731
846
  when scan(/\]\=/) then
732
847
  self.paren_nest -= 1 # HACK? I dunno, or bug in MRI
733
- return result(:expr_arg, :tASET, "[]=")
848
+ return result EXPR_ARG, :tASET, "[]="
734
849
  when scan(/\]/) then
735
850
  self.paren_nest -= 1 # HACK? I dunno, or bug in MRI
736
- return result(:expr_arg, :tAREF, "[]")
851
+ return result EXPR_ARG, :tAREF, "[]"
737
852
  else
738
853
  rb_compile_error "unexpected '['"
739
854
  end
740
855
  elsif is_beg? then
741
856
  token = :tLBRACK
742
- elsif is_arg? && space_seen then
857
+ elsif is_arg? && (space_seen || lex_state =~ EXPR_LABELED) then
743
858
  token = :tLBRACK
744
859
  else
745
860
  token = :tLBRACK2
746
861
  end
747
862
 
748
- # TODO: this is done by expr_result except "|EXPR_LABEL")
749
- # SET_LEX_STATE(EXPR_BEG|EXPR_LABEL);
750
- expr_result token, "["
751
- end
752
-
753
- def possibly_escape_string text, check
754
- content = match[1]
755
-
756
- if text =~ check then
757
- content.gsub(ESC) { unescape $1 }
758
- else
759
- content.gsub(/\\\\/, "\\").gsub(/\\'/, "'")
760
- end
863
+ cond.push false
864
+ cmdarg.push false
865
+ result EXPR_PAR, token, text
761
866
  end
762
867
 
763
- def process_symbol text
764
- symbol = possibly_escape_string text, /^:"/
868
+ def process_string # TODO: rewrite / remove
869
+ # matches top of parser_yylex in compare/parse23.y:8113
870
+ token = if lex_strterm[0] == :heredoc then
871
+ self.heredoc lex_strterm
872
+ else
873
+ self.parse_string lex_strterm
874
+ end
765
875
 
766
- return result(:expr_end, :tSYMBOL, symbol)
767
- end
876
+ token_type, c = token
768
877
 
769
- def was_label?
770
- @was_label = ruby22_label?
771
- true
772
- end
878
+ # matches parser_string_term from 2.3, but way off from 2.5
879
+ if ruby22plus? && token_type == :tSTRING_END && ["'", '"'].include?(c) then
880
+ if ((lex_state =~ EXPR_BEG|EXPR_ENDFN &&
881
+ !cond.is_in_state) || is_arg?) &&
882
+ is_label_suffix? then
883
+ scan(/:/)
884
+ token_type = token[0] = :tLABEL_END
885
+ end
886
+ end
773
887
 
774
- def process_label_or_string text
775
- if @was_label && text =~ /:\Z/ then
776
- @was_label = nil
777
- return process_label text
778
- elsif text =~ /:\Z/ then
779
- ss.pos -= 1 # put back ":"
780
- text = text[0..-2]
888
+ if [:tSTRING_END, :tREGEXP_END, :tLABEL_END].include? token_type then
889
+ self.lex_strterm = nil
890
+ self.lex_state = (token_type == :tLABEL_END) ? EXPR_PAR : EXPR_LIT
781
891
  end
782
892
 
783
- result :expr_end, :tSTRING, text[1..-2].gsub(/\\\\/, "\\").gsub(/\\'/, "'")
893
+ return token
784
894
  end
785
895
 
786
- def process_label text
787
- symbol = possibly_escape_string text, /^"/
896
+ def process_symbol text
897
+ symbol = possibly_escape_string text, /^:\"/ # stupid emacs
788
898
 
789
- result(:expr_labeled, :tLABEL, [symbol, self.lineno]) # TODO: expr_arg|expr_labeled
899
+ result EXPR_LIT, :tSYMBOL, symbol
790
900
  end
791
901
 
792
902
  def process_token text
793
903
  # matching: parse_ident in compare/parse23.y:7989
794
904
  # TODO: make this always return [token, lineno]
795
- self.last_state = lex_state
905
+ # FIX: remove: self.last_state = lex_state
796
906
 
797
907
  token = self.token = text
798
908
  token << matched if scan(/[\!\?](?!=)/)
@@ -801,7 +911,7 @@ class RubyLexer
801
911
  case
802
912
  when token =~ /[!?]$/ then
803
913
  :tFID
804
- when in_lex_state?(:expr_fname) && scan(/=(?:(?![~>=])|(?==>))/) then
914
+ when lex_state =~ EXPR_FNAME && scan(/=(?:(?![~>=])|(?==>))/) then
805
915
  # ident=, not =~ => == or followed by =>
806
916
  # TODO test lexing of a=>b vs a==>b
807
917
  token << matched
@@ -814,31 +924,33 @@ class RubyLexer
814
924
 
815
925
  if is_label_possible? and is_label_suffix? then
816
926
  scan(/:/)
817
- # TODO: :expr_arg|:expr_labeled
818
- return result :expr_labeled, :tLABEL, [token, self.lineno]
927
+ # TODO: propagate the lineno to ALL results
928
+ return result EXPR_LAB, :tLABEL, [token, self.lineno]
819
929
  end
820
930
 
821
- # TODO: mb == ENC_CODERANGE_7BIT && !in_lex_state?(:expr_dot)
822
- unless in_lex_state? :expr_dot then
931
+ # TODO: mb == ENC_CODERANGE_7BIT && lex_state !~ EXPR_DOT
932
+ if lex_state !~ EXPR_DOT then
823
933
  # See if it is a reserved word.
824
934
  keyword = RubyParserStuff::Keyword.keyword token
825
935
 
826
936
  return process_token_keyword keyword if keyword
827
- end # unless in_lex_state? :expr_dot
937
+ end
828
938
 
829
939
  # matching: compare/parse23.y:8079
830
- state = if is_beg? or is_arg? or in_lex_state? :expr_dot then
831
- cmd_state ? :expr_cmdarg : :expr_arg
832
- elsif in_lex_state? :expr_fname then
833
- :expr_endfn
940
+ state = if is_beg? or is_arg? or lex_state =~ EXPR_DOT then
941
+ cmd_state ? EXPR_CMDARG : EXPR_ARG
942
+ elsif lex_state =~ EXPR_FNAME then
943
+ EXPR_ENDFN
834
944
  else
835
- :expr_end
945
+ EXPR_END
836
946
  end
837
947
 
838
- if not [:expr_dot, :expr_fname].include? last_state and
839
- (tok_id == :tIDENTIFIER) and # not :expr_fname, not attrasgn
948
+ tok_id = :tIDENTIFIER if tok_id == :tCONSTANT && is_local_id(token)
949
+
950
+ if last_state !~ EXPR_DOT|EXPR_FNAME and
951
+ (tok_id == :tIDENTIFIER) and # not EXPR_FNAME, not attrasgn
840
952
  lvar_defined?(token) then
841
- state = :expr_end # TODO: EXPR_END|EXPR_LABEL
953
+ state = EXPR_END|EXPR_LABEL
842
954
  end
843
955
 
844
956
  token.lineno = self.lineno # yes, on a string. I know... I know...
@@ -853,32 +965,30 @@ class RubyLexer
853
965
 
854
966
  value = [token, self.lineno]
855
967
 
856
- return result(lex_state, keyword.id0, value) if state == :expr_fname
968
+ return result(lex_state, keyword.id0, value) if state =~ EXPR_FNAME
857
969
 
858
- self.command_start = true if lex_state == :expr_beg
970
+ self.command_start = true if lex_state =~ EXPR_BEG
859
971
 
860
972
  case
861
- when keyword.id0 == :kDO then
973
+ when keyword.id0 == :kDO then # parse26.y line 7591
862
974
  case
863
975
  when lambda_beginning? then
864
976
  self.lpar_beg = nil # lambda_beginning? == FALSE in the body of "-> do ... end"
865
- self.paren_nest -= 1
866
- result(lex_state, :kDO_LAMBDA, value)
977
+ self.paren_nest -= 1 # TODO: question this?
978
+ result lex_state, :kDO_LAMBDA, value
867
979
  when cond.is_in_state then
868
- result(lex_state, :kDO_COND, value)
869
- when cmdarg.is_in_state && state != :expr_cmdarg then
870
- result(lex_state, :kDO_BLOCK, value)
871
- when [:expr_beg, :expr_endarg].include?(state) then
872
- result(lex_state, :kDO_BLOCK, value)
980
+ result lex_state, :kDO_COND, value
981
+ when cmdarg.is_in_state && state != EXPR_CMDARG then
982
+ result lex_state, :kDO_BLOCK, value
873
983
  else
874
- result(lex_state, :kDO, value)
984
+ result lex_state, :kDO, value
875
985
  end
876
- when [:expr_beg, :expr_labeled].include?(state) then
877
- result(lex_state, keyword.id0, value)
986
+ when state =~ EXPR_PAD then
987
+ result lex_state, keyword.id0, value
878
988
  when keyword.id0 != keyword.id1 then
879
- result(:expr_beg, keyword.id1, value) # TODO: :expr_beg|:expr_label
989
+ result EXPR_PAR, keyword.id1, value
880
990
  else
881
- result(lex_state, keyword.id1, value)
991
+ result lex_state, keyword.id1, value
882
992
  end
883
993
  end
884
994
 
@@ -886,9 +996,9 @@ class RubyLexer
886
996
  ss.unscan # put back "_"
887
997
 
888
998
  if beginning_of_line? && scan(/\__END__(\r?\n|\Z)/) then
889
- return [RubyLexer::EOF, RubyLexer::EOF]
890
- elsif scan(/\_\w*/) then
891
- return process_token matched
999
+ [RubyLexer::EOF, RubyLexer::EOF]
1000
+ elsif scan(/#{IDENT_CHAR}+/) then
1001
+ process_token matched
892
1002
  end
893
1003
  end
894
1004
 
@@ -921,10 +1031,11 @@ class RubyLexer
921
1031
  when scan(/s/) then # space
922
1032
  " "
923
1033
  when scan(/[0-7]{1,3}/) then # octal constant
924
- (matched.to_i(8) & 0xFF).chr
1034
+ (matched.to_i(8) & 0xFF).chr.force_encoding Encoding::UTF_8
925
1035
  when scan(/x([0-9a-fA-F]{1,2})/) then # hex constant
926
- ss[1].to_i(16).chr
927
- when check(/M-\\[\\MCc]/) then
1036
+ # TODO: force encode everything to UTF-8?
1037
+ ss[1].to_i(16).chr.force_encoding Encoding::UTF_8
1038
+ when check(/M-\\./) then
928
1039
  scan(/M-\\/) # eat it
929
1040
  c = self.read_escape
930
1041
  c[0] = (c[0].ord | 0x80).chr
@@ -938,6 +1049,11 @@ class RubyLexer
938
1049
  c = self.read_escape
939
1050
  c[0] = (c[0].ord & 0x9f).chr
940
1051
  c
1052
+ when check(/(C-|c)\\(?!u|\\)/) then
1053
+ scan(/(C-|c)\\/) # eat it
1054
+ c = read_escape
1055
+ c[0] = (c[0].ord & 0x9f).chr
1056
+ c
941
1057
  when scan(/C-\?|c\?/) then
942
1058
  127.chr
943
1059
  when scan(/(C-|c)(.)/) then
@@ -946,15 +1062,25 @@ class RubyLexer
946
1062
  c
947
1063
  when scan(/^[89]/i) then # bad octal or hex... MRI ignores them :(
948
1064
  matched
949
- when scan(/u([0-9a-fA-F]{2,4}|\{[0-9a-fA-F]{2,6}\})/) then
950
- [ss[1].delete("{}").to_i(16)].pack("U")
1065
+ when scan(/u(\h{4})/) then
1066
+ [ss[1].to_i(16)].pack("U")
1067
+ when scan(/u(\h{1,3})/) then
1068
+ rb_compile_error "Invalid escape character syntax"
1069
+ when scan(/u\{(\h+(?:\s+\h+)*)\}/) then
1070
+ ss[1].split.map { |s| s.to_i(16) }.pack("U*")
951
1071
  when scan(/[McCx0-9]/) || end_of_stream? then
952
1072
  rb_compile_error("Invalid escape character syntax")
953
1073
  else
954
- ss.getch
1074
+ getch
955
1075
  end.dup
956
1076
  end
957
1077
 
1078
+ def getch
1079
+ c = ss.getch
1080
+ c = ss.getch if c == "\r" && ss.peek(1) == "\n"
1081
+ c
1082
+ end
1083
+
958
1084
  def regx_options # TODO: rewrite / remove
959
1085
  good, bad = [], []
960
1086
 
@@ -974,7 +1100,7 @@ class RubyLexer
974
1100
  self.brace_nest = 0
975
1101
  self.command_start = true
976
1102
  self.comments = []
977
- self.lex_state = :expr_none
1103
+ self.lex_state = EXPR_NONE
978
1104
  self.lex_strterm = nil
979
1105
  self.lineno = 1
980
1106
  self.lpar_beg = nil
@@ -988,29 +1114,30 @@ class RubyLexer
988
1114
  self.cmdarg.reset
989
1115
  end
990
1116
 
991
- def result lex_state, token, text # :nodoc:
992
- lex_state = self.arg_state if lex_state == :arg_state
993
- self.lex_state = lex_state if lex_state
1117
+ def result new_state, token, text # :nodoc:
1118
+ new_state = self.arg_state if new_state == :arg_state
1119
+ self.lex_state = new_state if new_state
994
1120
  [token, text]
995
1121
  end
996
1122
 
997
- def scan re
998
- ss.scan re
1123
+ def ruby22_label?
1124
+ ruby22plus? and is_label_possible?
999
1125
  end
1000
1126
 
1001
- def check re
1002
- ss.check re
1127
+ def ruby22plus?
1128
+ parser.class.version >= 22
1003
1129
  end
1004
1130
 
1005
- def eat_whitespace
1006
- r = scan(/\s+/)
1007
- self.extra_lineno += r.count("\n") if r
1008
- r
1131
+ def ruby23plus?
1132
+ parser.class.version >= 23
1009
1133
  end
1010
1134
 
1011
- def fixup_lineno extra = 0
1012
- self.lineno += self.extra_lineno + extra
1013
- self.extra_lineno = 0
1135
+ def ruby24minus?
1136
+ parser.class.version <= 24
1137
+ end
1138
+
1139
+ def scan re
1140
+ ss.scan re
1014
1141
  end
1015
1142
 
1016
1143
  def scanner_class # TODO: design this out of oedipus_lex. or something.
@@ -1033,12 +1160,6 @@ class RubyLexer
1033
1160
  self.lex_strterm = [:strterm, type, beg, nnd]
1034
1161
  end
1035
1162
 
1036
- # TODO: consider
1037
- # def src= src
1038
- # raise "bad src: #{src.inspect}" unless String === src
1039
- # @src = RPStringScanner.new(src)
1040
- # end
1041
-
1042
1163
  def tokadd_escape term # TODO: rewrite / remove
1043
1164
  case
1044
1165
  when scan(/\\\n/) then
@@ -1057,8 +1178,10 @@ class RubyLexer
1057
1178
  prev = self.string_buffer.last
1058
1179
  if term == chr && prev && prev.end_with?("(?") then
1059
1180
  self.string_buffer << chr
1181
+ elsif term == chr || chr.ascii_only? then
1182
+ self.string_buffer << matched # dunno why we keep them for ascii
1060
1183
  else
1061
- self.string_buffer << matched
1184
+ self.string_buffer << chr # HACK? this is such a rat's nest
1062
1185
  end
1063
1186
  else
1064
1187
  rb_compile_error "Invalid escape character syntax"
@@ -1066,22 +1189,24 @@ class RubyLexer
1066
1189
  end
1067
1190
 
1068
1191
  def tokadd_string(func, term, paren) # TODO: rewrite / remove
1069
- qwords = (func & STR_FUNC_QWORDS) != 0
1070
- escape = (func & STR_FUNC_ESCAPE) != 0
1071
- expand = (func & STR_FUNC_EXPAND) != 0
1072
- regexp = (func & STR_FUNC_REGEXP) != 0
1073
- symbol = (func & STR_FUNC_SYMBOL) != 0
1192
+ qwords = func =~ STR_FUNC_QWORDS
1193
+ escape = func =~ STR_FUNC_ESCAPE
1194
+ expand = func =~ STR_FUNC_EXPAND
1195
+ regexp = func =~ STR_FUNC_REGEXP
1196
+ symbol = func =~ STR_FUNC_SYMBOL
1074
1197
 
1075
1198
  paren_re = @@regexp_cache[paren]
1076
- term_re = @@regexp_cache[term]
1199
+ term_re = if term == "\n"
1200
+ /#{Regexp.escape "\r"}?#{Regexp.escape "\n"}/
1201
+ else
1202
+ @@regexp_cache[term]
1203
+ end
1077
1204
 
1078
1205
  until end_of_stream? do
1079
1206
  c = nil
1080
1207
  handled = true
1081
1208
 
1082
1209
  case
1083
- when paren_re && scan(paren_re) then
1084
- self.string_nest += 1
1085
1210
  when scan(term_re) then
1086
1211
  if self.string_nest == 0 then
1087
1212
  ss.pos -= 1
@@ -1089,7 +1214,9 @@ class RubyLexer
1089
1214
  else
1090
1215
  self.string_nest -= 1
1091
1216
  end
1092
- when expand && scan(/#(?=[\$\@\{])/) then
1217
+ when paren_re && scan(paren_re) then
1218
+ self.string_nest += 1
1219
+ when expand && scan(/#(?=[\$\@\{])/) then # TODO: this seems wrong
1093
1220
  ss.pos -= 1
1094
1221
  break
1095
1222
  when qwords && scan(/\s/) then
@@ -1103,7 +1230,7 @@ class RubyLexer
1103
1230
  string_buffer << "\n"
1104
1231
  next
1105
1232
  when qwords && scan(/\\\s/) then
1106
- c = ' '
1233
+ c = " "
1107
1234
  when expand && scan(/\\\n/) then
1108
1235
  next
1109
1236
  when regexp && check(/\\/) then
@@ -1128,12 +1255,16 @@ class RubyLexer
1128
1255
  end # top case
1129
1256
 
1130
1257
  unless handled then
1131
- t = Regexp.escape term
1132
- x = Regexp.escape(paren) if paren && paren != "\000"
1258
+ t = if term == "\n"
1259
+ Regexp.escape "\r\n"
1260
+ else
1261
+ Regexp.escape term
1262
+ end
1263
+ x = Regexp.escape paren if paren && paren != "\000"
1133
1264
  re = if qwords then
1134
- /[^#{t}#{x}\#\0\\\s]+|./ # |. to pick up whatever
1265
+ /[^#{t}#{x}\#\\\s]+|./ # |. to pick up whatever
1135
1266
  else
1136
- /[^#{t}#{x}\#\0\\]+|./
1267
+ /[^#{t}#{x}\#\\]+|./
1137
1268
  end
1138
1269
 
1139
1270
  scan re
@@ -1173,12 +1304,15 @@ class RubyLexer
1173
1304
  s
1174
1305
  when /^[McCx0-9]/ then
1175
1306
  rb_compile_error("Invalid escape character syntax")
1176
- when /u([0-9a-fA-F]{2,4}|\{[0-9a-fA-F]{2,6}\})/ then
1307
+ when /u(\h{4})/ then
1177
1308
  [$1.delete("{}").to_i(16)].pack("U")
1309
+ when /u(\h{1,3})/ then
1310
+ rb_compile_error("Invalid escape character syntax")
1311
+ when /u\{(\h+(?:\s+\h+)*)\}/ then
1312
+ $1.split.map { |s| s.to_i(16) }.pack("U*")
1178
1313
  else
1179
1314
  s
1180
1315
  end
1181
- x.force_encoding "UTF-8" if HAS_ENC
1182
1316
  x
1183
1317
  end
1184
1318
 
@@ -1186,172 +1320,154 @@ class RubyLexer
1186
1320
  # do nothing for now
1187
1321
  end
1188
1322
 
1189
- def ruby22plus?
1190
- parser.class.version >= 22
1191
- end
1192
-
1193
- def ruby23plus?
1194
- parser.class.version >= 23
1323
+ def was_label?
1324
+ @was_label = ruby22_label?
1325
+ true
1195
1326
  end
1196
1327
 
1197
- def process_string # TODO: rewrite / remove
1198
- # matches top of parser_yylex in compare/parse23.y:8113
1199
- token = if lex_strterm[0] == :heredoc then
1200
- self.heredoc lex_strterm
1201
- else
1202
- self.parse_string lex_strterm
1203
- end
1328
+ class State
1329
+ attr_accessor :n
1330
+ attr_accessor :names
1204
1331
 
1205
- token_type, c = token
1332
+ # TODO: take a shared hash of strings for inspect/to_s
1333
+ def initialize o, names
1334
+ raise ArgumentError, "bad state: %p" % [o] unless Integer === o # TODO: remove
1206
1335
 
1207
- # matches parser_string_term
1208
- if ruby22plus? && token_type == :tSTRING_END && ["'", '"'].include?(c) then
1209
- if (([:expr_beg, :expr_endfn].include?(lex_state) &&
1210
- !cond.is_in_state) || is_arg?) &&
1211
- is_label_suffix? then
1212
- scan(/:/)
1213
- token_type = token[0] = :tLABEL_END
1214
- end
1336
+ self.n = o
1337
+ self.names = names
1215
1338
  end
1216
1339
 
1217
- if [:tSTRING_END, :tREGEXP_END, :tLABEL_END].include? token_type then
1218
- self.lex_strterm = nil
1219
- # TODO: :expr_beg|:expr_label
1220
- self.lex_state = (token_type == :tLABEL_END) ? :expr_label : :expr_end
1340
+ def == o
1341
+ self.equal?(o) || (o.class == self.class && o.n == self.n)
1221
1342
  end
1222
1343
 
1223
- return token
1224
- end
1225
-
1226
- def parse_quote # TODO: remove / rewrite
1227
- beg, nnd, short_hand, c = nil, nil, false, nil
1228
-
1229
- if scan(/[a-z0-9]{1,2}/i) then # Long-hand (e.g. %Q{}).
1230
- rb_compile_error "unknown type of %string" if ss.matched_size == 2
1231
- c, beg, short_hand = matched, ss.getch, false
1232
- else # Short-hand (e.g. %{, %., %!, etc)
1233
- c, beg, short_hand = 'Q', ss.getch, true
1234
- end
1235
-
1236
- if end_of_stream? or c == RubyLexer::EOF or beg == RubyLexer::EOF then
1237
- rb_compile_error "unterminated quoted string meets end of file"
1238
- end
1239
-
1240
- # Figure nnd-char. "\0" is special to indicate beg=nnd and that no nesting?
1241
- nnd = { "(" => ")", "[" => "]", "{" => "}", "<" => ">" }[beg]
1242
- nnd, beg = beg, "\0" if nnd.nil?
1243
-
1244
- token_type, text = nil, "%#{c}#{beg}"
1245
- token_type, string_type = case c
1246
- when 'Q' then
1247
- ch = short_hand ? nnd : c + beg
1248
- text = "%#{ch}"
1249
- [:tSTRING_BEG, STR_DQUOTE]
1250
- when 'q' then
1251
- [:tSTRING_BEG, STR_SQUOTE]
1252
- when 'W' then
1253
- eat_whitespace
1254
- [:tWORDS_BEG, STR_DQUOTE | STR_FUNC_QWORDS]
1255
- when 'w' then
1256
- eat_whitespace
1257
- [:tQWORDS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
1258
- when 'x' then
1259
- [:tXSTRING_BEG, STR_XQUOTE]
1260
- when 'r' then
1261
- [:tREGEXP_BEG, STR_REGEXP]
1262
- when 's' then
1263
- self.lex_state = :expr_fname
1264
- [:tSYMBEG, STR_SSYM]
1265
- when 'I' then
1266
- eat_whitespace
1267
- [:tSYMBOLS_BEG, STR_DQUOTE | STR_FUNC_QWORDS]
1268
- when 'i' then
1269
- eat_whitespace
1270
- [:tQSYMBOLS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
1271
- end
1272
-
1273
- rb_compile_error "Bad %string type. Expected [QqWwIixrs], found '#{c}'." if
1274
- token_type.nil?
1275
-
1276
- raise "huh" unless string_type
1277
-
1278
- string string_type, nnd, beg
1279
-
1280
- return token_type, text
1281
- end
1282
-
1283
- def parse_string quote # TODO: rewrite / remove
1284
- _, string_type, term, open = quote
1285
-
1286
- space = false # FIX: remove these
1287
- func = string_type
1288
- paren = open
1289
- term_re = @@regexp_cache[term]
1290
-
1291
- qwords = (func & STR_FUNC_QWORDS) != 0
1292
- regexp = (func & STR_FUNC_REGEXP) != 0
1293
- expand = (func & STR_FUNC_EXPAND) != 0
1294
-
1295
- unless func then # nil'ed from qwords below. *sigh*
1296
- return :tSTRING_END, nil
1344
+ def =~ v
1345
+ (self.n & v.n) != 0
1297
1346
  end
1298
1347
 
1299
- space = true if qwords and eat_whitespace
1300
-
1301
- if self.string_nest == 0 && scan(/#{term_re}/) then
1302
- if qwords then
1303
- quote[1] = nil
1304
- return :tSPACE, nil
1305
- elsif regexp then
1306
- return :tREGEXP_END, self.regx_options
1307
- else
1308
- return :tSTRING_END, term
1309
- end
1348
+ def | v
1349
+ raise ArgumentError, "Incompatible State: %p vs %p" % [self, v] unless
1350
+ self.names == v.names
1351
+ self.class.new(self.n | v.n, self.names)
1310
1352
  end
1311
1353
 
1312
- return :tSPACE, nil if space
1354
+ def inspect
1355
+ return "Value(0)" if n.zero? # HACK?
1313
1356
 
1314
- self.string_buffer = []
1315
-
1316
- if expand
1317
- case
1318
- when scan(/#(?=\$(-.|[a-zA-Z_0-9~\*\$\?!@\/\\;,\.=:<>\"\&\`\'+]))/) then
1319
- # TODO: !ISASCII
1320
- # ?! see parser_peek_variable_name
1321
- return :tSTRING_DVAR, nil
1322
- when scan(/#(?=\@\@?[a-zA-Z_])/) then
1323
- # TODO: !ISASCII
1324
- return :tSTRING_DVAR, nil
1325
- when scan(/#[{]/) then
1326
- self.command_start = true
1327
- return :tSTRING_DBEG, nil
1328
- when scan(/#/) then
1329
- string_buffer << '#'
1330
- end
1357
+ names.map { |v, k| k if self =~ v }.
1358
+ compact.
1359
+ join("|").
1360
+ gsub(/(?:EXPR_|STR_(?:FUNC_)?)/, "")
1331
1361
  end
1332
1362
 
1333
- if tokadd_string(func, term, paren) == RubyLexer::EOF then
1334
- rb_compile_error "unterminated string meets end of file"
1363
+ alias to_s inspect
1364
+
1365
+ module Values
1366
+ expr_names = {}
1367
+
1368
+ EXPR_NONE = State.new 0x0, expr_names
1369
+ EXPR_BEG = State.new 0x1, expr_names
1370
+ EXPR_END = State.new 0x2, expr_names
1371
+ EXPR_ENDARG = State.new 0x4, expr_names
1372
+ EXPR_ENDFN = State.new 0x8, expr_names
1373
+ EXPR_ARG = State.new 0x10, expr_names
1374
+ EXPR_CMDARG = State.new 0x20, expr_names
1375
+ EXPR_MID = State.new 0x40, expr_names
1376
+ EXPR_FNAME = State.new 0x80, expr_names
1377
+ EXPR_DOT = State.new 0x100, expr_names
1378
+ EXPR_CLASS = State.new 0x200, expr_names
1379
+ EXPR_LABEL = State.new 0x400, expr_names
1380
+ EXPR_LABELED = State.new 0x800, expr_names
1381
+ EXPR_FITEM = State.new 0x1000, expr_names
1382
+
1383
+ EXPR_BEG_ANY = EXPR_BEG | EXPR_MID | EXPR_CLASS
1384
+ EXPR_ARG_ANY = EXPR_ARG | EXPR_CMDARG
1385
+ EXPR_END_ANY = EXPR_END | EXPR_ENDARG | EXPR_ENDFN
1386
+
1387
+ # extra fake lex_state names to make things a bit cleaner
1388
+
1389
+ EXPR_LAB = EXPR_ARG|EXPR_LABELED
1390
+ EXPR_LIT = EXPR_END|EXPR_ENDARG
1391
+ EXPR_PAR = EXPR_BEG|EXPR_LABEL
1392
+ EXPR_PAD = EXPR_BEG|EXPR_LABELED
1393
+
1394
+ EXPR_NUM = EXPR_LIT
1395
+
1396
+ expr_names.merge!(EXPR_NONE => "EXPR_NONE",
1397
+ EXPR_BEG => "EXPR_BEG",
1398
+ EXPR_END => "EXPR_END",
1399
+ EXPR_ENDARG => "EXPR_ENDARG",
1400
+ EXPR_ENDFN => "EXPR_ENDFN",
1401
+ EXPR_ARG => "EXPR_ARG",
1402
+ EXPR_CMDARG => "EXPR_CMDARG",
1403
+ EXPR_MID => "EXPR_MID",
1404
+ EXPR_FNAME => "EXPR_FNAME",
1405
+ EXPR_DOT => "EXPR_DOT",
1406
+ EXPR_CLASS => "EXPR_CLASS",
1407
+ EXPR_LABEL => "EXPR_LABEL",
1408
+ EXPR_LABELED => "EXPR_LABELED",
1409
+ EXPR_FITEM => "EXPR_FITEM")
1410
+
1411
+ # ruby constants for strings
1412
+
1413
+ str_func_names = {}
1414
+
1415
+ STR_FUNC_BORING = State.new 0x00, str_func_names
1416
+ STR_FUNC_ESCAPE = State.new 0x01, str_func_names
1417
+ STR_FUNC_EXPAND = State.new 0x02, str_func_names
1418
+ STR_FUNC_REGEXP = State.new 0x04, str_func_names
1419
+ STR_FUNC_QWORDS = State.new 0x08, str_func_names
1420
+ STR_FUNC_SYMBOL = State.new 0x10, str_func_names
1421
+ STR_FUNC_INDENT = State.new 0x20, str_func_names # <<-HEREDOC
1422
+ STR_FUNC_LABEL = State.new 0x40, str_func_names
1423
+ STR_FUNC_LIST = State.new 0x4000, str_func_names
1424
+ STR_FUNC_TERM = State.new 0x8000, str_func_names
1425
+ STR_FUNC_ICNTNT = State.new 0x10000, str_func_names # <<~HEREDOC -- TODO: remove?
1426
+
1427
+ # TODO: check parser25.y on how they do STR_FUNC_INDENT
1428
+
1429
+ STR_SQUOTE = STR_FUNC_BORING
1430
+ STR_DQUOTE = STR_FUNC_EXPAND
1431
+ STR_XQUOTE = STR_FUNC_EXPAND
1432
+ STR_REGEXP = STR_FUNC_REGEXP | STR_FUNC_ESCAPE | STR_FUNC_EXPAND
1433
+ STR_SWORD = STR_FUNC_QWORDS | STR_FUNC_LIST
1434
+ STR_DWORD = STR_FUNC_QWORDS | STR_FUNC_EXPAND | STR_FUNC_LIST
1435
+ STR_SSYM = STR_FUNC_SYMBOL
1436
+ STR_DSYM = STR_FUNC_SYMBOL | STR_FUNC_EXPAND
1437
+
1438
+ str_func_names.merge!(STR_FUNC_ESCAPE => "STR_FUNC_ESCAPE",
1439
+ STR_FUNC_EXPAND => "STR_FUNC_EXPAND",
1440
+ STR_FUNC_REGEXP => "STR_FUNC_REGEXP",
1441
+ STR_FUNC_QWORDS => "STR_FUNC_QWORDS",
1442
+ STR_FUNC_SYMBOL => "STR_FUNC_SYMBOL",
1443
+ STR_FUNC_INDENT => "STR_FUNC_INDENT",
1444
+ STR_FUNC_LABEL => "STR_FUNC_LABEL",
1445
+ STR_FUNC_LIST => "STR_FUNC_LIST",
1446
+ STR_FUNC_TERM => "STR_FUNC_TERM",
1447
+ STR_FUNC_ICNTNT => "STR_FUNC_ICNTNT",
1448
+ STR_SQUOTE => "STR_SQUOTE")
1335
1449
  end
1336
1450
 
1337
- return :tSTRING_CONTENT, string_buffer.join
1451
+ include Values
1338
1452
  end
1453
+
1454
+ include State::Values
1339
1455
  end
1340
1456
 
1341
1457
  require "ruby_lexer.rex"
1342
1458
 
1343
1459
  if ENV["RP_LINENO_DEBUG"] then
1344
1460
  class RubyLexer
1345
- alias :old_lineno= :lineno=
1346
-
1347
1461
  def d o
1348
1462
  $stderr.puts o.inspect
1349
1463
  end
1350
1464
 
1465
+ alias old_lineno= lineno=
1466
+
1351
1467
  def lineno= n
1352
1468
  self.old_lineno= n
1353
1469
  where = caller.first.split(/:/).first(2).join(":")
1354
- d :lineno => [n, where, ss && ss.rest[0,40]]
1470
+ d :lineno => [n, where, ss && ss.rest[0, 40]]
1355
1471
  end
1356
1472
  end
1357
1473
  end