ruby_parser 3.15.0 → 3.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/ruby_lexer.rb CHANGED
@@ -25,10 +25,15 @@ class RubyLexer
25
25
 
26
26
  HAS_ENC = "".respond_to? :encoding
27
27
 
28
+ BTOKENS = {
29
+ ".." => :tBDOT2,
30
+ "..." => :tBDOT3,
31
+ }
32
+
28
33
  TOKENS = {
29
34
  "!" => :tBANG,
30
35
  "!=" => :tNEQ,
31
- # "!@" => :tUBANG,
36
+ "!@" => :tBANG,
32
37
  "!~" => :tNMATCH,
33
38
  "," => :tCOMMA,
34
39
  ".." => :tDOT2,
@@ -41,17 +46,38 @@ class RubyLexer
41
46
  "->" => :tLAMBDA,
42
47
  }
43
48
 
49
+ PERCENT_END = {
50
+ "(" => ")",
51
+ "[" => "]",
52
+ "{" => "}",
53
+ "<" => ">",
54
+ }
55
+
56
+ SIMPLE_RE_META = /[\$\*\+\.\?\^\|\)\]\}\>]/
57
+
44
58
  @@regexp_cache = Hash.new { |h, k| h[k] = Regexp.new(Regexp.escape(k)) }
45
59
  @@regexp_cache[nil] = nil
46
60
 
61
+ def regexp_cache
62
+ @@regexp_cache
63
+ end
64
+
47
65
  if $DEBUG then
48
66
  attr_reader :lex_state
49
67
 
50
68
  def lex_state= o
51
69
  return if @lex_state == o
52
- raise ArgumentError, "bad state: %p" % [o] unless State === o
53
70
 
54
- warn "lex_state: %p -> %p" % [lex_state, o]
71
+ from = ""
72
+ if ENV["VERBOSE"]
73
+ path = caller[0]
74
+ path = caller[1] if path =~ /result/
75
+ path, line, *_ = path.split(/:/)
76
+ path.delete_prefix! File.dirname File.dirname __FILE__
77
+ from = " at .%s:%s" % [path, line]
78
+ end
79
+
80
+ warn "lex_state: %p -> %p%s" % [lex_state, o, from]
55
81
 
56
82
  @lex_state = o
57
83
  end
@@ -61,14 +87,16 @@ class RubyLexer
61
87
 
62
88
  attr_accessor :lex_state unless $DEBUG
63
89
 
64
- attr_accessor :lineno # we're bypassing oedipus' lineno handling.
65
90
  attr_accessor :brace_nest
66
91
  attr_accessor :cmdarg
67
92
  attr_accessor :command_start
68
93
  attr_accessor :cmd_state # temporary--ivar to avoid passing everywhere
69
94
  attr_accessor :last_state
70
95
  attr_accessor :cond
71
- attr_accessor :extra_lineno
96
+ attr_accessor :old_ss
97
+ attr_accessor :old_lineno
98
+
99
+ # these are generated via ruby_lexer.rex: ss, lineno
72
100
 
73
101
  ##
74
102
  # Additional context surrounding tokens that both the lexer and
@@ -93,6 +121,7 @@ class RubyLexer
93
121
 
94
122
  self.cond = RubyParserStuff::StackState.new(:cond, $DEBUG)
95
123
  self.cmdarg = RubyParserStuff::StackState.new(:cmdarg, $DEBUG)
124
+ self.ss = RPStringScanner.new ""
96
125
 
97
126
  reset
98
127
  end
@@ -105,14 +134,8 @@ class RubyLexer
105
134
  is_after_operator? ? EXPR_ARG : EXPR_BEG
106
135
  end
107
136
 
108
- def beginning_of_line?
109
- ss.bol?
110
- end
111
-
112
- alias bol? beginning_of_line? # to make .rex file more readable
113
-
114
- def check re
115
- ss.check re
137
+ def ignore_body_comments
138
+ @comments.clear
116
139
  end
117
140
 
118
141
  def comments # TODO: remove this... maybe comment_string + attr_accessor
@@ -121,14 +144,8 @@ class RubyLexer
121
144
  c
122
145
  end
123
146
 
124
- def eat_whitespace
125
- r = scan(/\s+/)
126
- self.extra_lineno += r.count("\n") if r
127
- r
128
- end
129
-
130
- def end_of_stream?
131
- ss.eos?
147
+ def debug n
148
+ raise "debug #{n}"
132
149
  end
133
150
 
134
151
  def expr_dot?
@@ -145,128 +162,6 @@ class RubyLexer
145
162
  result EXPR_BEG, token, text
146
163
  end
147
164
 
148
- def fixup_lineno extra = 0
149
- self.lineno += self.extra_lineno + extra
150
- self.extra_lineno = 0
151
- end
152
-
153
- def heredoc here # TODO: rewrite / remove
154
- _, eos, func, last_line = here
155
-
156
- indent = func =~ STR_FUNC_INDENT ? "[ \t]*" : nil
157
- expand = func =~ STR_FUNC_EXPAND
158
- eol = last_line && last_line.end_with?("\r\n") ? "\r\n" : "\n"
159
- eos_re = /#{indent}#{Regexp.escape eos}(\r*\n|\z)/
160
- err_msg = "can't match #{eos_re.inspect} anywhere in "
161
-
162
- rb_compile_error err_msg if end_of_stream?
163
-
164
- if beginning_of_line? && scan(eos_re) then
165
- self.lineno += 1
166
- ss.unread_many last_line # TODO: figure out how to remove this
167
- return :tSTRING_END, [eos, func] # TODO: calculate squiggle width at lex?
168
- end
169
-
170
- self.string_buffer = []
171
-
172
- if expand then
173
- case
174
- when scan(/#(?=\$(-.|[a-zA-Z_0-9~\*\$\?!@\/\\;,\.=:<>\"\&\`\'+]))/) then
175
- # TODO: !ISASCII
176
- # ?! see parser_peek_variable_name
177
- return :tSTRING_DVAR, matched
178
- when scan(/#(?=\@\@?[a-zA-Z_])/) then
179
- # TODO: !ISASCII
180
- return :tSTRING_DVAR, matched
181
- when scan(/#[{]/) then
182
- self.command_start = true
183
- return :tSTRING_DBEG, matched
184
- when scan(/#/) then
185
- string_buffer << "#"
186
- end
187
-
188
- begin
189
- c = tokadd_string func, eol, nil
190
-
191
- rb_compile_error err_msg if
192
- c == RubyLexer::EOF
193
-
194
- if c != eol then
195
- return :tSTRING_CONTENT, string_buffer.join
196
- else
197
- string_buffer << scan(/\n/)
198
- end
199
-
200
- rb_compile_error err_msg if end_of_stream?
201
- end until check(eos_re)
202
- else
203
- until check(eos_re) do
204
- string_buffer << scan(/.*(\n|\z)/)
205
- rb_compile_error err_msg if end_of_stream?
206
- end
207
- end
208
-
209
- self.lex_strterm = [:heredoc, eos, func, last_line]
210
-
211
- string_content = begin
212
- s = string_buffer.join
213
- s.b.force_encoding Encoding::UTF_8
214
- end
215
-
216
- return :tSTRING_CONTENT, string_content
217
- end
218
-
219
- def heredoc_identifier # TODO: remove / rewrite
220
- term, func = nil, STR_FUNC_BORING
221
- self.string_buffer = []
222
-
223
- heredoc_indent_mods = "-"
224
- heredoc_indent_mods += '\~' if ruby23plus?
225
-
226
- case
227
- when scan(/([#{heredoc_indent_mods}]?)([\'\"\`])(.*?)\2/) then
228
- term = ss[2]
229
- func |= STR_FUNC_INDENT unless ss[1].empty? # TODO: this seems wrong
230
- func |= STR_FUNC_ICNTNT if ss[1] == "~"
231
- func |= case term
232
- when "\'" then
233
- STR_SQUOTE
234
- when '"' then
235
- STR_DQUOTE
236
- else
237
- STR_XQUOTE
238
- end
239
- string_buffer << ss[3]
240
- when scan(/[#{heredoc_indent_mods}]?([\'\"\`])(?!\1*\Z)/) then
241
- rb_compile_error "unterminated here document identifier"
242
- when scan(/([#{heredoc_indent_mods}]?)(#{IDENT_CHAR}+)/) then
243
- term = '"'
244
- func |= STR_DQUOTE
245
- unless ss[1].empty? then
246
- func |= STR_FUNC_INDENT
247
- func |= STR_FUNC_ICNTNT if ss[1] == "~"
248
- end
249
- string_buffer << ss[2]
250
- else
251
- return nil
252
- end
253
-
254
- if scan(/.*\n/) then
255
- # TODO: think about storing off the char range instead
256
- line = matched
257
- else
258
- line = nil
259
- end
260
-
261
- self.lex_strterm = [:heredoc, string_buffer.join, func, line]
262
-
263
- if term == "`" then
264
- result nil, :tXSTRING_BEG, "`"
265
- else
266
- result nil, :tSTRING_BEG, "\""
267
- end
268
- end
269
-
270
165
  def in_fname? # REFACTOR
271
166
  lex_state =~ EXPR_FNAME
272
167
  end
@@ -277,13 +172,13 @@ class RubyLexer
277
172
  text = matched
278
173
  case
279
174
  when text.end_with?("ri")
280
- return result(EXPR_NUM, :tIMAGINARY, Complex(0, Rational(text.chop.chop.to_i(base))))
175
+ result EXPR_NUM, :tIMAGINARY, Complex(0, Rational(text.chop.chop.to_i(base)))
281
176
  when text.end_with?("r")
282
- return result(EXPR_NUM, :tRATIONAL, Rational(text.chop.to_i(base)))
177
+ result EXPR_NUM, :tRATIONAL, Rational(text.chop.to_i(base))
283
178
  when text.end_with?("i")
284
- return result(EXPR_NUM, :tIMAGINARY, Complex(0, text.chop.to_i(base)))
179
+ result EXPR_NUM, :tIMAGINARY, Complex(0, text.chop.to_i(base))
285
180
  else
286
- return result(EXPR_NUM, :tINTEGER, text.to_i(base))
181
+ result EXPR_NUM, :tINTEGER, text.to_i(base)
287
182
  end
288
183
  end
289
184
 
@@ -329,132 +224,10 @@ class RubyLexer
329
224
  self.parser.env[id.to_sym] == :lvar
330
225
  end
331
226
 
332
- def matched
333
- ss.matched
334
- end
335
-
336
227
  def not_end?
337
228
  not is_end?
338
229
  end
339
230
 
340
- def parse_quote # TODO: remove / rewrite
341
- beg, nnd, short_hand, c = nil, nil, false, nil
342
-
343
- if scan(/[a-z0-9]{1,2}/i) then # Long-hand (e.g. %Q{}).
344
- rb_compile_error "unknown type of %string" if ss.matched_size == 2
345
- c, beg, short_hand = matched, getch, false
346
- else # Short-hand (e.g. %{, %., %!, etc)
347
- c, beg, short_hand = "Q", getch, true
348
- end
349
-
350
- if end_of_stream? or c == RubyLexer::EOF or beg == RubyLexer::EOF then
351
- rb_compile_error "unterminated quoted string meets end of file"
352
- end
353
-
354
- # Figure nnd-char. "\0" is special to indicate beg=nnd and that no nesting?
355
- nnd = { "(" => ")", "[" => "]", "{" => "}", "<" => ">" }[beg]
356
- nnd, beg = beg, "\0" if nnd.nil?
357
-
358
- token_type, text = nil, "%#{c}#{beg}"
359
- token_type, string_type = case c
360
- when "Q" then
361
- ch = short_hand ? nnd : c + beg
362
- text = "%#{ch}"
363
- [:tSTRING_BEG, STR_DQUOTE]
364
- when "q" then
365
- [:tSTRING_BEG, STR_SQUOTE]
366
- when "W" then
367
- eat_whitespace
368
- [:tWORDS_BEG, STR_DQUOTE | STR_FUNC_QWORDS]
369
- when "w" then
370
- eat_whitespace
371
- [:tQWORDS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
372
- when "x" then
373
- [:tXSTRING_BEG, STR_XQUOTE]
374
- when "r" then
375
- [:tREGEXP_BEG, STR_REGEXP]
376
- when "s" then
377
- self.lex_state = EXPR_FNAME
378
- [:tSYMBEG, STR_SSYM]
379
- when "I" then
380
- eat_whitespace
381
- [:tSYMBOLS_BEG, STR_DQUOTE | STR_FUNC_QWORDS]
382
- when "i" then
383
- eat_whitespace
384
- [:tQSYMBOLS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
385
- end
386
-
387
- rb_compile_error "Bad %string type. Expected [QqWwIixrs], found '#{c}'." if
388
- token_type.nil?
389
-
390
- raise "huh" unless string_type
391
-
392
- string string_type, nnd, beg
393
-
394
- return token_type, text
395
- end
396
-
397
- def parse_string quote # TODO: rewrite / remove
398
- _, string_type, term, open = quote
399
-
400
- space = false # FIX: remove these
401
- func = string_type
402
- paren = open
403
- term_re = @@regexp_cache[term]
404
-
405
- qwords = func =~ STR_FUNC_QWORDS
406
- regexp = func =~ STR_FUNC_REGEXP
407
- expand = func =~ STR_FUNC_EXPAND
408
-
409
- unless func then # nil'ed from qwords below. *sigh*
410
- return :tSTRING_END, nil
411
- end
412
-
413
- space = true if qwords and eat_whitespace
414
-
415
- if self.string_nest == 0 && scan(/#{term_re}/) then
416
- if qwords then
417
- quote[1] = nil
418
- return :tSPACE, nil
419
- elsif regexp then
420
- return :tREGEXP_END, self.regx_options
421
- else
422
- return :tSTRING_END, term
423
- end
424
- end
425
-
426
- return :tSPACE, nil if space
427
-
428
- self.string_buffer = []
429
-
430
- if expand
431
- case
432
- when scan(/#(?=\$(-.|[a-zA-Z_0-9~\*\$\?!@\/\\;,\.=:<>\"\&\`\'+]))/) then
433
- # TODO: !ISASCII
434
- # ?! see parser_peek_variable_name
435
- return :tSTRING_DVAR, nil
436
- when scan(/#(?=\@\@?[a-zA-Z_])/) then
437
- # TODO: !ISASCII
438
- return :tSTRING_DVAR, nil
439
- when scan(/#[{]/) then
440
- self.command_start = true
441
- return :tSTRING_DBEG, nil
442
- when scan(/#/) then
443
- string_buffer << "#"
444
- end
445
- end
446
-
447
- if tokadd_string(func, term, paren) == RubyLexer::EOF then
448
- if func =~ STR_FUNC_REGEXP then
449
- rb_compile_error "unterminated regexp meets end of file"
450
- else
451
- rb_compile_error "unterminated string meets end of file"
452
- end
453
- end
454
-
455
- return :tSTRING_CONTENT, string_buffer.join
456
- end
457
-
458
231
  def possibly_escape_string text, check
459
232
  content = match[1]
460
233
 
@@ -475,11 +248,11 @@ class RubyLexer
475
248
  :tAMPER2
476
249
  end
477
250
 
478
- return result(:arg_state, token, "&")
251
+ result :arg_state, token, "&"
479
252
  end
480
253
 
481
254
  def process_backref text
482
- token = ss[1].to_sym
255
+ token = match[1].to_sym
483
256
  # TODO: can't do lineno hack w/ symbol
484
257
  result EXPR_END, :tBACK_REF, token
485
258
  end
@@ -493,7 +266,7 @@ class RubyLexer
493
266
  end
494
267
 
495
268
  @comments << matched
496
- self.lineno += matched.count("\n")
269
+ self.lineno += matched.count("\n") # HACK?
497
270
 
498
271
  nil # TODO
499
272
  end
@@ -564,9 +337,9 @@ class RubyLexer
564
337
 
565
338
  case
566
339
  when scan(/\'/) then
567
- string STR_SSYM
340
+ string STR_SSYM, matched
568
341
  when scan(/\"/) then
569
- string STR_DSYM
342
+ string STR_DSYM, matched
570
343
  end
571
344
 
572
345
  result EXPR_FNAME, :tSYMBEG, text
@@ -580,41 +353,48 @@ class RubyLexer
580
353
  end
581
354
  end
582
355
 
356
+ def process_dots text
357
+ tokens = ruby27plus? && is_beg? ? BTOKENS : TOKENS
358
+
359
+ result EXPR_BEG, tokens[text], text
360
+ end
361
+
583
362
  def process_float text
584
363
  rb_compile_error "Invalid numeric format" if text =~ /__/
585
364
 
586
365
  case
587
366
  when text.end_with?("ri")
588
- return result EXPR_NUM, :tIMAGINARY, Complex(0, Rational(text.chop.chop))
367
+ result EXPR_NUM, :tIMAGINARY, Complex(0, Rational(text.chop.chop))
589
368
  when text.end_with?("i")
590
- return result EXPR_NUM, :tIMAGINARY, Complex(0, text.chop.to_f)
369
+ result EXPR_NUM, :tIMAGINARY, Complex(0, text.chop.to_f)
591
370
  when text.end_with?("r")
592
- return result EXPR_NUM, :tRATIONAL, Rational(text.chop)
371
+ result EXPR_NUM, :tRATIONAL, Rational(text.chop)
593
372
  else
594
- return result EXPR_NUM, :tFLOAT, text.to_f
373
+ result EXPR_NUM, :tFLOAT, text.to_f
595
374
  end
596
375
  end
597
376
 
598
377
  def process_gvar text
599
- text.lineno = self.lineno
378
+ if parser.class.version > 20 && text == "$-" then
379
+ rb_compile_error "unexpected $undefined"
380
+ end
381
+
600
382
  result EXPR_END, :tGVAR, text
601
383
  end
602
384
 
603
385
  def process_gvar_oddity text
604
- return result EXPR_END, "$", "$" if text == "$" # TODO: wtf is this?
605
386
  rb_compile_error "#{text.inspect} is not allowed as a global variable name"
606
387
  end
607
388
 
608
389
  def process_ivar text
609
390
  tok_id = text =~ /^@@/ ? :tCVAR : :tIVAR
610
- text.lineno = self.lineno
611
391
  result EXPR_END, tok_id, text
612
392
  end
613
393
 
614
394
  def process_label text
615
395
  symbol = possibly_escape_string text, /^\"/
616
396
 
617
- result EXPR_LAB, :tLABEL, [symbol, self.lineno]
397
+ result EXPR_LAB, :tLABEL, symbol
618
398
  end
619
399
 
620
400
  def process_label_or_string text
@@ -622,11 +402,15 @@ class RubyLexer
622
402
  @was_label = nil
623
403
  return process_label text
624
404
  elsif text =~ /:\Z/ then
625
- ss.pos -= 1 # put back ":"
405
+ self.pos -= 1 # put back ":"
626
406
  text = text[0..-2]
627
407
  end
628
408
 
629
- result EXPR_END, :tSTRING, text[1..-2].gsub(/\\\\/, "\\").gsub(/\\\'/, "\'")
409
+ orig_line = lineno
410
+ str = text[1..-2].gsub(/\\\\/, "\\").gsub(/\\\'/, "\'")
411
+ self.lineno += str.count("\n")
412
+
413
+ result EXPR_END, :tSTRING, str, orig_line
630
414
  end
631
415
 
632
416
  def process_lchevron text
@@ -644,34 +428,25 @@ class RubyLexer
644
428
  self.lex_state = EXPR_BEG
645
429
  end
646
430
 
647
- return result(lex_state, :tLSHFT, "\<\<")
431
+ result lex_state, :tLSHFT, "\<\<"
648
432
  end
649
433
 
650
- def process_newline_or_comment text
434
+ def process_newline_or_comment text # ../compare/parse30.y:9126 ish
651
435
  c = matched
652
- hit = false
653
436
 
654
437
  if c == "#" then
655
- ss.pos -= 1
438
+ self.pos -= 1
656
439
 
657
- # TODO: handle magic comments
658
440
  while scan(/\s*\#.*(\n+|\z)/) do
659
- hit = true
660
- self.lineno += matched.lines.to_a.size
441
+ self.lineno += matched.count "\n"
661
442
  @comments << matched.gsub(/^ +#/, "#").gsub(/^ +$/, "")
662
443
  end
663
444
 
664
445
  return nil if end_of_stream?
665
446
  end
666
447
 
667
- self.lineno += 1 unless hit
668
-
669
- # Replace a string of newlines with a single one
670
- self.lineno += matched.lines.to_a.size if scan(/\n+/)
671
-
672
448
  c = (lex_state =~ EXPR_BEG|EXPR_CLASS|EXPR_FNAME|EXPR_DOT &&
673
449
  lex_state !~ EXPR_LABELED)
674
- # TODO: figure out what token_seen is for
675
450
  if c || self.lex_state == EXPR_LAB then # yes, == EXPR_LAB
676
451
  # ignore if !fallthrough?
677
452
  if !c && parser.in_kwarg then
@@ -679,25 +454,29 @@ class RubyLexer
679
454
  self.command_start = true
680
455
  return result EXPR_BEG, :tNL, nil
681
456
  else
682
- return # skip
457
+ maybe_pop_stack
458
+ return # goto retry
683
459
  end
684
460
  end
685
461
 
686
- if scan(/([\ \t\r\f\v]*)(\.|&)/) then
687
- self.space_seen = true unless ss[1].empty?
462
+ if scan(/[\ \t\r\f\v]+/) then
463
+ self.space_seen = true
464
+ end
688
465
 
689
- ss.pos -= 1
690
- return unless check(/\.\./)
466
+ if check(/#/) then
467
+ return # goto retry
468
+ elsif check(/&\.|\.(?!\.)/) then # C version is a hellish obfuscated xnor
469
+ return # goto retry
691
470
  end
692
471
 
693
472
  self.command_start = true
694
473
 
695
- return result(EXPR_BEG, :tNL, nil)
474
+ result EXPR_BEG, :tNL, nil
696
475
  end
697
476
 
698
477
  def process_nthref text
699
478
  # TODO: can't do lineno hack w/ number
700
- result EXPR_END, :tNTH_REF, ss[1].to_i
479
+ result EXPR_END, :tNTH_REF, match[1].to_i
701
480
  end
702
481
 
703
482
  def process_paren text
@@ -725,13 +504,16 @@ class RubyLexer
725
504
  end
726
505
 
727
506
  def process_percent text
728
- return parse_quote if is_beg?
729
-
730
- return result EXPR_BEG, :tOP_ASGN, "%" if scan(/\=/)
731
-
732
- return parse_quote if is_space_arg?(check(/\s/)) || (lex_state =~ EXPR_FITEM && check(/s/))
733
-
734
- return result :arg_state, :tPERCENT, "%"
507
+ case
508
+ when is_beg? then
509
+ process_percent_quote
510
+ when scan(/\=/)
511
+ result EXPR_BEG, :tOP_ASGN, "%"
512
+ when is_space_arg?(check(/\s/)) || (lex_state =~ EXPR_FITEM && check(/s/))
513
+ process_percent_quote
514
+ else
515
+ result :arg_state, :tPERCENT, "%"
516
+ end
735
517
  end
736
518
 
737
519
  def process_plus_minus text
@@ -805,20 +587,21 @@ class RubyLexer
805
587
  end
806
588
 
807
589
  def process_simple_string text
808
- replacement = text[1..-2].gsub(ESC) {
809
- unescape($1).b.force_encoding Encoding::UTF_8
810
- }
590
+ orig_line = lineno
591
+ self.lineno += text.count("\n")
811
592
 
812
- replacement = replacement.b unless replacement.valid_encoding?
593
+ str = text[1..-2]
594
+ .gsub(ESC) { unescape($1).b.force_encoding Encoding::UTF_8 }
595
+ str = str.b unless str.valid_encoding?
813
596
 
814
- result EXPR_END, :tSTRING, replacement
597
+ result EXPR_END, :tSTRING, str, orig_line
815
598
  end
816
599
 
817
600
  def process_slash text
818
601
  if is_beg? then
819
- string STR_REGEXP
602
+ string STR_REGEXP, matched
820
603
 
821
- return result(nil, :tREGEXP_BEG, "/")
604
+ return result nil, :tREGEXP_BEG, "/"
822
605
  end
823
606
 
824
607
  if scan(/\=/) then
@@ -833,7 +616,7 @@ class RubyLexer
833
616
  end
834
617
  end
835
618
 
836
- return result(:arg_state, :tDIVIDE, "/")
619
+ result :arg_state, :tDIVIDE, "/"
837
620
  end
838
621
 
839
622
  def process_square_bracket text
@@ -865,34 +648,6 @@ class RubyLexer
865
648
  result EXPR_PAR, token, text
866
649
  end
867
650
 
868
- def process_string # TODO: rewrite / remove
869
- # matches top of parser_yylex in compare/parse23.y:8113
870
- token = if lex_strterm[0] == :heredoc then
871
- self.heredoc lex_strterm
872
- else
873
- self.parse_string lex_strterm
874
- end
875
-
876
- token_type, c = token
877
-
878
- # matches parser_string_term from 2.3, but way off from 2.5
879
- if ruby22plus? && token_type == :tSTRING_END && ["'", '"'].include?(c) then
880
- if ((lex_state =~ EXPR_BEG|EXPR_ENDFN &&
881
- !cond.is_in_state) || is_arg?) &&
882
- is_label_suffix? then
883
- scan(/:/)
884
- token_type = token[0] = :tLABEL_END
885
- end
886
- end
887
-
888
- if [:tSTRING_END, :tREGEXP_END, :tLABEL_END].include? token_type then
889
- self.lex_strterm = nil
890
- self.lex_state = (token_type == :tLABEL_END) ? EXPR_PAR : EXPR_LIT
891
- end
892
-
893
- return token
894
- end
895
-
896
651
  def process_symbol text
897
652
  symbol = possibly_escape_string text, /^:\"/ # stupid emacs
898
653
 
@@ -901,7 +656,6 @@ class RubyLexer
901
656
 
902
657
  def process_token text
903
658
  # matching: parse_ident in compare/parse23.y:7989
904
- # TODO: make this always return [token, lineno]
905
659
  # FIX: remove: self.last_state = lex_state
906
660
 
907
661
  token = self.token = text
@@ -924,8 +678,7 @@ class RubyLexer
924
678
 
925
679
  if is_label_possible? and is_label_suffix? then
926
680
  scan(/:/)
927
- # TODO: propagate the lineno to ALL results
928
- return result EXPR_LAB, :tLABEL, [token, self.lineno]
681
+ return result EXPR_LAB, :tLABEL, token
929
682
  end
930
683
 
931
684
  # TODO: mb == ENC_CODERANGE_7BIT && lex_state !~ EXPR_DOT
@@ -936,14 +689,15 @@ class RubyLexer
936
689
  return process_token_keyword keyword if keyword
937
690
  end
938
691
 
939
- # matching: compare/parse23.y:8079
940
- state = if is_beg? or is_arg? or lex_state =~ EXPR_DOT then
692
+ # matching: compare/parse30.y:9039
693
+ state = if lex_state =~ EXPR_BEG_ANY|EXPR_ARG_ANY|EXPR_DOT then
941
694
  cmd_state ? EXPR_CMDARG : EXPR_ARG
942
695
  elsif lex_state =~ EXPR_FNAME then
943
696
  EXPR_ENDFN
944
697
  else
945
698
  EXPR_END
946
699
  end
700
+ self.lex_state = state
947
701
 
948
702
  tok_id = :tIDENTIFIER if tok_id == :tCONSTANT && is_local_id(token)
949
703
 
@@ -953,20 +707,16 @@ class RubyLexer
953
707
  state = EXPR_END|EXPR_LABEL
954
708
  end
955
709
 
956
- token.lineno = self.lineno # yes, on a string. I know... I know...
957
-
958
- return result(state, tok_id, token)
710
+ result state, tok_id, token
959
711
  end
960
712
 
961
713
  def process_token_keyword keyword
962
714
  # matching MIDDLE of parse_ident in compare/parse23.y:8046
963
715
  state = lex_state
964
- self.lex_state = keyword.state
965
-
966
- value = [token, self.lineno]
967
716
 
968
- return result(lex_state, keyword.id0, value) if state =~ EXPR_FNAME
717
+ return result(EXPR_ENDFN, keyword.id0, token) if lex_state =~ EXPR_FNAME
969
718
 
719
+ self.lex_state = keyword.state
970
720
  self.command_start = true if lex_state =~ EXPR_BEG
971
721
 
972
722
  case
@@ -975,27 +725,28 @@ class RubyLexer
975
725
  when lambda_beginning? then
976
726
  self.lpar_beg = nil # lambda_beginning? == FALSE in the body of "-> do ... end"
977
727
  self.paren_nest -= 1 # TODO: question this?
978
- result lex_state, :kDO_LAMBDA, value
728
+ result lex_state, :kDO_LAMBDA, token
979
729
  when cond.is_in_state then
980
- result lex_state, :kDO_COND, value
730
+ result lex_state, :kDO_COND, token
981
731
  when cmdarg.is_in_state && state != EXPR_CMDARG then
982
- result lex_state, :kDO_BLOCK, value
732
+ result lex_state, :kDO_BLOCK, token
983
733
  else
984
- result lex_state, :kDO, value
734
+ result lex_state, :kDO, token
985
735
  end
986
736
  when state =~ EXPR_PAD then
987
- result lex_state, keyword.id0, value
737
+ result lex_state, keyword.id0, token
988
738
  when keyword.id0 != keyword.id1 then
989
- result EXPR_PAR, keyword.id1, value
739
+ result EXPR_PAR, keyword.id1, token
990
740
  else
991
- result lex_state, keyword.id1, value
741
+ result lex_state, keyword.id1, token
992
742
  end
993
743
  end
994
744
 
995
745
  def process_underscore text
996
- ss.unscan # put back "_"
746
+ self.unscan # put back "_"
997
747
 
998
748
  if beginning_of_line? && scan(/\__END__(\r?\n|\Z)/) then
749
+ ss.terminate
999
750
  [RubyLexer::EOF, RubyLexer::EOF]
1000
751
  elsif scan(/#{IDENT_CHAR}+/) then
1001
752
  process_token matched
@@ -1003,121 +754,35 @@ class RubyLexer
1003
754
  end
1004
755
 
1005
756
  def rb_compile_error msg
1006
- msg += ". near line #{self.lineno}: #{ss.rest[/^.*/].inspect}"
757
+ msg += ". near line #{self.lineno}: #{self.rest[/^.*/].inspect}"
1007
758
  raise RubyParser::SyntaxError, msg
1008
759
  end
1009
760
 
1010
- def read_escape # TODO: remove / rewrite
1011
- case
1012
- when scan(/\\/) then # Backslash
1013
- '\\'
1014
- when scan(/n/) then # newline
1015
- self.extra_lineno -= 1
1016
- "\n"
1017
- when scan(/t/) then # horizontal tab
1018
- "\t"
1019
- when scan(/r/) then # carriage-return
1020
- "\r"
1021
- when scan(/f/) then # form-feed
1022
- "\f"
1023
- when scan(/v/) then # vertical tab
1024
- "\13"
1025
- when scan(/a/) then # alarm(bell)
1026
- "\007"
1027
- when scan(/e/) then # escape
1028
- "\033"
1029
- when scan(/b/) then # backspace
1030
- "\010"
1031
- when scan(/s/) then # space
1032
- " "
1033
- when scan(/[0-7]{1,3}/) then # octal constant
1034
- (matched.to_i(8) & 0xFF).chr.force_encoding Encoding::UTF_8
1035
- when scan(/x([0-9a-fA-F]{1,2})/) then # hex constant
1036
- # TODO: force encode everything to UTF-8?
1037
- ss[1].to_i(16).chr.force_encoding Encoding::UTF_8
1038
- when check(/M-\\./) then
1039
- scan(/M-\\/) # eat it
1040
- c = self.read_escape
1041
- c[0] = (c[0].ord | 0x80).chr
1042
- c
1043
- when scan(/M-(.)/) then
1044
- c = ss[1]
1045
- c[0] = (c[0].ord | 0x80).chr
1046
- c
1047
- when check(/(C-|c)\\[\\MCc]/) then
1048
- scan(/(C-|c)\\/) # eat it
1049
- c = self.read_escape
1050
- c[0] = (c[0].ord & 0x9f).chr
1051
- c
1052
- when check(/(C-|c)\\(?!u|\\)/) then
1053
- scan(/(C-|c)\\/) # eat it
1054
- c = read_escape
1055
- c[0] = (c[0].ord & 0x9f).chr
1056
- c
1057
- when scan(/C-\?|c\?/) then
1058
- 127.chr
1059
- when scan(/(C-|c)(.)/) then
1060
- c = ss[2]
1061
- c[0] = (c[0].ord & 0x9f).chr
1062
- c
1063
- when scan(/^[89]/i) then # bad octal or hex... MRI ignores them :(
1064
- matched
1065
- when scan(/u(\h{4})/) then
1066
- [ss[1].to_i(16)].pack("U")
1067
- when scan(/u(\h{1,3})/) then
1068
- rb_compile_error "Invalid escape character syntax"
1069
- when scan(/u\{(\h+(?:\s+\h+)*)\}/) then
1070
- ss[1].split.map { |s| s.to_i(16) }.pack("U*")
1071
- when scan(/[McCx0-9]/) || end_of_stream? then
1072
- rb_compile_error("Invalid escape character syntax")
1073
- else
1074
- getch
1075
- end.dup
1076
- end
1077
-
1078
- def getch
1079
- c = ss.getch
1080
- c = ss.getch if c == "\r" && ss.peek(1) == "\n"
1081
- c
1082
- end
1083
-
1084
- def regx_options # TODO: rewrite / remove
1085
- good, bad = [], []
1086
-
1087
- if scan(/[a-z]+/) then
1088
- good, bad = matched.split(//).partition { |s| s =~ /^[ixmonesu]$/ }
1089
- end
1090
-
1091
- unless bad.empty? then
1092
- rb_compile_error("unknown regexp option%s - %s" %
1093
- [(bad.size > 1 ? "s" : ""), bad.join.inspect])
1094
- end
1095
-
1096
- return good.join
1097
- end
1098
-
1099
761
  def reset
762
+ self.lineno = 1
1100
763
  self.brace_nest = 0
1101
764
  self.command_start = true
1102
765
  self.comments = []
1103
766
  self.lex_state = EXPR_NONE
1104
767
  self.lex_strterm = nil
1105
- self.lineno = 1
1106
768
  self.lpar_beg = nil
1107
769
  self.paren_nest = 0
1108
770
  self.space_seen = false
1109
771
  self.string_nest = 0
1110
772
  self.token = nil
1111
- self.extra_lineno = 0
773
+ self.string_buffer = []
774
+ self.old_ss = nil
775
+ self.old_lineno = nil
1112
776
 
1113
777
  self.cond.reset
1114
778
  self.cmdarg.reset
1115
779
  end
1116
780
 
1117
- def result new_state, token, text # :nodoc:
781
+ def result new_state, token, text, line = self.lineno # :nodoc:
1118
782
  new_state = self.arg_state if new_state == :arg_state
1119
783
  self.lex_state = new_state if new_state
1120
- [token, text]
784
+
785
+ [token, [text, line]]
1121
786
  end
1122
787
 
1123
788
  def ruby22_label?
@@ -1136,12 +801,8 @@ class RubyLexer
1136
801
  parser.class.version <= 24
1137
802
  end
1138
803
 
1139
- def scan re
1140
- ss.scan re
1141
- end
1142
-
1143
- def scanner_class # TODO: design this out of oedipus_lex. or something.
1144
- RPStringScanner
804
+ def ruby27plus?
805
+ parser.class.version >= 27
1145
806
  end
1146
807
 
1147
808
  def space_vs_beginning space_type, beg_type, fallback
@@ -1156,139 +817,9 @@ class RubyLexer
1156
817
  end
1157
818
  end
1158
819
 
1159
- def string type, beg = matched, nnd = "\0"
1160
- self.lex_strterm = [:strterm, type, beg, nnd]
1161
- end
1162
-
1163
- def tokadd_escape term # TODO: rewrite / remove
1164
- case
1165
- when scan(/\\\n/) then
1166
- # just ignore
1167
- when scan(/\\([0-7]{1,3}|x[0-9a-fA-F]{1,2})/) then
1168
- self.string_buffer << matched
1169
- when scan(/\\([MC]-|c)(?=\\)/) then
1170
- self.string_buffer << matched
1171
- self.tokadd_escape term
1172
- when scan(/\\([MC]-|c)(.)/) then
1173
- self.string_buffer << matched
1174
- when scan(/\\[McCx]/) then
1175
- rb_compile_error "Invalid escape character syntax"
1176
- when scan(/\\(.)/m) then
1177
- chr = ss[1]
1178
- prev = self.string_buffer.last
1179
- if term == chr && prev && prev.end_with?("(?") then
1180
- self.string_buffer << chr
1181
- elsif term == chr || chr.ascii_only? then
1182
- self.string_buffer << matched # dunno why we keep them for ascii
1183
- else
1184
- self.string_buffer << chr # HACK? this is such a rat's nest
1185
- end
1186
- else
1187
- rb_compile_error "Invalid escape character syntax"
1188
- end
1189
- end
1190
-
1191
- def tokadd_string(func, term, paren) # TODO: rewrite / remove
1192
- qwords = func =~ STR_FUNC_QWORDS
1193
- escape = func =~ STR_FUNC_ESCAPE
1194
- expand = func =~ STR_FUNC_EXPAND
1195
- regexp = func =~ STR_FUNC_REGEXP
1196
- symbol = func =~ STR_FUNC_SYMBOL
1197
-
1198
- paren_re = @@regexp_cache[paren]
1199
- term_re = if term == "\n"
1200
- /#{Regexp.escape "\r"}?#{Regexp.escape "\n"}/
1201
- else
1202
- @@regexp_cache[term]
1203
- end
1204
-
1205
- until end_of_stream? do
1206
- c = nil
1207
- handled = true
1208
-
1209
- case
1210
- when scan(term_re) then
1211
- if self.string_nest == 0 then
1212
- ss.pos -= 1
1213
- break
1214
- else
1215
- self.string_nest -= 1
1216
- end
1217
- when paren_re && scan(paren_re) then
1218
- self.string_nest += 1
1219
- when expand && scan(/#(?=[\$\@\{])/) then # TODO: this seems wrong
1220
- ss.pos -= 1
1221
- break
1222
- when qwords && scan(/\s/) then
1223
- ss.pos -= 1
1224
- break
1225
- when expand && scan(/#(?!\n)/) then
1226
- # do nothing
1227
- when check(/\\/) then
1228
- case
1229
- when qwords && scan(/\\\n/) then
1230
- string_buffer << "\n"
1231
- next
1232
- when qwords && scan(/\\\s/) then
1233
- c = " "
1234
- when expand && scan(/\\\n/) then
1235
- next
1236
- when regexp && check(/\\/) then
1237
- self.tokadd_escape term
1238
- next
1239
- when expand && scan(/\\/) then
1240
- c = self.read_escape
1241
- when scan(/\\\n/) then
1242
- # do nothing
1243
- when scan(/\\\\/) then
1244
- string_buffer << '\\' if escape
1245
- c = '\\'
1246
- when scan(/\\/) then
1247
- unless scan(term_re) || paren.nil? || scan(paren_re) then
1248
- string_buffer << "\\"
1249
- end
1250
- else
1251
- handled = false
1252
- end # inner /\\/ case
1253
- else
1254
- handled = false
1255
- end # top case
1256
-
1257
- unless handled then
1258
- t = if term == "\n"
1259
- Regexp.escape "\r\n"
1260
- else
1261
- Regexp.escape term
1262
- end
1263
- x = Regexp.escape paren if paren && paren != "\000"
1264
- re = if qwords then
1265
- /[^#{t}#{x}\#\\\s]+|./ # |. to pick up whatever
1266
- else
1267
- /[^#{t}#{x}\#\\]+|./
1268
- end
1269
-
1270
- scan re
1271
- c = matched
1272
-
1273
- rb_compile_error "symbol cannot contain '\\0'" if symbol && c =~ /\0/
1274
- end # unless handled
1275
-
1276
- c ||= matched
1277
- string_buffer << c
1278
- end # until
1279
-
1280
- c ||= matched
1281
- c = RubyLexer::EOF if end_of_stream?
1282
-
1283
- return c
1284
- end
1285
-
1286
820
  def unescape s
1287
821
  r = ESCAPES[s]
1288
822
 
1289
- self.extra_lineno += 1 if s == "\n" # eg backslash newline strings
1290
- self.extra_lineno -= 1 if r && s == "n" # literal \n, not newline
1291
-
1292
823
  return r if r
1293
824
 
1294
825
  x = case s
@@ -1309,7 +840,7 @@ class RubyLexer
1309
840
  when /u(\h{1,3})/ then
1310
841
  rb_compile_error("Invalid escape character syntax")
1311
842
  when /u\{(\h+(?:\s+\h+)*)\}/ then
1312
- $1.split.map { |s| s.to_i(16) }.pack("U*")
843
+ $1.split.map { |cp| cp.to_i(16) }.pack("U*")
1313
844
  else
1314
845
  s
1315
846
  end
@@ -1422,7 +953,7 @@ class RubyLexer
1422
953
  STR_FUNC_LABEL = State.new 0x40, str_func_names
1423
954
  STR_FUNC_LIST = State.new 0x4000, str_func_names
1424
955
  STR_FUNC_TERM = State.new 0x8000, str_func_names
1425
- STR_FUNC_ICNTNT = State.new 0x10000, str_func_names # <<~HEREDOC -- TODO: remove?
956
+ STR_FUNC_DEDENT = State.new 0x10000, str_func_names # <<~HEREDOC
1426
957
 
1427
958
  # TODO: check parser25.y on how they do STR_FUNC_INDENT
1428
959
 
@@ -1434,6 +965,7 @@ class RubyLexer
1434
965
  STR_DWORD = STR_FUNC_QWORDS | STR_FUNC_EXPAND | STR_FUNC_LIST
1435
966
  STR_SSYM = STR_FUNC_SYMBOL
1436
967
  STR_DSYM = STR_FUNC_SYMBOL | STR_FUNC_EXPAND
968
+ STR_LABEL = STR_FUNC_LABEL
1437
969
 
1438
970
  str_func_names.merge!(STR_FUNC_ESCAPE => "STR_FUNC_ESCAPE",
1439
971
  STR_FUNC_EXPAND => "STR_FUNC_EXPAND",
@@ -1444,7 +976,7 @@ class RubyLexer
1444
976
  STR_FUNC_LABEL => "STR_FUNC_LABEL",
1445
977
  STR_FUNC_LIST => "STR_FUNC_LIST",
1446
978
  STR_FUNC_TERM => "STR_FUNC_TERM",
1447
- STR_FUNC_ICNTNT => "STR_FUNC_ICNTNT",
979
+ STR_FUNC_DEDENT => "STR_FUNC_DEDENT",
1448
980
  STR_SQUOTE => "STR_SQUOTE")
1449
981
  end
1450
982
 
@@ -1454,7 +986,145 @@ class RubyLexer
1454
986
  include State::Values
1455
987
  end
1456
988
 
1457
- require "ruby_lexer.rex"
989
+ class RubyLexer
990
+ module SSWrapper
991
+ def string= s
992
+ ss.string= s
993
+ end
994
+
995
+ def beginning_of_line?
996
+ ss.bol?
997
+ end
998
+
999
+ alias bol? beginning_of_line? # to make .rex file more readable
1000
+
1001
+ def check re
1002
+ maybe_pop_stack
1003
+
1004
+ ss.check re
1005
+ end
1006
+
1007
+ def end_of_stream?
1008
+ ss.eos?
1009
+ end
1010
+
1011
+ alias eos? end_of_stream?
1012
+
1013
+ def getch
1014
+ c = ss.getch
1015
+ c = ss.getch if c == "\r" && ss.peek(1) == "\n"
1016
+ c
1017
+ end
1018
+
1019
+ def match
1020
+ ss
1021
+ end
1022
+
1023
+ def matched
1024
+ ss.matched
1025
+ end
1026
+
1027
+ def in_heredoc?
1028
+ !!self.old_ss
1029
+ end
1030
+
1031
+ def maybe_pop_stack
1032
+ if ss.eos? && in_heredoc? then
1033
+ self.ss_pop
1034
+ self.lineno_pop
1035
+ end
1036
+ end
1037
+
1038
+ def pos
1039
+ ss.pos
1040
+ end
1041
+
1042
+ def pos= n
1043
+ ss.pos = n
1044
+ end
1045
+
1046
+ def rest
1047
+ ss.rest
1048
+ end
1049
+
1050
+ def scan re
1051
+ maybe_pop_stack
1052
+
1053
+ ss.scan re
1054
+ end
1055
+
1056
+ def scanner_class # TODO: design this out of oedipus_lex. or something.
1057
+ RPStringScanner
1058
+ end
1059
+
1060
+ def ss_string
1061
+ ss.string
1062
+ end
1063
+
1064
+ def ss_string= s
1065
+ raise "Probably not"
1066
+ ss.string = s
1067
+ end
1068
+
1069
+ def unscan
1070
+ ss.unscan
1071
+ end
1072
+ end
1073
+
1074
+ include SSWrapper
1075
+ end
1076
+
1077
+ class RubyLexer
1078
+ module SSStackish
1079
+ def lineno_push new_lineno
1080
+ self.old_lineno = self.lineno
1081
+ self.lineno = new_lineno
1082
+ end
1083
+
1084
+ def lineno_pop
1085
+ self.lineno = self.old_lineno
1086
+ self.old_lineno = nil
1087
+ end
1088
+
1089
+ def ss= o
1090
+ raise "Clearing ss while in heredoc!?!" if in_heredoc?
1091
+ @old_ss = nil
1092
+ super
1093
+ end
1094
+
1095
+ def ss_push new_ss
1096
+ @old_ss = self.ss
1097
+ @ss = new_ss
1098
+ end
1099
+
1100
+ def ss_pop
1101
+ @ss = self.old_ss
1102
+ @old_ss = nil
1103
+ end
1104
+ end
1105
+
1106
+ prepend SSStackish
1107
+ end
1108
+
1109
+ if ENV["RP_STRTERM_DEBUG"] then
1110
+ class RubyLexer
1111
+ def d o
1112
+ $stderr.puts o.inspect
1113
+ end
1114
+
1115
+ alias old_lex_strterm= lex_strterm=
1116
+
1117
+ def lex_strterm= o
1118
+ self.old_lex_strterm= o
1119
+ where = caller.first.split(/:/).first(2).join(":")
1120
+ $stderr.puts
1121
+ d :lex_strterm => [o, where]
1122
+ end
1123
+ end
1124
+ end
1125
+
1126
+ require_relative "./ruby_lexer.rex.rb"
1127
+ require_relative "./ruby_lexer_strings.rb"
1458
1128
 
1459
1129
  if ENV["RP_LINENO_DEBUG"] then
1460
1130
  class RubyLexer
@@ -1467,7 +1137,8 @@ if ENV["RP_LINENO_DEBUG"] then
1467
1137
  def lineno= n
1468
1138
  self.old_lineno= n
1469
1139
  where = caller.first.split(/:/).first(2).join(":")
1470
- d :lineno => [n, where, ss && ss.rest[0, 40]]
1140
+ $stderr.puts
1141
+ d :lineno => [n, where]
1471
1142
  end
1472
1143
  end
1473
1144
  end