fabiokung-ruby_parser 2.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/lib/ruby_lexer.rb ADDED
@@ -0,0 +1,1331 @@
1
+ $: << File.expand_path("~/Work/p4/zss/src/ParseTree/dev/lib") # for me, not you.
2
+ require 'sexp'
3
+ require 'ruby_parser_extras'
4
+
5
+ class RubyLexer
6
+ attr_accessor :command_start
7
+ attr_accessor :cmdarg
8
+ attr_accessor :cond
9
+ attr_accessor :nest
10
+
11
+ ESC_RE = /\\([0-7]{1,3}|x[0-9a-fA-F]{1,2}|M-.|(C-|c)\?|(C-|c).|[^0-7xMCc])/
12
+
13
+ # Additional context surrounding tokens that both the lexer and
14
+ # grammar use.
15
+ attr_reader :lex_state
16
+
17
+ attr_accessor :lex_strterm
18
+
19
+ attr_accessor :parser # HACK for very end of lexer... *sigh*
20
+
21
+ # Stream of data that yylex examines.
22
+ attr_reader :src
23
+
24
+ # Last token read via yylex.
25
+ attr_accessor :token
26
+
27
+ attr_accessor :string_buffer
28
+
29
+ # Value of last token which had a value associated with it.
30
+ attr_accessor :yacc_value
31
+
32
+ # What handles warnings
33
+ attr_accessor :warnings
34
+
35
+ EOF = :eof_haha!
36
+
37
+ # ruby constants for strings (should this be moved somewhere else?)
38
+ STR_FUNC_BORING = 0x00
39
+ STR_FUNC_ESCAPE = 0x01 # TODO: remove and replace with REGEXP
40
+ STR_FUNC_EXPAND = 0x02
41
+ STR_FUNC_REGEXP = 0x04
42
+ STR_FUNC_AWORDS = 0x08
43
+ STR_FUNC_SYMBOL = 0x10
44
+ STR_FUNC_INDENT = 0x20 # <<-HEREDOC
45
+
46
+ STR_SQUOTE = STR_FUNC_BORING
47
+ STR_DQUOTE = STR_FUNC_BORING | STR_FUNC_EXPAND
48
+ STR_XQUOTE = STR_FUNC_BORING | STR_FUNC_EXPAND
49
+ STR_REGEXP = STR_FUNC_REGEXP | STR_FUNC_ESCAPE | STR_FUNC_EXPAND
50
+ STR_SSYM = STR_FUNC_SYMBOL
51
+ STR_DSYM = STR_FUNC_SYMBOL | STR_FUNC_EXPAND
52
+
53
+ # How the parser advances to the next token.
54
+ #
55
+ # @return true if not at end of file (EOF).
56
+
57
+ def advance
58
+ r = yylex
59
+ self.token = r
60
+
61
+ raise "yylex returned nil" unless r
62
+
63
+ return RubyLexer::EOF != r
64
+ end
65
+
66
+ def arg_ambiguous
67
+ self.warning("Ambiguous first argument. make sure.")
68
+ end
69
+
70
+ def comments
71
+ c = @comments.join
72
+ @comments.clear
73
+ c
74
+ end
75
+
76
+ def expr_beg_push val
77
+ cond.push false
78
+ cmdarg.push false
79
+ self.lex_state = :expr_beg
80
+ self.yacc_value = val
81
+ end
82
+
83
+ def fix_arg_lex_state
84
+ self.lex_state = if lex_state == :expr_fname || lex_state == :expr_dot
85
+ :expr_arg
86
+ else
87
+ :expr_beg
88
+ end
89
+ end
90
+
91
+ def heredoc here # 63 lines
92
+ _, eos, func, last_line = here
93
+
94
+ indent = (func & STR_FUNC_INDENT) != 0
95
+ expand = (func & STR_FUNC_EXPAND) != 0
96
+ eos_re = indent ? /[ \t]*#{eos}(\r?\n|\z)/ : /#{eos}(\r?\n|\z)/
97
+ err_msg = "can't match #{eos_re.inspect} anywhere in "
98
+
99
+ rb_compile_error err_msg if
100
+ src.eos?
101
+
102
+ if src.beginning_of_line? && src.scan(eos_re) then
103
+ src.unread_many last_line # TODO: figure out how to remove this
104
+ self.yacc_value = eos
105
+ return :tSTRING_END
106
+ end
107
+
108
+ self.string_buffer = []
109
+
110
+ if expand then
111
+ case
112
+ when src.scan(/#[$@]/) then
113
+ src.pos -= 1 # FIX omg stupid
114
+ self.yacc_value = src.matched
115
+ return :tSTRING_DVAR
116
+ when src.scan(/#[{]/) then
117
+ self.yacc_value = src.matched
118
+ return :tSTRING_DBEG
119
+ when src.scan(/#/) then
120
+ string_buffer << '#'
121
+ end
122
+
123
+ until src.scan(eos_re) do
124
+ c = tokadd_string func, "\n", nil
125
+
126
+ rb_compile_error err_msg if
127
+ c == RubyLexer::EOF
128
+
129
+ if c != "\n" then
130
+ self.yacc_value = string_buffer.join.delete("\r")
131
+ return :tSTRING_CONTENT
132
+ else
133
+ string_buffer << src.scan(/\n/)
134
+ end
135
+
136
+ rb_compile_error err_msg if
137
+ src.eos?
138
+ end
139
+
140
+ # tack on a NL after the heredoc token - FIX NL should not be needed
141
+ src.unread_many(eos + "\n") # TODO: remove this... stupid stupid stupid
142
+ else
143
+ until src.check(eos_re) do
144
+ string_buffer << src.scan(/.*(\n|\z)/)
145
+ rb_compile_error err_msg if
146
+ src.eos?
147
+ end
148
+ end
149
+
150
+ self.lex_strterm = [:heredoc, eos, func, last_line]
151
+ self.yacc_value = string_buffer.join.delete("\r")
152
+
153
+ return :tSTRING_CONTENT
154
+ end
155
+
156
+ def heredoc_identifier # 51 lines
157
+ term, func = nil, STR_FUNC_BORING
158
+ self.string_buffer = []
159
+
160
+ case
161
+ when src.scan(/(-?)(['"`])(.*?)\2/) then
162
+ term = src[2]
163
+ unless src[1].empty? then
164
+ func |= STR_FUNC_INDENT
165
+ end
166
+ func |= case term
167
+ when "\'" then
168
+ STR_SQUOTE
169
+ when '"' then
170
+ STR_DQUOTE
171
+ else
172
+ STR_XQUOTE
173
+ end
174
+ string_buffer << src[3]
175
+ when src.scan(/-?(['"`])(?!\1*\Z)/) then
176
+ rb_compile_error "unterminated here document identifier"
177
+ when src.scan(/(-?)(\w+)/) then
178
+ term = '"'
179
+ func |= STR_DQUOTE
180
+ unless src[1].empty? then
181
+ func |= STR_FUNC_INDENT
182
+ end
183
+ string_buffer << src[2]
184
+ else
185
+ return nil
186
+ end
187
+
188
+ if src.check(/.*\n/) then
189
+ # TODO: think about storing off the char range instead
190
+ line = src.string[src.pos, src.matched_size]
191
+ src.string[src.pos, src.matched_size] = "\n"
192
+ src.pos += 1
193
+ else
194
+ line = nil
195
+ end
196
+
197
+ self.lex_strterm = [:heredoc, string_buffer.join, func, line]
198
+
199
+ if term == '`' then
200
+ self.yacc_value = "`"
201
+ return :tXSTRING_BEG
202
+ else
203
+ self.yacc_value = "\""
204
+ return :tSTRING_BEG
205
+ end
206
+ end
207
+
208
+ def initialize
209
+ self.cond = StackState.new(:cond)
210
+ self.cmdarg = StackState.new(:cmdarg)
211
+ self.nest = 0
212
+ @comments = []
213
+
214
+ reset
215
+ end
216
+
217
+ def int_with_base base
218
+ rb_compile_error "Invalid numeric format" if src.matched =~ /__/
219
+ self.yacc_value = src.matched.to_i(base)
220
+ return :tINTEGER
221
+ end
222
+
223
+ def lex_state= o
224
+ raise "wtf?" unless Symbol === o
225
+ @lex_state = o
226
+ end
227
+
228
+ attr_writer :lineno
229
+ def lineno
230
+ @lineno ||= src.lineno
231
+ end
232
+
233
+ ##
234
+ # Parse a number from the input stream.
235
+ #
236
+ # @param c The first character of the number.
237
+ # @return A int constant wich represents a token.
238
+
239
+ def parse_number
240
+ self.lex_state = :expr_end
241
+
242
+ case
243
+ when src.scan(/[+-]?0[xbd]\b/) then
244
+ rb_compile_error "Invalid numeric format"
245
+ when src.scan(/[+-]?0x[a-f0-9_]+/i) then
246
+ int_with_base(16)
247
+ when src.scan(/[+-]?0b[01_]+/) then
248
+ int_with_base(2)
249
+ when src.scan(/[+-]?0d[0-9_]+/) then
250
+ int_with_base(10)
251
+ when src.scan(/[+-]?0o?[0-7_]*[89]/) then
252
+ rb_compile_error "Illegal octal digit."
253
+ when src.scan(/[+-]?0o?[0-7_]+|0o/) then
254
+ int_with_base(8)
255
+ when src.scan(/[+-]?[\d_]+_(e|\.)/) then
256
+ rb_compile_error "Trailing '_' in number."
257
+ when src.scan(/[+-]?[\d_]+\.[\d_]+(e[+-]?[\d_]+)?\b|[+-]?[\d_]+e[+-]?[\d_]+\b/i) then
258
+ number = src.matched
259
+ if number =~ /__/ then
260
+ rb_compile_error "Invalid numeric format"
261
+ end
262
+ self.yacc_value = number.to_f
263
+ :tFLOAT
264
+ when src.scan(/[+-]?0\b/) then
265
+ int_with_base(10)
266
+ when src.scan(/[+-]?[\d_]+\b/) then
267
+ int_with_base(10)
268
+ else
269
+ rb_compile_error "Bad number format"
270
+ end
271
+ end
272
+
273
+ def parse_quote # 58 lines
274
+ beg, nnd, short_hand, c = nil, nil, false, nil
275
+
276
+ if src.scan(/[a-z0-9]{1,2}/i) then # Long-hand (e.g. %Q{}).
277
+ rb_compile_error "unknown type of %string" if src.matched_size == 2
278
+ c, beg, short_hand = src.matched, src.getch, false
279
+ else # Short-hand (e.g. %{, %., %!, etc)
280
+ c, beg, short_hand = 'Q', src.getch, true
281
+ end
282
+
283
+ if src.eos? or c == RubyLexer::EOF or beg == RubyLexer::EOF then
284
+ rb_compile_error "unterminated quoted string meets end of file"
285
+ end
286
+
287
+ # Figure nnd-char. "\0" is special to indicate beg=nnd and that no nesting?
288
+ nnd = { "(" => ")", "[" => "]", "{" => "}", "<" => ">" }[beg]
289
+ nnd, beg = beg, "\0" if nnd.nil?
290
+
291
+ token_type, self.yacc_value = nil, "%#{c}#{beg}"
292
+ token_type, string_type = case c
293
+ when 'Q' then
294
+ ch = short_hand ? nnd : c + beg
295
+ self.yacc_value = "%#{ch}"
296
+ [:tSTRING_BEG, STR_DQUOTE]
297
+ when 'q' then
298
+ [:tSTRING_BEG, STR_SQUOTE]
299
+ when 'W' then
300
+ src.scan(/\s*/)
301
+ [:tWORDS_BEG, STR_DQUOTE | STR_FUNC_AWORDS]
302
+ when 'w' then
303
+ src.scan(/\s*/)
304
+ [:tAWORDS_BEG, STR_SQUOTE | STR_FUNC_AWORDS]
305
+ when 'x' then
306
+ [:tXSTRING_BEG, STR_XQUOTE]
307
+ when 'r' then
308
+ [:tREGEXP_BEG, STR_REGEXP]
309
+ when 's' then
310
+ self.lex_state = :expr_fname
311
+ [:tSYMBEG, STR_SSYM]
312
+ end
313
+
314
+ rb_compile_error "Bad %string type. Expected [Qqwxr\W], found '#{c}'." if
315
+ token_type.nil?
316
+
317
+ self.lex_strterm = [:strterm, string_type, nnd, beg]
318
+
319
+ return token_type
320
+ end
321
+
322
+ def parse_string(quote) # 65 lines
323
+ _, string_type, term, open = quote
324
+
325
+ space = false # FIX: remove these
326
+ func = string_type
327
+ paren = open
328
+ term_re = Regexp.escape term
329
+
330
+ awords = (func & STR_FUNC_AWORDS) != 0
331
+ regexp = (func & STR_FUNC_REGEXP) != 0
332
+ expand = (func & STR_FUNC_EXPAND) != 0
333
+
334
+ unless func then # FIX: impossible, prolly needs == 0
335
+ self.lineno = nil
336
+ return :tSTRING_END
337
+ end
338
+
339
+ space = true if awords and src.scan(/\s+/)
340
+
341
+ if self.nest == 0 && src.scan(/#{term_re}/) then
342
+ if awords then
343
+ quote[1] = nil
344
+ return :tSPACE
345
+ elsif regexp then
346
+ self.yacc_value = self.regx_options
347
+ self.lineno = nil
348
+ return :tREGEXP_END
349
+ else
350
+ self.yacc_value = term
351
+ self.lineno = nil
352
+ return :tSTRING_END
353
+ end
354
+ end
355
+
356
+ if space then
357
+ return :tSPACE
358
+ end
359
+
360
+ self.string_buffer = []
361
+
362
+ if expand
363
+ case
364
+ when src.scan(/#(?=[$@])/) then
365
+ return :tSTRING_DVAR
366
+ when src.scan(/#[{]/) then
367
+ return :tSTRING_DBEG
368
+ when src.scan(/#/) then
369
+ string_buffer << '#'
370
+ end
371
+ end
372
+
373
+ if tokadd_string(func, term, paren) == RubyLexer::EOF then
374
+ rb_compile_error "unterminated string meets end of file"
375
+ end
376
+
377
+ self.yacc_value = string_buffer.join
378
+
379
+
380
+ return :tSTRING_CONTENT
381
+ end
382
+
383
+ def rb_compile_error msg
384
+ msg += ". near line #{self.lineno}: #{src.rest[/^.*/].inspect}"
385
+ raise SyntaxError, msg
386
+ end
387
+
388
+ def read_escape # 51 lines
389
+ case
390
+ when src.scan(/\\/) then # Backslash
391
+ '\\'
392
+ when src.scan(/n/) then # newline
393
+ "\n"
394
+ when src.scan(/t/) then # horizontal tab
395
+ "\t"
396
+ when src.scan(/r/) then # carriage-return
397
+ "\r"
398
+ when src.scan(/f/) then # form-feed
399
+ "\f"
400
+ when src.scan(/v/) then # vertical tab
401
+ "\13"
402
+ when src.scan(/a/) then # alarm(bell)
403
+ "\007"
404
+ when src.scan(/e/) then # escape
405
+ "\033"
406
+ when src.scan(/b/) then # backspace
407
+ "\010"
408
+ when src.scan(/s/) then # space
409
+ " "
410
+ when src.scan(/[0-7]{1,3}/) then # octal constant
411
+ src.matched.to_i(8).chr
412
+ when src.scan(/x([0-9a-fA-F]{1,2})/) then # hex constant
413
+ src[1].to_i(16).chr
414
+ when src.scan(/M-\\/) then
415
+ c = self.read_escape
416
+ c[0] = (c[0].ord | 0x80).chr
417
+ c
418
+ when src.scan(/M-(.)/) then
419
+ c = src[1]
420
+ c[0] = (c[0].ord | 0x80).chr
421
+ c
422
+ when src.scan(/C-\\|c\\/) then
423
+ c = self.read_escape
424
+ c[0] = (c[0].ord & 0x9f).chr
425
+ c
426
+ when src.scan(/C-\?|c\?/) then
427
+ 0177.chr
428
+ when src.scan(/(C-|c)(.)/) then
429
+ c = src[2]
430
+ c[0] = (c[0].ord & 0x9f).chr
431
+ c
432
+ when src.scan(/[McCx0-9]/) || src.eos? then
433
+ rb_compile_error("Invalid escape character syntax")
434
+ else
435
+ src.getch
436
+ end
437
+ end
438
+
439
+ def regx_options # 15 lines
440
+ good, bad = [], []
441
+
442
+ if src.scan(/[a-z]+/) then
443
+ good, bad = src.matched.split(//).partition { |s| s =~ /^[ixmonesu]$/ }
444
+ end
445
+
446
+ unless bad.empty? then
447
+ rb_compile_error("unknown regexp option%s - %s" %
448
+ [(bad.size > 1 ? "s" : ""), bad.join.inspect])
449
+ end
450
+
451
+ return good.join
452
+ end
453
+
454
+ def reset
455
+ self.command_start = true
456
+ self.lex_strterm = nil
457
+ self.token = nil
458
+ self.yacc_value = nil
459
+
460
+ @src = nil
461
+ @lex_state = nil
462
+ end
463
+
464
+ def src= src
465
+ raise "bad src: #{src.inspect}" unless String === src
466
+ @src = RPStringScanner.new(src)
467
+ end
468
+
469
+ def tokadd_escape term # 20 lines
470
+ case
471
+ when src.scan(/\\\n/) then
472
+ # just ignore
473
+ when src.scan(/\\([0-7]{1,3}|x[0-9a-fA-F]{1,2})/) then
474
+ self.string_buffer << src.matched
475
+ when src.scan(/\\([MC]-|c)(?=\\)/) then
476
+ self.string_buffer << src.matched
477
+ self.tokadd_escape term
478
+ when src.scan(/\\([MC]-|c)(.)/) then
479
+ self.string_buffer << src.matched
480
+ when src.scan(/\\[McCx]/) then
481
+ rb_compile_error "Invalid escape character syntax"
482
+ when src.scan(/\\(.)/m) then
483
+ self.string_buffer << src.matched
484
+ else
485
+ rb_compile_error "Invalid escape character syntax"
486
+ end
487
+ end
488
+
489
+ def tokadd_string(func, term, paren) # 105 lines
490
+ awords = (func & STR_FUNC_AWORDS) != 0
491
+ escape = (func & STR_FUNC_ESCAPE) != 0
492
+ expand = (func & STR_FUNC_EXPAND) != 0
493
+ regexp = (func & STR_FUNC_REGEXP) != 0
494
+ symbol = (func & STR_FUNC_SYMBOL) != 0
495
+
496
+ paren_re = paren.nil? ? nil : Regexp.new(Regexp.escape(paren))
497
+ term_re = Regexp.new(Regexp.escape(term))
498
+
499
+ until src.eos? do
500
+ c = nil
501
+ handled = true
502
+ case
503
+ when self.nest == 0 && src.scan(term_re) then
504
+ src.pos -= 1
505
+ break
506
+ when paren_re && src.scan(paren_re) then
507
+ self.nest += 1
508
+ when src.scan(term_re) then
509
+ self.nest -= 1
510
+ when awords && src.scan(/\s/) then
511
+ src.pos -= 1
512
+ break
513
+ when expand && src.scan(/#(?=[\$\@\{])/) then
514
+ src.pos -= 1
515
+ break
516
+ when expand && src.scan(/#(?!\n)/) then
517
+ # do nothing
518
+ when src.check(/\\/) then
519
+ case
520
+ when awords && src.scan(/\\\n/) then
521
+ string_buffer << "\n"
522
+ next
523
+ when awords && src.scan(/\\\s/) then
524
+ c = ' '
525
+ when expand && src.scan(/\\\n/) then
526
+ next
527
+ when regexp && src.check(/\\/) then
528
+ self.tokadd_escape term
529
+ next
530
+ when expand && src.scan(/\\/) then
531
+ c = self.read_escape
532
+ when src.scan(/\\\n/) then
533
+ # do nothing
534
+ when src.scan(/\\\\/) then
535
+ string_buffer << '\\' if escape
536
+ c = '\\'
537
+ when src.scan(/\\/) then
538
+ unless src.scan(term_re) || paren.nil? || src.scan(paren_re) then
539
+ string_buffer << "\\"
540
+ end
541
+ else
542
+ handled = false
543
+ end
544
+ else
545
+ handled = false
546
+ end # case
547
+
548
+ unless handled then
549
+
550
+ t = Regexp.escape term
551
+ x = Regexp.escape(paren) if paren && paren != "\000"
552
+ re = if awords then
553
+ /[^#{t}#{x}\#\0\\\n\ ]+|./ # |. to pick up whatever
554
+ else
555
+ /[^#{t}#{x}\#\0\\]+|./
556
+ end
557
+
558
+ src.scan re
559
+ c = src.matched
560
+
561
+ rb_compile_error "symbol cannot contain '\\0'" if symbol && c =~ /\0/
562
+ end # unless handled
563
+
564
+ c ||= src.matched
565
+ string_buffer << c
566
+ end # until
567
+
568
+ c ||= src.matched
569
+ c = RubyLexer::EOF if src.eos?
570
+
571
+
572
+ return c
573
+ end
574
+
575
+ def unescape s
576
+
577
+ r = {
578
+ "a" => "\007",
579
+ "b" => "\010",
580
+ "e" => "\033",
581
+ "f" => "\f",
582
+ "n" => "\n",
583
+ "r" => "\r",
584
+ "s" => " ",
585
+ "t" => "\t",
586
+ "v" => "\13",
587
+ "\\" => '\\',
588
+ "\n" => "",
589
+ "C-\?" => 0177.chr,
590
+ "c\?" => 0177.chr,
591
+ }[s]
592
+
593
+ return r if r
594
+
595
+ case s
596
+ when /^[0-7]{1,3}/ then
597
+ $&.to_i(8).chr
598
+ when /^x([0-9a-fA-F]{1,2})/ then
599
+ $1.to_i(16).chr
600
+ when /^M-(.)/ then
601
+ ($1[0].ord | 0x80).chr
602
+ when /^(C-|c)(.)/ then
603
+ ($2[0].ord & 0x9f).chr
604
+ when /^[McCx0-9]/ then
605
+ rb_compile_error("Invalid escape character syntax")
606
+ else
607
+ s
608
+ end
609
+ end
610
+
611
+ def warning s
612
+ # do nothing for now
613
+ end
614
+
615
+ ##
616
+ # Returns the next token. Also sets yy_val is needed.
617
+ #
618
+ # @return Description of the Returned Value
619
+
620
+ def yylex # 826 lines
621
+
622
+ c = ''
623
+ space_seen = false
624
+ command_state = false
625
+ src = self.src
626
+
627
+ self.token = nil
628
+ self.yacc_value = nil
629
+
630
+ return yylex_string if lex_strterm
631
+
632
+ command_state = self.command_start
633
+ self.command_start = false
634
+
635
+ last_state = lex_state
636
+
637
+ loop do # START OF CASE
638
+ if src.scan(/\ |\t|\r|\f|\13/) then # white spaces, 13 = '\v
639
+ space_seen = true
640
+ next
641
+ elsif src.check(/[^a-zA-Z]/) then
642
+ if src.scan(/\n|#/) then
643
+ self.lineno = nil
644
+ c = src.matched
645
+ if c == '#' then
646
+ src.unread c # ok
647
+
648
+ while src.scan(/\s*#.*(\n+|\z)/) do
649
+ @comments << src.matched.gsub(/^ +#/, '#').gsub(/^ +$/, '')
650
+ end
651
+
652
+ if src.eos? then
653
+ return RubyLexer::EOF
654
+ end
655
+ end
656
+
657
+ # Replace a string of newlines with a single one
658
+ src.scan(/\n+/)
659
+
660
+ if [:expr_beg, :expr_fname,
661
+ :expr_dot, :expr_class].include? lex_state then
662
+ next
663
+ end
664
+
665
+ self.command_start = true
666
+ self.lex_state = :expr_beg
667
+ return :tNL
668
+ elsif src.scan(/[\]\)\}]/) then
669
+ cond.lexpop
670
+ cmdarg.lexpop
671
+ self.lex_state = :expr_end
672
+ self.yacc_value = src.matched
673
+ result = {
674
+ ")" => :tRPAREN,
675
+ "]" => :tRBRACK,
676
+ "}" => :tRCURLY
677
+ }[src.matched]
678
+ return result
679
+ elsif src.check(/\./) then
680
+ if src.scan(/\.\.\./) then
681
+ self.lex_state = :expr_beg
682
+ self.yacc_value = "..."
683
+ return :tDOT3
684
+ elsif src.scan(/\.\./) then
685
+ self.lex_state = :expr_beg
686
+ self.yacc_value = ".."
687
+ return :tDOT2
688
+ elsif src.scan(/\.\d/) then
689
+ rb_compile_error "no .<digit> floating literal anymore put 0 before dot"
690
+ elsif src.scan(/\./) then
691
+ self.lex_state = :expr_dot
692
+ self.yacc_value = "."
693
+ return :tDOT
694
+ end
695
+ elsif src.scan(/\,/) then
696
+ self.lex_state = :expr_beg
697
+ self.yacc_value = ","
698
+ return :tCOMMA
699
+ elsif src.scan(/\(/) then
700
+ result = :tLPAREN2
701
+ self.command_start = true
702
+ if lex_state == :expr_beg || lex_state == :expr_mid then
703
+ result = :tLPAREN
704
+ elsif space_seen then
705
+ if lex_state == :expr_cmdarg then
706
+ result = :tLPAREN_ARG
707
+ elsif lex_state == :expr_arg then
708
+ warning("don't put space before argument parentheses")
709
+ result = :tLPAREN2
710
+ end
711
+ end
712
+
713
+ self.expr_beg_push "("
714
+
715
+ return result
716
+ elsif src.check(/\=/) then
717
+ if src.scan(/\=\=\=/) then
718
+ self.fix_arg_lex_state
719
+ self.yacc_value = "==="
720
+ return :tEQQ
721
+ elsif src.scan(/\=\=/) then
722
+ self.fix_arg_lex_state
723
+ self.yacc_value = "=="
724
+ return :tEQ
725
+ elsif src.scan(/\=~/) then
726
+ self.fix_arg_lex_state
727
+ self.yacc_value = "=~"
728
+ return :tMATCH
729
+ elsif src.scan(/\=>/) then
730
+ self.fix_arg_lex_state
731
+ self.yacc_value = "=>"
732
+ return :tASSOC
733
+ elsif src.scan(/\=/) then
734
+ if src.was_begin_of_line and src.scan(/begin(?=\s)/) then
735
+ @comments << '=' << src.matched
736
+
737
+ unless src.scan(/.*?\n=end\s*(\n|\z)/m) then
738
+ @comments.clear
739
+ rb_compile_error("embedded document meets end of file")
740
+ end
741
+
742
+ @comments << src.matched
743
+
744
+ next
745
+ else
746
+ self.fix_arg_lex_state
747
+ self.yacc_value = '='
748
+ return :tEQL
749
+ end
750
+ end
751
+ elsif src.scan(/\"(#{ESC_RE}|#(#{ESC_RE}|[^\{\#\@\$\"\\])|[^\"\\\#])*\"/o) then
752
+ self.yacc_value = src.matched[1..-2].gsub(ESC_RE) { unescape $1 }
753
+ self.lex_state = :expr_end
754
+ return :tSTRING
755
+ elsif src.scan(/\"/) then # FALLBACK
756
+ self.lex_strterm = [:strterm, STR_DQUOTE, '"', "\0"] # TODO: question this
757
+ self.yacc_value = "\""
758
+ return :tSTRING_BEG
759
+ elsif src.scan(/\@\@?\w*/) then
760
+ self.token = src.matched
761
+
762
+ rb_compile_error "`#{token}` is not allowed as a variable name" if
763
+ token =~ /\@\d/
764
+
765
+ return process_token(command_state)
766
+ elsif src.scan(/\:\:/) then
767
+ if (lex_state == :expr_beg ||
768
+ lex_state == :expr_mid ||
769
+ lex_state == :expr_class ||
770
+ (lex_state.is_argument && space_seen)) then
771
+ self.lex_state = :expr_beg
772
+ self.yacc_value = "::"
773
+ return :tCOLON3
774
+ end
775
+
776
+ self.lex_state = :expr_dot
777
+ self.yacc_value = "::"
778
+ return :tCOLON2
779
+ elsif lex_state != :expr_end && lex_state != :expr_endarg && src.scan(/:([a-zA-Z_]\w*(?:[?!]|=(?!>))?)/) then
780
+ self.yacc_value = src[1]
781
+ self.lex_state = :expr_end
782
+ return :tSYMBOL
783
+ elsif src.scan(/\:/) then
784
+ # ?: / then / when
785
+ if (lex_state == :expr_end || lex_state == :expr_endarg||
786
+ src.check(/\s/)) then
787
+ self.lex_state = :expr_beg
788
+ self.yacc_value = ":"
789
+ return :tCOLON
790
+ end
791
+
792
+ case
793
+ when src.scan(/\'/) then
794
+ self.lex_strterm = [:strterm, STR_SSYM, src.matched, "\0"]
795
+ when src.scan(/\"/) then
796
+ self.lex_strterm = [:strterm, STR_DSYM, src.matched, "\0"]
797
+ end
798
+
799
+ self.lex_state = :expr_fname
800
+ self.yacc_value = ":"
801
+ return :tSYMBEG
802
+ elsif src.check(/[0-9]/) then
803
+ return parse_number
804
+ elsif src.scan(/\[/) then
805
+ result = src.matched
806
+
807
+ if lex_state == :expr_fname || lex_state == :expr_dot then
808
+ self.lex_state = :expr_arg
809
+ case
810
+ when src.scan(/\]\=/) then
811
+ self.yacc_value = "[]="
812
+ return :tASET
813
+ when src.scan(/\]/) then
814
+ self.yacc_value = "[]"
815
+ return :tAREF
816
+ else
817
+ rb_compile_error "unexpected '['"
818
+ end
819
+ elsif lex_state == :expr_beg || lex_state == :expr_mid then
820
+ result = :tLBRACK
821
+ elsif lex_state.is_argument && space_seen then
822
+ result = :tLBRACK
823
+ end
824
+
825
+ self.expr_beg_push "["
826
+
827
+ return result
828
+ elsif src.scan(/\'(\\.|[^\'])*\'/) then
829
+ self.yacc_value = src.matched[1..-2].gsub(/\\\\/, "\\").gsub(/\\'/, "'")
830
+ self.lex_state = :expr_end
831
+ return :tSTRING
832
+ elsif src.check(/\|/) then
833
+ if src.scan(/\|\|\=/) then
834
+ self.lex_state = :expr_beg
835
+ self.yacc_value = "||"
836
+ return :tOP_ASGN
837
+ elsif src.scan(/\|\|/) then
838
+ self.lex_state = :expr_beg
839
+ self.yacc_value = "||"
840
+ return :tOROP
841
+ elsif src.scan(/\|\=/) then
842
+ self.lex_state = :expr_beg
843
+ self.yacc_value = "|"
844
+ return :tOP_ASGN
845
+ elsif src.scan(/\|/) then
846
+ self.fix_arg_lex_state
847
+ self.yacc_value = "|"
848
+ return :tPIPE
849
+ end
850
+ elsif src.scan(/\{/) then
851
+ result = if lex_state.is_argument || lex_state == :expr_end then
852
+ :tLCURLY # block (primary)
853
+ elsif lex_state == :expr_endarg then
854
+ :tLBRACE_ARG # block (expr)
855
+ else
856
+ :tLBRACE # hash
857
+ end
858
+
859
+ self.expr_beg_push "{"
860
+
861
+ return result
862
+ elsif src.scan(/[+-]/) then
863
+ sign = src.matched
864
+ utype, type = if sign == "+" then
865
+ [:tUPLUS, :tPLUS]
866
+ else
867
+ [:tUMINUS, :tMINUS]
868
+ end
869
+
870
+ if lex_state == :expr_fname || lex_state == :expr_dot then
871
+ self.lex_state = :expr_arg
872
+ if src.scan(/@/) then
873
+ self.yacc_value = "#{sign}@"
874
+ return utype
875
+ else
876
+ self.yacc_value = sign
877
+ return type
878
+ end
879
+ end
880
+
881
+ if src.scan(/\=/) then
882
+ self.lex_state = :expr_beg
883
+ self.yacc_value = sign
884
+ return :tOP_ASGN
885
+ end
886
+
887
+ if (lex_state == :expr_beg || lex_state == :expr_mid ||
888
+ (lex_state.is_argument && space_seen && !src.check(/\s/))) then
889
+ if lex_state.is_argument then
890
+ arg_ambiguous
891
+ end
892
+
893
+ self.lex_state = :expr_beg
894
+ self.yacc_value = sign
895
+
896
+ if src.check(/\d/) then
897
+ if utype == :tUPLUS then
898
+ return self.parse_number
899
+ else
900
+ return :tUMINUS_NUM
901
+ end
902
+ end
903
+
904
+ return utype
905
+ end
906
+
907
+ self.lex_state = :expr_beg
908
+ self.yacc_value = sign
909
+ return type
910
+ elsif src.check(/\*/) then
911
+ if src.scan(/\*\*=/) then
912
+ self.lex_state = :expr_beg
913
+ self.yacc_value = "**"
914
+ return :tOP_ASGN
915
+ elsif src.scan(/\*\*/) then
916
+ self.yacc_value = "**"
917
+ self.fix_arg_lex_state
918
+ return :tPOW
919
+ elsif src.scan(/\*\=/) then
920
+ self.lex_state = :expr_beg
921
+ self.yacc_value = "*"
922
+ return :tOP_ASGN
923
+ elsif src.scan(/\*/) then
924
+ result = if lex_state.is_argument && space_seen && src.check(/\S/) then
925
+ warning("`*' interpreted as argument prefix")
926
+ :tSTAR
927
+ elsif lex_state == :expr_beg || lex_state == :expr_mid then
928
+ :tSTAR
929
+ else
930
+ :tSTAR2
931
+ end
932
+ self.yacc_value = "*"
933
+ self.fix_arg_lex_state
934
+
935
+ return result
936
+ end
937
+ elsif src.check(/\!/) then
938
+ if src.scan(/\!\=/) then
939
+ self.lex_state = :expr_beg
940
+ self.yacc_value = "!="
941
+ return :tNEQ
942
+ elsif src.scan(/\!~/) then
943
+ self.lex_state = :expr_beg
944
+ self.yacc_value = "!~"
945
+ return :tNMATCH
946
+ elsif src.scan(/\!/) then
947
+ self.lex_state = :expr_beg
948
+ self.yacc_value = "!"
949
+ return :tBANG
950
+ end
951
+ elsif src.check(/\</) then
952
+ if src.scan(/\<\=\>/) then
953
+ self.fix_arg_lex_state
954
+ self.yacc_value = "<=>"
955
+ return :tCMP
956
+ elsif src.scan(/\<\=/) then
957
+ self.fix_arg_lex_state
958
+ self.yacc_value = "<="
959
+ return :tLEQ
960
+ elsif src.scan(/\<\<\=/) then
961
+ self.fix_arg_lex_state
962
+ self.lex_state = :expr_beg
963
+ self.yacc_value = "\<\<"
964
+ return :tOP_ASGN
965
+ elsif src.scan(/\<\</) then
966
+ if (! [:expr_end, :expr_dot,
967
+ :expr_endarg, :expr_class].include?(lex_state) &&
968
+ (!lex_state.is_argument || space_seen)) then
969
+ tok = self.heredoc_identifier
970
+ if tok then
971
+ return tok
972
+ end
973
+ end
974
+
975
+ self.fix_arg_lex_state
976
+ self.yacc_value = "\<\<"
977
+ return :tLSHFT
978
+ elsif src.scan(/\</) then
979
+ self.fix_arg_lex_state
980
+ self.yacc_value = "<"
981
+ return :tLT
982
+ end
983
+ elsif src.check(/\>/) then
984
+ if src.scan(/\>\=/) then
985
+ self.fix_arg_lex_state
986
+ self.yacc_value = ">="
987
+ return :tGEQ
988
+ elsif src.scan(/\>\>=/) then
989
+ self.fix_arg_lex_state
990
+ self.lex_state = :expr_beg
991
+ self.yacc_value = ">>"
992
+ return :tOP_ASGN
993
+ elsif src.scan(/\>\>/) then
994
+ self.fix_arg_lex_state
995
+ self.yacc_value = ">>"
996
+ return :tRSHFT
997
+ elsif src.scan(/\>/) then
998
+ self.fix_arg_lex_state
999
+ self.yacc_value = ">"
1000
+ return :tGT
1001
+ end
1002
+ elsif src.scan(/\`/) then
1003
+ self.yacc_value = "`"
1004
+ case lex_state
1005
+ when :expr_fname then
1006
+ self.lex_state = :expr_end
1007
+ return :tBACK_REF2
1008
+ when :expr_dot then
1009
+ self.lex_state = if command_state then
1010
+ :expr_cmdarg
1011
+ else
1012
+ :expr_arg
1013
+ end
1014
+ return :tBACK_REF2
1015
+ end
1016
+ self.lex_strterm = [:strterm, STR_XQUOTE, '`', "\0"]
1017
+ return :tXSTRING_BEG
1018
+ elsif src.scan(/\?/) then
1019
+ if lex_state == :expr_end || lex_state == :expr_endarg then
1020
+ self.lex_state = :expr_beg
1021
+ self.yacc_value = "?"
1022
+ return :tEH
1023
+ end
1024
+
1025
+ if src.eos? then
1026
+ rb_compile_error "incomplete character syntax"
1027
+ end
1028
+
1029
+ if src.check(/\s|\v/) then
1030
+ unless lex_state.is_argument then
1031
+ c2 = { " " => 's',
1032
+ "\n" => 'n',
1033
+ "\t" => 't',
1034
+ "\v" => 'v',
1035
+ "\r" => 'r',
1036
+ "\f" => 'f' }[src.matched]
1037
+
1038
+ if c2 then
1039
+ warning("invalid character syntax; use ?\\" + c2)
1040
+ end
1041
+ end
1042
+
1043
+ # ternary
1044
+ self.lex_state = :expr_beg
1045
+ self.yacc_value = "?"
1046
+ return :tEH
1047
+ elsif src.check(/\w(?=\w)/) then # ternary, also
1048
+ self.lex_state = :expr_beg
1049
+ self.yacc_value = "?"
1050
+ return :tEH
1051
+ end
1052
+
1053
+ c = if src.scan(/\\/) then
1054
+ self.read_escape
1055
+ else
1056
+ src.getch
1057
+ end
1058
+ self.lex_state = :expr_end
1059
+ self.yacc_value = c[0].ord & 0xff
1060
+ return :tINTEGER
1061
+ elsif src.check(/\&/) then
1062
+ if src.scan(/\&\&\=/) then
1063
+ self.yacc_value = "&&"
1064
+ self.lex_state = :expr_beg
1065
+ return :tOP_ASGN
1066
+ elsif src.scan(/\&\&/) then
1067
+ self.lex_state = :expr_beg
1068
+ self.yacc_value = "&&"
1069
+ return :tANDOP
1070
+ elsif src.scan(/\&\=/) then
1071
+ self.yacc_value = "&"
1072
+ self.lex_state = :expr_beg
1073
+ return :tOP_ASGN
1074
+ elsif src.scan(/&/) then
1075
+ result = if lex_state.is_argument && space_seen &&
1076
+ !src.check(/\s/) then
1077
+ warning("`&' interpreted as argument prefix")
1078
+ :tAMPER
1079
+ elsif lex_state == :expr_beg || lex_state == :expr_mid then
1080
+ :tAMPER
1081
+ else
1082
+ :tAMPER2
1083
+ end
1084
+
1085
+ self.fix_arg_lex_state
1086
+ self.yacc_value = "&"
1087
+ return result
1088
+ end
1089
+ elsif src.scan(/\//) then
1090
+ if lex_state == :expr_beg || lex_state == :expr_mid then
1091
+ self.lex_strterm = [:strterm, STR_REGEXP, '/', "\0"]
1092
+ self.yacc_value = "/"
1093
+ return :tREGEXP_BEG
1094
+ end
1095
+
1096
+ if src.scan(/\=/) then
1097
+ self.yacc_value = "/"
1098
+ self.lex_state = :expr_beg
1099
+ return :tOP_ASGN
1100
+ end
1101
+
1102
+ if lex_state.is_argument && space_seen then
1103
+ unless src.scan(/\s/) then
1104
+ arg_ambiguous
1105
+ self.lex_strterm = [:strterm, STR_REGEXP, '/', "\0"]
1106
+ self.yacc_value = "/"
1107
+ return :tREGEXP_BEG
1108
+ end
1109
+ end
1110
+
1111
+ self.fix_arg_lex_state
1112
+ self.yacc_value = "/"
1113
+
1114
+ return :tDIVIDE
1115
+ elsif src.scan(/\^=/) then
1116
+ self.lex_state = :expr_beg
1117
+ self.yacc_value = "^"
1118
+ return :tOP_ASGN
1119
+ elsif src.scan(/\^/) then
1120
+ self.fix_arg_lex_state
1121
+ self.yacc_value = "^"
1122
+ return :tCARET
1123
+ elsif src.scan(/\;/) then
1124
+ self.command_start = true
1125
+ self.lex_state = :expr_beg
1126
+ self.yacc_value = ";"
1127
+ return :tSEMI
1128
+ elsif src.scan(/\~/) then
1129
+ if lex_state == :expr_fname || lex_state == :expr_dot then
1130
+ src.scan(/@/)
1131
+ end
1132
+
1133
+ self.fix_arg_lex_state
1134
+ self.yacc_value = "~"
1135
+
1136
+ return :tTILDE
1137
+ elsif src.scan(/\\/) then
1138
+ if src.scan(/\n/) then
1139
+ self.lineno = nil
1140
+ space_seen = true
1141
+ next
1142
+ end
1143
+ rb_compile_error "bare backslash only allowed before newline"
1144
+ elsif src.scan(/\%/) then
1145
+ if lex_state == :expr_beg || lex_state == :expr_mid then
1146
+ return parse_quote
1147
+ end
1148
+
1149
+ if src.scan(/\=/) then
1150
+ self.lex_state = :expr_beg
1151
+ self.yacc_value = "%"
1152
+ return :tOP_ASGN
1153
+ end
1154
+
1155
+ if lex_state.is_argument && space_seen && ! src.check(/\s/) then
1156
+ return parse_quote
1157
+ end
1158
+
1159
+ self.fix_arg_lex_state
1160
+ self.yacc_value = "%"
1161
+
1162
+ return :tPERCENT
1163
+ elsif src.check(/\$/) then
1164
+ if src.scan(/(\$_)(\w+)/) then
1165
+ self.lex_state = :expr_end
1166
+ self.token = src.matched
1167
+ return process_token(command_state)
1168
+ elsif src.scan(/\$_/) then
1169
+ self.lex_state = :expr_end
1170
+ self.token = src.matched
1171
+ self.yacc_value = src.matched
1172
+ return :tGVAR
1173
+ elsif src.scan(/\$[~*$?!@\/\\;,.=:<>\"]|\$-\w?/) then
1174
+ self.lex_state = :expr_end
1175
+ self.yacc_value = src.matched
1176
+ return :tGVAR
1177
+ elsif src.scan(/\$([\&\`\'\+])/) then
1178
+ self.lex_state = :expr_end
1179
+ # Explicit reference to these vars as symbols...
1180
+ if last_state == :expr_fname then
1181
+ self.yacc_value = src.matched
1182
+ return :tGVAR
1183
+ else
1184
+ self.yacc_value = src[1].to_sym
1185
+ return :tBACK_REF
1186
+ end
1187
+ elsif src.scan(/\$([1-9]\d*)/) then
1188
+ self.lex_state = :expr_end
1189
+ if last_state == :expr_fname then
1190
+ self.yacc_value = src.matched
1191
+ return :tGVAR
1192
+ else
1193
+ self.yacc_value = src[1].to_i
1194
+ return :tNTH_REF
1195
+ end
1196
+ elsif src.scan(/\$0/) then
1197
+ self.lex_state = :expr_end
1198
+ self.token = src.matched
1199
+ return process_token(command_state)
1200
+ elsif src.scan(/\$\W|\$\z/) then # TODO: remove?
1201
+ self.lex_state = :expr_end
1202
+ self.yacc_value = "$"
1203
+ return "$"
1204
+ elsif src.scan(/\$\w+/)
1205
+ self.lex_state = :expr_end
1206
+ self.token = src.matched
1207
+ return process_token(command_state)
1208
+ end
1209
+ elsif src.check(/\_/) then
1210
+ if src.beginning_of_line? && src.scan(/\__END__(\n|\Z)/) then
1211
+ self.lineno = nil
1212
+ return RubyLexer::EOF
1213
+ elsif src.scan(/\_\w*/) then
1214
+ self.token = src.matched
1215
+ return process_token(command_state)
1216
+ end
1217
+ end
1218
+ end # END OF CASE
1219
+
1220
+ if src.scan(/\004|\032|\000/) || src.eos? then # ^D, ^Z, EOF
1221
+ return RubyLexer::EOF
1222
+ else # alpha check
1223
+ if src.scan(/\W/) then
1224
+ rb_compile_error "Invalid char #{src.matched.inspect} in expression"
1225
+ end
1226
+ end
1227
+
1228
+ self.token = src.matched if self.src.scan(/\w+/)
1229
+
1230
+ return process_token(command_state)
1231
+ end
1232
+ end
1233
+
1234
+ def process_token(command_state)
1235
+
1236
+ token << src.matched if token =~ /^\w/ && src.scan(/[\!\?](?!=)/)
1237
+
1238
+ result = nil
1239
+ last_state = lex_state
1240
+
1241
+
1242
+ case token
1243
+ when /^\$/ then
1244
+ self.lex_state, result = :expr_end, :tGVAR
1245
+ when /^@@/ then
1246
+ self.lex_state, result = :expr_end, :tCVAR
1247
+ when /^@/ then
1248
+ self.lex_state, result = :expr_end, :tIVAR
1249
+ else
1250
+ if token =~ /[!?]$/ then
1251
+ result = :tFID
1252
+ else
1253
+ if lex_state == :expr_fname then
1254
+ # ident=, not =~ => == or followed by =>
1255
+ # TODO test lexing of a=>b vs a==>b
1256
+ if src.scan(/=(?:(?![~>=])|(?==>))/) then
1257
+ result = :tIDENTIFIER
1258
+ token << src.matched
1259
+ end
1260
+ end
1261
+
1262
+ result ||= if token =~ /^[A-Z]/ then
1263
+ :tCONSTANT
1264
+ else
1265
+ :tIDENTIFIER
1266
+ end
1267
+ end
1268
+
1269
+ unless lex_state == :expr_dot then
1270
+ # See if it is a reserved word.
1271
+ keyword = Keyword.keyword token
1272
+
1273
+ if keyword then
1274
+ state = lex_state
1275
+ self.lex_state = keyword.state
1276
+ self.yacc_value = token
1277
+
1278
+ if keyword.id0 == :kDO then
1279
+ self.command_start = true
1280
+ return :kDO_COND if cond.is_in_state
1281
+ return :kDO_BLOCK if cmdarg.is_in_state && state != :expr_cmdarg
1282
+ return :kDO_BLOCK if state == :expr_endarg
1283
+ return :kDO
1284
+ end
1285
+
1286
+ return keyword.id0 if state == :expr_beg
1287
+
1288
+ self.lex_state = :expr_beg if keyword.id0 != keyword.id1
1289
+
1290
+ return keyword.id1
1291
+ end
1292
+ end
1293
+
1294
+ if (lex_state == :expr_beg || lex_state == :expr_mid ||
1295
+ lex_state == :expr_dot || lex_state == :expr_arg ||
1296
+ lex_state == :expr_cmdarg) then
1297
+ if command_state then
1298
+ self.lex_state = :expr_cmdarg
1299
+ else
1300
+ self.lex_state = :expr_arg
1301
+ end
1302
+ else
1303
+ self.lex_state = :expr_end
1304
+ end
1305
+ end
1306
+
1307
+ self.yacc_value = token
1308
+
1309
+
1310
+ self.lex_state = :expr_end if
1311
+ last_state != :expr_dot && self.parser.env[token.to_sym] == :lvar
1312
+
1313
+ return result
1314
+ end
1315
+
1316
+ def yylex_string # 23 lines
1317
+ token = if lex_strterm[0] == :heredoc then
1318
+ self.heredoc lex_strterm
1319
+ else
1320
+ self.parse_string lex_strterm
1321
+ end
1322
+
1323
+ if token == :tSTRING_END || token == :tREGEXP_END then
1324
+ self.lineno = nil
1325
+ self.lex_strterm = nil
1326
+ self.lex_state = :expr_end
1327
+ end
1328
+
1329
+ return token
1330
+ end
1331
+ end