ruby_parser 3.13.1 → 3.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data/.autotest +18 -29
  4. data/History.rdoc +312 -0
  5. data/Manifest.txt +16 -15
  6. data/README.rdoc +13 -9
  7. data/Rakefile +237 -106
  8. data/bin/ruby_parse +3 -1
  9. data/bin/ruby_parse_extract_error +9 -4
  10. data/compare/normalize.rb +54 -6
  11. data/debugging.md +172 -0
  12. data/gauntlet.md +107 -0
  13. data/lib/rp_extensions.rb +15 -36
  14. data/lib/rp_stringscanner.rb +20 -51
  15. data/lib/ruby_lexer.rb +515 -812
  16. data/lib/ruby_lexer.rex +33 -27
  17. data/lib/ruby_lexer.rex.rb +64 -31
  18. data/lib/ruby_lexer_strings.rb +638 -0
  19. data/lib/ruby_parser.rb +46 -36
  20. data/lib/{ruby_parser.yy → ruby_parser2.yy} +1400 -488
  21. data/lib/ruby_parser20.rb +10953 -0
  22. data/lib/ruby_parser21.rb +10978 -0
  23. data/lib/ruby_parser22.rb +11119 -0
  24. data/lib/ruby_parser23.rb +11160 -0
  25. data/lib/ruby_parser24.rb +11209 -0
  26. data/lib/ruby_parser25.rb +11209 -0
  27. data/lib/ruby_parser26.rb +11231 -0
  28. data/lib/ruby_parser27.rb +12960 -0
  29. data/lib/{ruby26_parser.y → ruby_parser3.yy} +1652 -521
  30. data/lib/ruby_parser30.rb +13292 -0
  31. data/lib/ruby_parser31.rb +13625 -0
  32. data/lib/ruby_parser32.rb +13577 -0
  33. data/lib/ruby_parser33.rb +13577 -0
  34. data/lib/ruby_parser_extras.rb +988 -474
  35. data/test/test_ruby_lexer.rb +1339 -1155
  36. data/test/test_ruby_parser.rb +4255 -2103
  37. data/test/test_ruby_parser_extras.rb +39 -4
  38. data/tools/munge.rb +52 -13
  39. data/tools/ripper.rb +24 -6
  40. data.tar.gz.sig +0 -0
  41. metadata +73 -56
  42. metadata.gz.sig +0 -0
  43. data/lib/ruby20_parser.rb +0 -6869
  44. data/lib/ruby20_parser.y +0 -2431
  45. data/lib/ruby21_parser.rb +0 -6944
  46. data/lib/ruby21_parser.y +0 -2449
  47. data/lib/ruby22_parser.rb +0 -6968
  48. data/lib/ruby22_parser.y +0 -2458
  49. data/lib/ruby23_parser.rb +0 -6987
  50. data/lib/ruby23_parser.y +0 -2460
  51. data/lib/ruby24_parser.rb +0 -6994
  52. data/lib/ruby24_parser.y +0 -2466
  53. data/lib/ruby25_parser.rb +0 -6994
  54. data/lib/ruby25_parser.y +0 -2466
  55. data/lib/ruby26_parser.rb +0 -7012
data/lib/ruby_lexer.rb CHANGED
@@ -4,135 +4,9 @@
4
4
  $DEBUG = true if ENV["DEBUG"]
5
5
 
6
6
  class RubyLexer
7
-
8
7
  # :stopdoc:
9
- HAS_ENC = "".respond_to? :encoding
10
-
11
- IDENT_CHAR = if HAS_ENC then
12
- /[\w\u0080-\u{10ffff}]/u
13
- else
14
- /[\w\x80-\xFF]/n
15
- end
16
-
17
8
  EOF = :eof_haha!
18
9
 
19
- # ruby constants for strings (should this be moved somewhere else?)
20
-
21
- STR_FUNC_BORING = 0x00
22
- STR_FUNC_ESCAPE = 0x01 # TODO: remove and replace with REGEXP
23
- STR_FUNC_EXPAND = 0x02
24
- STR_FUNC_REGEXP = 0x04
25
- STR_FUNC_QWORDS = 0x08
26
- STR_FUNC_SYMBOL = 0x10
27
- STR_FUNC_INDENT = 0x20 # <<-HEREDOC
28
- STR_FUNC_ICNTNT = 0x40 # <<~HEREDOC
29
-
30
- STR_SQUOTE = STR_FUNC_BORING
31
- STR_DQUOTE = STR_FUNC_BORING | STR_FUNC_EXPAND
32
- STR_XQUOTE = STR_FUNC_BORING | STR_FUNC_EXPAND
33
- STR_REGEXP = STR_FUNC_REGEXP | STR_FUNC_ESCAPE | STR_FUNC_EXPAND
34
- STR_SSYM = STR_FUNC_SYMBOL
35
- STR_DSYM = STR_FUNC_SYMBOL | STR_FUNC_EXPAND
36
-
37
- class State
38
- attr_accessor :n
39
-
40
- def initialize o
41
- raise ArgumentError, "bad state: %p" % [o] unless Integer === o # TODO: remove
42
-
43
- self.n = o
44
- end
45
-
46
- def == o
47
- o.class == self.class && o.n == self.n
48
- end
49
-
50
- def =~ v
51
- (self.n & v.n) != 0
52
- end
53
-
54
- def | v
55
- self.class.new(self.n | v.n)
56
- end
57
-
58
- def inspect
59
- return "EXPR_NONE" if n.zero?
60
- NAMES.map { |v,k| k if self =~ v }.compact.join "|"
61
- end
62
-
63
- module Values
64
- EXPR_NONE = State.new 0x0
65
- EXPR_BEG = State.new 0x1
66
- EXPR_END = State.new 0x2
67
- EXPR_ENDARG = State.new 0x4
68
- EXPR_ENDFN = State.new 0x8
69
- EXPR_ARG = State.new 0x10
70
- EXPR_CMDARG = State.new 0x20
71
- EXPR_MID = State.new 0x40
72
- EXPR_FNAME = State.new 0x80
73
- EXPR_DOT = State.new 0x100
74
- EXPR_CLASS = State.new 0x200
75
- EXPR_LABEL = State.new 0x400
76
- EXPR_LABELED = State.new 0x800
77
- EXPR_FITEM = State.new 0x1000
78
-
79
- EXPR_BEG_ANY = EXPR_BEG | EXPR_MID | EXPR_CLASS
80
- EXPR_ARG_ANY = EXPR_ARG | EXPR_CMDARG
81
- EXPR_END_ANY = EXPR_END | EXPR_ENDARG | EXPR_ENDFN
82
-
83
- # extra fake lex_state names to make things a bit cleaner
84
-
85
- EXPR_LAB = EXPR_ARG|EXPR_LABELED
86
- EXPR_NUM = EXPR_END|EXPR_ENDARG
87
- EXPR_PAR = EXPR_BEG|EXPR_LABEL
88
- EXPR_PAD = EXPR_BEG|EXPR_LABELED
89
- end
90
-
91
- include Values
92
-
93
- NAMES = {
94
- EXPR_NONE => "EXPR_NONE",
95
- EXPR_BEG => "EXPR_BEG",
96
- EXPR_END => "EXPR_END",
97
- EXPR_ENDARG => "EXPR_ENDARG",
98
- EXPR_ENDFN => "EXPR_ENDFN",
99
- EXPR_ARG => "EXPR_ARG",
100
- EXPR_CMDARG => "EXPR_CMDARG",
101
- EXPR_MID => "EXPR_MID",
102
- EXPR_FNAME => "EXPR_FNAME",
103
- EXPR_DOT => "EXPR_DOT",
104
- EXPR_CLASS => "EXPR_CLASS",
105
- EXPR_LABEL => "EXPR_LABEL",
106
- EXPR_LABELED => "EXPR_LABELED",
107
- EXPR_FITEM => "EXPR_FITEM",
108
- }
109
- end
110
-
111
- include State::Values
112
-
113
- if $DEBUG then
114
- def lex_state= o
115
- return if @lex_state == o
116
- raise ArgumentError, "bad state: %p" % [o] unless State === o
117
- if ENV["V"] then
118
- c = caller[0]
119
- c = caller[1] if c =~ /\b(expr_)?result\b/
120
- c = caller[2] if c =~ /\b(expr_)?result\b/
121
- warn "lex_state: %p -> %p from %s" % [lex_state, o, c.clean_caller]
122
- else
123
- warn "lex_state: %p -> %p" % [lex_state, o]
124
- end
125
- @lex_state = o
126
- end
127
- else
128
- def lex_state= o
129
- raise ArgumentError, "bad state: %p" % [o] unless State === o
130
- @lex_state = o
131
- end
132
- end
133
-
134
- attr_reader :lex_state
135
-
136
10
  ESCAPES = {
137
11
  "a" => "\007",
138
12
  "b" => "\010",
@@ -149,10 +23,17 @@ class RubyLexer
149
23
  "c\?" => 127.chr,
150
24
  }
151
25
 
26
+ HAS_ENC = "".respond_to? :encoding
27
+
28
+ BTOKENS = {
29
+ ".." => :tBDOT2,
30
+ "..." => :tBDOT3,
31
+ }
32
+
152
33
  TOKENS = {
153
34
  "!" => :tBANG,
154
35
  "!=" => :tNEQ,
155
- # "!@" => :tUBANG,
36
+ "!@" => :tBANG,
156
37
  "!~" => :tNMATCH,
157
38
  "," => :tCOMMA,
158
39
  ".." => :tDOT2,
@@ -165,21 +46,57 @@ class RubyLexer
165
46
  "->" => :tLAMBDA,
166
47
  }
167
48
 
168
- TAB_WIDTH = 8
49
+ PERCENT_END = {
50
+ "(" => ")",
51
+ "[" => "]",
52
+ "{" => "}",
53
+ "<" => ">",
54
+ }
169
55
 
170
- @@regexp_cache = Hash.new { |h,k| h[k] = Regexp.new(Regexp.escape(k)) }
56
+ SIMPLE_RE_META = /[\$\*\+\.\?\^\|\)\]\}\>]/
57
+
58
+ @@regexp_cache = Hash.new { |h, k| h[k] = Regexp.new(Regexp.escape(k)) }
171
59
  @@regexp_cache[nil] = nil
172
60
 
61
+ def regexp_cache
62
+ @@regexp_cache
63
+ end
64
+
65
+ if $DEBUG then
66
+ attr_reader :lex_state
67
+
68
+ def lex_state= o
69
+ return if @lex_state == o
70
+
71
+ from = ""
72
+ if ENV["VERBOSE"]
73
+ path = caller[0]
74
+ path = caller[1] if path =~ /result/
75
+ path, line, *_ = path.split(/:/)
76
+ path.delete_prefix! File.dirname File.dirname __FILE__
77
+ from = " at .%s:%s" % [path, line]
78
+ end
79
+
80
+ warn "lex_state: %p -> %p%s" % [lex_state, o, from]
81
+
82
+ @lex_state = o
83
+ end
84
+ end
85
+
173
86
  # :startdoc:
174
87
 
175
- attr_accessor :lineno # we're bypassing oedipus' lineno handling.
88
+ attr_accessor :lex_state unless $DEBUG
89
+
176
90
  attr_accessor :brace_nest
177
91
  attr_accessor :cmdarg
178
92
  attr_accessor :command_start
179
93
  attr_accessor :cmd_state # temporary--ivar to avoid passing everywhere
180
94
  attr_accessor :last_state
181
95
  attr_accessor :cond
182
- attr_accessor :extra_lineno
96
+ attr_accessor :old_ss
97
+ attr_accessor :old_lineno
98
+
99
+ # these are generated via ruby_lexer.rex: ss, lineno
183
100
 
184
101
  ##
185
102
  # Additional context surrounding tokens that both the lexer and
@@ -196,39 +113,30 @@ class RubyLexer
196
113
  # Last token read via next_token.
197
114
  attr_accessor :token
198
115
 
199
- attr_writer :comments
116
+ # Last comment lexed, or nil
117
+ attr_accessor :comment
200
118
 
201
119
  def initialize _ = nil
202
120
  @lex_state = nil # remove one warning under $DEBUG
203
- self.lex_state = EXPR_NONE
121
+ @lex_state = EXPR_NONE
204
122
 
205
123
  self.cond = RubyParserStuff::StackState.new(:cond, $DEBUG)
206
124
  self.cmdarg = RubyParserStuff::StackState.new(:cmdarg, $DEBUG)
125
+ self.ss = RPStringScanner.new ""
207
126
 
208
127
  reset
209
128
  end
210
129
 
211
130
  def arg_ambiguous
212
- self.warning("Ambiguous first argument. make sure.")
131
+ self.warning "Ambiguous first argument. make sure."
213
132
  end
214
133
 
215
134
  def arg_state
216
135
  is_after_operator? ? EXPR_ARG : EXPR_BEG
217
136
  end
218
137
 
219
- def beginning_of_line?
220
- ss.bol?
221
- end
222
- alias :bol? :beginning_of_line? # to make .rex file more readable
223
-
224
- def comments # TODO: remove this... maybe comment_string + attr_accessor
225
- c = @comments.join
226
- @comments.clear
227
- c
228
- end
229
-
230
- def end_of_stream?
231
- ss.eos?
138
+ def debug n
139
+ raise "debug #{n}"
232
140
  end
233
141
 
234
142
  def expr_dot?
@@ -245,185 +153,30 @@ class RubyLexer
245
153
  result EXPR_BEG, token, text
246
154
  end
247
155
 
248
- def heredoc here # TODO: rewrite / remove
249
- _, eos, func, last_line = here
250
-
251
- indent = (func & STR_FUNC_INDENT) != 0 ? "[ \t]*" : nil
252
- content_indent = (func & STR_FUNC_ICNTNT) != 0
253
- expand = (func & STR_FUNC_EXPAND) != 0
254
- eos_re = /#{indent}#{Regexp.escape eos}(\r*\n|\z)/
255
- err_msg = "can't match #{eos_re.inspect} anywhere in "
256
-
257
- rb_compile_error err_msg if end_of_stream?
258
-
259
- if beginning_of_line? && scan(eos_re) then
260
- self.lineno += 1
261
- ss.unread_many last_line # TODO: figure out how to remove this
262
- return :tSTRING_END, eos
263
- end
264
-
265
- self.string_buffer = []
266
-
267
- if expand then
268
- case
269
- when scan(/#[$@]/) then
270
- ss.pos -= 1 # FIX omg stupid
271
- return :tSTRING_DVAR, matched
272
- when scan(/#[{]/) then
273
- return :tSTRING_DBEG, matched
274
- when scan(/#/) then
275
- string_buffer << '#'
276
- end
277
-
278
- begin
279
- c = tokadd_string func, "\n", nil
280
-
281
- rb_compile_error err_msg if
282
- c == RubyLexer::EOF
283
-
284
- if c != "\n" then
285
- return :tSTRING_CONTENT, string_buffer.join.delete("\r")
286
- else
287
- string_buffer << scan(/\n/)
288
- end
289
-
290
- rb_compile_error err_msg if end_of_stream?
291
- end until check(eos_re)
292
- else
293
- until check(eos_re) do
294
- string_buffer << scan(/.*(\n|\z)/)
295
- rb_compile_error err_msg if end_of_stream?
296
- end
297
- end
298
-
299
- self.lex_strterm = [:heredoc, eos, func, last_line]
300
-
301
- string_content = begin
302
- s = string_buffer.join
303
- s.delete "\r"
304
- rescue ArgumentError
305
- s.b.delete("\r").force_encoding Encoding::UTF_8
306
- end
307
-
308
- string_content = heredoc_dedent(string_content) if content_indent && ruby23plus?
309
-
310
- return :tSTRING_CONTENT, string_content
311
- end
312
-
313
- def heredoc_dedent(string_content)
314
- width = string_content.scan(/^[ \t]*(?=\S)/).map do |whitespace|
315
- heredoc_whitespace_indent_size whitespace
316
- end.min || 0
317
-
318
- string_content.split("\n", -1).map do |line|
319
- dedent_string line, width
320
- end.join "\n"
321
- end
322
-
323
- def dedent_string(string, width)
324
- characters_skipped = 0
325
- indentation_skipped = 0
326
-
327
- string.chars.each do |char|
328
- break if indentation_skipped >= width
329
- if char == ' '
330
- characters_skipped += 1
331
- indentation_skipped += 1
332
- elsif char == "\t"
333
- proposed = TAB_WIDTH * (indentation_skipped / TAB_WIDTH + 1)
334
- break if (proposed > width)
335
- characters_skipped += 1
336
- indentation_skipped = proposed
337
- end
338
- end
339
- string[characters_skipped..-1]
340
- end
341
-
342
- def heredoc_whitespace_indent_size(whitespace)
343
- whitespace.chars.inject 0 do |size, char|
344
- if char == "\t"
345
- size + TAB_WIDTH
346
- else
347
- size + 1
348
- end
349
- end
350
- end
351
-
352
- def heredoc_identifier # TODO: remove / rewrite
353
- term, func = nil, STR_FUNC_BORING
354
- self.string_buffer = []
355
-
356
- heredoc_indent_mods = '-'
357
- heredoc_indent_mods += '\~' if ruby23plus?
358
-
359
- case
360
- when scan(/([#{heredoc_indent_mods}]?)([\'\"\`])(.*?)\2/) then
361
- term = ss[2]
362
- func |= STR_FUNC_INDENT unless ss[1].empty?
363
- func |= STR_FUNC_ICNTNT if ss[1] == '~'
364
- func |= case term
365
- when "\'" then
366
- STR_SQUOTE
367
- when '"' then
368
- STR_DQUOTE
369
- else
370
- STR_XQUOTE
371
- end
372
- string_buffer << ss[3]
373
- when scan(/[#{heredoc_indent_mods}]?([\'\"\`])(?!\1*\Z)/) then
374
- rb_compile_error "unterminated here document identifier"
375
- when scan(/([#{heredoc_indent_mods}]?)(#{IDENT_CHAR}+)/) then
376
- term = '"'
377
- func |= STR_DQUOTE
378
- unless ss[1].empty? then
379
- func |= STR_FUNC_INDENT
380
- func |= STR_FUNC_ICNTNT if ss[1] == '~'
381
- end
382
- string_buffer << ss[2]
383
- else
384
- return nil
385
- end
386
-
387
- if scan(/.*\n/) then
388
- # TODO: think about storing off the char range instead
389
- line = matched
390
- else
391
- line = nil
392
- end
393
-
394
- self.lex_strterm = [:heredoc, string_buffer.join, func, line]
395
-
396
- if term == '`' then
397
- result nil, :tXSTRING_BEG, "`"
398
- else
399
- result nil, :tSTRING_BEG, "\""
400
- end
401
- end
402
-
403
156
  def in_fname? # REFACTOR
404
157
  lex_state =~ EXPR_FNAME
405
158
  end
406
159
 
407
- def is_after_operator?
408
- lex_state =~ EXPR_FNAME|EXPR_DOT
409
- end
410
-
411
160
  def int_with_base base
412
161
  rb_compile_error "Invalid numeric format" if matched =~ /__/
413
162
 
414
163
  text = matched
415
164
  case
416
- when text.end_with?('ri')
417
- return result(EXPR_NUM, :tIMAGINARY, Complex(0, Rational(text.chop.chop.to_i(base))))
418
- when text.end_with?('r')
419
- return result(EXPR_NUM, :tRATIONAL, Rational(text.chop.to_i(base)))
420
- when text.end_with?('i')
421
- return result(EXPR_NUM, :tIMAGINARY, Complex(0, text.chop.to_i(base)))
165
+ when text.end_with?("ri")
166
+ result EXPR_NUM, :tIMAGINARY, Complex(0, Rational(text.chop.chop.to_i(base)))
167
+ when text.end_with?("r")
168
+ result EXPR_NUM, :tRATIONAL, Rational(text.chop.to_i(base))
169
+ when text.end_with?("i")
170
+ result EXPR_NUM, :tIMAGINARY, Complex(0, text.chop.to_i(base))
422
171
  else
423
- return result(EXPR_NUM, :tINTEGER, text.to_i(base))
172
+ result EXPR_NUM, :tINTEGER, text.to_i(base)
424
173
  end
425
174
  end
426
175
 
176
+ def is_after_operator?
177
+ lex_state =~ EXPR_FNAME|EXPR_DOT
178
+ end
179
+
427
180
  def is_arg?
428
181
  lex_state =~ EXPR_ARG_ANY
429
182
  end
@@ -436,15 +189,6 @@ class RubyLexer
436
189
  lex_state =~ EXPR_END_ANY
437
190
  end
438
191
 
439
- def lvar_defined? id
440
- # TODO: (dyna_in_block? && dvar_defined?(id)) || local_id?(id)
441
- self.parser.env[id.to_sym] == :lvar
442
- end
443
-
444
- def ruby22_label?
445
- ruby22plus? and is_label_possible?
446
- end
447
-
448
192
  def is_label_possible?
449
193
  (lex_state =~ EXPR_LABEL|EXPR_ENDFN && !cmd_state) || is_arg?
450
194
  end
@@ -461,14 +205,30 @@ class RubyLexer
461
205
  lpar_beg && lpar_beg == paren_nest
462
206
  end
463
207
 
464
- def matched
465
- ss.matched
208
+ def is_local_id id
209
+ # maybe just make this false for now
210
+ self.parser.env[id.to_sym] == :lvar # HACK: this isn't remotely right
211
+ end
212
+
213
+ def lvar_defined? id
214
+ # TODO: (dyna_in_block? && dvar_defined?(id)) || local_id?(id)
215
+ self.parser.env[id.to_sym] == :lvar
466
216
  end
467
217
 
468
218
  def not_end?
469
219
  not is_end?
470
220
  end
471
221
 
222
+ def possibly_escape_string text, check
223
+ content = match[1]
224
+
225
+ if text =~ check then
226
+ unescape_string content
227
+ else
228
+ content.gsub(/\\\\/, "\\").gsub(/\\\'/, "'")
229
+ end
230
+ end
231
+
472
232
  def process_amper text
473
233
  token = if is_arg? && space_seen && !check(/\s/) then
474
234
  warning("`&' interpreted as argument prefix")
@@ -479,44 +239,50 @@ class RubyLexer
479
239
  :tAMPER2
480
240
  end
481
241
 
482
- return result(:arg_state, token, "&")
242
+ result :arg_state, token, "&"
483
243
  end
484
244
 
485
245
  def process_backref text
486
- token = ss[1].to_sym
246
+ token = match[1].to_sym
487
247
  # TODO: can't do lineno hack w/ symbol
488
248
  result EXPR_END, :tBACK_REF, token
489
249
  end
490
250
 
491
251
  def process_begin text
492
- @comments << matched
252
+ self.comment ||= +""
253
+ self.comment << matched
493
254
 
494
255
  unless scan(/.*?\n=end( |\t|\f)*[^\n]*(\n|\z)/m) then
495
- @comments.clear
256
+ self.comment = nil
496
257
  rb_compile_error("embedded document meets end of file")
497
258
  end
498
259
 
499
- @comments << matched
500
- self.lineno += matched.count("\n")
260
+ self.comment << matched
261
+ self.lineno += matched.count("\n") # HACK?
501
262
 
502
263
  nil # TODO
503
264
  end
504
265
 
505
- def process_brace_close text
506
- # matching compare/parse23.y:8561
507
- cond.lexpop
508
- cmdarg.lexpop
266
+ # TODO: make all tXXXX terminals include lexer.lineno ... enforce it somehow?
509
267
 
268
+ def process_brace_close text
510
269
  case matched
511
270
  when "}" then
512
271
  self.brace_nest -= 1
513
- self.lex_state = EXPR_ENDARG # TODO: EXPR_END ? Look at 2.6
514
-
515
272
  return :tSTRING_DEND, matched if brace_nest < 0
273
+ end
274
+
275
+ # matching compare/parse26.y:8099
276
+ cond.pop
277
+ cmdarg.pop
278
+
279
+ case matched
280
+ when "}" then
281
+ self.lex_state = ruby24minus? ? EXPR_ENDARG : EXPR_END
516
282
  return :tRCURLY, matched
517
283
  when "]" then
518
284
  self.paren_nest -= 1
519
- self.lex_state = EXPR_ENDARG
285
+ self.lex_state = ruby24minus? ? EXPR_ENDARG : EXPR_END
520
286
  return :tRBRACK, matched
521
287
  when ")" then
522
288
  self.paren_nest -= 1
@@ -527,30 +293,6 @@ class RubyLexer
527
293
  end
528
294
  end
529
295
 
530
- def process_colon1 text
531
- # ?: / then / when
532
- if is_end? || check(/\s/) then
533
- return result EXPR_BEG, :tCOLON, text
534
- end
535
-
536
- case
537
- when scan(/\'/) then
538
- string STR_SSYM
539
- when scan(/\"/) then
540
- string STR_DSYM
541
- end
542
-
543
- result EXPR_FNAME, :tSYMBEG, text
544
- end
545
-
546
- def process_colon2 text
547
- if is_beg? || lex_state =~ EXPR_CLASS || is_space_arg? then
548
- result EXPR_BEG, :tCOLON3, text
549
- else
550
- result EXPR_DOT, :tCOLON2, text
551
- end
552
- end
553
-
554
296
  def process_brace_open text
555
297
  # matching compare/parse23.y:8694
556
298
  self.brace_nest += 1
@@ -566,7 +308,7 @@ class RubyLexer
566
308
  when lex_state =~ EXPR_LABELED then
567
309
  :tLBRACE # hash
568
310
  when lex_state =~ EXPR_ARG_ANY|EXPR_END|EXPR_ENDFN then
569
- :tLCURLY # block (primary) '{' in parse.y
311
+ :tLCURLY # block (primary) "{" in parse.y
570
312
  when lex_state =~ EXPR_ENDARG then
571
313
  :tLBRACE_ARG # block (expr)
572
314
  else
@@ -581,37 +323,96 @@ class RubyLexer
581
323
  result state, token, text
582
324
  end
583
325
 
326
+ def process_colon1 text
327
+ # ?: / then / when
328
+ if is_end? || check(/\s/) then
329
+ return result EXPR_BEG, :tCOLON, text
330
+ end
331
+
332
+ case
333
+ when scan(/\'/) then
334
+ string STR_SSYM, matched
335
+ when scan(/\"/) then
336
+ string STR_DSYM, matched
337
+ end
338
+
339
+ result EXPR_FNAME, :tSYMBEG, text
340
+ end
341
+
342
+ def process_colon2 text
343
+ if is_beg? || lex_state =~ EXPR_CLASS || is_space_arg? then
344
+ result EXPR_BEG, :tCOLON3, text
345
+ else
346
+ result EXPR_DOT, :tCOLON2, text
347
+ end
348
+ end
349
+
350
+ def process_dots text # parse32.y:10216
351
+ is_beg = self.is_beg?
352
+ self.lex_state = EXPR_BEG
353
+
354
+ return result EXPR_ENDARG, :tBDOT3, text if
355
+ parser.in_argdef && text == "..." # TODO: version check?
356
+
357
+ tokens = ruby27plus? && is_beg ? BTOKENS : TOKENS
358
+
359
+ result EXPR_BEG, tokens[text], text
360
+ end
361
+
584
362
  def process_float text
585
363
  rb_compile_error "Invalid numeric format" if text =~ /__/
586
364
 
587
365
  case
588
- when text.end_with?('ri')
589
- return result EXPR_NUM, :tIMAGINARY, Complex(0, Rational(text.chop.chop))
590
- when text.end_with?('i')
591
- return result EXPR_NUM, :tIMAGINARY, Complex(0, text.chop.to_f)
592
- when text.end_with?('r')
593
- return result EXPR_NUM, :tRATIONAL, Rational(text.chop)
366
+ when text.end_with?("ri")
367
+ result EXPR_NUM, :tIMAGINARY, Complex(0, Rational(text.chop.chop))
368
+ when text.end_with?("i")
369
+ result EXPR_NUM, :tIMAGINARY, Complex(0, text.chop.to_f)
370
+ when text.end_with?("r")
371
+ result EXPR_NUM, :tRATIONAL, Rational(text.chop)
594
372
  else
595
- return result EXPR_NUM, :tFLOAT, text.to_f
373
+ result EXPR_NUM, :tFLOAT, text.to_f
596
374
  end
597
375
  end
598
376
 
599
377
  def process_gvar text
600
- text.lineno = self.lineno
378
+ if parser.class.version > 20 && text == "$-" then
379
+ rb_compile_error "unexpected $undefined"
380
+ end
381
+
601
382
  result EXPR_END, :tGVAR, text
602
383
  end
603
384
 
604
385
  def process_gvar_oddity text
605
- return result EXPR_END, "$", "$" if text == "$" # TODO: wtf is this?
606
386
  rb_compile_error "#{text.inspect} is not allowed as a global variable name"
607
387
  end
608
388
 
609
389
  def process_ivar text
610
390
  tok_id = text =~ /^@@/ ? :tCVAR : :tIVAR
611
- text.lineno = self.lineno
612
391
  result EXPR_END, tok_id, text
613
392
  end
614
393
 
394
+ def process_label text
395
+ symbol = possibly_escape_string text, /^\"/
396
+
397
+ result EXPR_LAB, :tLABEL, symbol
398
+ end
399
+
400
+ def process_label_or_string text
401
+ if @was_label && text =~ /:\Z/ then
402
+ @was_label = nil
403
+ return process_label text
404
+ elsif text =~ /:\Z/ then
405
+ self.pos -= 1 # put back ":"
406
+ text = text[0..-2]
407
+ end
408
+
409
+ orig_line = lineno
410
+ str = text[1..-2].gsub(/\\\\/, "\\").gsub(/\\\'/, "\'")
411
+ self.lineno += str.count("\n")
412
+
413
+ result EXPR_END, :tSTRING, str, orig_line
414
+ end
415
+
615
416
  def process_lchevron text
616
417
  if (lex_state !~ EXPR_DOT|EXPR_CLASS &&
617
418
  !is_end? &&
@@ -627,34 +428,26 @@ class RubyLexer
627
428
  self.lex_state = EXPR_BEG
628
429
  end
629
430
 
630
- return result(lex_state, :tLSHFT, "\<\<")
431
+ result lex_state, :tLSHFT, "\<\<"
631
432
  end
632
433
 
633
- def process_newline_or_comment text
434
+ def process_newline_or_comment text # ../compare/parse30.y:9126 ish
634
435
  c = matched
635
- hit = false
636
436
 
637
- if c == '#' then
638
- ss.pos -= 1
437
+ if c == "#" then
438
+ self.pos -= 1
639
439
 
640
- # TODO: handle magic comments
641
440
  while scan(/\s*\#.*(\n+|\z)/) do
642
- hit = true
643
- self.lineno += matched.lines.to_a.size
644
- @comments << matched.gsub(/^ +#/, '#').gsub(/^ +$/, '')
441
+ self.lineno += matched.count "\n"
442
+ self.comment ||= +""
443
+ self.comment << matched.gsub(/^ +#/, "#").gsub(/^ +$/, "")
645
444
  end
646
445
 
647
446
  return nil if end_of_stream?
648
447
  end
649
448
 
650
- self.lineno += 1 unless hit
651
-
652
- # Replace a string of newlines with a single one
653
- self.lineno += matched.lines.to_a.size if scan(/\n+/)
654
-
655
449
  c = (lex_state =~ EXPR_BEG|EXPR_CLASS|EXPR_FNAME|EXPR_DOT &&
656
450
  lex_state !~ EXPR_LABELED)
657
- # TODO: figure out what token_seen is for
658
451
  if c || self.lex_state == EXPR_LAB then # yes, == EXPR_LAB
659
452
  # ignore if !fallthrough?
660
453
  if !c && parser.in_kwarg then
@@ -662,25 +455,29 @@ class RubyLexer
662
455
  self.command_start = true
663
456
  return result EXPR_BEG, :tNL, nil
664
457
  else
665
- return # skip
458
+ maybe_pop_stack
459
+ return # goto retry
666
460
  end
667
461
  end
668
462
 
669
- if scan(/([\ \t\r\f\v]*)(\.|&)/) then
670
- self.space_seen = true unless ss[1].empty?
463
+ if scan(/[\ \t\r\f\v]+/) then
464
+ self.space_seen = true
465
+ end
671
466
 
672
- ss.pos -= 1
673
- return unless check(/\.\./)
467
+ if check(/#/) then
468
+ return # goto retry
469
+ elsif check(/&\.|\.(?!\.)/) then # C version is a hellish obfuscated xnor
470
+ return # goto retry
674
471
  end
675
472
 
676
473
  self.command_start = true
677
474
 
678
- return result(EXPR_BEG, :tNL, nil)
475
+ result EXPR_BEG, :tNL, nil
679
476
  end
680
477
 
681
478
  def process_nthref text
682
479
  # TODO: can't do lineno hack w/ number
683
- result EXPR_END, :tNTH_REF, ss[1].to_i
480
+ result EXPR_END, :tNTH_REF, match[1].to_i
684
481
  end
685
482
 
686
483
  def process_paren text
@@ -697,7 +494,7 @@ class RubyLexer
697
494
  # "an argument list, not a decomposed argument")
698
495
  :tLPAREN2
699
496
  else
700
- :tLPAREN2 # plain '(' in parse.y
497
+ :tLPAREN2 # plain "(" in parse.y
701
498
  end
702
499
 
703
500
  self.paren_nest += 1
@@ -708,13 +505,16 @@ class RubyLexer
708
505
  end
709
506
 
710
507
  def process_percent text
711
- return parse_quote if is_beg?
712
-
713
- return result EXPR_BEG, :tOP_ASGN, "%" if scan(/\=/)
714
-
715
- return parse_quote if is_space_arg?(check(/\s/)) || (lex_state =~ EXPR_FITEM && check(/s/))
716
-
717
- return result :arg_state, :tPERCENT, "%"
508
+ case
509
+ when is_beg? then
510
+ process_percent_quote
511
+ when scan(/\=/)
512
+ result EXPR_BEG, :tOP_ASGN, "%"
513
+ when is_space_arg?(check(/\s/)) || (lex_state =~ EXPR_FITEM && check(/s/))
514
+ process_percent_quote
515
+ else
516
+ result :arg_state, :tPERCENT, "%"
517
+ end
718
518
  end
719
519
 
720
520
  def process_plus_minus text
@@ -735,7 +535,7 @@ class RubyLexer
735
535
 
736
536
  return result(EXPR_BEG, :tOP_ASGN, sign) if scan(/\=/)
737
537
 
738
- if (is_beg? || (is_arg? && space_seen && !check(/\s/))) then
538
+ if is_beg? || (is_arg? && space_seen && !check(/\s/)) then
739
539
  arg_ambiguous if is_arg?
740
540
 
741
541
  if check(/\d/) then
@@ -760,12 +560,12 @@ class RubyLexer
760
560
 
761
561
  if check(/\s|\v/) then
762
562
  unless is_arg? then
763
- c2 = { " " => 's',
764
- "\n" => 'n',
765
- "\t" => 't',
766
- "\v" => 'v',
767
- "\r" => 'r',
768
- "\f" => 'f' }[matched]
563
+ c2 = { " " => "s",
564
+ "\n" => "n",
565
+ "\t" => "t",
566
+ "\v" => "v",
567
+ "\r" => "r",
568
+ "\f" => "f" }[matched]
769
569
 
770
570
  if c2 then
771
571
  warning("invalid character syntax; use ?\\" + c2)
@@ -781,17 +581,26 @@ class RubyLexer
781
581
  c = if scan(/\\/) then
782
582
  self.read_escape
783
583
  else
784
- ss.getch
584
+ getch
785
585
  end
786
586
 
787
587
  result EXPR_END, :tSTRING, c
788
588
  end
789
589
 
590
+ def process_simple_string text
591
+ orig_line = lineno
592
+ self.lineno += text.count("\n")
593
+
594
+ str = unescape_string text[1..-2]
595
+
596
+ result EXPR_END, :tSTRING, str, orig_line
597
+ end
598
+
790
599
  def process_slash text
791
600
  if is_beg? then
792
- string STR_REGEXP
601
+ string STR_REGEXP, matched
793
602
 
794
- return result(nil, :tREGEXP_BEG, "/")
603
+ return result nil, :tREGEXP_BEG, "/"
795
604
  end
796
605
 
797
606
  if scan(/\=/) then
@@ -806,7 +615,7 @@ class RubyLexer
806
615
  end
807
616
  end
808
617
 
809
- return result(:arg_state, :tDIVIDE, "/")
618
+ result :arg_state, :tDIVIDE, "/"
810
619
  end
811
620
 
812
621
  def process_square_bracket text
@@ -838,48 +647,14 @@ class RubyLexer
838
647
  result EXPR_PAR, token, text
839
648
  end
840
649
 
841
- def possibly_escape_string text, check
842
- content = match[1]
843
-
844
- if text =~ check then
845
- content.gsub(ESC) { unescape $1 }
846
- else
847
- content.gsub(/\\\\/, "\\").gsub(/\\'/, "'")
848
- end
849
- end
850
-
851
650
  def process_symbol text
852
- symbol = possibly_escape_string text, /^:"/
853
-
854
- result EXPR_END, :tSYMBOL, symbol
855
- end
856
-
857
- def was_label?
858
- @was_label = ruby22_label?
859
- true
860
- end
861
-
862
- def process_label_or_string text
863
- if @was_label && text =~ /:\Z/ then
864
- @was_label = nil
865
- return process_label text
866
- elsif text =~ /:\Z/ then
867
- ss.pos -= 1 # put back ":"
868
- text = text[0..-2]
869
- end
651
+ symbol = possibly_escape_string text, /^:\"/ # stupid emacs
870
652
 
871
- result EXPR_END, :tSTRING, text[1..-2].gsub(/\\\\/, "\\").gsub(/\\'/, "'")
872
- end
873
-
874
- def process_label text
875
- symbol = possibly_escape_string text, /^"/
876
-
877
- result EXPR_LAB, :tLABEL, [symbol, self.lineno]
653
+ result EXPR_LIT, :tSYMBOL, symbol
878
654
  end
879
655
 
880
656
  def process_token text
881
657
  # matching: parse_ident in compare/parse23.y:7989
882
- # TODO: make this always return [token, lineno]
883
658
  # FIX: remove: self.last_state = lex_state
884
659
 
885
660
  token = self.token = text
@@ -902,7 +677,7 @@ class RubyLexer
902
677
 
903
678
  if is_label_possible? and is_label_suffix? then
904
679
  scan(/:/)
905
- return result EXPR_LAB, :tLABEL, [token, self.lineno]
680
+ return result EXPR_LAB, :tLABEL, token
906
681
  end
907
682
 
908
683
  # TODO: mb == ENC_CODERANGE_7BIT && lex_state !~ EXPR_DOT
@@ -913,14 +688,17 @@ class RubyLexer
913
688
  return process_token_keyword keyword if keyword
914
689
  end
915
690
 
916
- # matching: compare/parse23.y:8079
917
- state = if is_beg? or is_arg? or lex_state =~ EXPR_DOT then
691
+ # matching: compare/parse32.y:9031
692
+ state = if lex_state =~ EXPR_BEG_ANY|EXPR_ARG_ANY|EXPR_DOT then
918
693
  cmd_state ? EXPR_CMDARG : EXPR_ARG
919
694
  elsif lex_state =~ EXPR_FNAME then
920
695
  EXPR_ENDFN
921
696
  else
922
697
  EXPR_END
923
698
  end
699
+ self.lex_state = state
700
+
701
+ tok_id = :tIDENTIFIER if tok_id == :tCONSTANT && is_local_id(token)
924
702
 
925
703
  if last_state !~ EXPR_DOT|EXPR_FNAME and
926
704
  (tok_id == :tIDENTIFIER) and # not EXPR_FNAME, not attrasgn
@@ -928,183 +706,102 @@ class RubyLexer
928
706
  state = EXPR_END|EXPR_LABEL
929
707
  end
930
708
 
931
- token.lineno = self.lineno # yes, on a string. I know... I know...
932
-
933
- return result(state, tok_id, token)
709
+ result state, tok_id, token
934
710
  end
935
711
 
936
712
  def process_token_keyword keyword
937
- # matching MIDDLE of parse_ident in compare/parse23.y:8046
713
+ # matching MIDDLE of parse_ident in compare/parse32.y:9695
938
714
  state = lex_state
939
- self.lex_state = keyword.state
940
715
 
941
- value = [token, self.lineno]
942
-
943
- return result(lex_state, keyword.id0, value) if state =~ EXPR_FNAME
716
+ return result(EXPR_ENDFN, keyword.id0, token) if lex_state =~ EXPR_FNAME
944
717
 
718
+ self.lex_state = keyword.state
945
719
  self.command_start = true if lex_state =~ EXPR_BEG
946
720
 
947
721
  case
948
- when keyword.id0 == :kDO then
722
+ when keyword.id0 == :kDO then # parse32.y line 9712
949
723
  case
950
724
  when lambda_beginning? then
951
725
  self.lpar_beg = nil # lambda_beginning? == FALSE in the body of "-> do ... end"
952
- self.paren_nest -= 1
953
- result lex_state, :kDO_LAMBDA, value
726
+ self.paren_nest -= 1 # TODO: question this?
727
+ result lex_state, :kDO_LAMBDA, token
954
728
  when cond.is_in_state then
955
- result lex_state, :kDO_COND, value
729
+ result lex_state, :kDO_COND, token
956
730
  when cmdarg.is_in_state && state != EXPR_CMDARG then
957
- result lex_state, :kDO_BLOCK, value
958
- when state =~ EXPR_BEG|EXPR_ENDARG then
959
- result lex_state, :kDO_BLOCK, value
731
+ result lex_state, :kDO_BLOCK, token
960
732
  else
961
- result lex_state, :kDO, value
733
+ result lex_state, :kDO, token
962
734
  end
963
735
  when state =~ EXPR_PAD then
964
- result lex_state, keyword.id0, value
736
+ result lex_state, keyword.id0, token
965
737
  when keyword.id0 != keyword.id1 then
966
- result EXPR_PAR, keyword.id1, value
738
+ result EXPR_PAR, keyword.id1, token
967
739
  else
968
- result lex_state, keyword.id1, value
740
+ result lex_state, keyword.id1, token
969
741
  end
970
742
  end
971
743
 
972
744
  def process_underscore text
973
- ss.unscan # put back "_"
745
+ self.unscan # put back "_"
974
746
 
975
747
  if beginning_of_line? && scan(/\__END__(\r?\n|\Z)/) then
976
- return [RubyLexer::EOF, RubyLexer::EOF]
977
- elsif scan(/\_\w*/) then
978
- return process_token matched
748
+ ss.terminate
749
+ [RubyLexer::EOF, RubyLexer::EOF]
750
+ elsif scan(/#{IDENT_CHAR}+/) then
751
+ process_token matched
979
752
  end
980
753
  end
981
754
 
982
755
  def rb_compile_error msg
983
- msg += ". near line #{self.lineno}: #{ss.rest[/^.*/].inspect}"
756
+ msg += ". near line #{self.lineno}: #{self.rest[/^.*/].inspect}"
984
757
  raise RubyParser::SyntaxError, msg
985
758
  end
986
759
 
987
- def read_escape # TODO: remove / rewrite
988
- case
989
- when scan(/\\/) then # Backslash
990
- '\\'
991
- when scan(/n/) then # newline
992
- self.extra_lineno -= 1
993
- "\n"
994
- when scan(/t/) then # horizontal tab
995
- "\t"
996
- when scan(/r/) then # carriage-return
997
- "\r"
998
- when scan(/f/) then # form-feed
999
- "\f"
1000
- when scan(/v/) then # vertical tab
1001
- "\13"
1002
- when scan(/a/) then # alarm(bell)
1003
- "\007"
1004
- when scan(/e/) then # escape
1005
- "\033"
1006
- when scan(/b/) then # backspace
1007
- "\010"
1008
- when scan(/s/) then # space
1009
- " "
1010
- when scan(/[0-7]{1,3}/) then # octal constant
1011
- (matched.to_i(8) & 0xFF).chr.force_encoding Encoding::UTF_8
1012
- when scan(/x([0-9a-fA-F]{1,2})/) then # hex constant
1013
- # TODO: force encode everything to UTF-8?
1014
- ss[1].to_i(16).chr.force_encoding Encoding::UTF_8
1015
- when check(/M-\\[\\MCc]/) then
1016
- scan(/M-\\/) # eat it
1017
- c = self.read_escape
1018
- c[0] = (c[0].ord | 0x80).chr
1019
- c
1020
- when scan(/M-(.)/) then
1021
- c = ss[1]
1022
- c[0] = (c[0].ord | 0x80).chr
1023
- c
1024
- when check(/(C-|c)\\[\\MCc]/) then
1025
- scan(/(C-|c)\\/) # eat it
1026
- c = self.read_escape
1027
- c[0] = (c[0].ord & 0x9f).chr
1028
- c
1029
- when scan(/C-\?|c\?/) then
1030
- 127.chr
1031
- when scan(/(C-|c)(.)/) then
1032
- c = ss[2]
1033
- c[0] = (c[0].ord & 0x9f).chr
1034
- c
1035
- when scan(/^[89]/i) then # bad octal or hex... MRI ignores them :(
1036
- matched
1037
- when scan(/u([0-9a-fA-F]{4}|\{[0-9a-fA-F]{2,6}\})/) then
1038
- [ss[1].delete("{}").to_i(16)].pack("U")
1039
- when scan(/u([0-9a-fA-F]{1,3})/) then
1040
- rb_compile_error "Invalid escape character syntax"
1041
- when scan(/[McCx0-9]/) || end_of_stream? then
1042
- rb_compile_error("Invalid escape character syntax")
1043
- else
1044
- ss.getch
1045
- end.dup
1046
- end
1047
-
1048
- def regx_options # TODO: rewrite / remove
1049
- good, bad = [], []
1050
-
1051
- if scan(/[a-z]+/) then
1052
- good, bad = matched.split(//).partition { |s| s =~ /^[ixmonesu]$/ }
1053
- end
1054
-
1055
- unless bad.empty? then
1056
- rb_compile_error("unknown regexp option%s - %s" %
1057
- [(bad.size > 1 ? "s" : ""), bad.join.inspect])
1058
- end
1059
-
1060
- return good.join
1061
- end
1062
-
1063
760
  def reset
761
+ self.lineno = 1
1064
762
  self.brace_nest = 0
1065
763
  self.command_start = true
1066
- self.comments = []
764
+ self.comment = nil
1067
765
  self.lex_state = EXPR_NONE
1068
766
  self.lex_strterm = nil
1069
- self.lineno = 1
1070
767
  self.lpar_beg = nil
1071
768
  self.paren_nest = 0
1072
769
  self.space_seen = false
1073
770
  self.string_nest = 0
1074
771
  self.token = nil
1075
- self.extra_lineno = 0
772
+ self.string_buffer = []
773
+ self.old_ss = nil
774
+ self.old_lineno = nil
1076
775
 
1077
776
  self.cond.reset
1078
777
  self.cmdarg.reset
1079
778
  end
1080
779
 
1081
- def result new_state, token, text # :nodoc:
780
+ def result new_state, token, text, line = self.lineno # :nodoc:
1082
781
  new_state = self.arg_state if new_state == :arg_state
1083
782
  self.lex_state = new_state if new_state
1084
- [token, text]
783
+
784
+ [token, [text, line]]
1085
785
  end
1086
786
 
1087
- def scan re
1088
- ss.scan re
787
+ def ruby22_label?
788
+ ruby22plus? and is_label_possible?
1089
789
  end
1090
790
 
1091
- def check re
1092
- ss.check re
791
+ def ruby22plus?
792
+ parser.class.version >= 22
1093
793
  end
1094
794
 
1095
- def eat_whitespace
1096
- r = scan(/\s+/)
1097
- self.extra_lineno += r.count("\n") if r
1098
- r
795
+ def ruby23plus?
796
+ parser.class.version >= 23
1099
797
  end
1100
798
 
1101
- def fixup_lineno extra = 0
1102
- self.lineno += self.extra_lineno + extra
1103
- self.extra_lineno = 0
799
+ def ruby24minus?
800
+ parser.class.version <= 24
1104
801
  end
1105
802
 
1106
- def scanner_class # TODO: design this out of oedipus_lex. or something.
1107
- RPStringScanner
803
+ def ruby27plus?
804
+ parser.class.version >= 27
1108
805
  end
1109
806
 
1110
807
  def space_vs_beginning space_type, beg_type, fallback
@@ -1119,137 +816,18 @@ class RubyLexer
1119
816
  end
1120
817
  end
1121
818
 
1122
- def string type, beg = matched, nnd = "\0"
1123
- self.lex_strterm = [:strterm, type, beg, nnd]
1124
- end
1125
-
1126
- # TODO: consider
1127
- # def src= src
1128
- # raise "bad src: #{src.inspect}" unless String === src
1129
- # @src = RPStringScanner.new(src)
1130
- # end
1131
-
1132
- def tokadd_escape term # TODO: rewrite / remove
1133
- case
1134
- when scan(/\\\n/) then
1135
- # just ignore
1136
- when scan(/\\([0-7]{1,3}|x[0-9a-fA-F]{1,2})/) then
1137
- self.string_buffer << matched
1138
- when scan(/\\([MC]-|c)(?=\\)/) then
1139
- self.string_buffer << matched
1140
- self.tokadd_escape term
1141
- when scan(/\\([MC]-|c)(.)/) then
1142
- self.string_buffer << matched
1143
- when scan(/\\[McCx]/) then
1144
- rb_compile_error "Invalid escape character syntax"
1145
- when scan(/\\(.)/m) then
1146
- chr = ss[1]
1147
- prev = self.string_buffer.last
1148
- if term == chr && prev && prev.end_with?("(?") then
1149
- self.string_buffer << chr
1150
- elsif term == chr || chr.ascii_only? then
1151
- self.string_buffer << matched # dunno why we keep them for ascii
1152
- else
1153
- self.string_buffer << chr # HACK? this is such a rat's nest
1154
- end
819
+ def unescape_string str
820
+ str = str.gsub(ESC) { unescape($1).b.force_encoding Encoding::UTF_8 }
821
+ if str.valid_encoding?
822
+ str
1155
823
  else
1156
- rb_compile_error "Invalid escape character syntax"
824
+ str.b
1157
825
  end
1158
826
  end
1159
827
 
1160
- def tokadd_string(func, term, paren) # TODO: rewrite / remove
1161
- qwords = (func & STR_FUNC_QWORDS) != 0
1162
- escape = (func & STR_FUNC_ESCAPE) != 0
1163
- expand = (func & STR_FUNC_EXPAND) != 0
1164
- regexp = (func & STR_FUNC_REGEXP) != 0
1165
- symbol = (func & STR_FUNC_SYMBOL) != 0
1166
-
1167
- paren_re = @@regexp_cache[paren]
1168
- term_re = @@regexp_cache[term]
1169
-
1170
- until end_of_stream? do
1171
- c = nil
1172
- handled = true
1173
-
1174
- case
1175
- when paren_re && scan(paren_re) then
1176
- self.string_nest += 1
1177
- when scan(term_re) then
1178
- if self.string_nest == 0 then
1179
- ss.pos -= 1
1180
- break
1181
- else
1182
- self.string_nest -= 1
1183
- end
1184
- when expand && scan(/#(?=[\$\@\{])/) then # TODO: this seems wrong
1185
- ss.pos -= 1
1186
- break
1187
- when qwords && scan(/\s/) then
1188
- ss.pos -= 1
1189
- break
1190
- when expand && scan(/#(?!\n)/) then
1191
- # do nothing
1192
- when check(/\\/) then
1193
- case
1194
- when qwords && scan(/\\\n/) then
1195
- string_buffer << "\n"
1196
- next
1197
- when qwords && scan(/\\\s/) then
1198
- c = ' '
1199
- when expand && scan(/\\\n/) then
1200
- next
1201
- when regexp && check(/\\/) then
1202
- self.tokadd_escape term
1203
- next
1204
- when expand && scan(/\\/) then
1205
- c = self.read_escape
1206
- when scan(/\\\n/) then
1207
- # do nothing
1208
- when scan(/\\\\/) then
1209
- string_buffer << '\\' if escape
1210
- c = '\\'
1211
- when scan(/\\/) then
1212
- unless scan(term_re) || paren.nil? || scan(paren_re) then
1213
- string_buffer << "\\"
1214
- end
1215
- else
1216
- handled = false
1217
- end # inner /\\/ case
1218
- else
1219
- handled = false
1220
- end # top case
1221
-
1222
- unless handled then
1223
- t = Regexp.escape term
1224
- x = Regexp.escape(paren) if paren && paren != "\000"
1225
- re = if qwords then
1226
- /[^#{t}#{x}\#\0\\\s]+|./ # |. to pick up whatever
1227
- else
1228
- /[^#{t}#{x}\#\0\\]+|./
1229
- end
1230
-
1231
- scan re
1232
- c = matched
1233
-
1234
- rb_compile_error "symbol cannot contain '\\0'" if symbol && c =~ /\0/
1235
- end # unless handled
1236
-
1237
- c ||= matched
1238
- string_buffer << c
1239
- end # until
1240
-
1241
- c ||= matched
1242
- c = RubyLexer::EOF if end_of_stream?
1243
-
1244
- return c
1245
- end
1246
-
1247
828
  def unescape s
1248
829
  r = ESCAPES[s]
1249
830
 
1250
- self.extra_lineno += 1 if s == "\n" # eg backslash newline strings
1251
- self.extra_lineno -= 1 if r && s == "n" # literal \n, not newline
1252
-
1253
831
  return r if r
1254
832
 
1255
833
  x = case s
@@ -1265,10 +843,12 @@ class RubyLexer
1265
843
  s
1266
844
  when /^[McCx0-9]/ then
1267
845
  rb_compile_error("Invalid escape character syntax")
1268
- when /u([0-9a-fA-F]{4}|\{[0-9a-fA-F]{2,6}\})/ then
846
+ when /u(\h{4})/ then
1269
847
  [$1.delete("{}").to_i(16)].pack("U")
1270
- when /u([0-9a-fA-F]{1,3})/ then
848
+ when /u(\h{1,3})/ then
1271
849
  rb_compile_error("Invalid escape character syntax")
850
+ when /u\{(\h+(?:\s+\h+)*)\}/ then
851
+ $1.split.map { |cp| cp.to_i(16) }.pack("U*")
1272
852
  else
1273
853
  s
1274
854
  end
@@ -1279,171 +859,294 @@ class RubyLexer
1279
859
  # do nothing for now
1280
860
  end
1281
861
 
1282
- def ruby22plus?
1283
- parser.class.version >= 22
862
+ def was_label?
863
+ @was_label = ruby22_label?
864
+ true
1284
865
  end
1285
866
 
1286
- def ruby23plus?
1287
- parser.class.version >= 23
1288
- end
867
+ class State
868
+ attr_accessor :n
869
+ attr_accessor :names
1289
870
 
1290
- def process_string # TODO: rewrite / remove
1291
- # matches top of parser_yylex in compare/parse23.y:8113
1292
- token = if lex_strterm[0] == :heredoc then
1293
- self.heredoc lex_strterm
1294
- else
1295
- self.parse_string lex_strterm
1296
- end
871
+ # TODO: take a shared hash of strings for inspect/to_s
872
+ def initialize o, names
873
+ raise ArgumentError, "bad state: %p" % [o] unless Integer === o # TODO: remove
1297
874
 
1298
- token_type, c = token
875
+ self.n = o
876
+ self.names = names
877
+ end
1299
878
 
1300
- # matches parser_string_term
1301
- if ruby22plus? && token_type == :tSTRING_END && ["'", '"'].include?(c) then
1302
- if ((lex_state =~ EXPR_BEG|EXPR_ENDFN &&
1303
- !cond.is_in_state) || is_arg?) &&
1304
- is_label_suffix? then
1305
- scan(/:/)
1306
- token_type = token[0] = :tLABEL_END
1307
- end
879
+ def == o
880
+ self.equal?(o) || (o.class == self.class && o.n == self.n)
1308
881
  end
1309
882
 
1310
- if [:tSTRING_END, :tREGEXP_END, :tLABEL_END].include? token_type then
1311
- self.lex_strterm = nil
1312
- self.lex_state = (token_type == :tLABEL_END) ? EXPR_PAR : EXPR_END
883
+ def =~ v
884
+ (self.n & v.n) != 0
1313
885
  end
1314
886
 
1315
- return token
1316
- end
887
+ def | v
888
+ raise ArgumentError, "Incompatible State: %p vs %p" % [self, v] unless
889
+ self.names == v.names
890
+ self.class.new(self.n | v.n, self.names)
891
+ end
1317
892
 
1318
- def parse_quote # TODO: remove / rewrite
1319
- beg, nnd, short_hand, c = nil, nil, false, nil
893
+ def inspect
894
+ return "EXPR_NONE" if n.zero? # HACK?
1320
895
 
1321
- if scan(/[a-z0-9]{1,2}/i) then # Long-hand (e.g. %Q{}).
1322
- rb_compile_error "unknown type of %string" if ss.matched_size == 2
1323
- c, beg, short_hand = matched, ss.getch, false
1324
- else # Short-hand (e.g. %{, %., %!, etc)
1325
- c, beg, short_hand = 'Q', ss.getch, true
896
+ names.map { |v, k| k if self =~ v }.
897
+ compact.
898
+ join("|").
899
+ gsub(/(?:EXPR_|STR_(?:FUNC_)?)/, "")
1326
900
  end
1327
901
 
1328
- if end_of_stream? or c == RubyLexer::EOF or beg == RubyLexer::EOF then
1329
- rb_compile_error "unterminated quoted string meets end of file"
1330
- end
902
+ alias to_s inspect
1331
903
 
1332
- # Figure nnd-char. "\0" is special to indicate beg=nnd and that no nesting?
1333
- nnd = { "(" => ")", "[" => "]", "{" => "}", "<" => ">" }[beg]
1334
- nnd, beg = beg, "\0" if nnd.nil?
904
+ module Values
905
+ expr_names = {}
906
+
907
+ EXPR_NONE = State.new 0x0, expr_names
908
+ EXPR_BEG = State.new 0x1, expr_names
909
+ EXPR_END = State.new 0x2, expr_names
910
+ EXPR_ENDARG = State.new 0x4, expr_names
911
+ EXPR_ENDFN = State.new 0x8, expr_names
912
+ EXPR_ARG = State.new 0x10, expr_names
913
+ EXPR_CMDARG = State.new 0x20, expr_names
914
+ EXPR_MID = State.new 0x40, expr_names
915
+ EXPR_FNAME = State.new 0x80, expr_names
916
+ EXPR_DOT = State.new 0x100, expr_names
917
+ EXPR_CLASS = State.new 0x200, expr_names
918
+ EXPR_LABEL = State.new 0x400, expr_names
919
+ EXPR_LABELED = State.new 0x800, expr_names
920
+ EXPR_FITEM = State.new 0x1000, expr_names
1335
921
 
1336
- token_type, text = nil, "%#{c}#{beg}"
1337
- token_type, string_type = case c
1338
- when 'Q' then
1339
- ch = short_hand ? nnd : c + beg
1340
- text = "%#{ch}"
1341
- [:tSTRING_BEG, STR_DQUOTE]
1342
- when 'q' then
1343
- [:tSTRING_BEG, STR_SQUOTE]
1344
- when 'W' then
1345
- eat_whitespace
1346
- [:tWORDS_BEG, STR_DQUOTE | STR_FUNC_QWORDS]
1347
- when 'w' then
1348
- eat_whitespace
1349
- [:tQWORDS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
1350
- when 'x' then
1351
- [:tXSTRING_BEG, STR_XQUOTE]
1352
- when 'r' then
1353
- [:tREGEXP_BEG, STR_REGEXP]
1354
- when 's' then
1355
- self.lex_state = EXPR_FNAME
1356
- [:tSYMBEG, STR_SSYM]
1357
- when 'I' then
1358
- eat_whitespace
1359
- [:tSYMBOLS_BEG, STR_DQUOTE | STR_FUNC_QWORDS]
1360
- when 'i' then
1361
- eat_whitespace
1362
- [:tQSYMBOLS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
1363
- end
922
+ EXPR_BEG_ANY = EXPR_BEG | EXPR_MID | EXPR_CLASS
923
+ EXPR_ARG_ANY = EXPR_ARG | EXPR_CMDARG
924
+ EXPR_END_ANY = EXPR_END | EXPR_ENDARG | EXPR_ENDFN
1364
925
 
1365
- rb_compile_error "Bad %string type. Expected [QqWwIixrs], found '#{c}'." if
1366
- token_type.nil?
926
+ # extra fake lex_state names to make things a bit cleaner
1367
927
 
1368
- raise "huh" unless string_type
928
+ EXPR_LAB = EXPR_ARG|EXPR_LABELED
929
+ EXPR_LIT = EXPR_END|EXPR_ENDARG
930
+ EXPR_PAR = EXPR_BEG|EXPR_LABEL
931
+ EXPR_PAD = EXPR_BEG|EXPR_LABELED
1369
932
 
1370
- string string_type, nnd, beg
933
+ EXPR_NUM = EXPR_LIT
934
+
935
+ expr_names.merge!(EXPR_NONE => "EXPR_NONE",
936
+ EXPR_BEG => "EXPR_BEG",
937
+ EXPR_END => "EXPR_END",
938
+ EXPR_ENDARG => "EXPR_ENDARG",
939
+ EXPR_ENDFN => "EXPR_ENDFN",
940
+ EXPR_ARG => "EXPR_ARG",
941
+ EXPR_CMDARG => "EXPR_CMDARG",
942
+ EXPR_MID => "EXPR_MID",
943
+ EXPR_FNAME => "EXPR_FNAME",
944
+ EXPR_DOT => "EXPR_DOT",
945
+ EXPR_CLASS => "EXPR_CLASS",
946
+ EXPR_LABEL => "EXPR_LABEL",
947
+ EXPR_LABELED => "EXPR_LABELED",
948
+ EXPR_FITEM => "EXPR_FITEM")
949
+
950
+ # ruby constants for strings
951
+
952
+ str_func_names = {}
953
+
954
+ STR_FUNC_BORING = State.new 0x00, str_func_names
955
+ STR_FUNC_ESCAPE = State.new 0x01, str_func_names
956
+ STR_FUNC_EXPAND = State.new 0x02, str_func_names
957
+ STR_FUNC_REGEXP = State.new 0x04, str_func_names
958
+ STR_FUNC_QWORDS = State.new 0x08, str_func_names
959
+ STR_FUNC_SYMBOL = State.new 0x10, str_func_names
960
+ STR_FUNC_INDENT = State.new 0x20, str_func_names # <<-HEREDOC
961
+ STR_FUNC_LABEL = State.new 0x40, str_func_names
962
+ STR_FUNC_LIST = State.new 0x4000, str_func_names
963
+ STR_FUNC_TERM = State.new 0x8000, str_func_names
964
+ STR_FUNC_DEDENT = State.new 0x10000, str_func_names # <<~HEREDOC
965
+
966
+ # TODO: check parser25.y on how they do STR_FUNC_INDENT
967
+
968
+ STR_SQUOTE = STR_FUNC_BORING
969
+ STR_DQUOTE = STR_FUNC_EXPAND
970
+ STR_XQUOTE = STR_FUNC_EXPAND
971
+ STR_REGEXP = STR_FUNC_REGEXP | STR_FUNC_ESCAPE | STR_FUNC_EXPAND
972
+ STR_SWORD = STR_FUNC_QWORDS | STR_FUNC_LIST
973
+ STR_DWORD = STR_FUNC_QWORDS | STR_FUNC_EXPAND | STR_FUNC_LIST
974
+ STR_SSYM = STR_FUNC_SYMBOL
975
+ STR_DSYM = STR_FUNC_SYMBOL | STR_FUNC_EXPAND
976
+ STR_LABEL = STR_FUNC_LABEL
977
+
978
+ str_func_names.merge!(STR_FUNC_ESCAPE => "STR_FUNC_ESCAPE",
979
+ STR_FUNC_EXPAND => "STR_FUNC_EXPAND",
980
+ STR_FUNC_REGEXP => "STR_FUNC_REGEXP",
981
+ STR_FUNC_QWORDS => "STR_FUNC_QWORDS",
982
+ STR_FUNC_SYMBOL => "STR_FUNC_SYMBOL",
983
+ STR_FUNC_INDENT => "STR_FUNC_INDENT",
984
+ STR_FUNC_LABEL => "STR_FUNC_LABEL",
985
+ STR_FUNC_LIST => "STR_FUNC_LIST",
986
+ STR_FUNC_TERM => "STR_FUNC_TERM",
987
+ STR_FUNC_DEDENT => "STR_FUNC_DEDENT",
988
+ STR_SQUOTE => "STR_SQUOTE")
989
+ end
1371
990
 
1372
- return token_type, text
991
+ include Values
1373
992
  end
1374
993
 
1375
- def parse_string quote # TODO: rewrite / remove
1376
- _, string_type, term, open = quote
994
+ include State::Values
995
+ end
996
+
997
+ class RubyLexer
998
+ module SSWrapper
999
+ def string= s
1000
+ ss.string= s
1001
+ end
1002
+
1003
+ def beginning_of_line?
1004
+ ss.bol?
1005
+ end
1006
+
1007
+ alias bol? beginning_of_line? # to make .rex file more readable
1377
1008
 
1378
- space = false # FIX: remove these
1379
- func = string_type
1380
- paren = open
1381
- term_re = @@regexp_cache[term]
1009
+ def check re
1010
+ maybe_pop_stack
1382
1011
 
1383
- qwords = (func & STR_FUNC_QWORDS) != 0
1384
- regexp = (func & STR_FUNC_REGEXP) != 0
1385
- expand = (func & STR_FUNC_EXPAND) != 0
1012
+ ss.check re
1013
+ end
1386
1014
 
1387
- unless func then # nil'ed from qwords below. *sigh*
1388
- return :tSTRING_END, nil
1015
+ def end_of_stream?
1016
+ ss.eos?
1389
1017
  end
1390
1018
 
1391
- space = true if qwords and eat_whitespace
1019
+ alias eos? end_of_stream?
1392
1020
 
1393
- if self.string_nest == 0 && scan(/#{term_re}/) then
1394
- if qwords then
1395
- quote[1] = nil
1396
- return :tSPACE, nil
1397
- elsif regexp then
1398
- return :tREGEXP_END, self.regx_options
1399
- else
1400
- return :tSTRING_END, term
1401
- end
1021
+ def getch
1022
+ c = ss.getch
1023
+ c = ss.getch if c == "\r" && ss.peek(1) == "\n"
1024
+ c
1402
1025
  end
1403
1026
 
1404
- return :tSPACE, nil if space
1027
+ def match
1028
+ ss
1029
+ end
1405
1030
 
1406
- self.string_buffer = []
1031
+ def matched
1032
+ ss.matched
1033
+ end
1407
1034
 
1408
- if expand
1409
- case
1410
- when scan(/#(?=\$(-.|[a-zA-Z_0-9~\*\$\?!@\/\\;,\.=:<>\"\&\`\'+]))/) then
1411
- # TODO: !ISASCII
1412
- # ?! see parser_peek_variable_name
1413
- return :tSTRING_DVAR, nil
1414
- when scan(/#(?=\@\@?[a-zA-Z_])/) then
1415
- # TODO: !ISASCII
1416
- return :tSTRING_DVAR, nil
1417
- when scan(/#[{]/) then
1418
- self.command_start = true
1419
- return :tSTRING_DBEG, nil
1420
- when scan(/#/) then
1421
- string_buffer << '#'
1035
+ def in_heredoc?
1036
+ !!self.old_ss
1037
+ end
1038
+
1039
+ def maybe_pop_stack
1040
+ if ss.eos? && in_heredoc? then
1041
+ self.ss_pop
1042
+ self.lineno_pop
1422
1043
  end
1423
1044
  end
1424
1045
 
1425
- if tokadd_string(func, term, paren) == RubyLexer::EOF then
1426
- rb_compile_error "unterminated string meets end of file"
1046
+ def pos
1047
+ ss.pos
1048
+ end
1049
+
1050
+ def pos= n
1051
+ ss.pos = n
1052
+ end
1053
+
1054
+ def rest
1055
+ ss.rest
1056
+ end
1057
+
1058
+ def scan re
1059
+ maybe_pop_stack
1060
+
1061
+ ss.scan re
1062
+ end
1063
+
1064
+ def scanner_class # TODO: design this out of oedipus_lex. or something.
1065
+ RPStringScanner
1066
+ end
1067
+
1068
+ def ss_string
1069
+ ss.string
1070
+ end
1071
+
1072
+ def ss_string= s
1073
+ raise "Probably not"
1074
+ ss.string = s
1075
+ end
1076
+
1077
+ def unscan
1078
+ ss.unscan
1079
+ end
1080
+ end
1081
+
1082
+ include SSWrapper
1083
+ end
1084
+
1085
+ class RubyLexer
1086
+ module SSStackish
1087
+ def lineno_push new_lineno
1088
+ self.old_lineno = self.lineno
1089
+ self.lineno = new_lineno
1090
+ end
1091
+
1092
+ def lineno_pop
1093
+ self.lineno = self.old_lineno
1094
+ self.old_lineno = nil
1095
+ end
1096
+
1097
+ def ss= o
1098
+ raise "Clearing ss while in heredoc!?!" if in_heredoc?
1099
+ @old_ss = nil
1100
+ super
1427
1101
  end
1428
1102
 
1429
- return :tSTRING_CONTENT, string_buffer.join
1103
+ def ss_push new_ss
1104
+ @old_ss = self.ss
1105
+ @ss = new_ss
1106
+ end
1107
+
1108
+ def ss_pop
1109
+ @ss = self.old_ss
1110
+ @old_ss = nil
1111
+ end
1430
1112
  end
1113
+
1114
+ prepend SSStackish
1431
1115
  end
1432
1116
 
1433
- require "ruby_lexer.rex"
1117
+ if ENV["RP_STRTERM_DEBUG"] then
1118
+ class RubyLexer
1119
+ def d o
1120
+ $stderr.puts o.inspect
1121
+ end
1122
+
1123
+ alias old_lex_strterm= lex_strterm=
1124
+
1125
+ def lex_strterm= o
1126
+ self.old_lex_strterm= o
1127
+ where = caller.first.split(/:/).first(2).join(":")
1128
+ $stderr.puts
1129
+ d :lex_strterm => [o, where]
1130
+ end
1131
+ end
1132
+ end
1133
+
1134
+ require_relative "./ruby_lexer.rex.rb"
1135
+ require_relative "./ruby_lexer_strings.rb"
1434
1136
 
1435
1137
  if ENV["RP_LINENO_DEBUG"] then
1436
1138
  class RubyLexer
1437
- alias :old_lineno= :lineno=
1438
-
1439
1139
  def d o
1440
1140
  $stderr.puts o.inspect
1441
1141
  end
1442
1142
 
1143
+ alias old_lineno= lineno=
1144
+
1443
1145
  def lineno= n
1444
1146
  self.old_lineno= n
1445
1147
  where = caller.first.split(/:/).first(2).join(":")
1446
- d :lineno => [n, where, ss && ss.rest[0,40]]
1148
+ $stderr.puts
1149
+ d :lineno => [n, where]
1447
1150
  end
1448
1151
  end
1449
1152
  end