rubylexer 0.7.3 → 0.7.4
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +16 -0
- data/Manifest.txt +3 -1
- data/README.txt +12 -19
- data/Rakefile +2 -2
- data/lib/rubylexer.rb +214 -86
- data/lib/rubylexer/context.rb +17 -6
- data/lib/rubylexer/lextable.rb +202 -0
- data/lib/rubylexer/rulexer.rb +61 -9
- data/lib/rubylexer/test/illegal_oneliners.rb +4 -0
- data/lib/rubylexer/test/stanzas.rb +2 -0
- data/lib/rubylexer/test/testcases.rb +6 -1
- data/lib/rubylexer/token.rb +4 -1
- data/lib/rubylexer/version.rb +1 -1
- data/test/code/regression.rb +1 -1
- data/test/code/rubylexervsruby.rb +23 -6
- data/test/data/1.rb +729 -0
- data/test/data/heart.rb +43 -2
- data/test/data/pleac.rb +6282 -0
- data/testing.txt +1 -1
- metadata +7 -4
data/History.txt
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
=== 0.7.4/5-20-2009
|
2
|
+
* 2 Major Enhancements:
|
3
|
+
* preliminary support for ruby 1.9
|
4
|
+
* utf8 inputs should now work... more or less
|
5
|
+
|
6
|
+
* 5 Minor Enhancements:
|
7
|
+
* better detection of illegal escapes and interpolations in strings
|
8
|
+
* indicate error on unterminated here body
|
9
|
+
* fixed pattern of keywords that can't start a param list (ignores ?,! now)
|
10
|
+
* in is_var_name?, check for global/instance vars first
|
11
|
+
* comma and star in a true lhs should be correctly marked as such, now
|
12
|
+
|
13
|
+
* 2 Bugfixes:
|
14
|
+
* added tag field to Token; I hope many flags can be coalesced into tag.
|
15
|
+
* note line that all strings (and here docs) start and end on
|
16
|
+
|
1
17
|
=== 0.7.3/4-19-2009
|
2
18
|
* 9 Bugfixes:
|
3
19
|
* remember whether comma was seen in paren context
|
data/Manifest.txt
CHANGED
@@ -15,6 +15,7 @@ lib/rubylexer/version.rb
|
|
15
15
|
lib/rubylexer/rulexer.rb
|
16
16
|
lib/rubylexer/tokenprinter.rb
|
17
17
|
lib/rubylexer/charset.rb
|
18
|
+
lib/rubylexer/lextable.rb
|
18
19
|
lib/rubylexer/symboltable.rb
|
19
20
|
lib/rubylexer/charhandler.rb
|
20
21
|
lib/assert.rb
|
@@ -43,11 +44,13 @@ test/data/23.rb
|
|
43
44
|
test/data/lbrack.rb
|
44
45
|
test/data/untitled1.rb
|
45
46
|
test/data/rescue.rb
|
47
|
+
test/data/pleac.rb
|
46
48
|
test/data/pleac.rb.broken
|
47
49
|
test/data/heart.rb
|
48
50
|
test/data/s.rb
|
49
51
|
test/data/wsdlDriver.rb
|
50
52
|
test/data/p-op.rb
|
53
|
+
test/data/1.rb
|
51
54
|
test/data/1.rb.broken
|
52
55
|
test/data/untermed_here.rb.broken
|
53
56
|
test/data/newsyntax.rb
|
@@ -72,7 +75,6 @@ test/code/regression.rb
|
|
72
75
|
test/code/strgen.rb
|
73
76
|
test/code/tarball.rb
|
74
77
|
lib/rubylexer/test/testcases.rb
|
75
|
-
test/data/chunky.plain.rb
|
76
78
|
test/data/cvtesc.rb
|
77
79
|
test/data/__eof2.rb
|
78
80
|
test/data/__eof5.rb
|
data/README.txt
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
= RubyLexer
|
2
2
|
|
3
|
-
*
|
4
|
-
*
|
5
|
-
*
|
3
|
+
* rubyforge.net/projects/rubylexer
|
4
|
+
* github.com/coatl/rubylexer
|
6
5
|
|
7
6
|
=== DESCRIPTION:
|
8
7
|
|
@@ -48,13 +47,9 @@ end
|
|
48
47
|
== Status
|
49
48
|
RubyLexer can correctly lex all legal Ruby 1.8 code that I've been able to
|
50
49
|
find on my Debian system. It can also handle (most of) my catalog of nasty
|
51
|
-
test cases (
|
52
|
-
|
53
|
-
|
54
|
-
about and plan to fix, but it seems that Ruby coders don't write code complex
|
55
|
-
enough to trigger them very often. Although incomplete, RubyLexer can
|
56
|
-
correctly distinguish these ambiguous uses of the following operator and
|
57
|
-
keywords, depending on context:
|
50
|
+
test cases (see below for known problems). Modulo some very obscure bugs,
|
51
|
+
RubyLexer can correctly distinguish these ambiguous uses of the following
|
52
|
+
operators, depending on context:
|
58
53
|
% can be modulus operator or start of fancy string
|
59
54
|
/ can be division operator or start of regex
|
60
55
|
* & + - :: can be unary or binary operator
|
@@ -83,18 +78,16 @@ emit advisory tokens when local var defined/goes out of scope (or hidden/unhidde
|
|
83
78
|
token pruning in dumptokens...
|
84
79
|
|
85
80
|
== known issues: (and planned fix release)
|
86
|
-
context not really preserved when entering or leaving string inclusions. this
|
87
|
-
a number or problems
|
88
|
-
|
89
|
-
string tokenization sometimes a little different from ruby around newlines
|
90
|
-
(htree/template.rb) (0.8)
|
81
|
+
context not really preserved when entering or leaving string inclusions. this caused
|
82
|
+
-a number or problems, which had to be hacked around. it would be better to avoid
|
83
|
+
-tokens within tokens. (0.8)
|
91
84
|
string contents might not be correctly translated in a few cases (0.8?)
|
92
85
|
symbols which contain string interpolations are flattened into one token. eg :"foo#{bar}" (0.8)
|
93
86
|
'\r' whitespace sometimes seen in dos-formatted output.. shouldn't be (eg pre.rb) (0.7)
|
94
87
|
windows newline in source is likely to cause problems in obscure cases (need test case)
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
regression test currently shows
|
88
|
+
ruby 1.9 incompletely supported (0.9)
|
89
|
+
current character set is always forced to ascii-8bit. however, this mode should be
|
90
|
+
-compatible with texts written in regular ascii, utf-8, and euc. (among others?) (1.0)
|
91
|
+
regression test currently shows a few errors with differences in exact token ordering
|
99
92
|
-around string inclusions. these errors are much less serious than they seem.
|
100
93
|
offset of AssignmentRhsListEndToken appears to be off by 1
|
data/Rakefile
CHANGED
@@ -25,13 +25,13 @@ require 'lib/rubylexer/version.rb'
|
|
25
25
|
hoe=Hoe.new("rubylexer", RubyLexer::VERSION) do |_|
|
26
26
|
_.author = "Caleb Clausen"
|
27
27
|
_.email = "rubylexer-owner @at@ inforadical .dot. net"
|
28
|
-
_.url = ["http://
|
28
|
+
_.url = ["http://github.com/coatl/rubylexer/", "http://rubyforge.org/projects/rubylexer/"]
|
29
29
|
_.extra_deps << ['sequence', '>= 0.2.0']
|
30
30
|
_.test_globs=["test/code/regression.rb"]
|
31
31
|
_.description=desc
|
32
32
|
_.summary=desc[/\A[^.]+\./]
|
33
33
|
_.spec_extras={:bindir=>'',:rdoc_options=>'-x lib/rubylexer/test'}
|
34
|
-
_.rdoc_pattern=/\A(howtouse\.txt|testing\.txt|README\.txt|lib\/[^\/]*\.rb|lib\/rubylexer\/[^\d][^\/]*\.rb)\Z/
|
34
|
+
#_.rdoc_pattern=/\A(howtouse\.txt|testing\.txt|README\.txt|lib\/[^\/]*\.rb|lib\/rubylexer\/[^\d][^\/]*\.rb)\Z/
|
35
35
|
end
|
36
36
|
|
37
37
|
|
data/lib/rubylexer.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
=begin
|
1
|
+
=begin
|
2
2
|
rubylexer - a ruby lexer written in ruby
|
3
3
|
Copyright (C) 2004,2005,2008 Caleb Clausen
|
4
4
|
|
@@ -60,9 +60,6 @@ class RubyLexer
|
|
60
60
|
INNERBOUNDINGWORDS="(#{INNERBOUNDINGWORDLIST.join '|'})"
|
61
61
|
BINOPWORDLIST=%w"and or"
|
62
62
|
BINOPWORDS="(#{BINOPWORDLIST.join '|'})"
|
63
|
-
NEVERSTARTPARAMLISTWORDS=/\A(#{OPORBEGINWORDS}|#{INNERBOUNDINGWORDS}|#{BINOPWORDS}|end)([^a-zA-Z0-9_!?=]|\Z)/o
|
64
|
-
NEVERSTARTPARAMLISTFIRST=CharSet['aoeitrwu'] #chars that begin NEVERSTARTPARAMLIST
|
65
|
-
NEVERSTARTPARAMLISTMAXLEN=7 #max len of a NEVERSTARTPARAMLIST
|
66
63
|
|
67
64
|
RUBYKEYWORDS=%r{
|
68
65
|
^(alias|#{BINOPWORDS}|defined\?|not|undef|end|
|
@@ -72,6 +69,11 @@ class RubyLexer
|
|
72
69
|
}xo
|
73
70
|
#__END__ should not be in this set... its handled in start_of_line_directives
|
74
71
|
|
72
|
+
HIGHASCII=?\x80..?\xFF
|
73
|
+
NONASCII=HIGHASCII
|
74
|
+
#NONASCII=?\x80..?xFFFFFFFF #or is it 10FFFF, whatever the highest conceivable code point
|
75
|
+
|
76
|
+
|
75
77
|
CHARMAPPINGS = {
|
76
78
|
?$ => :dollar_identifier,
|
77
79
|
?@ => :at_identifier,
|
@@ -115,14 +117,43 @@ class RubyLexer
|
|
115
117
|
"])}" => :close_brace,
|
116
118
|
|
117
119
|
|
118
|
-
?# => :comment
|
120
|
+
?# => :comment,
|
121
|
+
|
122
|
+
NONASCII => :identifier,
|
119
123
|
}
|
120
124
|
|
121
125
|
attr_reader :incomplete_here_tokens, :parsestack, :last_token_maybe_implicit
|
122
126
|
|
127
|
+
UCLETTER=@@UCLETTER="[A-Z]"
|
128
|
+
|
129
|
+
#cheaters way, treats utf chars as always 1 byte wide
|
130
|
+
#all high-bit chars are lowercase letters
|
131
|
+
#works, but strings compare with strict binary identity, not unicode collation
|
132
|
+
#works for euc too, I think
|
133
|
+
#(the ruby spec for utf8 support permits this interpretation)
|
134
|
+
LCLETTER=@@LCLETTER="[a-z_\x80-\xFF]"
|
135
|
+
LETTER=@@LETTER="[A-Za-z_\x80-\xFF]"
|
136
|
+
LETTER_DIGIT=@@LETTER_DIGIT="[A-Za-z_0-9\x80-\xFF]"
|
137
|
+
eval %w[UCLETTER LCLETTER LETTER LETTER_DIGIT].map{|n| "
|
138
|
+
def #{n}; #{n}; end
|
139
|
+
def self.#{n}; @@#{n}; end
|
140
|
+
"
|
141
|
+
}.to_s
|
142
|
+
|
143
|
+
NEVERSTARTPARAMLISTWORDS=/\A(#{OPORBEGINWORDS}|#{INNERBOUNDINGWORDS}|#{BINOPWORDS}|end)((?:(?!#@@LETTER_DIGIT).)|\Z)/om
|
144
|
+
NEVERSTARTPARAMLISTFIRST=CharSet['aoeitrwu'] #chars that begin NEVERSTARTPARAMLIST
|
145
|
+
NEVERSTARTPARAMLISTMAXLEN=7 #max len of a NEVERSTARTPARAMLIST
|
146
|
+
|
147
|
+
=begin
|
148
|
+
require 'jcode'
|
149
|
+
utf8=String::PATTERN_UTF8 #or euc, or sjis...
|
150
|
+
LCLETTER_U="(?>[a-z_]|#{utf8})"
|
151
|
+
LETTER_U="(?>[A-Za-z_]|#{utf8})"
|
152
|
+
IDENTCHAR_U="(?>[A-Za-z_0-9]|#{utf8})"
|
153
|
+
=end
|
123
154
|
|
124
155
|
#-----------------------------------
|
125
|
-
def initialize(filename,file,linenum=1,offset_adjust=0)
|
156
|
+
def initialize(filename,file,linenum=1,offset_adjust=0,options={:rubyversion=>1.8})
|
126
157
|
@offset_adjust=0 #set again in next line
|
127
158
|
super(filename,file, linenum,offset_adjust)
|
128
159
|
@start_linenum=linenum
|
@@ -137,13 +168,61 @@ class RubyLexer
|
|
137
168
|
@enable_macro=nil
|
138
169
|
@base_file=nil
|
139
170
|
@progress_thread=nil
|
171
|
+
@rubyversion=options[:rubyversion]
|
172
|
+
@encoding=options[:encoding]||:detect
|
140
173
|
|
141
174
|
@toptable=CharHandler.new(self, :illegal_char, CHARMAPPINGS)
|
142
175
|
|
176
|
+
read_leading_encoding
|
143
177
|
start_of_line_directives
|
144
178
|
progress_printer
|
145
179
|
end
|
146
180
|
|
181
|
+
ENCODING_ALIASES={
|
182
|
+
'utf-8'=>'utf8',
|
183
|
+
|
184
|
+
'ascii-8bit'=>'binary',
|
185
|
+
'ascii-7bit'=>'ascii',
|
186
|
+
'euc-jp'=>'euc',
|
187
|
+
|
188
|
+
'ascii8bit'=>'binary',
|
189
|
+
'ascii7bit'=>'ascii',
|
190
|
+
'eucjp'=>'euc',
|
191
|
+
|
192
|
+
'us-ascii'=>'ascii',
|
193
|
+
'shift-jis'=>'sjis',
|
194
|
+
|
195
|
+
'autodetect'=>'detect',
|
196
|
+
}
|
197
|
+
ENCODINGS=%w[ascii binary utf8 euc sjis]
|
198
|
+
def read_leading_encoding
|
199
|
+
return unless @encoding==:detect
|
200
|
+
@encoding=:ascii
|
201
|
+
@encoding=:utf8 if @file.skip( /\xEF\xBB\xBF/ ) #bom
|
202
|
+
if @file.skip( /\A#!/ )
|
203
|
+
loop do
|
204
|
+
til_charset( /[\s\v]/ )
|
205
|
+
break if @file.skip( / ([^-\s\v]|--[\s\v])/,4 )
|
206
|
+
if @file.skip( /.-K(.)/ )
|
207
|
+
case $1
|
208
|
+
when 'u'; @encoding=:utf8
|
209
|
+
when 'e'; @encoding=:euc
|
210
|
+
when 's'; @encoding=:sjis
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
214
|
+
til_charset( /[\n]/ )
|
215
|
+
end
|
216
|
+
if @rubyversion>=1.9 and @file.skip(
|
217
|
+
/\A#[\x00-\x7F]*?(?:en)?coding[\s\v]*[:=][\s\v]*([a-z0-9_-]+)[\x00-\x7F]*\n/i
|
218
|
+
)
|
219
|
+
name=$1
|
220
|
+
name.downcase!
|
221
|
+
name=ENCODING_ALIASES[name] if ENCODING_ALIASES[name]
|
222
|
+
@encoding=name.to_sym if ENCODINGS.include? name
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
147
226
|
def progress_printer
|
148
227
|
return unless ENV['RL_PROGRESS']
|
149
228
|
$stderr.puts 'printing progresses'
|
@@ -163,6 +242,7 @@ class RubyLexer
|
|
163
242
|
attr :localvars_stack
|
164
243
|
attr :offset_adjust
|
165
244
|
attr_writer :pending_here_bodies
|
245
|
+
attr :rubyversion
|
166
246
|
|
167
247
|
#-----------------------------------
|
168
248
|
def set_last_token(tok)
|
@@ -361,7 +441,7 @@ private
|
|
361
441
|
result = ((
|
362
442
|
#order matters here, but it shouldn't
|
363
443
|
#(but til_charset must be last)
|
364
|
-
eat_if(
|
444
|
+
eat_if(/-#@@LETTER_DIGIT/o,2) or
|
365
445
|
eat_next_if(/[!@&+`'=~\-\/\\,.;<>*"$?:]/) or
|
366
446
|
(?0..?9)===nextchar ? til_charset(/[^\d]/) : nil
|
367
447
|
))
|
@@ -376,21 +456,25 @@ private
|
|
376
456
|
#or if in a non-bare context
|
377
457
|
#just asserts because those contexts are never encountered.
|
378
458
|
#control goes through symbol(<...>,nil)
|
379
|
-
assert(
|
459
|
+
assert( /^#@@LETTER$/o===context)
|
380
460
|
assert MethNameToken===@last_operative_token || !(@last_operative_token===/^(\.|::|(un)?def|alias)$/)
|
381
461
|
|
382
|
-
@
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
462
|
+
if @parsestack.last.wantarrow and @rubyversion>=1.9 and @file.skip ":"
|
463
|
+
@moretokens.push SymbolToken.new(str,oldpos), KeywordToken.new("=>",input_position-1)
|
464
|
+
else
|
465
|
+
@moretokens.unshift(*parse_keywords(str,oldpos) do |tok|
|
466
|
+
#if not a keyword, decide if it should be var or method
|
467
|
+
case str
|
468
|
+
when FUNCLIKE_KEYWORDS; except=tok
|
469
|
+
when VARLIKE_KEYWORDS,RUBYKEYWORDS; raise "shouldnt see keywords here, now"
|
470
|
+
end
|
471
|
+
was_last=@last_operative_token
|
472
|
+
@last_operative_token=tok if tok
|
473
|
+
normally=safe_recurse { |a| var_or_meth_name(str,was_last,oldpos,after_nonid_op?{true}) }
|
474
|
+
(Array===normally ? normally[0]=except : normally=except) if except
|
475
|
+
normally
|
476
|
+
end)
|
477
|
+
end
|
394
478
|
return @moretokens.shift
|
395
479
|
end
|
396
480
|
|
@@ -399,7 +483,7 @@ private
|
|
399
483
|
def identifier_as_string(context)
|
400
484
|
#must begin w/ letter or underscore
|
401
485
|
#char class needs changing here for utf8 support
|
402
|
-
/
|
486
|
+
/#@@LETTER/o===nextchar.chr or return
|
403
487
|
|
404
488
|
#equals, question mark, and exclamation mark
|
405
489
|
#might be allowed at the end in some contexts.
|
@@ -418,7 +502,7 @@ private
|
|
418
502
|
end
|
419
503
|
@in_def_name||context==?: and trailers<<"|=(?![=~>])"
|
420
504
|
|
421
|
-
@file.scan(IDENTREX[trailers]||=/^(
|
505
|
+
@file.scan(IDENTREX[trailers]||=/^(?>#@@LETTER#@@LETTER_DIGIT*(?:#{trailers})?)/)
|
422
506
|
end
|
423
507
|
|
424
508
|
#-----------------------------------
|
@@ -447,8 +531,8 @@ private
|
|
447
531
|
def comma_in_lvalue_list?
|
448
532
|
@parsestack.last.lhs=
|
449
533
|
case l=@parsestack.last
|
450
|
-
when ListContext
|
451
|
-
when DefContext
|
534
|
+
when ListContext;
|
535
|
+
when DefContext; l.in_body
|
452
536
|
else true
|
453
537
|
end
|
454
538
|
end
|
@@ -459,7 +543,7 @@ private
|
|
459
543
|
@defining_lvar or case ctx=@parsestack.last
|
460
544
|
#when ForSMContext; ctx.state==:for
|
461
545
|
when RescueSMContext
|
462
|
-
lasttok.ident=="=>" and @file.match?( /\A[\s\v]*([:;#\n]|then
|
546
|
+
lasttok.ident=="=>" and @file.match?( /\A[\s\v]*([:;#\n]|then(?!#@@LETTER_DIGIT))/om )
|
463
547
|
#when BlockParamListLhsContext; true
|
464
548
|
end
|
465
549
|
end
|
@@ -487,13 +571,13 @@ private
|
|
487
571
|
was_in_lvar_define_state=in_lvar_define_state(lasttok)
|
488
572
|
#maybe_local really means 'maybe local or constant'
|
489
573
|
maybe_local=case name
|
490
|
-
when /
|
491
|
-
when
|
574
|
+
when /(?!#@@LETTER_DIGIT).$/o #do nothing
|
575
|
+
when /^#@@LCLETTER/o
|
492
576
|
(localvars===name or
|
493
577
|
VARLIKE_KEYWORDS===name or
|
494
578
|
was_in_lvar_define_state
|
495
579
|
) and not lasttok===/^(\.|::)$/
|
496
|
-
when
|
580
|
+
when /^#@@UCLETTER/o
|
497
581
|
is_const=true
|
498
582
|
not lasttok==='.' #this is the right algorithm for constants...
|
499
583
|
end
|
@@ -509,7 +593,7 @@ private
|
|
509
593
|
result=ws_toks=ignored_tokens(true) {|nl| sawnl=true }
|
510
594
|
if sawnl || eof?
|
511
595
|
if was_in_lvar_define_state
|
512
|
-
if
|
596
|
+
if /^#@@LCLETTER#@@LETTER_DIGIT*$/o===name
|
513
597
|
assert !(lasttok===/^(\.|::)$/)
|
514
598
|
localvars[name]=true
|
515
599
|
end
|
@@ -531,7 +615,7 @@ private
|
|
531
615
|
when ?=; not /^=[>=~]$/===readahead(2)
|
532
616
|
when ?,; comma_in_lvalue_list?
|
533
617
|
when ?); last_context_not_implicit.lhs
|
534
|
-
when ?i; /^in
|
618
|
+
when ?i; /^in(?!#@@LETTER_DIGIT)/o===readahead(3) and
|
535
619
|
ForSMContext===last_context_not_implicit
|
536
620
|
when ?>,?<; /^(.)\1=$/===readahead(3)
|
537
621
|
when ?*,?&; /^(.)\1?=/===readahead(3)
|
@@ -543,8 +627,8 @@ private
|
|
543
627
|
end
|
544
628
|
if (assignment_coming && !(lasttok===/^(\.|::)$/) or was_in_lvar_define_state)
|
545
629
|
tok=assign_lvar_type! VarNameToken.new(name,pos)
|
546
|
-
if /
|
547
|
-
elsif
|
630
|
+
if /(?!#@@LETTER_DIGIT).$/o===name
|
631
|
+
elsif /^#@@LCLETTER/o===name and !(lasttok===/^(\.|::)$/)
|
548
632
|
localvars[name]=true
|
549
633
|
end
|
550
634
|
return result.unshift(tok)
|
@@ -559,7 +643,7 @@ private
|
|
559
643
|
when nil: 2
|
560
644
|
when ?!; /^![=~]$/===readahead(2) ? 2 : 1
|
561
645
|
when ?d;
|
562
|
-
if /^do(
|
646
|
+
if /^do((?!#@@LETTER_DIGIT)|$)/o===readahead(3)
|
563
647
|
if maybe_local and expecting_do?
|
564
648
|
ty=VarNameToken
|
565
649
|
0
|
@@ -572,7 +656,7 @@ private
|
|
572
656
|
end
|
573
657
|
when NEVERSTARTPARAMLISTFIRST
|
574
658
|
(NEVERSTARTPARAMLISTWORDS===readahead(NEVERSTARTPARAMLISTMAXLEN)) ? 2 : 1
|
575
|
-
when ?",?',?`,?a..?z,?A..?Z,?0..?9,?_
|
659
|
+
when ?",?',?`,?a..?z,?A..?Z,?0..?9,?_,?@,?$,?~,NONASCII; 1 #"
|
576
660
|
when ?{
|
577
661
|
maybe_local=false
|
578
662
|
1
|
@@ -633,10 +717,12 @@ private
|
|
633
717
|
else
|
634
718
|
3
|
635
719
|
end
|
636
|
-
when ??; next3=readahead(3)
|
637
|
-
|
720
|
+
when ??; next3=readahead(3)
|
721
|
+
#? never begins a char constant if immediately followed
|
722
|
+
#by 2 or more letters or digits
|
723
|
+
/^\?([#{WHSPLF}]|#@@LETTER_DIGIT{2})/o===next3 ? 2 : 3
|
638
724
|
# when ?:,??; (readahead(2)[/^.[#{WHSPLF}]/o]) ? 2 : 3
|
639
|
-
when ?<; (!ws_toks.empty? && readahead(4)[/^<<-?["'`
|
725
|
+
when ?<; (!ws_toks.empty? && readahead(4)[/^<<-?(?:["'`]|#@@LETTER_DIGIT)/o]) ? 3 : 2
|
640
726
|
when ?[;
|
641
727
|
if ws_toks.empty?
|
642
728
|
(KeywordToken===oldlast and /^(return|break|next)$/===oldlast.ident) ? 3 : 2
|
@@ -707,7 +793,7 @@ private
|
|
707
793
|
break false
|
708
794
|
elsif ','==tok.to_s and @parsestack.size==basesize+1
|
709
795
|
break true
|
710
|
-
elsif OperatorToken===tok and /^[&*]$/===tok.ident and tok.
|
796
|
+
elsif OperatorToken===tok and /^[&*]$/===tok.ident and tok.tag and @parsestack.size==basesize+1
|
711
797
|
break true
|
712
798
|
elsif EoiToken===tok
|
713
799
|
lexerror tok, "unexpected eof in parameter list"
|
@@ -890,7 +976,7 @@ private
|
|
890
976
|
@moretokens.push KeywordToken.new('::',offset+md.end(0)-2) if dc
|
891
977
|
loop do
|
892
978
|
offset=input_position
|
893
|
-
@file.scan(/\A(#@@WSTOKS)?(
|
979
|
+
@file.scan(/\A(#@@WSTOKS)?(#@@UCLETTER#@@LETTER_DIGIT*)(::)?/o)
|
894
980
|
#this regexp---^ will need to change in order to support utf8 properly.
|
895
981
|
md=@file.last_match
|
896
982
|
all,ws,name,dc=*md
|
@@ -1013,11 +1099,11 @@ private
|
|
1013
1099
|
|
1014
1100
|
#maybe_local really means 'maybe local or constant'
|
1015
1101
|
maybe_local=case name
|
1016
|
-
when /
|
1102
|
+
when /(?!#@@LETTER_DIGIT).$/o; #do nothing
|
1017
1103
|
when /^[@$]/; true
|
1018
1104
|
when VARLIKE_KEYWORDS,FUNCLIKE_KEYWORDS; ty=KeywordToken
|
1019
|
-
when
|
1020
|
-
when
|
1105
|
+
when /^#@@LCLETTER/o; localvars===name
|
1106
|
+
when /^#@@UCLETTER/o; is_const=true #this is the right algorithm for constants...
|
1021
1107
|
end
|
1022
1108
|
result.push( *ignored_tokens(false,false) )
|
1023
1109
|
nc=nextchar
|
@@ -1059,7 +1145,7 @@ private
|
|
1059
1145
|
|
1060
1146
|
#look for start of parameter list
|
1061
1147
|
nc=(@moretokens.empty? ? nextchar.chr : @moretokens.first.to_s[0,1])
|
1062
|
-
if state==:expect_op and /^[
|
1148
|
+
if state==:expect_op and /^(?:#@@LETTER|[(&*])/o===nc
|
1063
1149
|
ctx.state=:def_param_list
|
1064
1150
|
list,listend=def_param_list
|
1065
1151
|
result.concat list
|
@@ -1080,7 +1166,7 @@ private
|
|
1080
1166
|
when EoiToken
|
1081
1167
|
lexerror tok,'unexpected eof in def header'
|
1082
1168
|
when StillIgnoreToken
|
1083
|
-
when MethNameToken ,VarNameToken #
|
1169
|
+
when MethNameToken ,VarNameToken # /^#@@LETTER/o.token_pat
|
1084
1170
|
lexerror tok,'expected . or ::' unless state==:expect_name
|
1085
1171
|
state=:expect_op
|
1086
1172
|
when /^(\.|::)$/.token_pat
|
@@ -1416,7 +1502,7 @@ end
|
|
1416
1502
|
#result.concat ignored_tokens
|
1417
1503
|
if expect_name
|
1418
1504
|
case tok
|
1419
|
-
when IgnoreToken #,
|
1505
|
+
when IgnoreToken #, /^#@@UCLETTER/o #do nothing
|
1420
1506
|
when /^,$/.token_pat #hack
|
1421
1507
|
|
1422
1508
|
when VarNameToken
|
@@ -1498,12 +1584,20 @@ end
|
|
1498
1584
|
if want_unary
|
1499
1585
|
#readahead(2)[1..1][/[\s\v#\\]/] or #not needed?
|
1500
1586
|
assert OperatorToken===result
|
1501
|
-
result.unary
|
1587
|
+
result.tag=:unary #result should distinguish unary+binary *&
|
1502
1588
|
WHSPLF[nextchar.chr] or
|
1503
1589
|
@moretokens << NoWsToken.new(input_position)
|
1504
|
-
comma_in_lvalue_list?
|
1590
|
+
cill=comma_in_lvalue_list?
|
1505
1591
|
if ch=='*'
|
1506
1592
|
@parsestack.last.see self, :splat
|
1593
|
+
case @parsestack[-1]
|
1594
|
+
when AssignmentRhsContext; result.tag= :rhs
|
1595
|
+
when ParamListContext,ParamListContextNoParen; #:call
|
1596
|
+
when ListImmedContext; #:array
|
1597
|
+
when BlockParamListLhsContext; #:block
|
1598
|
+
when KnownNestedLhsParenContext; #:nested
|
1599
|
+
else result.tag= :lhs if cill
|
1600
|
+
end
|
1507
1601
|
end
|
1508
1602
|
end
|
1509
1603
|
result
|
@@ -1553,10 +1647,10 @@ end
|
|
1553
1647
|
|
1554
1648
|
s=tok.to_s
|
1555
1649
|
case s
|
1556
|
-
when /[^a-z_0-9]$/i; false
|
1557
|
-
# when /^[a-z_]/; localvars===s or VARLIKE_KEYWORDS===s
|
1558
|
-
when /^[A-Z_]/i; VarNameToken===tok
|
1559
1650
|
when /^[@$<]/; true
|
1651
|
+
when /(?!#@@LETTER_DIGIT).$/o; false
|
1652
|
+
# when /^#@@LCLETTER/o; localvars===s or VARLIKE_KEYWORDS===s
|
1653
|
+
when /^#@@LETTER/o; VarNameToken===tok
|
1560
1654
|
else raise "not var or method name: #{s}"
|
1561
1655
|
end
|
1562
1656
|
end
|
@@ -1573,7 +1667,7 @@ end
|
|
1573
1667
|
if ch==':'
|
1574
1668
|
not TernaryContext===@parsestack.last
|
1575
1669
|
else
|
1576
|
-
!readahead(3)[
|
1670
|
+
!readahead(3)[/^\?#@@LETTER_DIGIT{2}/o]
|
1577
1671
|
end
|
1578
1672
|
}
|
1579
1673
|
end
|
@@ -1603,21 +1697,25 @@ end
|
|
1603
1697
|
@moretokens.push tok=KeywordToken.new(':',startpos)
|
1604
1698
|
|
1605
1699
|
case @parsestack.last
|
1606
|
-
when TernaryContext
|
1700
|
+
when TernaryContext
|
1607
1701
|
tok.ternary=true
|
1608
1702
|
@parsestack.pop #should be in the context's see handler
|
1609
|
-
when ExpectDoOrNlContext
|
1610
|
-
@
|
1611
|
-
|
1612
|
-
|
1613
|
-
|
1614
|
-
|
1615
|
-
|
1616
|
-
|
1617
|
-
|
1703
|
+
when ExpectDoOrNlContext #should be in the context's see handler
|
1704
|
+
if @rubyversion<1.9
|
1705
|
+
@parsestack.pop
|
1706
|
+
assert @parsestack.last.starter[/^(while|until|for)$/]
|
1707
|
+
tok.as=";"
|
1708
|
+
end
|
1709
|
+
when ExpectThenOrNlContext,WhenParamListContext
|
1710
|
+
if @rubyversion<1.9
|
1711
|
+
#should be in the context's see handler
|
1712
|
+
@parsestack.pop
|
1713
|
+
tok.as="then"
|
1714
|
+
end
|
1715
|
+
when RescueSMContext
|
1618
1716
|
tok.as=";"
|
1619
|
-
|
1620
|
-
|
1717
|
+
end or
|
1718
|
+
fail ": not expected in #{@parsestack.last.class}->#{@parsestack.last.starter}"
|
1621
1719
|
|
1622
1720
|
#end ternary context, if any
|
1623
1721
|
@parsestack.last.see self,:colon
|
@@ -1631,7 +1729,7 @@ end
|
|
1631
1729
|
lasttok=@last_operative_token
|
1632
1730
|
assert !(String===lasttok)
|
1633
1731
|
if (VarNameToken===lasttok or MethNameToken===lasttok) and
|
1634
|
-
lasttok===/^[$@
|
1732
|
+
lasttok===/^(?:[$@]|#@@LETTER)/o and !WHSPCHARS[lastchar]
|
1635
1733
|
then
|
1636
1734
|
@moretokens << colon2
|
1637
1735
|
result= NoWsToken.new(startpos)
|
@@ -1664,12 +1762,12 @@ end
|
|
1664
1762
|
when ?` then read(1) #`
|
1665
1763
|
when ?@ then at_identifier.to_s
|
1666
1764
|
when ?$ then dollar_identifier.to_s
|
1667
|
-
when ?_,?a..?z then identifier_as_string(?:)
|
1765
|
+
when ?_,?a..?z,NONASCII then identifier_as_string(?:)
|
1668
1766
|
when ?A..?Z then
|
1669
1767
|
result=identifier_as_string(?:)
|
1670
1768
|
if @last_operative_token==='::'
|
1671
1769
|
assert klass==MethNameToken
|
1672
|
-
|
1770
|
+
/#@@LETTER_DIGIT$/o===result and klass=VarNameToken
|
1673
1771
|
end
|
1674
1772
|
result
|
1675
1773
|
else
|
@@ -1696,7 +1794,7 @@ end
|
|
1696
1794
|
return [opmatches ? read(opmatches.size) :
|
1697
1795
|
case nc=nextchar
|
1698
1796
|
when ?` then read(1) #`
|
1699
|
-
when ?_,?a..?z,?A..?Z then
|
1797
|
+
when ?_,?a..?z,?A..?Z,NONASCII then
|
1700
1798
|
context=merge_assignment_op_in_setter_callsites? ? ?: : nc
|
1701
1799
|
identifier_as_string(context)
|
1702
1800
|
else
|
@@ -1720,7 +1818,7 @@ end
|
|
1720
1818
|
quote_real=true
|
1721
1819
|
else
|
1722
1820
|
quote='"'
|
1723
|
-
ender
|
1821
|
+
ender=@file.scan(/#@@LETTER_DIGIT+/o)
|
1724
1822
|
ender.length >= 1 or
|
1725
1823
|
return lexerror(HerePlaceholderToken.new( dash, quote, ender, nil ), "invalid here header")
|
1726
1824
|
end
|
@@ -1739,6 +1837,7 @@ if true
|
|
1739
1837
|
|
1740
1838
|
nl=readnl or return lexerror(res, "here header without body (at eof)")
|
1741
1839
|
|
1840
|
+
res.string.startline=linenum
|
1742
1841
|
@moretokens<< res
|
1743
1842
|
bodystart=input_position
|
1744
1843
|
@offset_adjust = @min_offset_adjust+procrastinated.size
|
@@ -1748,6 +1847,8 @@ if true
|
|
1748
1847
|
@offset_adjust = @min_offset_adjust
|
1749
1848
|
#was: @offset_adjust -= procrastinated.size
|
1750
1849
|
bodysize=input_position-bodystart
|
1850
|
+
res.string.line=linenum-1
|
1851
|
+
lexerror res,res.string.error
|
1751
1852
|
|
1752
1853
|
#one or two already read characters are overwritten here,
|
1753
1854
|
#in order to keep offsets correct in the long term
|
@@ -1814,7 +1915,7 @@ end
|
|
1814
1915
|
#-----------------------------------
|
1815
1916
|
def lessthan(ch) #match quadriop('<') or here doc or spaceship op
|
1816
1917
|
case readahead(3)
|
1817
|
-
when /^<<['"`\-
|
1918
|
+
when /^<<(?:['"`\-]|#@@LETTER_DIGIT)$/o #'
|
1818
1919
|
if quote_expected?(ch) and not @last_operative_token==='class'
|
1819
1920
|
here_header
|
1820
1921
|
else
|
@@ -1901,7 +2002,11 @@ end
|
|
1901
2002
|
if tofill.dash
|
1902
2003
|
close+=til_charset(/[^#{WHSP}]/o)
|
1903
2004
|
end
|
1904
|
-
|
2005
|
+
if eof? #this is an error, should be handled better
|
2006
|
+
lexerror tofill, "unterminated here body"
|
2007
|
+
lexerror tofill.string, "unterminated here body"
|
2008
|
+
break
|
2009
|
+
end
|
1905
2010
|
if read(tofill.ender.size)==tofill.ender
|
1906
2011
|
crs=til_charset(/[^\r]/)||''
|
1907
2012
|
if nl=readnl
|
@@ -1917,6 +2022,8 @@ end
|
|
1917
2022
|
line=til_charset(/[\n]/)
|
1918
2023
|
unless nl=readnl
|
1919
2024
|
assert eof?
|
2025
|
+
lexerror tofill, "unterminated here body"
|
2026
|
+
lexerror tofill.string, "unterminated here body"
|
1920
2027
|
break #this is an error, should be handled better
|
1921
2028
|
end
|
1922
2029
|
line.chomp!("\r")
|
@@ -2118,7 +2225,7 @@ end
|
|
2118
2225
|
#used to resolve the ambiguity of
|
2119
2226
|
# unary ops (+, -, *, &, ~ !) in ruby
|
2120
2227
|
#returns whether current token is to be the start of a literal
|
2121
|
-
IDBEGINCHAR=/^[
|
2228
|
+
IDBEGINCHAR=/^(?:#@@LETTER|[$@])/o
|
2122
2229
|
def unary_op_expected?(ch) #yukko hack
|
2123
2230
|
'*&='[readahead(2)[1..1]] and return false
|
2124
2231
|
|
@@ -2139,8 +2246,8 @@ end
|
|
2139
2246
|
def quote_expected?(ch) #yukko hack
|
2140
2247
|
case ch[0]
|
2141
2248
|
when ?? then readahead(2)[/^\?[#{WHSPLF}]$/o] #not needed?
|
2142
|
-
when ?% then readahead(3)[/^%([a-pt-vyzA-PR-VX-Z]|[QqrswWx]
|
2143
|
-
when ?< then !readahead(4)[/^<<-?['"`
|
2249
|
+
when ?% then readahead(3)[/^%([a-pt-vyzA-PR-VX-Z]|[QqrswWx]#{@@LETTER_DIGIT.gsub('_','')})/o]
|
2250
|
+
when ?< then !readahead(4)[/^<<-?(?:['"`]|#@@LETTER_DIGIT)/o]
|
2144
2251
|
else raise 'unexpected ch (#{ch}) in quote_expected?'
|
2145
2252
|
# when ?+,?-,?&,?*,?~,?! then '*&='[readahead(2)[1..1]]
|
2146
2253
|
end and return false
|
@@ -2322,17 +2429,26 @@ end
|
|
2322
2429
|
str << c
|
2323
2430
|
result= operator_or_methname_token( str,offset)
|
2324
2431
|
case c
|
2325
|
-
when '='
|
2432
|
+
when '=' #===,==
|
2326
2433
|
str<< (eat_next_if(?=)or'')
|
2327
2434
|
|
2328
|
-
when '>'
|
2435
|
+
when '>' #=>
|
2329
2436
|
unless ParamListContextNoParen===@parsestack.last
|
2330
2437
|
@moretokens.unshift result
|
2331
2438
|
@moretokens.unshift( *abort_noparens!("=>"))
|
2332
2439
|
result=@moretokens.shift
|
2333
2440
|
end
|
2334
2441
|
@parsestack.last.see self,:arrow
|
2335
|
-
when ''
|
2442
|
+
when '~' # =~... after regex, maybe?
|
2443
|
+
last=last_operative_token
|
2444
|
+
|
2445
|
+
if @rubyversion>=1.9 and StringToken===last and last.lvars
|
2446
|
+
#ruby delays adding lvars from regexps to known lvars table
|
2447
|
+
#for several tokens in some cases. not sure why or if on purpose
|
2448
|
+
#i'm just going to add them right away
|
2449
|
+
localvars.concat last.lvars
|
2450
|
+
end
|
2451
|
+
when '' #plain assignment: record local variable definitions
|
2336
2452
|
last_context_not_implicit.lhs=false
|
2337
2453
|
@moretokens.push( *ignored_tokens(true).map{|x|
|
2338
2454
|
NewlineToken===x ? EscNlToken.new(@filename,@linenum,x.ident,x.offset) : x
|
@@ -2340,7 +2456,7 @@ end
|
|
2340
2456
|
@parsestack.push AssignmentRhsContext.new(@linenum)
|
2341
2457
|
if eat_next_if ?*
|
2342
2458
|
tok=OperatorToken.new('*', input_position-1)
|
2343
|
-
tok.unary
|
2459
|
+
tok.tag=:unary
|
2344
2460
|
@moretokens.push tok
|
2345
2461
|
WHSPLF[nextchar.chr] or
|
2346
2462
|
@moretokens << NoWsToken.new(input_position)
|
@@ -2450,14 +2566,15 @@ end
|
|
2450
2566
|
tokch.set_infix! unless after_nonid_op?{WHSPLF[lastchar]}
|
2451
2567
|
@parsestack.push ListImmedContext.new(ch,@linenum)
|
2452
2568
|
lasttok=last_operative_token
|
2453
|
-
#could be: lasttok
|
2454
|
-
if (VarNameToken===lasttok or ImplicitParamListEndToken===lasttok or
|
2569
|
+
#could be: lasttok===/^#@@LETTER/o
|
2570
|
+
if (VarNameToken===lasttok or ImplicitParamListEndToken===lasttok or
|
2571
|
+
MethNameToken===lasttok or lasttok===FUNCLIKE_KEYWORDS) and !WHSPCHARS[lastchar]
|
2455
2572
|
@moretokens << (tokch)
|
2456
2573
|
tokch= NoWsToken.new(input_position-1)
|
2457
2574
|
end
|
2458
2575
|
when '('
|
2459
2576
|
lasttok=last_token_maybe_implicit #last_operative_token
|
2460
|
-
#could be: lasttok
|
2577
|
+
#could be: lasttok===/^#@@LETTER/o
|
2461
2578
|
if (VarNameToken===lasttok or MethNameToken===lasttok or
|
2462
2579
|
lasttok===FUNCLIKE_KEYWORDS)
|
2463
2580
|
unless WHSPCHARS[lastchar]
|
@@ -2466,7 +2583,17 @@ end
|
|
2466
2583
|
end
|
2467
2584
|
@parsestack.push ParamListContext.new(@linenum)
|
2468
2585
|
else
|
2469
|
-
|
2586
|
+
ctx=@parsestack.last
|
2587
|
+
lasttok=last_operative_token
|
2588
|
+
maybe_def=DefContext===ctx && !ctx.in_body &&
|
2589
|
+
!(KeywordToken===lasttok && lasttok.ident=="def")
|
2590
|
+
if maybe_def or
|
2591
|
+
BlockParamListLhsContext===ctx or
|
2592
|
+
ParenContext===ctx && ctx.lhs
|
2593
|
+
@parsestack.push KnownNestedLhsParenContext.new(@linenum)
|
2594
|
+
else
|
2595
|
+
@parsestack.push ParenContext.new(@linenum)
|
2596
|
+
end
|
2470
2597
|
end
|
2471
2598
|
|
2472
2599
|
when '{'
|
@@ -2574,13 +2701,14 @@ end
|
|
2574
2701
|
@parsestack.pop
|
2575
2702
|
@moretokens.unshift AssignmentRhsListEndToken.new(input_position)
|
2576
2703
|
end
|
2577
|
-
token.comma_type=
|
2578
2704
|
case @parsestack[-1]
|
2579
|
-
when AssignmentRhsContext;
|
2580
|
-
when ParamListContext,ParamListContextNoParen;
|
2581
|
-
when ListImmedContext;
|
2705
|
+
when AssignmentRhsContext; token.tag=:rhs
|
2706
|
+
when ParamListContext,ParamListContextNoParen; #:call
|
2707
|
+
when ListImmedContext; #:array
|
2708
|
+
when BlockParamListLhsContext; #:block
|
2709
|
+
when KnownNestedLhsParenContext; #:nested
|
2582
2710
|
else
|
2583
|
-
|
2711
|
+
token.tag=:lhs if comma_in_lvalue_list?
|
2584
2712
|
end
|
2585
2713
|
@parsestack.last.see self,:comma
|
2586
2714
|
return @moretokens.shift
|