rubylexer 0.7.3 → 0.7.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +16 -0
- data/Manifest.txt +3 -1
- data/README.txt +12 -19
- data/Rakefile +2 -2
- data/lib/rubylexer.rb +214 -86
- data/lib/rubylexer/context.rb +17 -6
- data/lib/rubylexer/lextable.rb +202 -0
- data/lib/rubylexer/rulexer.rb +61 -9
- data/lib/rubylexer/test/illegal_oneliners.rb +4 -0
- data/lib/rubylexer/test/stanzas.rb +2 -0
- data/lib/rubylexer/test/testcases.rb +6 -1
- data/lib/rubylexer/token.rb +4 -1
- data/lib/rubylexer/version.rb +1 -1
- data/test/code/regression.rb +1 -1
- data/test/code/rubylexervsruby.rb +23 -6
- data/test/data/1.rb +729 -0
- data/test/data/heart.rb +43 -2
- data/test/data/pleac.rb +6282 -0
- data/testing.txt +1 -1
- metadata +7 -4
data/History.txt
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
=== 0.7.4/5-20-2009
|
2
|
+
* 2 Major Enhancements:
|
3
|
+
* preliminary support for ruby 1.9
|
4
|
+
* utf8 inputs should now work... more or less
|
5
|
+
|
6
|
+
* 5 Minor Enhancements:
|
7
|
+
* better detection of illegal escapes and interpolations in strings
|
8
|
+
* indicate error on unterminated here body
|
9
|
+
* fixed pattern of keywords that can't start a param list (ignores ?,! now)
|
10
|
+
* in is_var_name?, check for global/instance vars first
|
11
|
+
* comma and star in a true lhs should be correctly marked as such, now
|
12
|
+
|
13
|
+
* 2 Bugfixes:
|
14
|
+
* added tag field to Token; I hope many flags can be coalesced into tag.
|
15
|
+
* note line that all strings (and here docs) start and end on
|
16
|
+
|
1
17
|
=== 0.7.3/4-19-2009
|
2
18
|
* 9 Bugfixes:
|
3
19
|
* remember whether comma was seen in paren context
|
data/Manifest.txt
CHANGED
@@ -15,6 +15,7 @@ lib/rubylexer/version.rb
|
|
15
15
|
lib/rubylexer/rulexer.rb
|
16
16
|
lib/rubylexer/tokenprinter.rb
|
17
17
|
lib/rubylexer/charset.rb
|
18
|
+
lib/rubylexer/lextable.rb
|
18
19
|
lib/rubylexer/symboltable.rb
|
19
20
|
lib/rubylexer/charhandler.rb
|
20
21
|
lib/assert.rb
|
@@ -43,11 +44,13 @@ test/data/23.rb
|
|
43
44
|
test/data/lbrack.rb
|
44
45
|
test/data/untitled1.rb
|
45
46
|
test/data/rescue.rb
|
47
|
+
test/data/pleac.rb
|
46
48
|
test/data/pleac.rb.broken
|
47
49
|
test/data/heart.rb
|
48
50
|
test/data/s.rb
|
49
51
|
test/data/wsdlDriver.rb
|
50
52
|
test/data/p-op.rb
|
53
|
+
test/data/1.rb
|
51
54
|
test/data/1.rb.broken
|
52
55
|
test/data/untermed_here.rb.broken
|
53
56
|
test/data/newsyntax.rb
|
@@ -72,7 +75,6 @@ test/code/regression.rb
|
|
72
75
|
test/code/strgen.rb
|
73
76
|
test/code/tarball.rb
|
74
77
|
lib/rubylexer/test/testcases.rb
|
75
|
-
test/data/chunky.plain.rb
|
76
78
|
test/data/cvtesc.rb
|
77
79
|
test/data/__eof2.rb
|
78
80
|
test/data/__eof5.rb
|
data/README.txt
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
= RubyLexer
|
2
2
|
|
3
|
-
*
|
4
|
-
*
|
5
|
-
*
|
3
|
+
* rubyforge.net/projects/rubylexer
|
4
|
+
* github.com/coatl/rubylexer
|
6
5
|
|
7
6
|
=== DESCRIPTION:
|
8
7
|
|
@@ -48,13 +47,9 @@ end
|
|
48
47
|
== Status
|
49
48
|
RubyLexer can correctly lex all legal Ruby 1.8 code that I've been able to
|
50
49
|
find on my Debian system. It can also handle (most of) my catalog of nasty
|
51
|
-
test cases (
|
52
|
-
|
53
|
-
|
54
|
-
about and plan to fix, but it seems that Ruby coders don't write code complex
|
55
|
-
enough to trigger them very often. Although incomplete, RubyLexer can
|
56
|
-
correctly distinguish these ambiguous uses of the following operator and
|
57
|
-
keywords, depending on context:
|
50
|
+
test cases (see below for known problems). Modulo some very obscure bugs,
|
51
|
+
RubyLexer can correctly distinguish these ambiguous uses of the following
|
52
|
+
operators, depending on context:
|
58
53
|
% can be modulus operator or start of fancy string
|
59
54
|
/ can be division operator or start of regex
|
60
55
|
* & + - :: can be unary or binary operator
|
@@ -83,18 +78,16 @@ emit advisory tokens when local var defined/goes out of scope (or hidden/unhidde
|
|
83
78
|
token pruning in dumptokens...
|
84
79
|
|
85
80
|
== known issues: (and planned fix release)
|
86
|
-
context not really preserved when entering or leaving string inclusions. this
|
87
|
-
a number or problems
|
88
|
-
|
89
|
-
string tokenization sometimes a little different from ruby around newlines
|
90
|
-
(htree/template.rb) (0.8)
|
81
|
+
context not really preserved when entering or leaving string inclusions. this caused
|
82
|
+
-a number or problems, which had to be hacked around. it would be better to avoid
|
83
|
+
-tokens within tokens. (0.8)
|
91
84
|
string contents might not be correctly translated in a few cases (0.8?)
|
92
85
|
symbols which contain string interpolations are flattened into one token. eg :"foo#{bar}" (0.8)
|
93
86
|
'\r' whitespace sometimes seen in dos-formatted output.. shouldn't be (eg pre.rb) (0.7)
|
94
87
|
windows newline in source is likely to cause problems in obscure cases (need test case)
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
regression test currently shows
|
88
|
+
ruby 1.9 incompletely supported (0.9)
|
89
|
+
current character set is always forced to ascii-8bit. however, this mode should be
|
90
|
+
-compatible with texts written in regular ascii, utf-8, and euc. (among others?) (1.0)
|
91
|
+
regression test currently shows a few errors with differences in exact token ordering
|
99
92
|
-around string inclusions. these errors are much less serious than they seem.
|
100
93
|
offset of AssignmentRhsListEndToken appears to be off by 1
|
data/Rakefile
CHANGED
@@ -25,13 +25,13 @@ require 'lib/rubylexer/version.rb'
|
|
25
25
|
hoe=Hoe.new("rubylexer", RubyLexer::VERSION) do |_|
|
26
26
|
_.author = "Caleb Clausen"
|
27
27
|
_.email = "rubylexer-owner @at@ inforadical .dot. net"
|
28
|
-
_.url = ["http://
|
28
|
+
_.url = ["http://github.com/coatl/rubylexer/", "http://rubyforge.org/projects/rubylexer/"]
|
29
29
|
_.extra_deps << ['sequence', '>= 0.2.0']
|
30
30
|
_.test_globs=["test/code/regression.rb"]
|
31
31
|
_.description=desc
|
32
32
|
_.summary=desc[/\A[^.]+\./]
|
33
33
|
_.spec_extras={:bindir=>'',:rdoc_options=>'-x lib/rubylexer/test'}
|
34
|
-
_.rdoc_pattern=/\A(howtouse\.txt|testing\.txt|README\.txt|lib\/[^\/]*\.rb|lib\/rubylexer\/[^\d][^\/]*\.rb)\Z/
|
34
|
+
#_.rdoc_pattern=/\A(howtouse\.txt|testing\.txt|README\.txt|lib\/[^\/]*\.rb|lib\/rubylexer\/[^\d][^\/]*\.rb)\Z/
|
35
35
|
end
|
36
36
|
|
37
37
|
|
data/lib/rubylexer.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
=begin
|
1
|
+
=begin
|
2
2
|
rubylexer - a ruby lexer written in ruby
|
3
3
|
Copyright (C) 2004,2005,2008 Caleb Clausen
|
4
4
|
|
@@ -60,9 +60,6 @@ class RubyLexer
|
|
60
60
|
INNERBOUNDINGWORDS="(#{INNERBOUNDINGWORDLIST.join '|'})"
|
61
61
|
BINOPWORDLIST=%w"and or"
|
62
62
|
BINOPWORDS="(#{BINOPWORDLIST.join '|'})"
|
63
|
-
NEVERSTARTPARAMLISTWORDS=/\A(#{OPORBEGINWORDS}|#{INNERBOUNDINGWORDS}|#{BINOPWORDS}|end)([^a-zA-Z0-9_!?=]|\Z)/o
|
64
|
-
NEVERSTARTPARAMLISTFIRST=CharSet['aoeitrwu'] #chars that begin NEVERSTARTPARAMLIST
|
65
|
-
NEVERSTARTPARAMLISTMAXLEN=7 #max len of a NEVERSTARTPARAMLIST
|
66
63
|
|
67
64
|
RUBYKEYWORDS=%r{
|
68
65
|
^(alias|#{BINOPWORDS}|defined\?|not|undef|end|
|
@@ -72,6 +69,11 @@ class RubyLexer
|
|
72
69
|
}xo
|
73
70
|
#__END__ should not be in this set... its handled in start_of_line_directives
|
74
71
|
|
72
|
+
HIGHASCII=?\x80..?\xFF
|
73
|
+
NONASCII=HIGHASCII
|
74
|
+
#NONASCII=?\x80..?xFFFFFFFF #or is it 10FFFF, whatever the highest conceivable code point
|
75
|
+
|
76
|
+
|
75
77
|
CHARMAPPINGS = {
|
76
78
|
?$ => :dollar_identifier,
|
77
79
|
?@ => :at_identifier,
|
@@ -115,14 +117,43 @@ class RubyLexer
|
|
115
117
|
"])}" => :close_brace,
|
116
118
|
|
117
119
|
|
118
|
-
?# => :comment
|
120
|
+
?# => :comment,
|
121
|
+
|
122
|
+
NONASCII => :identifier,
|
119
123
|
}
|
120
124
|
|
121
125
|
attr_reader :incomplete_here_tokens, :parsestack, :last_token_maybe_implicit
|
122
126
|
|
127
|
+
UCLETTER=@@UCLETTER="[A-Z]"
|
128
|
+
|
129
|
+
#cheaters way, treats utf chars as always 1 byte wide
|
130
|
+
#all high-bit chars are lowercase letters
|
131
|
+
#works, but strings compare with strict binary identity, not unicode collation
|
132
|
+
#works for euc too, I think
|
133
|
+
#(the ruby spec for utf8 support permits this interpretation)
|
134
|
+
LCLETTER=@@LCLETTER="[a-z_\x80-\xFF]"
|
135
|
+
LETTER=@@LETTER="[A-Za-z_\x80-\xFF]"
|
136
|
+
LETTER_DIGIT=@@LETTER_DIGIT="[A-Za-z_0-9\x80-\xFF]"
|
137
|
+
eval %w[UCLETTER LCLETTER LETTER LETTER_DIGIT].map{|n| "
|
138
|
+
def #{n}; #{n}; end
|
139
|
+
def self.#{n}; @@#{n}; end
|
140
|
+
"
|
141
|
+
}.to_s
|
142
|
+
|
143
|
+
NEVERSTARTPARAMLISTWORDS=/\A(#{OPORBEGINWORDS}|#{INNERBOUNDINGWORDS}|#{BINOPWORDS}|end)((?:(?!#@@LETTER_DIGIT).)|\Z)/om
|
144
|
+
NEVERSTARTPARAMLISTFIRST=CharSet['aoeitrwu'] #chars that begin NEVERSTARTPARAMLIST
|
145
|
+
NEVERSTARTPARAMLISTMAXLEN=7 #max len of a NEVERSTARTPARAMLIST
|
146
|
+
|
147
|
+
=begin
|
148
|
+
require 'jcode'
|
149
|
+
utf8=String::PATTERN_UTF8 #or euc, or sjis...
|
150
|
+
LCLETTER_U="(?>[a-z_]|#{utf8})"
|
151
|
+
LETTER_U="(?>[A-Za-z_]|#{utf8})"
|
152
|
+
IDENTCHAR_U="(?>[A-Za-z_0-9]|#{utf8})"
|
153
|
+
=end
|
123
154
|
|
124
155
|
#-----------------------------------
|
125
|
-
def initialize(filename,file,linenum=1,offset_adjust=0)
|
156
|
+
def initialize(filename,file,linenum=1,offset_adjust=0,options={:rubyversion=>1.8})
|
126
157
|
@offset_adjust=0 #set again in next line
|
127
158
|
super(filename,file, linenum,offset_adjust)
|
128
159
|
@start_linenum=linenum
|
@@ -137,13 +168,61 @@ class RubyLexer
|
|
137
168
|
@enable_macro=nil
|
138
169
|
@base_file=nil
|
139
170
|
@progress_thread=nil
|
171
|
+
@rubyversion=options[:rubyversion]
|
172
|
+
@encoding=options[:encoding]||:detect
|
140
173
|
|
141
174
|
@toptable=CharHandler.new(self, :illegal_char, CHARMAPPINGS)
|
142
175
|
|
176
|
+
read_leading_encoding
|
143
177
|
start_of_line_directives
|
144
178
|
progress_printer
|
145
179
|
end
|
146
180
|
|
181
|
+
ENCODING_ALIASES={
|
182
|
+
'utf-8'=>'utf8',
|
183
|
+
|
184
|
+
'ascii-8bit'=>'binary',
|
185
|
+
'ascii-7bit'=>'ascii',
|
186
|
+
'euc-jp'=>'euc',
|
187
|
+
|
188
|
+
'ascii8bit'=>'binary',
|
189
|
+
'ascii7bit'=>'ascii',
|
190
|
+
'eucjp'=>'euc',
|
191
|
+
|
192
|
+
'us-ascii'=>'ascii',
|
193
|
+
'shift-jis'=>'sjis',
|
194
|
+
|
195
|
+
'autodetect'=>'detect',
|
196
|
+
}
|
197
|
+
ENCODINGS=%w[ascii binary utf8 euc sjis]
|
198
|
+
def read_leading_encoding
|
199
|
+
return unless @encoding==:detect
|
200
|
+
@encoding=:ascii
|
201
|
+
@encoding=:utf8 if @file.skip( /\xEF\xBB\xBF/ ) #bom
|
202
|
+
if @file.skip( /\A#!/ )
|
203
|
+
loop do
|
204
|
+
til_charset( /[\s\v]/ )
|
205
|
+
break if @file.skip( / ([^-\s\v]|--[\s\v])/,4 )
|
206
|
+
if @file.skip( /.-K(.)/ )
|
207
|
+
case $1
|
208
|
+
when 'u'; @encoding=:utf8
|
209
|
+
when 'e'; @encoding=:euc
|
210
|
+
when 's'; @encoding=:sjis
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
214
|
+
til_charset( /[\n]/ )
|
215
|
+
end
|
216
|
+
if @rubyversion>=1.9 and @file.skip(
|
217
|
+
/\A#[\x00-\x7F]*?(?:en)?coding[\s\v]*[:=][\s\v]*([a-z0-9_-]+)[\x00-\x7F]*\n/i
|
218
|
+
)
|
219
|
+
name=$1
|
220
|
+
name.downcase!
|
221
|
+
name=ENCODING_ALIASES[name] if ENCODING_ALIASES[name]
|
222
|
+
@encoding=name.to_sym if ENCODINGS.include? name
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
147
226
|
def progress_printer
|
148
227
|
return unless ENV['RL_PROGRESS']
|
149
228
|
$stderr.puts 'printing progresses'
|
@@ -163,6 +242,7 @@ class RubyLexer
|
|
163
242
|
attr :localvars_stack
|
164
243
|
attr :offset_adjust
|
165
244
|
attr_writer :pending_here_bodies
|
245
|
+
attr :rubyversion
|
166
246
|
|
167
247
|
#-----------------------------------
|
168
248
|
def set_last_token(tok)
|
@@ -361,7 +441,7 @@ private
|
|
361
441
|
result = ((
|
362
442
|
#order matters here, but it shouldn't
|
363
443
|
#(but til_charset must be last)
|
364
|
-
eat_if(
|
444
|
+
eat_if(/-#@@LETTER_DIGIT/o,2) or
|
365
445
|
eat_next_if(/[!@&+`'=~\-\/\\,.;<>*"$?:]/) or
|
366
446
|
(?0..?9)===nextchar ? til_charset(/[^\d]/) : nil
|
367
447
|
))
|
@@ -376,21 +456,25 @@ private
|
|
376
456
|
#or if in a non-bare context
|
377
457
|
#just asserts because those contexts are never encountered.
|
378
458
|
#control goes through symbol(<...>,nil)
|
379
|
-
assert(
|
459
|
+
assert( /^#@@LETTER$/o===context)
|
380
460
|
assert MethNameToken===@last_operative_token || !(@last_operative_token===/^(\.|::|(un)?def|alias)$/)
|
381
461
|
|
382
|
-
@
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
462
|
+
if @parsestack.last.wantarrow and @rubyversion>=1.9 and @file.skip ":"
|
463
|
+
@moretokens.push SymbolToken.new(str,oldpos), KeywordToken.new("=>",input_position-1)
|
464
|
+
else
|
465
|
+
@moretokens.unshift(*parse_keywords(str,oldpos) do |tok|
|
466
|
+
#if not a keyword, decide if it should be var or method
|
467
|
+
case str
|
468
|
+
when FUNCLIKE_KEYWORDS; except=tok
|
469
|
+
when VARLIKE_KEYWORDS,RUBYKEYWORDS; raise "shouldnt see keywords here, now"
|
470
|
+
end
|
471
|
+
was_last=@last_operative_token
|
472
|
+
@last_operative_token=tok if tok
|
473
|
+
normally=safe_recurse { |a| var_or_meth_name(str,was_last,oldpos,after_nonid_op?{true}) }
|
474
|
+
(Array===normally ? normally[0]=except : normally=except) if except
|
475
|
+
normally
|
476
|
+
end)
|
477
|
+
end
|
394
478
|
return @moretokens.shift
|
395
479
|
end
|
396
480
|
|
@@ -399,7 +483,7 @@ private
|
|
399
483
|
def identifier_as_string(context)
|
400
484
|
#must begin w/ letter or underscore
|
401
485
|
#char class needs changing here for utf8 support
|
402
|
-
/
|
486
|
+
/#@@LETTER/o===nextchar.chr or return
|
403
487
|
|
404
488
|
#equals, question mark, and exclamation mark
|
405
489
|
#might be allowed at the end in some contexts.
|
@@ -418,7 +502,7 @@ private
|
|
418
502
|
end
|
419
503
|
@in_def_name||context==?: and trailers<<"|=(?![=~>])"
|
420
504
|
|
421
|
-
@file.scan(IDENTREX[trailers]||=/^(
|
505
|
+
@file.scan(IDENTREX[trailers]||=/^(?>#@@LETTER#@@LETTER_DIGIT*(?:#{trailers})?)/)
|
422
506
|
end
|
423
507
|
|
424
508
|
#-----------------------------------
|
@@ -447,8 +531,8 @@ private
|
|
447
531
|
def comma_in_lvalue_list?
|
448
532
|
@parsestack.last.lhs=
|
449
533
|
case l=@parsestack.last
|
450
|
-
when ListContext
|
451
|
-
when DefContext
|
534
|
+
when ListContext;
|
535
|
+
when DefContext; l.in_body
|
452
536
|
else true
|
453
537
|
end
|
454
538
|
end
|
@@ -459,7 +543,7 @@ private
|
|
459
543
|
@defining_lvar or case ctx=@parsestack.last
|
460
544
|
#when ForSMContext; ctx.state==:for
|
461
545
|
when RescueSMContext
|
462
|
-
lasttok.ident=="=>" and @file.match?( /\A[\s\v]*([:;#\n]|then
|
546
|
+
lasttok.ident=="=>" and @file.match?( /\A[\s\v]*([:;#\n]|then(?!#@@LETTER_DIGIT))/om )
|
463
547
|
#when BlockParamListLhsContext; true
|
464
548
|
end
|
465
549
|
end
|
@@ -487,13 +571,13 @@ private
|
|
487
571
|
was_in_lvar_define_state=in_lvar_define_state(lasttok)
|
488
572
|
#maybe_local really means 'maybe local or constant'
|
489
573
|
maybe_local=case name
|
490
|
-
when /
|
491
|
-
when
|
574
|
+
when /(?!#@@LETTER_DIGIT).$/o #do nothing
|
575
|
+
when /^#@@LCLETTER/o
|
492
576
|
(localvars===name or
|
493
577
|
VARLIKE_KEYWORDS===name or
|
494
578
|
was_in_lvar_define_state
|
495
579
|
) and not lasttok===/^(\.|::)$/
|
496
|
-
when
|
580
|
+
when /^#@@UCLETTER/o
|
497
581
|
is_const=true
|
498
582
|
not lasttok==='.' #this is the right algorithm for constants...
|
499
583
|
end
|
@@ -509,7 +593,7 @@ private
|
|
509
593
|
result=ws_toks=ignored_tokens(true) {|nl| sawnl=true }
|
510
594
|
if sawnl || eof?
|
511
595
|
if was_in_lvar_define_state
|
512
|
-
if
|
596
|
+
if /^#@@LCLETTER#@@LETTER_DIGIT*$/o===name
|
513
597
|
assert !(lasttok===/^(\.|::)$/)
|
514
598
|
localvars[name]=true
|
515
599
|
end
|
@@ -531,7 +615,7 @@ private
|
|
531
615
|
when ?=; not /^=[>=~]$/===readahead(2)
|
532
616
|
when ?,; comma_in_lvalue_list?
|
533
617
|
when ?); last_context_not_implicit.lhs
|
534
|
-
when ?i; /^in
|
618
|
+
when ?i; /^in(?!#@@LETTER_DIGIT)/o===readahead(3) and
|
535
619
|
ForSMContext===last_context_not_implicit
|
536
620
|
when ?>,?<; /^(.)\1=$/===readahead(3)
|
537
621
|
when ?*,?&; /^(.)\1?=/===readahead(3)
|
@@ -543,8 +627,8 @@ private
|
|
543
627
|
end
|
544
628
|
if (assignment_coming && !(lasttok===/^(\.|::)$/) or was_in_lvar_define_state)
|
545
629
|
tok=assign_lvar_type! VarNameToken.new(name,pos)
|
546
|
-
if /
|
547
|
-
elsif
|
630
|
+
if /(?!#@@LETTER_DIGIT).$/o===name
|
631
|
+
elsif /^#@@LCLETTER/o===name and !(lasttok===/^(\.|::)$/)
|
548
632
|
localvars[name]=true
|
549
633
|
end
|
550
634
|
return result.unshift(tok)
|
@@ -559,7 +643,7 @@ private
|
|
559
643
|
when nil: 2
|
560
644
|
when ?!; /^![=~]$/===readahead(2) ? 2 : 1
|
561
645
|
when ?d;
|
562
|
-
if /^do(
|
646
|
+
if /^do((?!#@@LETTER_DIGIT)|$)/o===readahead(3)
|
563
647
|
if maybe_local and expecting_do?
|
564
648
|
ty=VarNameToken
|
565
649
|
0
|
@@ -572,7 +656,7 @@ private
|
|
572
656
|
end
|
573
657
|
when NEVERSTARTPARAMLISTFIRST
|
574
658
|
(NEVERSTARTPARAMLISTWORDS===readahead(NEVERSTARTPARAMLISTMAXLEN)) ? 2 : 1
|
575
|
-
when ?",?',?`,?a..?z,?A..?Z,?0..?9,?_
|
659
|
+
when ?",?',?`,?a..?z,?A..?Z,?0..?9,?_,?@,?$,?~,NONASCII; 1 #"
|
576
660
|
when ?{
|
577
661
|
maybe_local=false
|
578
662
|
1
|
@@ -633,10 +717,12 @@ private
|
|
633
717
|
else
|
634
718
|
3
|
635
719
|
end
|
636
|
-
when ??; next3=readahead(3)
|
637
|
-
|
720
|
+
when ??; next3=readahead(3)
|
721
|
+
#? never begins a char constant if immediately followed
|
722
|
+
#by 2 or more letters or digits
|
723
|
+
/^\?([#{WHSPLF}]|#@@LETTER_DIGIT{2})/o===next3 ? 2 : 3
|
638
724
|
# when ?:,??; (readahead(2)[/^.[#{WHSPLF}]/o]) ? 2 : 3
|
639
|
-
when ?<; (!ws_toks.empty? && readahead(4)[/^<<-?["'`
|
725
|
+
when ?<; (!ws_toks.empty? && readahead(4)[/^<<-?(?:["'`]|#@@LETTER_DIGIT)/o]) ? 3 : 2
|
640
726
|
when ?[;
|
641
727
|
if ws_toks.empty?
|
642
728
|
(KeywordToken===oldlast and /^(return|break|next)$/===oldlast.ident) ? 3 : 2
|
@@ -707,7 +793,7 @@ private
|
|
707
793
|
break false
|
708
794
|
elsif ','==tok.to_s and @parsestack.size==basesize+1
|
709
795
|
break true
|
710
|
-
elsif OperatorToken===tok and /^[&*]$/===tok.ident and tok.
|
796
|
+
elsif OperatorToken===tok and /^[&*]$/===tok.ident and tok.tag and @parsestack.size==basesize+1
|
711
797
|
break true
|
712
798
|
elsif EoiToken===tok
|
713
799
|
lexerror tok, "unexpected eof in parameter list"
|
@@ -890,7 +976,7 @@ private
|
|
890
976
|
@moretokens.push KeywordToken.new('::',offset+md.end(0)-2) if dc
|
891
977
|
loop do
|
892
978
|
offset=input_position
|
893
|
-
@file.scan(/\A(#@@WSTOKS)?(
|
979
|
+
@file.scan(/\A(#@@WSTOKS)?(#@@UCLETTER#@@LETTER_DIGIT*)(::)?/o)
|
894
980
|
#this regexp---^ will need to change in order to support utf8 properly.
|
895
981
|
md=@file.last_match
|
896
982
|
all,ws,name,dc=*md
|
@@ -1013,11 +1099,11 @@ private
|
|
1013
1099
|
|
1014
1100
|
#maybe_local really means 'maybe local or constant'
|
1015
1101
|
maybe_local=case name
|
1016
|
-
when /
|
1102
|
+
when /(?!#@@LETTER_DIGIT).$/o; #do nothing
|
1017
1103
|
when /^[@$]/; true
|
1018
1104
|
when VARLIKE_KEYWORDS,FUNCLIKE_KEYWORDS; ty=KeywordToken
|
1019
|
-
when
|
1020
|
-
when
|
1105
|
+
when /^#@@LCLETTER/o; localvars===name
|
1106
|
+
when /^#@@UCLETTER/o; is_const=true #this is the right algorithm for constants...
|
1021
1107
|
end
|
1022
1108
|
result.push( *ignored_tokens(false,false) )
|
1023
1109
|
nc=nextchar
|
@@ -1059,7 +1145,7 @@ private
|
|
1059
1145
|
|
1060
1146
|
#look for start of parameter list
|
1061
1147
|
nc=(@moretokens.empty? ? nextchar.chr : @moretokens.first.to_s[0,1])
|
1062
|
-
if state==:expect_op and /^[
|
1148
|
+
if state==:expect_op and /^(?:#@@LETTER|[(&*])/o===nc
|
1063
1149
|
ctx.state=:def_param_list
|
1064
1150
|
list,listend=def_param_list
|
1065
1151
|
result.concat list
|
@@ -1080,7 +1166,7 @@ private
|
|
1080
1166
|
when EoiToken
|
1081
1167
|
lexerror tok,'unexpected eof in def header'
|
1082
1168
|
when StillIgnoreToken
|
1083
|
-
when MethNameToken ,VarNameToken #
|
1169
|
+
when MethNameToken ,VarNameToken # /^#@@LETTER/o.token_pat
|
1084
1170
|
lexerror tok,'expected . or ::' unless state==:expect_name
|
1085
1171
|
state=:expect_op
|
1086
1172
|
when /^(\.|::)$/.token_pat
|
@@ -1416,7 +1502,7 @@ end
|
|
1416
1502
|
#result.concat ignored_tokens
|
1417
1503
|
if expect_name
|
1418
1504
|
case tok
|
1419
|
-
when IgnoreToken #,
|
1505
|
+
when IgnoreToken #, /^#@@UCLETTER/o #do nothing
|
1420
1506
|
when /^,$/.token_pat #hack
|
1421
1507
|
|
1422
1508
|
when VarNameToken
|
@@ -1498,12 +1584,20 @@ end
|
|
1498
1584
|
if want_unary
|
1499
1585
|
#readahead(2)[1..1][/[\s\v#\\]/] or #not needed?
|
1500
1586
|
assert OperatorToken===result
|
1501
|
-
result.unary
|
1587
|
+
result.tag=:unary #result should distinguish unary+binary *&
|
1502
1588
|
WHSPLF[nextchar.chr] or
|
1503
1589
|
@moretokens << NoWsToken.new(input_position)
|
1504
|
-
comma_in_lvalue_list?
|
1590
|
+
cill=comma_in_lvalue_list?
|
1505
1591
|
if ch=='*'
|
1506
1592
|
@parsestack.last.see self, :splat
|
1593
|
+
case @parsestack[-1]
|
1594
|
+
when AssignmentRhsContext; result.tag= :rhs
|
1595
|
+
when ParamListContext,ParamListContextNoParen; #:call
|
1596
|
+
when ListImmedContext; #:array
|
1597
|
+
when BlockParamListLhsContext; #:block
|
1598
|
+
when KnownNestedLhsParenContext; #:nested
|
1599
|
+
else result.tag= :lhs if cill
|
1600
|
+
end
|
1507
1601
|
end
|
1508
1602
|
end
|
1509
1603
|
result
|
@@ -1553,10 +1647,10 @@ end
|
|
1553
1647
|
|
1554
1648
|
s=tok.to_s
|
1555
1649
|
case s
|
1556
|
-
when /[^a-z_0-9]$/i; false
|
1557
|
-
# when /^[a-z_]/; localvars===s or VARLIKE_KEYWORDS===s
|
1558
|
-
when /^[A-Z_]/i; VarNameToken===tok
|
1559
1650
|
when /^[@$<]/; true
|
1651
|
+
when /(?!#@@LETTER_DIGIT).$/o; false
|
1652
|
+
# when /^#@@LCLETTER/o; localvars===s or VARLIKE_KEYWORDS===s
|
1653
|
+
when /^#@@LETTER/o; VarNameToken===tok
|
1560
1654
|
else raise "not var or method name: #{s}"
|
1561
1655
|
end
|
1562
1656
|
end
|
@@ -1573,7 +1667,7 @@ end
|
|
1573
1667
|
if ch==':'
|
1574
1668
|
not TernaryContext===@parsestack.last
|
1575
1669
|
else
|
1576
|
-
!readahead(3)[
|
1670
|
+
!readahead(3)[/^\?#@@LETTER_DIGIT{2}/o]
|
1577
1671
|
end
|
1578
1672
|
}
|
1579
1673
|
end
|
@@ -1603,21 +1697,25 @@ end
|
|
1603
1697
|
@moretokens.push tok=KeywordToken.new(':',startpos)
|
1604
1698
|
|
1605
1699
|
case @parsestack.last
|
1606
|
-
when TernaryContext
|
1700
|
+
when TernaryContext
|
1607
1701
|
tok.ternary=true
|
1608
1702
|
@parsestack.pop #should be in the context's see handler
|
1609
|
-
when ExpectDoOrNlContext
|
1610
|
-
@
|
1611
|
-
|
1612
|
-
|
1613
|
-
|
1614
|
-
|
1615
|
-
|
1616
|
-
|
1617
|
-
|
1703
|
+
when ExpectDoOrNlContext #should be in the context's see handler
|
1704
|
+
if @rubyversion<1.9
|
1705
|
+
@parsestack.pop
|
1706
|
+
assert @parsestack.last.starter[/^(while|until|for)$/]
|
1707
|
+
tok.as=";"
|
1708
|
+
end
|
1709
|
+
when ExpectThenOrNlContext,WhenParamListContext
|
1710
|
+
if @rubyversion<1.9
|
1711
|
+
#should be in the context's see handler
|
1712
|
+
@parsestack.pop
|
1713
|
+
tok.as="then"
|
1714
|
+
end
|
1715
|
+
when RescueSMContext
|
1618
1716
|
tok.as=";"
|
1619
|
-
|
1620
|
-
|
1717
|
+
end or
|
1718
|
+
fail ": not expected in #{@parsestack.last.class}->#{@parsestack.last.starter}"
|
1621
1719
|
|
1622
1720
|
#end ternary context, if any
|
1623
1721
|
@parsestack.last.see self,:colon
|
@@ -1631,7 +1729,7 @@ end
|
|
1631
1729
|
lasttok=@last_operative_token
|
1632
1730
|
assert !(String===lasttok)
|
1633
1731
|
if (VarNameToken===lasttok or MethNameToken===lasttok) and
|
1634
|
-
lasttok===/^[$@
|
1732
|
+
lasttok===/^(?:[$@]|#@@LETTER)/o and !WHSPCHARS[lastchar]
|
1635
1733
|
then
|
1636
1734
|
@moretokens << colon2
|
1637
1735
|
result= NoWsToken.new(startpos)
|
@@ -1664,12 +1762,12 @@ end
|
|
1664
1762
|
when ?` then read(1) #`
|
1665
1763
|
when ?@ then at_identifier.to_s
|
1666
1764
|
when ?$ then dollar_identifier.to_s
|
1667
|
-
when ?_,?a..?z then identifier_as_string(?:)
|
1765
|
+
when ?_,?a..?z,NONASCII then identifier_as_string(?:)
|
1668
1766
|
when ?A..?Z then
|
1669
1767
|
result=identifier_as_string(?:)
|
1670
1768
|
if @last_operative_token==='::'
|
1671
1769
|
assert klass==MethNameToken
|
1672
|
-
|
1770
|
+
/#@@LETTER_DIGIT$/o===result and klass=VarNameToken
|
1673
1771
|
end
|
1674
1772
|
result
|
1675
1773
|
else
|
@@ -1696,7 +1794,7 @@ end
|
|
1696
1794
|
return [opmatches ? read(opmatches.size) :
|
1697
1795
|
case nc=nextchar
|
1698
1796
|
when ?` then read(1) #`
|
1699
|
-
when ?_,?a..?z,?A..?Z then
|
1797
|
+
when ?_,?a..?z,?A..?Z,NONASCII then
|
1700
1798
|
context=merge_assignment_op_in_setter_callsites? ? ?: : nc
|
1701
1799
|
identifier_as_string(context)
|
1702
1800
|
else
|
@@ -1720,7 +1818,7 @@ end
|
|
1720
1818
|
quote_real=true
|
1721
1819
|
else
|
1722
1820
|
quote='"'
|
1723
|
-
ender
|
1821
|
+
ender=@file.scan(/#@@LETTER_DIGIT+/o)
|
1724
1822
|
ender.length >= 1 or
|
1725
1823
|
return lexerror(HerePlaceholderToken.new( dash, quote, ender, nil ), "invalid here header")
|
1726
1824
|
end
|
@@ -1739,6 +1837,7 @@ if true
|
|
1739
1837
|
|
1740
1838
|
nl=readnl or return lexerror(res, "here header without body (at eof)")
|
1741
1839
|
|
1840
|
+
res.string.startline=linenum
|
1742
1841
|
@moretokens<< res
|
1743
1842
|
bodystart=input_position
|
1744
1843
|
@offset_adjust = @min_offset_adjust+procrastinated.size
|
@@ -1748,6 +1847,8 @@ if true
|
|
1748
1847
|
@offset_adjust = @min_offset_adjust
|
1749
1848
|
#was: @offset_adjust -= procrastinated.size
|
1750
1849
|
bodysize=input_position-bodystart
|
1850
|
+
res.string.line=linenum-1
|
1851
|
+
lexerror res,res.string.error
|
1751
1852
|
|
1752
1853
|
#one or two already read characters are overwritten here,
|
1753
1854
|
#in order to keep offsets correct in the long term
|
@@ -1814,7 +1915,7 @@ end
|
|
1814
1915
|
#-----------------------------------
|
1815
1916
|
def lessthan(ch) #match quadriop('<') or here doc or spaceship op
|
1816
1917
|
case readahead(3)
|
1817
|
-
when /^<<['"`\-
|
1918
|
+
when /^<<(?:['"`\-]|#@@LETTER_DIGIT)$/o #'
|
1818
1919
|
if quote_expected?(ch) and not @last_operative_token==='class'
|
1819
1920
|
here_header
|
1820
1921
|
else
|
@@ -1901,7 +2002,11 @@ end
|
|
1901
2002
|
if tofill.dash
|
1902
2003
|
close+=til_charset(/[^#{WHSP}]/o)
|
1903
2004
|
end
|
1904
|
-
|
2005
|
+
if eof? #this is an error, should be handled better
|
2006
|
+
lexerror tofill, "unterminated here body"
|
2007
|
+
lexerror tofill.string, "unterminated here body"
|
2008
|
+
break
|
2009
|
+
end
|
1905
2010
|
if read(tofill.ender.size)==tofill.ender
|
1906
2011
|
crs=til_charset(/[^\r]/)||''
|
1907
2012
|
if nl=readnl
|
@@ -1917,6 +2022,8 @@ end
|
|
1917
2022
|
line=til_charset(/[\n]/)
|
1918
2023
|
unless nl=readnl
|
1919
2024
|
assert eof?
|
2025
|
+
lexerror tofill, "unterminated here body"
|
2026
|
+
lexerror tofill.string, "unterminated here body"
|
1920
2027
|
break #this is an error, should be handled better
|
1921
2028
|
end
|
1922
2029
|
line.chomp!("\r")
|
@@ -2118,7 +2225,7 @@ end
|
|
2118
2225
|
#used to resolve the ambiguity of
|
2119
2226
|
# unary ops (+, -, *, &, ~ !) in ruby
|
2120
2227
|
#returns whether current token is to be the start of a literal
|
2121
|
-
IDBEGINCHAR=/^[
|
2228
|
+
IDBEGINCHAR=/^(?:#@@LETTER|[$@])/o
|
2122
2229
|
def unary_op_expected?(ch) #yukko hack
|
2123
2230
|
'*&='[readahead(2)[1..1]] and return false
|
2124
2231
|
|
@@ -2139,8 +2246,8 @@ end
|
|
2139
2246
|
def quote_expected?(ch) #yukko hack
|
2140
2247
|
case ch[0]
|
2141
2248
|
when ?? then readahead(2)[/^\?[#{WHSPLF}]$/o] #not needed?
|
2142
|
-
when ?% then readahead(3)[/^%([a-pt-vyzA-PR-VX-Z]|[QqrswWx]
|
2143
|
-
when ?< then !readahead(4)[/^<<-?['"`
|
2249
|
+
when ?% then readahead(3)[/^%([a-pt-vyzA-PR-VX-Z]|[QqrswWx]#{@@LETTER_DIGIT.gsub('_','')})/o]
|
2250
|
+
when ?< then !readahead(4)[/^<<-?(?:['"`]|#@@LETTER_DIGIT)/o]
|
2144
2251
|
else raise 'unexpected ch (#{ch}) in quote_expected?'
|
2145
2252
|
# when ?+,?-,?&,?*,?~,?! then '*&='[readahead(2)[1..1]]
|
2146
2253
|
end and return false
|
@@ -2322,17 +2429,26 @@ end
|
|
2322
2429
|
str << c
|
2323
2430
|
result= operator_or_methname_token( str,offset)
|
2324
2431
|
case c
|
2325
|
-
when '='
|
2432
|
+
when '=' #===,==
|
2326
2433
|
str<< (eat_next_if(?=)or'')
|
2327
2434
|
|
2328
|
-
when '>'
|
2435
|
+
when '>' #=>
|
2329
2436
|
unless ParamListContextNoParen===@parsestack.last
|
2330
2437
|
@moretokens.unshift result
|
2331
2438
|
@moretokens.unshift( *abort_noparens!("=>"))
|
2332
2439
|
result=@moretokens.shift
|
2333
2440
|
end
|
2334
2441
|
@parsestack.last.see self,:arrow
|
2335
|
-
when ''
|
2442
|
+
when '~' # =~... after regex, maybe?
|
2443
|
+
last=last_operative_token
|
2444
|
+
|
2445
|
+
if @rubyversion>=1.9 and StringToken===last and last.lvars
|
2446
|
+
#ruby delays adding lvars from regexps to known lvars table
|
2447
|
+
#for several tokens in some cases. not sure why or if on purpose
|
2448
|
+
#i'm just going to add them right away
|
2449
|
+
localvars.concat last.lvars
|
2450
|
+
end
|
2451
|
+
when '' #plain assignment: record local variable definitions
|
2336
2452
|
last_context_not_implicit.lhs=false
|
2337
2453
|
@moretokens.push( *ignored_tokens(true).map{|x|
|
2338
2454
|
NewlineToken===x ? EscNlToken.new(@filename,@linenum,x.ident,x.offset) : x
|
@@ -2340,7 +2456,7 @@ end
|
|
2340
2456
|
@parsestack.push AssignmentRhsContext.new(@linenum)
|
2341
2457
|
if eat_next_if ?*
|
2342
2458
|
tok=OperatorToken.new('*', input_position-1)
|
2343
|
-
tok.unary
|
2459
|
+
tok.tag=:unary
|
2344
2460
|
@moretokens.push tok
|
2345
2461
|
WHSPLF[nextchar.chr] or
|
2346
2462
|
@moretokens << NoWsToken.new(input_position)
|
@@ -2450,14 +2566,15 @@ end
|
|
2450
2566
|
tokch.set_infix! unless after_nonid_op?{WHSPLF[lastchar]}
|
2451
2567
|
@parsestack.push ListImmedContext.new(ch,@linenum)
|
2452
2568
|
lasttok=last_operative_token
|
2453
|
-
#could be: lasttok
|
2454
|
-
if (VarNameToken===lasttok or ImplicitParamListEndToken===lasttok or
|
2569
|
+
#could be: lasttok===/^#@@LETTER/o
|
2570
|
+
if (VarNameToken===lasttok or ImplicitParamListEndToken===lasttok or
|
2571
|
+
MethNameToken===lasttok or lasttok===FUNCLIKE_KEYWORDS) and !WHSPCHARS[lastchar]
|
2455
2572
|
@moretokens << (tokch)
|
2456
2573
|
tokch= NoWsToken.new(input_position-1)
|
2457
2574
|
end
|
2458
2575
|
when '('
|
2459
2576
|
lasttok=last_token_maybe_implicit #last_operative_token
|
2460
|
-
#could be: lasttok
|
2577
|
+
#could be: lasttok===/^#@@LETTER/o
|
2461
2578
|
if (VarNameToken===lasttok or MethNameToken===lasttok or
|
2462
2579
|
lasttok===FUNCLIKE_KEYWORDS)
|
2463
2580
|
unless WHSPCHARS[lastchar]
|
@@ -2466,7 +2583,17 @@ end
|
|
2466
2583
|
end
|
2467
2584
|
@parsestack.push ParamListContext.new(@linenum)
|
2468
2585
|
else
|
2469
|
-
|
2586
|
+
ctx=@parsestack.last
|
2587
|
+
lasttok=last_operative_token
|
2588
|
+
maybe_def=DefContext===ctx && !ctx.in_body &&
|
2589
|
+
!(KeywordToken===lasttok && lasttok.ident=="def")
|
2590
|
+
if maybe_def or
|
2591
|
+
BlockParamListLhsContext===ctx or
|
2592
|
+
ParenContext===ctx && ctx.lhs
|
2593
|
+
@parsestack.push KnownNestedLhsParenContext.new(@linenum)
|
2594
|
+
else
|
2595
|
+
@parsestack.push ParenContext.new(@linenum)
|
2596
|
+
end
|
2470
2597
|
end
|
2471
2598
|
|
2472
2599
|
when '{'
|
@@ -2574,13 +2701,14 @@ end
|
|
2574
2701
|
@parsestack.pop
|
2575
2702
|
@moretokens.unshift AssignmentRhsListEndToken.new(input_position)
|
2576
2703
|
end
|
2577
|
-
token.comma_type=
|
2578
2704
|
case @parsestack[-1]
|
2579
|
-
when AssignmentRhsContext;
|
2580
|
-
when ParamListContext,ParamListContextNoParen;
|
2581
|
-
when ListImmedContext;
|
2705
|
+
when AssignmentRhsContext; token.tag=:rhs
|
2706
|
+
when ParamListContext,ParamListContextNoParen; #:call
|
2707
|
+
when ListImmedContext; #:array
|
2708
|
+
when BlockParamListLhsContext; #:block
|
2709
|
+
when KnownNestedLhsParenContext; #:nested
|
2582
2710
|
else
|
2583
|
-
|
2711
|
+
token.tag=:lhs if comma_in_lvalue_list?
|
2584
2712
|
end
|
2585
2713
|
@parsestack.last.see self,:comma
|
2586
2714
|
return @moretokens.shift
|