rubylexer 0.7.3 → 0.7.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -36,6 +36,7 @@ module NestedContexts
36
36
 
37
37
  def see lxr,msg; end
38
38
  def lhs=*x; end #do nothing
39
+ def wantarrow; false end
39
40
  end
40
41
 
41
42
  #contexts which expect to see commas,
@@ -48,18 +49,26 @@ module NestedContexts
48
49
  assert '{['[starter]
49
50
  super(starter, starter.tr('{[','}]') ,linenum)
50
51
  end
52
+ def wantarrow; true end
51
53
  end
52
54
 
53
55
  class ParenContext < NestedContext
54
56
  def initialize(linenum)
55
57
  super('(', ')' ,linenum)
56
58
  end
57
- attr_accessor :lhs,:saw_comma
59
+
60
+ attr_accessor :lhs
58
61
  def see(lxr,msg)
59
- @saw_comma=true if msg==:comma
62
+ @lhs=true if msg==:comma || msg==:splat
60
63
  end
61
64
  end
62
65
 
66
+ class KnownNestedLhsParenContext < ParenContext
67
+ # def lhs; true end
68
+ # def lhs=x; end
69
+ # def see(lxr,msg) end
70
+ end
71
+
63
72
  class BlockContext < NestedContext
64
73
  def initialize(linenum)
65
74
  super('{','}',linenum)
@@ -83,6 +92,7 @@ module NestedContexts
83
92
  super('(', ')',linenum)
84
93
  end
85
94
  def lhs; false end
95
+ def wantarrow; true end
86
96
  end
87
97
 
88
98
  class ImplicitLhsContext < NestedContext
@@ -107,6 +117,7 @@ module NestedContexts
107
117
  super(nil,nil,linenum)
108
118
  end
109
119
  def lhs; false end
120
+ def wantarrow; true end
110
121
  end
111
122
 
112
123
  class KWParamListContextNoParen < ParamListContextNoParen
@@ -224,17 +235,17 @@ module NestedContexts
224
235
  def see(lxr,msg)
225
236
  stack=lxr.parsestack
226
237
  case msg
227
- when :rescue:
238
+ when :rescue;
228
239
  WantsEndContext===stack.last or
229
240
  BlockContext===stack.last or
230
241
  ParenContext===stack.last or
231
242
  raise 'syntax error: rescue not expected at this time'
232
- when :arrow: #local var defined in this state
233
- when :then,:semi,:colon:
243
+ when :arrow; #local var defined in this state
244
+ when :then,:semi,:colon;
234
245
  msg=:then
235
246
  self.equal? stack.pop or raise 'syntax error: then not expected at this time'
236
247
  #pop self off owning context stack
237
- when :comma, :splat: return
248
+ when :comma, :splat; return
238
249
  else super
239
250
  end
240
251
  LEGAL_SUCCESSORS[@state].include? msg or raise "rescue syntax error: #{msg} unexpected in #@state"
@@ -0,0 +1,202 @@
1
+ class RubyLexer
2
+ class Rule
3
+ def initialize(lead,matcher,*actions)
4
+ fail unless String===lead or Fixnum===lead
5
+ @lead,@matcher,@actions=lead,matcher,actions
6
+ end
7
+ end
8
+
9
+ class Mode #set of Rules
10
+ def initialize(rules)
11
+ rules.map!{|r| Rule.new(*r) }
12
+ @rules=rules
13
+ fail if rules.size>255
14
+ rules.each_with_index{|r,i| all_chars_of(r.lead).each{|char|
15
+ @chartable[char]||=''
16
+ @chartable[char]<<i
17
+ }}
18
+ #should order of rules in @chartable[x] be tweaked?
19
+ end
20
+ end
21
+
22
+ lc_letters="a-z_" #this is correct for ascii, other charsets will need different char classes here
23
+ uc_letters="A-Z" #this is always A-Z, for all charsets
24
+ letters=lc_letters+uc_letters
25
+
26
+ num=/(0(x[_0-9a-f]+|
27
+ d[_0-9]+|
28
+ b[_01]+|
29
+ o?[_0-7]+)|
30
+ [1-9][_0-9]*
31
+ (\.[_0-9]+)?
32
+ (e[+-]?[_0-9]+)?
33
+ /ix
34
+ #this might allow leading and trailing _ where ruby does not
35
+
36
+ ws="\s\t\r\v\f"
37
+ eqbegin= /=begin(#{ws}.*)?\n((?!=end[#{ws}\n]).*\n)*=end(#{ws}.*)?$/
38
+ ews=/([#{ws}]+|\\$|\#.*$|\n#{eqbegin}?)*/
39
+ ews_no_nl=/([#{ws}]+|\\\n(#{eqbegin}\n)?)+/
40
+
41
+ var=/[#{letters}][#{letters}0-9]*/
42
+ civar=/@@?#{var}/
43
+ gs=/[^#{ws}\n\#\x0- -]|-[#{letters}0-9]?/
44
+ gvar=/$(#{var}|#{gs})/
45
+
46
+ method=/#{var}[?!]?/
47
+ method_eq=/#{var}[?!=]?/
48
+
49
+ loopers=/c|C-|m|M-/
50
+ simple_esc=/\\([^cCmMx0-7]|x[0-9a-fA-F]{1,2}|[0-7]{1,3})/
51
+ loop_esc= /(\\#{loopers}(
52
+ [^\\](?!\\#{loopers})|
53
+ #{simple_esc}(?!\\#{loopers})|
54
+ (?=\\#{loopers})
55
+ )+
56
+ /mx
57
+ esc=/#{simple_esc}|#{loop_esc}/
58
+
59
+ definately_val=/
60
+ [~!`@${(\['":]|
61
+ [%^\-+/](?!=)|
62
+ <<-?[`'"#{letters}]|
63
+ [0-9#{letters}]
64
+ /x
65
+
66
+ CommonMode=Mode.new(
67
+ [ws,/[#{ws}]+/,WhitespaceToken,:stay],
68
+ [?\\,EscNlToken,:stay],
69
+ [?#, /\#.*$/,CommentToken,:stay]
70
+ #[],
71
+ )
72
+
73
+ ValueMode=CommonMode|Mode.new(
74
+ [?$, gvar, VarNameToken],
75
+ [?@, civar, VarNameToken],
76
+ ["!~&*", /./, UnaryOpToken, ValueMode],
77
+ [?%, /%[qw][^#{lc_letter.sub'_',''}A-Z0-9]/, StringStartToken, :push_context, string_mode(?'){|ss| ss[-1]}],
78
+ [%['], /./, StringStartToken, :push_context, string_mode(?'){?'}],
79
+ [%["`/], /./, StringStartToken, :push_context, string_mode(?"){|ss| ss[-1]}],
80
+ #[?^,/./, UnaryOpToken, ValueMode],
81
+ #["&*", /./, UnaryOpToken, ValueMode], #unary
82
+ ["+-", /[+-]#{num}/, NumberToken], #numeric
83
+ ["+-", /[+-]/, UnaryOpToken, ValueMode], #unary
84
+ [?|, /./,KeywordToken, :block_params, :push_context, ValueMode], #goalpost
85
+ [?:, /:(?=['"])/, UnaryColonToken, ValueMode], #symbol
86
+ [?:, /:(#{gvar}|#{civar}|#{method_eq}|#{operator_method}|`|\[\]=?)/, SymbolToken], #symbol
87
+ [?{, /./, OperatorToken, :push_context, ValueMode], #hash lit
88
+ [?[, /./, OperatorToken, :push_context, ValueMode], #array lit
89
+ [?<, /<<-?#{var}|'[^']*'|"[^"]*"|`[^`]*`/, :here_doc, HereDocHeadToken], #here doc
90
+ [??, /\?([^\\#{ws}\n]|#{esc})/, CharToken], #char lit
91
+ ["0-9", num, NumberToken],
92
+ ["A-Z", method, :const_or_method], #use JustAfterMethodMode to figure out what to output/where to go
93
+ [lc_letters, method, :lvar_or_method],#use JustAfterMethodMode to figure out what to output/where to go
94
+ ["(",/./, KeywordToken, :push_context, ValueMode],
95
+
96
+ #indicates empty construct or trailing comma or semicolon (or nl)
97
+ [?}, /./, :pop_context, :pop_soft_scope?, :block_end?, huhToken, OpMode]
98
+ ['])', /./, :pop_context, huhToken, OpMode]
99
+
100
+ [?\\, "\\\n", :escnl, WhitespaceToken, ValueMode]
101
+ [?\n, :escnl, WhitespaceToken, ValueMode]
102
+ [".,", :error],
103
+ [:begin, :maybe_rescue_etc, :push_context,ValueMode]
104
+ [:def, :hard_scope, :maybe_rescue_etc, :push_context, :nasty_custom_parser_here, ValueMode]
105
+ [/if|unless/, :maybe_then, :maybe_else, :push_context,ValueMode]
106
+ [/while|until/, :maybe_do, :push_context,ValueMode]
107
+ [:for, :expect_in, :maybe_do, :push_context,ValueMode]
108
+ [:class, :push_hard_scope, :maybe_rescue_etc, :maybe_lt, :push_context,ValueMode]
109
+ [:module, :push_hard_scope, :maybe_rescue_etc, :maybe_include, :push_context,ValueMode]
110
+ [:end, :pop_hard_scope?, :pop_context, OpMode]
111
+ [/return|next|break/] #these are special in subtle ways I forget....
112
+ [huh FUNCLIKE_KEYWORDS, huh]
113
+ [huh VARLIKE_KEYWORDS, huh]
114
+ [:BEGIN, huh]
115
+ [:END, huh]
116
+ [:case]
117
+ [:when]
118
+ [:defined?]
119
+
120
+
121
+ {:others=>:error,
122
+ :default=>OpMode}
123
+ )
124
+
125
+ OpMode=CommonMode|Mode.new(
126
+ [";,", /./, OperatorToken]
127
+ ["%/^", /.=?/, :could_be_assign, OperatorToken]
128
+ ["&*+-", /(.)\1?=?/, :could_be_assign, OperatorToken],
129
+ [?|, /\|\|?=?/, :could_be_assign, :could_be_end_goalpost, OperatorToken],
130
+ [?<, /<<=?/, :could_be_assign, OperatorToken],
131
+ [?>, />>=?/, :could_be_assign, OperatorToken],
132
+ [?<, /<=?>?/, OperatorToken], #also gets <>
133
+ [?>, />=?/, OperatorToken],
134
+ [?=, /=(~|>|=?=?)/, :could_be_assign, OperatorToken]
135
+ ["0-9",huh,:error]
136
+ [letters,huh,:error]
137
+ [?:, /::(?=#{ews}[#{uc_letter}][#{letter}]*(?![?`~@${\(]|!([^=]|$)#{ews_no_nl}#{definately_val}))/, OperatorToken]
138
+ [?:, /::/, OperatorToken, MethodNameMode]
139
+ #constant if capitalized and not followed by (implicit or explicit) param list and not ending in ? or ! , else method
140
+ [?:, /:/, OperatorToken]
141
+ [?., /\.\.\.?/, OperatorToken]
142
+ [?., /\.(?!\.)/, OperatorToken, MethodNameMode]
143
+ [?{, /./, :push_context, :push_soft_scope, :block_start, :maybe_goalposts, huhToken]
144
+ [?}, /./, :pop_context, :pop_soft_scope?, :block_end?, huhToken, OpMode]
145
+ ['])', /./, :pop_context, huhToken, OpMode]
146
+ [/and|or|if|unless|while|until|rescue/, OperatorToken]
147
+ [:do, :must_be_after_rparen, :push_soft_scope, :maybe_goalposts, KeywordToken]
148
+ [:do, :if_allowed, KeywordToken]
149
+ [:end, :pop_hard_scope?, :pop_context, OpMode]
150
+ {:others=>:error,
151
+ :default=>ValueMode}
152
+ )
153
+ MethodNameMode=CommonMode|Mode.new(
154
+ [letters, method, MethodNameToken],
155
+ [?`,/./, huh, MethodNameToken],
156
+ [huh, operator_method, MethodNameToken]
157
+ [?[, /\[\]=?/, MethodNameToken]
158
+ #[?(] #in ruby 1.9
159
+ {:default=>JustAfterMethodMode}
160
+ )
161
+
162
+ JustAfterMethodMode=OpMode|Mode.new(
163
+ [ws, /[#{ws}]+/, WhitespaceToken, AfterMethodMode],
164
+ #[?\\] #hrm?
165
+ [?(,huh,:push_context, ParamListStartToken, ValueMode]
166
+ [?{,huh,:push_context, :push_soft_scope, :block_start, huhToken, ValueMode]
167
+ [huh nil, /(?= [^#{ws}({] )/x, :no_token, OpMode]
168
+ )
169
+ AfterMethodMode=Mode.new(
170
+ #these indicate implicit parens unless followed by ws
171
+ [?/, /./, StringStartToken, :iparen, :push_context, string_mode(?"){?/}],
172
+ ['+-*&',huh, :iparen, ValueMode]
173
+ #[?^]
174
+ [?%,huh,]
175
+ [?`,huh,]
176
+
177
+ [?:,huh,] #tricky... operator in ternary context, else always symbol
178
+
179
+ #these indicate implicit parens always
180
+ [?[, //, :iparen, ValueMode]
181
+ [lc_letters, //, :iparen, OpMode]
182
+ ["$@A-Z", //, :iparen, OpMode]
183
+ ["0-9", //, :iparen, OpMode]
184
+ [%[~!], //, :iparen, ValueMode]
185
+
186
+
187
+ [?<, /(?=<<-?['"#{lc_letters}])/i, :iparen, OpMode]
188
+ [?{, //, :iparens2, OpMode]
189
+ [?=, //, :iparens2, OpMode]
190
+ [?;, //, :iparens2, OpMode]
191
+
192
+ [?(] #tricky, need to look ahead for commas
193
+
194
+ [")]}",/./,:iparens2, OpMode]
195
+ []
196
+ {:default=>huh}
197
+ )
198
+
199
+ AfterNewline=Mode.new
200
+ StringInteriorMode=Mode.new
201
+
202
+ end
@@ -1,4 +1,4 @@
1
- =begin legal crap
1
+ =begin
2
2
  rubylexer - a ruby lexer written in ruby
3
3
  Copyright (C) 2004,2005,2008 Caleb Clausen
4
4
 
@@ -53,7 +53,7 @@ class RubyLexer
53
53
  WHSPLF=WHSP+"\n"
54
54
  #maybe \r should be in WHSPLF instead
55
55
 
56
- LEGALCHARS=/[ -~#{WHSPLF}]/
56
+ LEGALCHARS=/[ -~#{WHSPLF}\x80-\xFF]/
57
57
 
58
58
  PAIRS={ '{'=>'}', '['=>']', '('=>')', '<'=>'>'}
59
59
 
@@ -139,7 +139,50 @@ private
139
139
 
140
140
  #-----------------------------------
141
141
  def regex(ch=nil)
142
- result=RenderExactlyStringToken.new('/').append_token str=double_quote("/")
142
+ result=RenderExactlyStringToken.new('/').append_token double_quote("/")
143
+ if @rubyversion>=1.9
144
+ named_brs=[]
145
+ if result.elems.size==1 and String===result.elems.first
146
+ index=0
147
+ huh
148
+ while index=elem.index(/#{EVEN_BS_S}( \(\?[<'] | \(\?\# | \[ )/xo,index)
149
+ huh
150
+ case alt
151
+ when "(?<"; huh
152
+ index=elem.index(/\G...(#{LCLETTER}#{LETTER_DIGIT}+)>/o,index)
153
+ index or huh
154
+ index+=$1.size+4
155
+ named_brs<<$1
156
+ when "(?'"; huh
157
+ index=elem.index(/\G...(#{LCLETTER}#{LETTER_DIGIT}+)'/o,index)
158
+ index or huh
159
+ index+=$1.size+4
160
+ named_brs<<$1
161
+ when "(?#"; huh
162
+ index+=3
163
+ index=elem.index(/#{EVEN_BS_S}\)/,index)
164
+ index or huh
165
+ index+=1
166
+ when "["; huh
167
+ index+=1
168
+ paren_ctr=1
169
+ loop do
170
+ index=elem.index(/#{EVEN_BS_S}(&&\[\^|\])/o,index)
171
+ index or huh
172
+ index+=$&.size
173
+ unless $1[-1]==?]
174
+ paren_ctr+=1
175
+ else
176
+ paren_ctr-=1
177
+ break if paren_ctr==0
178
+ end
179
+ end
180
+
181
+ end
182
+ end
183
+ end
184
+ result.lvars= named_brs unless named_brs.empty?
185
+ end
143
186
  result.open=result.close="/"
144
187
  result.line=@linenum
145
188
  return result
@@ -175,7 +218,7 @@ private
175
218
  when 'x' then '`' #exec it
176
219
  when 's' then strlex=:single_quote; "'" #symbol
177
220
  #other letters, nums are illegal here
178
- when /^[a-z0-9]$/oi
221
+ when /^#{LCLETTER().gsub('_','')}$/o
179
222
  error= "unrecognized %string type: "+ch; '"'
180
223
  when ''
181
224
  result= lexerror( StringToken.new('', oldpos), "unexpected eof in %string")
@@ -236,10 +279,9 @@ end
236
279
  ($|[^\\])(c|[CM]-)|
237
280
  ($|[^CM])-
238
281
  )
239
- (\\(?:c|[CM]-)?\\)*
282
+ (\\(?:c|[CM]-)?){2}*
240
283
  /x
241
284
  ILLEGAL_ESCAPED=/#{EVEN_BS_S}(\\([CM][^-]|x[^a-fA-F0-9]))/o #whaddaya do with this?
242
- ILLEGAL_CRUNCH=/#{EVEN_BS_S}(\#@[^a-zA-Z_]|\#$[^a-zA-Z_0-9\-!@&+`'=~\/\\,.;<>*"$?:;])/o #and this?
243
285
  def all_quote(nester, type, delimiter, bs_handler=nil)
244
286
  if FASTER_STRING_ESCAPES
245
287
  #string must start with nester
@@ -354,8 +396,18 @@ if FASTER_STRING_ESCAPES
354
396
  break
355
397
  end
356
398
 
357
- #shouldn't tolerate ILLEGAL_ESCAPED in str (unless single quotish)....
358
- lexerror str, "illegal escape sequence" if !("['"[type]) and ILLEGAL_ESCAPED===b
399
+
400
+ unless ("['"[type])
401
+ @@ILLEGAL_CRUNCH||=/
402
+ #{EVEN_BS_S}(?:
403
+ \#@(?:(?!#{LETTER()})|[^@]) |
404
+ \#$(?:(?!#{LETTER_DIGIT()})|[^\-!@&+`'=~\/\\,.;<>*"$?:;])
405
+ )
406
+ /ox #and this?
407
+
408
+ #shouldn't tolerate ILLEGAL_ESCAPED in str (unless single quotish)....
409
+ lexerror str, "illegal escape sequence" if /#{@@ILLEGAL_CRUNCH}|#{ILLEGAL_ESCAPED}/===b
410
+ end
359
411
 
360
412
  str.append b
361
413
  }
@@ -651,7 +703,7 @@ end
651
703
  def ruby_code(ch='{')
652
704
  assert ch[/^[{(@$]$/]
653
705
  klass= RubyLexer===self ? self.class : RubyLexer
654
- rl=klass.new(@filename,@file,@linenum,offset_adjust())
706
+ rl=klass.new(@filename,@file,@linenum,offset_adjust(),:rubyversion=>@rubyversion)
655
707
  rl.extend RecursiveRubyLexer
656
708
  rl.enable_macros! if @enable_macro
657
709
  rl.in_def=true if inside_method_def?
@@ -1 +1,5 @@
1
+ def (z,*a=0).b; end
2
+ def (z,*a=0).b; a %(1) end
3
+ def (z,*a=0).b; b %(1) end
4
+ def (z,*a=0).b; z %(1) end
1
5
  "#{
@@ -651,6 +651,8 @@ x{
651
651
  a ?b:c
652
652
  p(a ? b:c)
653
653
  p(a ?b:c)
654
+ p(a ?:r:c)
655
+ p(a ? :r:c)
654
656
  }
655
657
 
656
658
  x{
@@ -6,6 +6,11 @@ module TestCases
6
6
  STANZAS.each{|stanza| stanza<<"\n" }
7
7
  ILLEGAL_ONELINERS=IO.readlines(rldir+'/rubylexer/test/illegal_oneliners.rb').map{|x| x.chomp}.grep(/\A\s*[^#\s\n]/).reverse
8
8
  ILLEGAL_STANZAS=IO.read(rldir+'/rubylexer/test/illegal_stanzas.rb').split("\n\n").grep(/./).reverse
9
- TESTCASES=ONELINERS+STANZAS
9
+
10
+ datadir=$:.find{|dir| File.exist? dir+'/../test/data/p.rb' }
11
+ FILENAMES=Dir[datadir+'/../test/data/*'].reject{|fn| File.directory? fn}
12
+ FILES=FILENAMES.map{|fn| File.read fn }
13
+
14
+ TESTCASES=ONELINERS+STANZAS+FILES
10
15
  ILLEGAL_TESTCASES=ILLEGAL_ONELINERS+ILLEGAL_STANZAS
11
16
  end
@@ -38,6 +38,8 @@ class Token
38
38
  def error; end
39
39
 
40
40
  def has_no_block?; false end
41
+
42
+ attr_accessor :tag
41
43
  end
42
44
 
43
45
  #-------------------------
@@ -83,7 +85,7 @@ class KeywordToken < WToken #also some operators
83
85
  self===RubyLexer::BEGINWORDS and @has_end||=nil
84
86
  end
85
87
 
86
- attr_accessor :comma_type, :ternary, :grouping
88
+ attr_accessor :ternary, :grouping
87
89
 
88
90
  def has_no_block!
89
91
  @has_no_block=true
@@ -204,6 +206,7 @@ class StringToken < Token
204
206
 
205
207
  attr_accessor :modifiers #for regex only
206
208
  attr_accessor :elems
209
+ attr_accessor :startline
207
210
  attr_accessor :line #line on which the string ENDS
208
211
  attr_accessor :bs_handler
209
212
 
@@ -1,3 +1,3 @@
1
1
  class RubyLexer
2
- VERSION='0.7.3'
2
+ VERSION='0.7.4'
3
3
  end