rubylexer 0.7.3 → 0.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,6 +36,7 @@ module NestedContexts
36
36
 
37
37
  def see lxr,msg; end
38
38
  def lhs=*x; end #do nothing
39
+ def wantarrow; false end
39
40
  end
40
41
 
41
42
  #contexts which expect to see commas,
@@ -48,18 +49,26 @@ module NestedContexts
48
49
  assert '{['[starter]
49
50
  super(starter, starter.tr('{[','}]') ,linenum)
50
51
  end
52
+ def wantarrow; true end
51
53
  end
52
54
 
53
55
  class ParenContext < NestedContext
54
56
  def initialize(linenum)
55
57
  super('(', ')' ,linenum)
56
58
  end
57
- attr_accessor :lhs,:saw_comma
59
+
60
+ attr_accessor :lhs
58
61
  def see(lxr,msg)
59
- @saw_comma=true if msg==:comma
62
+ @lhs=true if msg==:comma || msg==:splat
60
63
  end
61
64
  end
62
65
 
66
+ class KnownNestedLhsParenContext < ParenContext
67
+ # def lhs; true end
68
+ # def lhs=x; end
69
+ # def see(lxr,msg) end
70
+ end
71
+
63
72
  class BlockContext < NestedContext
64
73
  def initialize(linenum)
65
74
  super('{','}',linenum)
@@ -83,6 +92,7 @@ module NestedContexts
83
92
  super('(', ')',linenum)
84
93
  end
85
94
  def lhs; false end
95
+ def wantarrow; true end
86
96
  end
87
97
 
88
98
  class ImplicitLhsContext < NestedContext
@@ -107,6 +117,7 @@ module NestedContexts
107
117
  super(nil,nil,linenum)
108
118
  end
109
119
  def lhs; false end
120
+ def wantarrow; true end
110
121
  end
111
122
 
112
123
  class KWParamListContextNoParen < ParamListContextNoParen
@@ -224,17 +235,17 @@ module NestedContexts
224
235
  def see(lxr,msg)
225
236
  stack=lxr.parsestack
226
237
  case msg
227
- when :rescue:
238
+ when :rescue;
228
239
  WantsEndContext===stack.last or
229
240
  BlockContext===stack.last or
230
241
  ParenContext===stack.last or
231
242
  raise 'syntax error: rescue not expected at this time'
232
- when :arrow: #local var defined in this state
233
- when :then,:semi,:colon:
243
+ when :arrow; #local var defined in this state
244
+ when :then,:semi,:colon;
234
245
  msg=:then
235
246
  self.equal? stack.pop or raise 'syntax error: then not expected at this time'
236
247
  #pop self off owning context stack
237
- when :comma, :splat: return
248
+ when :comma, :splat; return
238
249
  else super
239
250
  end
240
251
  LEGAL_SUCCESSORS[@state].include? msg or raise "rescue syntax error: #{msg} unexpected in #@state"
@@ -0,0 +1,202 @@
1
+ class RubyLexer
2
+ class Rule
3
+ def initialize(lead,matcher,*actions)
4
+ fail unless String===lead or Fixnum===lead
5
+ @lead,@matcher,@actions=lead,matcher,actions
6
+ end
7
+ end
8
+
9
+ class Mode #set of Rules
10
+ def initialize(rules)
11
+ rules.map!{|r| Rule.new(*r) }
12
+ @rules=rules
13
+ fail if rules.size>255
14
+ rules.each_with_index{|r,i| all_chars_of(r.lead).each{|char|
15
+ @chartable[char]||=''
16
+ @chartable[char]<<i
17
+ }}
18
+ #should order of rules in @chartable[x] be tweaked?
19
+ end
20
+ end
21
+
22
+ lc_letters="a-z_" #this is correct for ascii, other charsets will need different char classes here
23
+ uc_letters="A-Z" #this is always A-Z, for all charsets
24
+ letters=lc_letters+uc_letters
25
+
26
+ num=/(0(x[_0-9a-f]+|
27
+ d[_0-9]+|
28
+ b[_01]+|
29
+ o?[_0-7]+)|
30
+ [1-9][_0-9]*
31
+ (\.[_0-9]+)?
32
+ (e[+-]?[_0-9]+)?
33
+ /ix
34
+ #this might allow leading and trailing _ where ruby does not
35
+
36
+ ws="\s\t\r\v\f"
37
+ eqbegin= /=begin(#{ws}.*)?\n((?!=end[#{ws}\n]).*\n)*=end(#{ws}.*)?$/
38
+ ews=/([#{ws}]+|\\$|\#.*$|\n#{eqbegin}?)*/
39
+ ews_no_nl=/([#{ws}]+|\\\n(#{eqbegin}\n)?)+/
40
+
41
+ var=/[#{letters}][#{letters}0-9]*/
42
+ civar=/@@?#{var}/
43
+ gs=/[^#{ws}\n\#\x0- -]|-[#{letters}0-9]?/
44
+ gvar=/$(#{var}|#{gs})/
45
+
46
+ method=/#{var}[?!]?/
47
+ method_eq=/#{var}[?!=]?/
48
+
49
+ loopers=/c|C-|m|M-/
50
+ simple_esc=/\\([^cCmMx0-7]|x[0-9a-fA-F]{1,2}|[0-7]{1,3})/
51
+ loop_esc= /(\\#{loopers}(
52
+ [^\\](?!\\#{loopers})|
53
+ #{simple_esc}(?!\\#{loopers})|
54
+ (?=\\#{loopers})
55
+ )+
56
+ /mx
57
+ esc=/#{simple_esc}|#{loop_esc}/
58
+
59
+ definately_val=/
60
+ [~!`@${(\['":]|
61
+ [%^\-+/](?!=)|
62
+ <<-?[`'"#{letters}]|
63
+ [0-9#{letters}]
64
+ /x
65
+
66
+ CommonMode=Mode.new(
67
+ [ws,/[#{ws}]+/,WhitespaceToken,:stay],
68
+ [?\\,EscNlToken,:stay],
69
+ [?#, /\#.*$/,CommentToken,:stay]
70
+ #[],
71
+ )
72
+
73
+ ValueMode=CommonMode|Mode.new(
74
+ [?$, gvar, VarNameToken],
75
+ [?@, civar, VarNameToken],
76
+ ["!~&*", /./, UnaryOpToken, ValueMode],
77
+ [?%, /%[qw][^#{lc_letter.sub'_',''}A-Z0-9]/, StringStartToken, :push_context, string_mode(?'){|ss| ss[-1]}],
78
+ [%['], /./, StringStartToken, :push_context, string_mode(?'){?'}],
79
+ [%["`/], /./, StringStartToken, :push_context, string_mode(?"){|ss| ss[-1]}],
80
+ #[?^,/./, UnaryOpToken, ValueMode],
81
+ #["&*", /./, UnaryOpToken, ValueMode], #unary
82
+ ["+-", /[+-]#{num}/, NumberToken], #numeric
83
+ ["+-", /[+-]/, UnaryOpToken, ValueMode], #unary
84
+ [?|, /./,KeywordToken, :block_params, :push_context, ValueMode], #goalpost
85
+ [?:, /:(?=['"])/, UnaryColonToken, ValueMode], #symbol
86
+ [?:, /:(#{gvar}|#{civar}|#{method_eq}|#{operator_method}|`|\[\]=?)/, SymbolToken], #symbol
87
+ [?{, /./, OperatorToken, :push_context, ValueMode], #hash lit
88
+ [?[, /./, OperatorToken, :push_context, ValueMode], #array lit
89
+ [?<, /<<-?#{var}|'[^']*'|"[^"]*"|`[^`]*`/, :here_doc, HereDocHeadToken], #here doc
90
+ [??, /\?([^\\#{ws}\n]|#{esc})/, CharToken], #char lit
91
+ ["0-9", num, NumberToken],
92
+ ["A-Z", method, :const_or_method], #use JustAfterMethodMode to figure out what to output/where to go
93
+ [lc_letters, method, :lvar_or_method],#use JustAfterMethodMode to figure out what to output/where to go
94
+ ["(",/./, KeywordToken, :push_context, ValueMode],
95
+
96
+ #indicates empty construct or trailing comma or semicolon (or nl)
97
+ [?}, /./, :pop_context, :pop_soft_scope?, :block_end?, huhToken, OpMode]
98
+ ['])', /./, :pop_context, huhToken, OpMode]
99
+
100
+ [?\\, "\\\n", :escnl, WhitespaceToken, ValueMode]
101
+ [?\n, :escnl, WhitespaceToken, ValueMode]
102
+ [".,", :error],
103
+ [:begin, :maybe_rescue_etc, :push_context,ValueMode]
104
+ [:def, :hard_scope, :maybe_rescue_etc, :push_context, :nasty_custom_parser_here, ValueMode]
105
+ [/if|unless/, :maybe_then, :maybe_else, :push_context,ValueMode]
106
+ [/while|until/, :maybe_do, :push_context,ValueMode]
107
+ [:for, :expect_in, :maybe_do, :push_context,ValueMode]
108
+ [:class, :push_hard_scope, :maybe_rescue_etc, :maybe_lt, :push_context,ValueMode]
109
+ [:module, :push_hard_scope, :maybe_rescue_etc, :maybe_include, :push_context,ValueMode]
110
+ [:end, :pop_hard_scope?, :pop_context, OpMode]
111
+ [/return|next|break/] #these are special in subtle ways I forget....
112
+ [huh FUNCLIKE_KEYWORDS, huh]
113
+ [huh VARLIKE_KEYWORDS, huh]
114
+ [:BEGIN, huh]
115
+ [:END, huh]
116
+ [:case]
117
+ [:when]
118
+ [:defined?]
119
+
120
+
121
+ {:others=>:error,
122
+ :default=>OpMode}
123
+ )
124
+
125
+ OpMode=CommonMode|Mode.new(
126
+ [";,", /./, OperatorToken]
127
+ ["%/^", /.=?/, :could_be_assign, OperatorToken]
128
+ ["&*+-", /(.)\1?=?/, :could_be_assign, OperatorToken],
129
+ [?|, /\|\|?=?/, :could_be_assign, :could_be_end_goalpost, OperatorToken],
130
+ [?<, /<<=?/, :could_be_assign, OperatorToken],
131
+ [?>, />>=?/, :could_be_assign, OperatorToken],
132
+ [?<, /<=?>?/, OperatorToken], #also gets <>
133
+ [?>, />=?/, OperatorToken],
134
+ [?=, /=(~|>|=?=?)/, :could_be_assign, OperatorToken]
135
+ ["0-9",huh,:error]
136
+ [letters,huh,:error]
137
+ [?:, /::(?=#{ews}[#{uc_letter}][#{letter}]*(?![?`~@${\(]|!([^=]|$)#{ews_no_nl}#{definately_val}))/, OperatorToken]
138
+ [?:, /::/, OperatorToken, MethodNameMode]
139
+ #constant if capitalized and not followed by (implicit or explicit) param list and not ending in ? or ! , else method
140
+ [?:, /:/, OperatorToken]
141
+ [?., /\.\.\.?/, OperatorToken]
142
+ [?., /\.(?!\.)/, OperatorToken, MethodNameMode]
143
+ [?{, /./, :push_context, :push_soft_scope, :block_start, :maybe_goalposts, huhToken]
144
+ [?}, /./, :pop_context, :pop_soft_scope?, :block_end?, huhToken, OpMode]
145
+ ['])', /./, :pop_context, huhToken, OpMode]
146
+ [/and|or|if|unless|while|until|rescue/, OperatorToken]
147
+ [:do, :must_be_after_rparen, :push_soft_scope, :maybe_goalposts, KeywordToken]
148
+ [:do, :if_allowed, KeywordToken]
149
+ [:end, :pop_hard_scope?, :pop_context, OpMode]
150
+ {:others=>:error,
151
+ :default=>ValueMode}
152
+ )
153
+ MethodNameMode=CommonMode|Mode.new(
154
+ [letters, method, MethodNameToken],
155
+ [?`,/./, huh, MethodNameToken],
156
+ [huh, operator_method, MethodNameToken]
157
+ [?[, /\[\]=?/, MethodNameToken]
158
+ #[?(] #in ruby 1.9
159
+ {:default=>JustAfterMethodMode}
160
+ )
161
+
162
+ JustAfterMethodMode=OpMode|Mode.new(
163
+ [ws, /[#{ws}]+/, WhitespaceToken, AfterMethodMode],
164
+ #[?\\] #hrm?
165
+ [?(,huh,:push_context, ParamListStartToken, ValueMode]
166
+ [?{,huh,:push_context, :push_soft_scope, :block_start, huhToken, ValueMode]
167
+ [huh nil, /(?= [^#{ws}({] )/x, :no_token, OpMode]
168
+ )
169
+ AfterMethodMode=Mode.new(
170
+ #these indicate implicit parens unless followed by ws
171
+ [?/, /./, StringStartToken, :iparen, :push_context, string_mode(?"){?/}],
172
+ ['+-*&',huh, :iparen, ValueMode]
173
+ #[?^]
174
+ [?%,huh,]
175
+ [?`,huh,]
176
+
177
+ [?:,huh,] #tricky... operator in ternary context, else always symbol
178
+
179
+ #these indicate implicit parens always
180
+ [?[, //, :iparen, ValueMode]
181
+ [lc_letters, //, :iparen, OpMode]
182
+ ["$@A-Z", //, :iparen, OpMode]
183
+ ["0-9", //, :iparen, OpMode]
184
+ [%[~!], //, :iparen, ValueMode]
185
+
186
+
187
+ [?<, /(?=<<-?['"#{lc_letters}])/i, :iparen, OpMode]
188
+ [?{, //, :iparens2, OpMode]
189
+ [?=, //, :iparens2, OpMode]
190
+ [?;, //, :iparens2, OpMode]
191
+
192
+ [?(] #tricky, need to look ahead for commas
193
+
194
+ [")]}",/./,:iparens2, OpMode]
195
+ []
196
+ {:default=>huh}
197
+ )
198
+
199
+ AfterNewline=Mode.new
200
+ StringInteriorMode=Mode.new
201
+
202
+ end
@@ -1,4 +1,4 @@
1
- =begin legal crap
1
+ =begin
2
2
  rubylexer - a ruby lexer written in ruby
3
3
  Copyright (C) 2004,2005,2008 Caleb Clausen
4
4
 
@@ -53,7 +53,7 @@ class RubyLexer
53
53
  WHSPLF=WHSP+"\n"
54
54
  #maybe \r should be in WHSPLF instead
55
55
 
56
- LEGALCHARS=/[ -~#{WHSPLF}]/
56
+ LEGALCHARS=/[ -~#{WHSPLF}\x80-\xFF]/
57
57
 
58
58
  PAIRS={ '{'=>'}', '['=>']', '('=>')', '<'=>'>'}
59
59
 
@@ -139,7 +139,50 @@ private
139
139
 
140
140
  #-----------------------------------
141
141
  def regex(ch=nil)
142
- result=RenderExactlyStringToken.new('/').append_token str=double_quote("/")
142
+ result=RenderExactlyStringToken.new('/').append_token double_quote("/")
143
+ if @rubyversion>=1.9
144
+ named_brs=[]
145
+ if result.elems.size==1 and String===result.elems.first
146
+ index=0
147
+ huh
148
+ while index=elem.index(/#{EVEN_BS_S}( \(\?[<'] | \(\?\# | \[ )/xo,index)
149
+ huh
150
+ case alt
151
+ when "(?<"; huh
152
+ index=elem.index(/\G...(#{LCLETTER}#{LETTER_DIGIT}+)>/o,index)
153
+ index or huh
154
+ index+=$1.size+4
155
+ named_brs<<$1
156
+ when "(?'"; huh
157
+ index=elem.index(/\G...(#{LCLETTER}#{LETTER_DIGIT}+)'/o,index)
158
+ index or huh
159
+ index+=$1.size+4
160
+ named_brs<<$1
161
+ when "(?#"; huh
162
+ index+=3
163
+ index=elem.index(/#{EVEN_BS_S}\)/,index)
164
+ index or huh
165
+ index+=1
166
+ when "["; huh
167
+ index+=1
168
+ paren_ctr=1
169
+ loop do
170
+ index=elem.index(/#{EVEN_BS_S}(&&\[\^|\])/o,index)
171
+ index or huh
172
+ index+=$&.size
173
+ unless $1[-1]==?]
174
+ paren_ctr+=1
175
+ else
176
+ paren_ctr-=1
177
+ break if paren_ctr==0
178
+ end
179
+ end
180
+
181
+ end
182
+ end
183
+ end
184
+ result.lvars= named_brs unless named_brs.empty?
185
+ end
143
186
  result.open=result.close="/"
144
187
  result.line=@linenum
145
188
  return result
@@ -175,7 +218,7 @@ private
175
218
  when 'x' then '`' #exec it
176
219
  when 's' then strlex=:single_quote; "'" #symbol
177
220
  #other letters, nums are illegal here
178
- when /^[a-z0-9]$/oi
221
+ when /^#{LCLETTER().gsub('_','')}$/o
179
222
  error= "unrecognized %string type: "+ch; '"'
180
223
  when ''
181
224
  result= lexerror( StringToken.new('', oldpos), "unexpected eof in %string")
@@ -236,10 +279,9 @@ end
236
279
  ($|[^\\])(c|[CM]-)|
237
280
  ($|[^CM])-
238
281
  )
239
- (\\(?:c|[CM]-)?\\)*
282
+ (\\(?:c|[CM]-)?){2}*
240
283
  /x
241
284
  ILLEGAL_ESCAPED=/#{EVEN_BS_S}(\\([CM][^-]|x[^a-fA-F0-9]))/o #whaddaya do with this?
242
- ILLEGAL_CRUNCH=/#{EVEN_BS_S}(\#@[^a-zA-Z_]|\#$[^a-zA-Z_0-9\-!@&+`'=~\/\\,.;<>*"$?:;])/o #and this?
243
285
  def all_quote(nester, type, delimiter, bs_handler=nil)
244
286
  if FASTER_STRING_ESCAPES
245
287
  #string must start with nester
@@ -354,8 +396,18 @@ if FASTER_STRING_ESCAPES
354
396
  break
355
397
  end
356
398
 
357
- #shouldn't tolerate ILLEGAL_ESCAPED in str (unless single quotish)....
358
- lexerror str, "illegal escape sequence" if !("['"[type]) and ILLEGAL_ESCAPED===b
399
+
400
+ unless ("['"[type])
401
+ @@ILLEGAL_CRUNCH||=/
402
+ #{EVEN_BS_S}(?:
403
+ \#@(?:(?!#{LETTER()})|[^@]) |
404
+ \#$(?:(?!#{LETTER_DIGIT()})|[^\-!@&+`'=~\/\\,.;<>*"$?:;])
405
+ )
406
+ /ox #and this?
407
+
408
+ #shouldn't tolerate ILLEGAL_ESCAPED in str (unless single quotish)....
409
+ lexerror str, "illegal escape sequence" if /#{@@ILLEGAL_CRUNCH}|#{ILLEGAL_ESCAPED}/===b
410
+ end
359
411
 
360
412
  str.append b
361
413
  }
@@ -651,7 +703,7 @@ end
651
703
  def ruby_code(ch='{')
652
704
  assert ch[/^[{(@$]$/]
653
705
  klass= RubyLexer===self ? self.class : RubyLexer
654
- rl=klass.new(@filename,@file,@linenum,offset_adjust())
706
+ rl=klass.new(@filename,@file,@linenum,offset_adjust(),:rubyversion=>@rubyversion)
655
707
  rl.extend RecursiveRubyLexer
656
708
  rl.enable_macros! if @enable_macro
657
709
  rl.in_def=true if inside_method_def?
@@ -1 +1,5 @@
1
+ def (z,*a=0).b; end
2
+ def (z,*a=0).b; a %(1) end
3
+ def (z,*a=0).b; b %(1) end
4
+ def (z,*a=0).b; z %(1) end
1
5
  "#{
@@ -651,6 +651,8 @@ x{
651
651
  a ?b:c
652
652
  p(a ? b:c)
653
653
  p(a ?b:c)
654
+ p(a ?:r:c)
655
+ p(a ? :r:c)
654
656
  }
655
657
 
656
658
  x{
@@ -6,6 +6,11 @@ module TestCases
6
6
  STANZAS.each{|stanza| stanza<<"\n" }
7
7
  ILLEGAL_ONELINERS=IO.readlines(rldir+'/rubylexer/test/illegal_oneliners.rb').map{|x| x.chomp}.grep(/\A\s*[^#\s\n]/).reverse
8
8
  ILLEGAL_STANZAS=IO.read(rldir+'/rubylexer/test/illegal_stanzas.rb').split("\n\n").grep(/./).reverse
9
- TESTCASES=ONELINERS+STANZAS
9
+
10
+ datadir=$:.find{|dir| File.exist? dir+'/../test/data/p.rb' }
11
+ FILENAMES=Dir[datadir+'/../test/data/*'].reject{|fn| File.directory? fn}
12
+ FILES=FILENAMES.map{|fn| File.read fn }
13
+
14
+ TESTCASES=ONELINERS+STANZAS+FILES
10
15
  ILLEGAL_TESTCASES=ILLEGAL_ONELINERS+ILLEGAL_STANZAS
11
16
  end
@@ -38,6 +38,8 @@ class Token
38
38
  def error; end
39
39
 
40
40
  def has_no_block?; false end
41
+
42
+ attr_accessor :tag
41
43
  end
42
44
 
43
45
  #-------------------------
@@ -83,7 +85,7 @@ class KeywordToken < WToken #also some operators
83
85
  self===RubyLexer::BEGINWORDS and @has_end||=nil
84
86
  end
85
87
 
86
- attr_accessor :comma_type, :ternary, :grouping
88
+ attr_accessor :ternary, :grouping
87
89
 
88
90
  def has_no_block!
89
91
  @has_no_block=true
@@ -204,6 +206,7 @@ class StringToken < Token
204
206
 
205
207
  attr_accessor :modifiers #for regex only
206
208
  attr_accessor :elems
209
+ attr_accessor :startline
207
210
  attr_accessor :line #line on which the string ENDS
208
211
  attr_accessor :bs_handler
209
212
 
@@ -1,3 +1,3 @@
1
1
  class RubyLexer
2
- VERSION='0.7.3'
2
+ VERSION='0.7.4'
3
3
  end