RubyGems - rubylexer - Versions diffs - 0.7.3 → 0.7.4 - Mend

rubylexer 0.7.3 → 0.7.4

Files changed (20) hide show

data/History.txt +16 -0
data/Manifest.txt +3 -1
data/README.txt +12 -19
data/Rakefile +2 -2
data/lib/rubylexer.rb +214 -86
data/lib/rubylexer/context.rb +17 -6
data/lib/rubylexer/lextable.rb +202 -0
data/lib/rubylexer/rulexer.rb +61 -9
data/lib/rubylexer/test/illegal_oneliners.rb +4 -0
data/lib/rubylexer/test/stanzas.rb +2 -0
data/lib/rubylexer/test/testcases.rb +6 -1
data/lib/rubylexer/token.rb +4 -1
data/lib/rubylexer/version.rb +1 -1
data/test/code/regression.rb +1 -1
data/test/code/rubylexervsruby.rb +23 -6
data/test/data/1.rb +729 -0
data/test/data/heart.rb +43 -2
data/test/data/pleac.rb +6282 -0
data/testing.txt +1 -1
metadata +7 -4

@@ -36,6 +36,7 @@ module NestedContexts
     def see lxr,msg; end
     def lhs=*x; end #do nothing
+    def wantarrow; false end
   end
   #contexts which expect to see commas,
@@ -48,18 +49,26 @@ module NestedContexts
       assert '{['[starter]
       super(starter, starter.tr('{[','}]') ,linenum)
     end
+    def wantarrow; true end
   end
   class ParenContext < NestedContext
     def initialize(linenum)
       super('(', ')' ,linenum)
     end
-    attr_accessor :lhs,:saw_comma
+    attr_accessor :lhs
     def see(lxr,msg)
-      @saw_comma=true if msg==:comma
+      @lhs=true if msg==:comma || msg==:splat
     end
   end
+  class KnownNestedLhsParenContext < ParenContext
+#    def lhs; true end
+#    def lhs=x; end
+#    def see(lxr,msg) end
+  end
   class BlockContext  < NestedContext
     def initialize(linenum)
       super('{','}',linenum)
@@ -83,6 +92,7 @@ module NestedContexts
       super('(', ')',linenum)
     end
     def lhs; false end
+    def wantarrow; true end
   end
   class ImplicitLhsContext < NestedContext
@@ -107,6 +117,7 @@ module NestedContexts
       super(nil,nil,linenum)
     end
     def lhs; false end
+    def wantarrow; true end
   end
   class KWParamListContextNoParen < ParamListContextNoParen
@@ -224,17 +235,17 @@ module NestedContexts
     def see(lxr,msg)
       stack=lxr.parsestack
       case msg
-      when :rescue:
+      when :rescue;
         WantsEndContext===stack.last or
           BlockContext===stack.last or
           ParenContext===stack.last or
           raise 'syntax error: rescue not expected at this time'
-      when :arrow: #local var defined in this state
-      when :then,:semi,:colon:
+      when :arrow; #local var defined in this state
+      when :then,:semi,:colon;
         msg=:then
         self.equal? stack.pop or raise 'syntax error: then not expected at this time'
                   #pop self off owning context stack
-      when :comma, :splat: return
+      when :comma, :splat; return
       else super
       end
       LEGAL_SUCCESSORS[@state].include? msg or raise "rescue syntax error: #{msg} unexpected in #@state"

data/lib/rubylexer/lextable.rb ADDED

@@ -0,0 +1,202 @@
+class RubyLexer
+  class Rule
+    def initialize(lead,matcher,*actions)
+      fail unless String===lead or Fixnum===lead
+      @lead,@matcher,@actions=lead,matcher,actions
+    end
+  end
+  class Mode #set of Rules
+    def initialize(rules)
+      rules.map!{|r| Rule.new(*r) }
+      @rules=rules
+      fail if rules.size>255
+      rules.each_with_index{|r,i| all_chars_of(r.lead).each{|char|
+        @chartable[char]||=''
+        @chartable[char]<<i
+      }}
+      #should order of rules in @chartable[x] be tweaked?
+    end
+  end
+  lc_letters="a-z_"  #this is correct for ascii, other charsets will need different char classes here
+  uc_letters="A-Z"   #this is always A-Z, for all charsets
+  letters=lc_letters+uc_letters
+  num=/(0(x[_0-9a-f]+|
+          d[_0-9]+|
+          b[_01]+|
+          o?[_0-7]+)|
+        [1-9][_0-9]*
+        (\.[_0-9]+)?
+        (e[+-]?[_0-9]+)?
+      /ix
+  #this might allow leading and trailing _ where ruby does not
+  ws="\s\t\r\v\f"
+  eqbegin= /=begin(#{ws}.*)?\n((?!=end[#{ws}\n]).*\n)*=end(#{ws}.*)?$/
+  ews=/([#{ws}]+|\\$|\#.*$|\n#{eqbegin}?)*/
+  ews_no_nl=/([#{ws}]+|\\\n(#{eqbegin}\n)?)+/
+  var=/[#{letters}][#{letters}0-9]*/
+  civar=/@@?#{var}/
+  gs=/[^#{ws}\n\#\x0- -]|-[#{letters}0-9]?/
+  gvar=/$(#{var}|#{gs})/
+  method=/#{var}[?!]?/
+  method_eq=/#{var}[?!=]?/
+  loopers=/c|C-|m|M-/
+  simple_esc=/\\([^cCmMx0-7]|x[0-9a-fA-F]{1,2}|[0-7]{1,3})/
+  loop_esc= /(\\#{loopers}(
+              [^\\](?!\\#{loopers})|
+              #{simple_esc}(?!\\#{loopers})|
+              (?=\\#{loopers})
+             )+
+            /mx
+  esc=/#{simple_esc}|#{loop_esc}/
+  definately_val=/
+    [~!`@${(\['":]|
+    [%^\-+/](?!=)|
+    <<-?[`'"#{letters}]|
+    [0-9#{letters}]
+  /x
+  CommonMode=Mode.new(
+    [ws,/[#{ws}]+/,WhitespaceToken,:stay],
+    [?\\,EscNlToken,:stay],
+    [?#, /\#.*$/,CommentToken,:stay]
+    #[],
+  )
+  ValueMode=CommonMode|Mode.new(
+    [?$, gvar, VarNameToken],
+    [?@, civar, VarNameToken],
+    ["!~&*", /./, UnaryOpToken, ValueMode],
+    [?%, /%[qw][^#{lc_letter.sub'_',''}A-Z0-9]/, StringStartToken, :push_context, string_mode(?'){|ss| ss[-1]}],
+    [%['], /./, StringStartToken, :push_context, string_mode(?'){?'}],
+    [%["`/], /./, StringStartToken, :push_context, string_mode(?"){|ss| ss[-1]}],
+    #[?^,/./, UnaryOpToken, ValueMode],
+    #["&*", /./, UnaryOpToken, ValueMode], #unary
+    ["+-", /[+-]#{num}/, NumberToken], #numeric
+    ["+-", /[+-]/, UnaryOpToken, ValueMode], #unary
+    [?|, /./,KeywordToken, :block_params, :push_context, ValueMode], #goalpost
+    [?:, /:(?=['"])/, UnaryColonToken, ValueMode], #symbol
+    [?:, /:(#{gvar}|#{civar}|#{method_eq}|#{operator_method}|`|\[\]=?)/, SymbolToken], #symbol
+    [?{, /./, OperatorToken, :push_context, ValueMode], #hash lit
+    [?[, /./, OperatorToken, :push_context, ValueMode], #array lit
+    [?<, /<<-?#{var}|'[^']*'|"[^"]*"|`[^`]*`/, :here_doc, HereDocHeadToken], #here doc
+    [??, /\?([^\\#{ws}\n]|#{esc})/, CharToken], #char lit
+    ["0-9", num, NumberToken],
+    ["A-Z", method, :const_or_method], #use JustAfterMethodMode to figure out what to output/where to go
+    [lc_letters, method, :lvar_or_method],#use JustAfterMethodMode to figure out what to output/where to go
+    ["(",/./, KeywordToken, :push_context, ValueMode],
+    #indicates empty construct or trailing comma or semicolon (or nl)
+    [?}, /./, :pop_context, :pop_soft_scope?, :block_end?, huhToken, OpMode]
+    ['])', /./, :pop_context, huhToken, OpMode]
+    [?\\, "\\\n", :escnl, WhitespaceToken, ValueMode]
+    [?\n, :escnl, WhitespaceToken, ValueMode]
+    [".,", :error],
+    [:begin, :maybe_rescue_etc, :push_context,ValueMode]
+    [:def, :hard_scope, :maybe_rescue_etc, :push_context, :nasty_custom_parser_here, ValueMode]
+    [/if|unless/, :maybe_then, :maybe_else, :push_context,ValueMode]
+    [/while|until/, :maybe_do, :push_context,ValueMode]
+    [:for, :expect_in, :maybe_do, :push_context,ValueMode]
+    [:class, :push_hard_scope, :maybe_rescue_etc, :maybe_lt, :push_context,ValueMode]
+    [:module, :push_hard_scope, :maybe_rescue_etc, :maybe_include, :push_context,ValueMode]
+    [:end, :pop_hard_scope?, :pop_context, OpMode]
+    [/return|next|break/] #these are special in subtle ways I forget....
+    [huh FUNCLIKE_KEYWORDS, huh]
+    [huh VARLIKE_KEYWORDS, huh]
+    [:BEGIN, huh]
+    [:END, huh]
+    [:case]
+    [:when]
+    [:defined?]
+    {:others=>:error,
+    :default=>OpMode}
+  )
+  OpMode=CommonMode|Mode.new(
+    [";,", /./, OperatorToken]
+    ["%/^", /.=?/, :could_be_assign, OperatorToken]
+    ["&*+-", /(.)\1?=?/, :could_be_assign, OperatorToken],
+    [?|, /\|\|?=?/, :could_be_assign, :could_be_end_goalpost, OperatorToken],
+    [?<, /<<=?/, :could_be_assign, OperatorToken],
+    [?>, />>=?/, :could_be_assign, OperatorToken],
+    [?<, /<=?>?/, OperatorToken], #also gets <>
+    [?>, />=?/, OperatorToken],
+    [?=, /=(~|>|=?=?)/, :could_be_assign, OperatorToken]
+    ["0-9",huh,:error]
+    [letters,huh,:error]
+    [?:, /::(?=#{ews}[#{uc_letter}][#{letter}]*(?![?`~@${\(]|!([^=]|$)#{ews_no_nl}#{definately_val}))/, OperatorToken]
+    [?:, /::/, OperatorToken, MethodNameMode]
+    #constant if capitalized and not followed by (implicit or explicit) param list and not ending in ? or ! , else method
+    [?:, /:/, OperatorToken]
+    [?., /\.\.\.?/, OperatorToken]
+    [?., /\.(?!\.)/, OperatorToken, MethodNameMode]
+    [?{, /./, :push_context, :push_soft_scope, :block_start, :maybe_goalposts, huhToken]
+    [?}, /./, :pop_context, :pop_soft_scope?, :block_end?, huhToken, OpMode]
+    ['])', /./, :pop_context, huhToken, OpMode]
+    [/and|or|if|unless|while|until|rescue/, OperatorToken]
+    [:do, :must_be_after_rparen, :push_soft_scope, :maybe_goalposts, KeywordToken]
+    [:do, :if_allowed, KeywordToken]
+    [:end, :pop_hard_scope?, :pop_context, OpMode]
+    {:others=>:error,
+    :default=>ValueMode}
+  )
+  MethodNameMode=CommonMode|Mode.new(
+    [letters, method, MethodNameToken],
+    [?`,/./, huh, MethodNameToken],
+    [huh, operator_method, MethodNameToken]
+    [?[, /\[\]=?/, MethodNameToken]
+    #[?(] #in ruby 1.9
+    {:default=>JustAfterMethodMode}
+  )
+  JustAfterMethodMode=OpMode|Mode.new(
+    [ws, /[#{ws}]+/, WhitespaceToken, AfterMethodMode],
+    #[?\\] #hrm?
+    [?(,huh,:push_context, ParamListStartToken, ValueMode]
+    [?{,huh,:push_context, :push_soft_scope, :block_start, huhToken, ValueMode]
+    [huh nil, /(?= [^#{ws}({] )/x, :no_token, OpMode]
+  )
+  AfterMethodMode=Mode.new(
+    #these indicate implicit parens unless followed by ws
+    [?/, /./, StringStartToken, :iparen, :push_context, string_mode(?"){?/}],
+    ['+-*&',huh, :iparen, ValueMode]
+    #[?^]
+    [?%,huh,]
+    [?`,huh,]
+    [?:,huh,] #tricky... operator in ternary context, else always symbol
+    #these indicate implicit parens always
+    [?[, //, :iparen, ValueMode]
+    [lc_letters, //, :iparen, OpMode]
+    ["$@A-Z", //, :iparen, OpMode]
+    ["0-9", //, :iparen, OpMode]
+    [%[~!], //, :iparen, ValueMode]
+    [?<, /(?=<<-?['"#{lc_letters}])/i, :iparen, OpMode]
+    [?{, //, :iparens2, OpMode]
+    [?=, //, :iparens2, OpMode]
+    [?;, //, :iparens2, OpMode]
+    [?(] #tricky, need to look ahead for commas
+    [")]}",/./,:iparens2, OpMode]
+    []
+    {:default=>huh}
+  )
+  AfterNewline=Mode.new
+  StringInteriorMode=Mode.new
+end

data/lib/rubylexer/rulexer.rb CHANGED

@@ -1,4 +1,4 @@
-=begin legal crap
+=begin
     rubylexer - a ruby lexer written in ruby
     Copyright (C) 2004,2005,2008  Caleb Clausen
@@ -53,7 +53,7 @@ class RubyLexer
    WHSPLF=WHSP+"\n"
    #maybe \r should be in WHSPLF instead
-   LEGALCHARS=/[ -~#{WHSPLF}]/
+   LEGALCHARS=/[ -~#{WHSPLF}\x80-\xFF]/
    PAIRS={ '{'=>'}', '['=>']', '('=>')', '<'=>'>'}
@@ -139,7 +139,50 @@ private
    #-----------------------------------
    def regex(ch=nil)
-      result=RenderExactlyStringToken.new('/').append_token str=double_quote("/")
+      result=RenderExactlyStringToken.new('/').append_token double_quote("/")
+      if @rubyversion>=1.9
+        named_brs=[]
+        if result.elems.size==1 and String===result.elems.first
+            index=0
+            huh
+            while index=elem.index(/#{EVEN_BS_S}( \(\?[<'] | \(\?\# | \[ )/xo,index)
+              huh
+              case alt
+              when "(?<"; huh
+                index=elem.index(/\G...(#{LCLETTER}#{LETTER_DIGIT}+)>/o,index)
+                index or huh
+                index+=$1.size+4
+                named_brs<<$1
+              when "(?'"; huh
+                index=elem.index(/\G...(#{LCLETTER}#{LETTER_DIGIT}+)'/o,index)
+                index or huh
+                index+=$1.size+4
+                named_brs<<$1
+              when "(?#"; huh
+                index+=3
+                index=elem.index(/#{EVEN_BS_S}\)/,index)
+                index or huh
+                index+=1
+              when "["; huh
+                index+=1
+                paren_ctr=1
+                loop do
+                  index=elem.index(/#{EVEN_BS_S}(&&\[\^|\])/o,index)
+                  index or huh
+                  index+=$&.size
+                  unless $1[-1]==?]
+                    paren_ctr+=1
+                  else
+                    paren_ctr-=1
+                    break if paren_ctr==0
+                  end
+                end
+              end
+            end
+        end
+        result.lvars= named_brs unless named_brs.empty?
+      end
       result.open=result.close="/"
       result.line=@linenum
       return result
@@ -175,7 +218,7 @@ private
          when 'x' then '`' #exec it
          when 's' then strlex=:single_quote; "'" #symbol
          #other letters, nums are illegal here
-         when /^[a-z0-9]$/oi
+         when /^#{LCLETTER().gsub('_','')}$/o
             error= "unrecognized %string type: "+ch; '"'
          when ''
             result= lexerror( StringToken.new('', oldpos), "unexpected eof in %string")
@@ -236,10 +279,9 @@ end
       ($|[^\\])(c|[CM]-)|
       ($|[^CM])-
      )
-     (\\(?:c|[CM]-)?\\)*
+     (\\(?:c|[CM]-)?){2}*
    /x
    ILLEGAL_ESCAPED=/#{EVEN_BS_S}(\\([CM][^-]|x[^a-fA-F0-9]))/o #whaddaya do with this?
-   ILLEGAL_CRUNCH=/#{EVEN_BS_S}(\#@[^a-zA-Z_]|\#$[^a-zA-Z_0-9\-!@&+`'=~\/\\,.;<>*"$?:;])/o #and this?
    def all_quote(nester, type, delimiter, bs_handler=nil)
 if FASTER_STRING_ESCAPES
       #string must start with nester
@@ -354,8 +396,18 @@ if FASTER_STRING_ESCAPES
                break
          end
-         #shouldn't tolerate ILLEGAL_ESCAPED in str (unless single quotish)....
-         lexerror str, "illegal escape sequence" if !("['"[type]) and ILLEGAL_ESCAPED===b
+         unless ("['"[type])
+           @@ILLEGAL_CRUNCH||=/
+             #{EVEN_BS_S}(?:
+               \#@(?:(?!#{LETTER()})|[^@]) |
+               \#$(?:(?!#{LETTER_DIGIT()})|[^\-!@&+`'=~\/\\,.;<>*"$?:;])
+             )
+           /ox #and this?
+           #shouldn't tolerate ILLEGAL_ESCAPED in str (unless single quotish)....
+           lexerror str, "illegal escape sequence" if /#{@@ILLEGAL_CRUNCH}|#{ILLEGAL_ESCAPED}/===b
+         end
          str.append b
       }
@@ -651,7 +703,7 @@ end
    def ruby_code(ch='{')
       assert ch[/^[{(@$]$/]
       klass= RubyLexer===self ? self.class : RubyLexer
-      rl=klass.new(@filename,@file,@linenum,offset_adjust())
+      rl=klass.new(@filename,@file,@linenum,offset_adjust(),:rubyversion=>@rubyversion)
       rl.extend RecursiveRubyLexer
       rl.enable_macros! if @enable_macro
       rl.in_def=true if inside_method_def?

data/lib/rubylexer/test/illegal_oneliners.rb CHANGED

@@ -1 +1,5 @@
+def (z,*a=0).b; end
+def (z,*a=0).b; a %(1) end
+def (z,*a=0).b; b %(1) end
+def (z,*a=0).b; z %(1) end
 "#{

data/lib/rubylexer/test/stanzas.rb CHANGED

@@ -651,6 +651,8 @@ x{
   a ?b:c
   p(a ? b:c)
   p(a ?b:c)
+  p(a ?:r:c)
+  p(a ? :r:c)
 }
 x{

data/lib/rubylexer/test/testcases.rb CHANGED

@@ -6,6 +6,11 @@ module TestCases
   STANZAS.each{|stanza| stanza<<"\n" }
   ILLEGAL_ONELINERS=IO.readlines(rldir+'/rubylexer/test/illegal_oneliners.rb').map{|x| x.chomp}.grep(/\A\s*[^#\s\n]/).reverse
   ILLEGAL_STANZAS=IO.read(rldir+'/rubylexer/test/illegal_stanzas.rb').split("\n\n").grep(/./).reverse
-  TESTCASES=ONELINERS+STANZAS
+  datadir=$:.find{|dir| File.exist? dir+'/../test/data/p.rb' }
+  FILENAMES=Dir[datadir+'/../test/data/*'].reject{|fn| File.directory? fn}
+  FILES=FILENAMES.map{|fn| File.read fn }
+  TESTCASES=ONELINERS+STANZAS+FILES
   ILLEGAL_TESTCASES=ILLEGAL_ONELINERS+ILLEGAL_STANZAS
 end

data/lib/rubylexer/token.rb CHANGED

@@ -38,6 +38,8 @@ class Token
    def error; end
    def has_no_block?; false end
+   attr_accessor :tag
 end
 #-------------------------
@@ -83,7 +85,7 @@ class KeywordToken < WToken   #also some operators
     self===RubyLexer::BEGINWORDS and @has_end||=nil
   end
-  attr_accessor :comma_type, :ternary, :grouping
+  attr_accessor :ternary, :grouping
   def has_no_block!
      @has_no_block=true
@@ -204,6 +206,7 @@ class StringToken < Token
    attr_accessor :modifiers    #for regex only
    attr_accessor :elems
+   attr_accessor :startline
    attr_accessor :line  #line on which the string ENDS
    attr_accessor :bs_handler

data/lib/rubylexer/version.rb CHANGED

@@ -1,3 +1,3 @@
 class RubyLexer
-  VERSION='0.7.3'
+  VERSION='0.7.4'
 end