RubyGems - syntax - Versions diffs - 0.5.0 - Mend

syntax 0.5.0

Files changed (12) hide show

data/lib/syntax.rb +31 -0
data/lib/syntax/common.rb +118 -0
data/lib/syntax/convertors/html.rb +50 -0
data/lib/syntax/ruby.rb +239 -0
data/lib/syntax/version.rb +9 -0
data/lib/syntax/xml.rb +108 -0
data/lib/syntax/yaml.rb +105 -0
data/test/ALL-TESTS.rb +5 -0
data/test/syntax/tc_ruby.rb +518 -0
data/test/syntax/tc_xml.rb +202 -0
data/test/syntax/tc_yaml.rb +228 -0
metadata +51 -0

data/lib/syntax.rb ADDED

@@ -0,0 +1,31 @@
+require 'syntax/common'
+module Syntax
+  # A default tokenizer for handling syntaxes that are not explicitly handled
+  # elsewhere. It simply yields the given text as a single token.
+  class Default
+    # Yield the given text as a single token.
+    def tokenize( text )
+      yield Token.new( text, :normal )
+    end
+  end
+  # A hash for registering syntax implementations.
+  SYNTAX = Hash.new( Default )
+  # Load the implementation of the requested syntax. If the syntax cannot be
+  # found, or if it cannot be loaded for whatever reason, the Default syntax
+  # handler will be returned.
+  def load( syntax )
+    begin
+      require "syntax/#{syntax}"
+    rescue LoadError
+    end
+    SYNTAX[ syntax ].new
+  end
+  module_function :load
+end

data/lib/syntax/common.rb ADDED

@@ -0,0 +1,118 @@
+require 'strscan'
+module Syntax
+  # A single token extracted by a tokenizer. It is simply the lexeme
+  # itself, decorated with a 'group' attribute to identify the type of the
+  # lexeme.
+  class Token < String
+    # the type of the lexeme that was extracted.
+    attr_reader :group
+    # Create a new Token representing the given text, and belonging to the
+    # given group.
+    def initialize( text, group )
+      super text
+      @group = group
+    end
+  end
+  # The base class of all tokenizers. It sets up the scanner and manages the
+  # looping until all tokens have been extracted. It also provides convenience
+  # methods to make sure adjacent tokens of identical groups are returned as
+  # a single token.
+  class Tokenizer
+    # Start tokenizing. This sets up the state in preparation for tokenization,
+    # such as creating a new scanner for the text and saving the callback block.
+    # The block will be invoked for each token extracted.
+    def start( text, &block )
+      @chunk = ""
+      @group = :normal
+      @callback = block
+      @text = StringScanner.new( text )
+      setup
+    end
+    # Subclasses may override this method to provide implementation-specific
+    # setup logic.
+    def setup
+    end
+    # Finish tokenizing. This flushes the buffer, yielding any remaining text
+    # to the client.
+    def finish
+      start_group nil
+      teardown
+    end
+    # Subclasses may override this method to provide implementation-specific
+    # teardown logic.
+    def teardown
+    end
+    # Subclasses must implement this method, which is called for each iteration
+    # of the tokenization process. This method may extract multiple tokens.
+    def step
+      raise NotImplementedError, "subclasses must implement #step"
+    end
+    # Begins tokenizing the given text, calling #step until the text has been
+    # exhausted.
+    def tokenize( text, &block )
+      start text, &block
+      step until @text.eos?
+      finish
+    end
+    private
+      # A convenience for delegating method calls to the scanner.
+      def self.delegate( sym )
+        define_method( sym ) { |*a| @text.__send__( sym, *a ) }
+      end
+      delegate :bol?
+      delegate :eos?
+      delegate :scan
+      delegate :scan_until
+      delegate :check
+      delegate :check_until
+      delegate :getch
+      delegate :matched
+      delegate :pre_match
+      delegate :peek
+      delegate :pos
+      # Access the n-th subgroup from the most recent match.
+      def subgroup(n)
+        @text[n]
+      end
+      # Append the given data to the currently active chunk.
+      def append( data )
+        @chunk << data
+      end
+      # Request that a new group be started. If the current group is the same
+      # as the group being requested, a new group will not be created. If a new
+      # group is created and the current chunk is not empty, the chunk's
+      # contents will be yielded to the client as a token, and then cleared.
+      #
+      # After the new group is started, if +data+ is non-nil it will be appended
+      # to the chunk.
+      def start_group( gr, data=nil )
+        if gr != @group && !@chunk.empty?
+          @callback.call( Token.new( @chunk, @group ) )
+          @chunk = ""
+        end
+        @group = gr
+        @chunk << data if data
+      end
+  end
+end

data/lib/syntax/convertors/html.rb ADDED

@@ -0,0 +1,50 @@
+require 'syntax'
+module Syntax
+  module Convertors
+    # A simple class for converting a text into HTML.
+    class HTML
+      # A convenience method for instantiating a new HTML convertor for a
+      # specific syntax.
+      def self.for_syntax( syntax )
+        new( Syntax.load( syntax ) )
+      end
+      # Creates a new HTML convertor that uses the given tokenizer.
+      def initialize( tokenizer )
+        @tokenizer = tokenizer
+      end
+      # Converts the given text to HTML, using spans to represent token groups
+      # of any type but <tt>:normal</tt> (which is always unhighlighted). If
+      # +pre+ is +true+, the html is automatically wrapped in pre tags.
+      def convert( text, pre=true )
+        html = ""
+        html << "<pre>" if pre
+        @tokenizer.tokenize( text ) do |tok|
+          if tok.group == :normal
+            html << html_escape( tok )
+          else
+            html << "<span class=\"#{tok.group}\">#{html_escape(tok)}</span>"
+          end
+        end
+        html << "</pre>" if pre
+        html
+      end
+      private
+        # Replaces some characters with their corresponding HTML entities.
+        def html_escape( string )
+          string.gsub( /&/, "&amp;" ).
+                 gsub( /</, "&lt;" ).
+                 gsub( />/, "&gt;" ).
+                 gsub( /"/, "&quot;" )
+        end
+    end
+  end
+end

data/lib/syntax/ruby.rb ADDED

@@ -0,0 +1,239 @@
+require 'syntax'
+module Syntax
+  # A tokenizer for the Ruby language. It recognizes all common syntax
+  # (and some less common syntax) but because it is not a true lexer, it
+  # will make mistakes on some ambiguous cases.
+  class Ruby < Tokenizer
+    # The list of all identifiers recognized as keywords.
+    KEYWORDS =
+      %w{if then elsif else end begin do rescue ensure while for
+         class module def yield raise until unless and or not when
+         case super undef break next redo retry in return alias
+         defined?}
+    # Perform ruby-specific setup
+    def setup
+      @selector = false
+    end
+    # Step through a single iteration of the tokenization process.
+    def step
+      case
+        when bol? && check( /=begin/ )
+          start_group( :comment, scan_until( /^=end$/ ) )
+        when bol? && check( /__END__$/ )
+          start_group( :comment, scan_until( /\Z/ ) )
+      else
+        case
+          when check( /def\s+/ )
+            start_group :keyword, scan( /def\s+/ )
+            start_group :method,  scan_until( /$|(?=[;(\s])/ )
+          when check( /class\s+/ )
+            start_group :keyword, scan( /class\s+/ )
+            start_group :class,  scan_until( /$|(?=[;\s<])/ )
+          when check( /module\s+/ )
+            start_group :keyword, scan( /module\s+/ )
+            start_group :module,  scan_until( /$|(?=[;\s])/ )
+          when check( /::/ )
+            start_group :punct, scan(/::/)
+          when check( /:"/ )
+            start_group :symbol, scan(/:/)
+            scan_delimited_region :symbol, :symbol, "", true
+          when check( /:'/ )
+            start_group :symbol, scan(/:/)
+            scan_delimited_region :symbol, :symbol, "", false
+          when check( /:\w/ )
+            start_group :symbol, scan(/:\w+[!?]?/)
+          when check( /\?\\?./ )
+            start_group :char, scan(/\?\\?./)
+          when check( /(__FILE__|__LINE__|true|false|nil|self)[?!]?/ )
+            if @selector || matched[-1] == ?? || matched[-1] == ?!
+              start_group :ident,
+                scan(/(__FILE__|__LINE__|true|false|nil|self)[?!]?/)
+            else
+              start_group :constant,
+                scan(/(__FILE__|__LINE__|true|false|nil|self)/)
+            end
+            @selector = false
+          else
+            case peek(2)
+              when "%r"
+                scan_delimited_region :punct, :regex, scan( /../ ), true
+              when "%w", "%q"
+                scan_delimited_region :punct, :string, scan( /../ ), false
+              when "%s"
+                scan_delimited_region :punct, :symbol, scan( /../ ), false
+              when "%W", "%Q", "%x"
+                scan_delimited_region :punct, :string, scan( /../ ), true
+              when /%[^\sa-zA-Z0-9]/
+                scan_delimited_region :punct, :string, scan( /./ ), true
+              when "<<"
+                start_group :punct, scan( /<</ )
+                float_right = scan( /-/ )
+                append "-" if float_right
+                if ( type = scan( /['"]/ ) )
+                  append type
+                  delim = scan_until( /(?=#{type})/ )
+                  if delim.nil?
+                    append scan_until( /\Z/ )
+                    return
+                  end
+                else
+                  delim = scan( /\w+/ ) or return
+                end
+                start_group :constant, delim
+                start_group :punct, scan( /#{type}/ ) if type
+                scan_delimited_region :constant, :string, "", ( type != "'" ),
+                  delim, true, float_right
+              else
+                case peek(1)
+                  when /\s/
+                    start_group :normal, scan( /\s+/ )
+                  when "#"
+                    start_group :comment, scan( /#.*$/ )
+                  when /[A-Z]/
+                    start_group :constant, scan( /\w+/ )
+                  when /[a-z_]/
+                    word = scan( /\w+[?!]?/ )
+                    if !@selector && KEYWORDS.include?( word )
+                      start_group :keyword, word
+                    elsif
+                      start_group :ident, word
+                    end
+                    @selector = false
+                  when /\d/
+                    start_group :number,
+                      scan( /[\d_]+(\.[\d_]+)?([eE][\d_]+)?/ )
+                  when '"'
+                    scan_delimited_region :punct, :string, "", true
+                  when '/'
+                    scan_delimited_region :punct, :regex, "", true
+                  when "'"
+                    scan_delimited_region :punct, :string, "", false
+                  when "."
+                    dots = scan( /\.{1,3}/ )
+                    start_group :punct, dots
+                    @selector = ( dots.length == 1 )
+                  when /[@]/
+                    start_group :attribute, scan( /@{1,2}\w*/ )
+                  when /[$]/
+                    start_group :global, scan(/\$/)
+                    start_group :global, scan( /\w+|./ ) if check(/./)
+                  when /[-!?*\/+=<>()\[\]\{}:;,&|%]/
+                    start_group :punct,
+                      scan(/[-!?*\/+=<>()\[\]\{}:;,&|%]/)
+                  else
+                    # all else just falls through this, to prevent
+                    # infinite loops...
+                    append getch
+                end
+            end
+        end
+      end
+    end
+    private
+      # Scan a delimited region of text. This handles the simple cases (strings
+      # delimited with quotes) as well as the more complex cases of %-strings
+      # and here-documents.
+      def scan_delimited_region( delim_group, inner_group, starter, exprs,
+        delim=nil, delim_alone=false, float_right=false )
+      # begin
+        if !delim
+          start_group delim_group, starter
+          delim = scan( /./ )
+          append delim
+          delim = case delim
+            when '{' then '}'
+            when '(' then ')'
+            when '[' then ']'
+            else delim
+          end
+        end
+        start_group inner_group
+        items = "\\\\|"
+        if delim_alone
+          items << "(^"
+          items << '\s*' if float_right
+          items << "#{delim}$)"
+        else
+          items << "#{delim}"
+        end
+        items << "|#(\\$|@|\\{)"if exprs
+        items = Regexp.new( items )
+        loop do
+          p = pos
+          match = scan_until( items )
+          if match.nil?
+            start_group inner_group, scan_until( /\Z/ )
+            break
+          else
+            text = pre_match[p..-1]
+            start_group inner_group, text if text.length > 0
+            case matched.strip
+              when "\\"
+                unless exprs
+                  case peek(1)
+                    when "'"
+                      scan(/./)
+                      start_group :expr, "\\'"
+                    when "\\"
+                      scan(/./)
+                      start_group :expr, "\\\\"
+                    else
+                      start_group inner_group, "\\"
+                  end
+                else
+                  start_group :expr, "\\"
+                  c = getch
+                  append c
+                  case c
+                    when 'x'
+                      append scan( /[a-fA-F0-9]{1,2}/ )
+                    when /[0-7]/
+                      append scan( /[0-7]{0,2}/ )
+                  end
+                end
+              when delim
+                start_group delim_group, matched
+                break
+              when /^#/
+                start_group :expr, matched
+                case matched[1]
+                  when ?{
+                    depth = 1
+                    while depth > 0
+                      p = pos
+                      c = scan_until( /[\{}]/ )
+                      if c.nil?
+                        append scan_until( /\Z/ )
+                        break
+                      else
+                        depth += ( matched == "{" ? 1 : -1 )
+                        append pre_match[p..-1]
+                        append matched
+                      end
+                    end
+                  when ?$, ?@
+                    append scan( /\w+/ )
+                end
+              else raise "unexpected match on #{matched}"
+            end
+          end
+        end
+      end
+  end
+  SYNTAX["ruby"] = Ruby
+end

data/lib/syntax/version.rb ADDED

@@ -0,0 +1,9 @@
+module Syntax
+  module Version
+    MAJOR=0
+    MINOR=5
+    TINY=0
+    STRING=[MAJOR,MINOR,TINY].join('.')
+  end
+end