RubyGems - mediacloth - Versions diffs - 0.0.3 → 0.5 - Mend

mediacloth 0.0.3 → 0.5

Files changed (115) hide show

data/README.md +36 -0
data/lib/mediacloth/mediawikiast.rb +58 -1
data/lib/mediacloth/mediawikihtmlgenerator.rb +229 -73
data/lib/mediacloth/mediawikilexer.rb +1030 -656
data/lib/mediacloth/mediawikilinkhandler.rb +89 -0
data/lib/mediacloth/mediawikiparams.rb +1 -10
data/lib/mediacloth/mediawikiparser.rb +939 -409
data/lib/mediacloth/mediawikiparser.tab.rb +1357 -0
data/lib/mediacloth/mediawikiparser.y +256 -52
data/lib/mediacloth/mediawikisignedwikigenerator.rb +42 -0
data/lib/mediacloth/mediawikitemplatehandler.rb +8 -0
data/lib/mediacloth/mediawikiwalker.rb +72 -1
data/lib/mediacloth.rb +33 -10
data/test/data/ast1 +68 -0
data/test/data/ast10 +196 -0
data/test/data/ast11 +34 -0
data/test/data/ast12 +39 -0
data/test/data/ast13 +25 -0
data/test/data/ast14 +13 -0
data/test/data/ast15 +25 -0
data/test/data/ast16 +17 -0
data/test/data/ast17 +9 -0
data/test/data/ast18 +21 -0
data/test/data/ast19 +32 -0
data/test/data/ast2 +4 -0
data/test/data/ast20 +10 -0
data/test/data/ast21 +27 -0
data/test/data/ast22 +22 -0
data/test/data/ast23 +5 -0
data/test/data/ast3 +6 -0
data/test/data/ast4 +122 -0
data/test/data/ast5 +122 -0
data/test/data/ast6 +22 -0
data/test/data/ast7 +143 -0
data/test/data/ast8 +3 -0
data/test/data/ast9 +11 -0
data/test/data/html1 +33 -5
data/test/data/html10 +31 -27
data/test/data/html11 +19 -0
data/test/data/html12 +32 -0
data/test/data/html13 +29 -0
data/test/data/html14 +4 -0
data/test/data/html15 +29 -0
data/test/data/html16 +28 -0
data/test/data/html17 +10 -0
data/test/data/html18 +8 -0
data/test/data/html19 +27 -0
data/test/data/html2 +1 -1
data/test/data/html20 +7 -0
data/test/data/html21 +5 -0
data/test/data/html22 +24 -0
data/test/data/html23 +7 -0
data/test/data/html3 +1 -1
data/test/data/html4 +60 -11
data/test/data/html5 +45 -6
data/test/data/html6 +5 -5
data/test/data/html7 +59 -1
data/test/data/html8 +1 -1
data/test/data/html9 +10 -2
data/test/data/input1 +4 -0
data/test/data/input11 +19 -0
data/test/data/input12 +32 -0
data/test/data/input13 +10 -0
data/test/data/input14 +8 -0
data/test/data/input15 +10 -0
data/test/data/input16 +28 -0
data/test/data/input17 +10 -0
data/test/data/input18 +16 -0
data/test/data/input19 +29 -0
data/test/data/input20 +8 -0
data/test/data/input21 +18 -0
data/test/data/input22 +20 -0
data/test/data/input23 +8 -0
data/test/data/input4 +13 -1
data/test/data/input5 +45 -4
data/test/data/input7 +25 -1
data/test/data/lex1 +17 -18
data/test/data/lex10 +57 -87
data/test/data/lex11 +18 -0
data/test/data/lex12 +32 -0
data/test/data/lex13 +3 -0
data/test/data/lex14 +1 -0
data/test/data/lex15 +3 -0
data/test/data/lex16 +27 -0
data/test/data/lex17 +9 -0
data/test/data/lex18 +4 -0
data/test/data/lex19 +27 -0
data/test/data/lex2 +2 -2
data/test/data/lex20 +7 -0
data/test/data/lex21 +4 -0
data/test/data/lex22 +3 -0
data/test/data/lex23 +7 -0
data/test/data/lex3 +1 -1
data/test/data/lex4 +35 -29
data/test/data/lex5 +57 -18
data/test/data/lex6 +7 -7
data/test/data/lex7 +42 -18
data/test/data/lex8 +1 -1
data/test/data/lex9 +6 -6
data/test/dataproducers/ast.rb +24 -0
data/test/dataproducers/html.rb +11 -12
data/test/dataproducers/lex.rb +9 -4
data/test/debugwalker.rb +25 -11
data/test/htmlgenerator.rb +170 -13
data/test/lexer.rb +626 -83
data/test/linkhandler.rb +39 -0
data/test/parser.rb +176 -9
data/test/signedwikigenerator.rb +113 -0
metadata +158 -79
data/README +0 -37
data/lib/mediacloth/mediawikilexer.rb~ +0 -491
data/lib/mediacloth/mediawikiparser.y~ +0 -210
data/test/data/result1 +0 -48
data/test/dataproducers/html.rb~ +0 -24
data/test/dataproducers/lex.rb~ +0 -15

data/lib/mediacloth/mediawikilexer.rb CHANGED Viewed

@@ -1,737 +1,1111 @@
-#The lexer for MediaWiki language.
-#
-#Standalone usage:
-# file = File.new("somefile", "r")
-# input = file.read
-# lexer = MediaWikiLexer.new
-# lexer.tokenize(input)
-#
-#Inside RACC-generated parser:
-# ...
-# ---- inner ----
-# attr_accessor :lexer
-# def parse(input)
-#     lexer.tokenize(input)
-#     return do_parse
-# end
-# def next_token
-#     return @lexer.lex
-# end
-# ...
-# parser = MediaWikiParser.new
-# parser.lexer = MediaWikiLexer.new
-# parser.parse(input)
-class MediaWikiLexer
-    #Initialized the lexer with a match table.
-    #
-    #The match table tells the lexer which method to invoke
-    #on given input char during "tokenize" phase.
-    def initialize
-        @position = 0
-        @pair_stack = [[false, false]] #stack of tokens for which a pair should be found
-        @list_stack = []
-        # Default lexer table
-        @lexer_table = Hash.new(method(:match_other))
-        @lexer_table["'"] = method(:match_italic_or_bold)
-        @lexer_table["="] = method(:match_section)
-        @lexer_table["["] = method(:match_link_start)
-        @lexer_table["]"] = method(:match_link_end)
-        @lexer_table["|"] = method(:match_link_sep_or_table_cell)
-        @lexer_table[" "] = method(:match_space)
-        @lexer_table["*"] = method(:match_list)
-        @lexer_table["#"] = method(:match_list)
-        @lexer_table[";"] = method(:match_list)
-        @lexer_table[":"] = method(:match_list)
-        @lexer_table["-"] = method(:match_line)
-        @lexer_table["~"] = method(:match_signature)
-        @lexer_table["h"] = method(:match_inline_link)
-        @lexer_table["\n"] = method(:match_newline)
-        @lexer_table["\r"] = method(:match_carriagereturn)
-        @lexer_table["<"] = method(:match_tag_start)
-        @lexer_table["{"] = method(:match_table)
-        @lexer_table["!"] = method(:match_table_head)
-        # Lexer table used when inside :match_tag_start ... :match_tag_end
-        @tag_lexer_table = Hash.new(method(:match_other))
-        @tag_lexer_table["<"] = method(:match_tag_end)
-        # Begin lexing in default state
-        @current_lexer_table = @lexer_table
-    end
-    #Transforms input stream (string) into the stream of tokens.
-    #Tokens are collected into an array of type [ [TOKEN_SYMBOL, TOKEN_VALUE], ..., [false, false] ].
-    #This array can be given as input token-by token to RACC based parser with no
-    #modification. The last token [false, false] inficates EOF.
-    def tokenize(input)
-        @tokens = []
-        start_para
-        @cursor = 0
-        @text = input
-        @next_token = []
-        #This tokenizer algorithm assumes that everything that is not
-        #matched by the lexer is going to be :TEXT token. Otherwise it's usual
-        #lexer algo which call methods from the match table to define next tokens.
-        while (@cursor < @text.length)
-            @current_token = [:TEXT, ''] unless @current_token
-            @token_start = @cursor
-            @char = @text[@cursor, 1]
-            if @current_lexer_table[@char].call == :TEXT
-                @current_token[1] += @text[@token_start, 1]
-            else
-                #skip empty :TEXT tokens
-                unless empty_text_token?
-                    @tokens << @current_token
-                    unless para_breaker?(@next_token[0]) or in_block?
-                        #if no paragraph was previously started
-                        #then we should start it
-                        start_para if !@para
-                    else
-                        #if we already have a paragraph this is the time to close it
-                        end_para if @para
-                    end
-                end
-                if para_breaker?(@next_token[0])
-                    if @tokens.last and @tokens.last[0] == :PARA_START
-                        #we need to remove para start token because no para end is possible
-                        @tokens.pop
-                        @para = false
-                    elsif @para
-                        end_para
-                    end
-                end
+require 'strscan'
+class String
+  def is_empty_token?
+    self.size == 0 or self == "\n" or self == "\r\n"
+  end
+end
-                @next_token[1] = @text[@token_start, @cursor - @token_start]
-                @tokens << @next_token
-                #hack to enable sub-lexing!
-                if @sub_tokens
-                    @tokens += @sub_tokens
-                    @sub_tokens = nil
-                end
-                #end of hack!
+# Class for storing text tokens data - index and text
+class TokenString < String
+    attr_reader :idx
+    def initialize(lexer, text = '')
+        @lexer = lexer
+        @idx = 0
+        super(text)
+    end
+    def <<(pending_text)
+        # If TokenString.length is 0 and we are pushing some text
+        # than in this moment we can retreive this tokes's index
+        if length == 0
+            @idx = @lexer.cursor
+        end
+        super(pending_text)
+    end
+end
-                #if the next token can start the paragraph, let's try that
-                start_para if @tokens.last and para_starter?(@tokens.last[0])
+class TokenArray < Array
+    def initialize(lexer)
+        @lexer = lexer
+    end
-                @current_token = nil
-                @next_token = []
-            end
+    def <<(token)
+        if @lexer.tokens.last && (@lexer.tokens.last[3].nil? || @lexer.tokens.last[3] == 0)
+            @lexer.tokens.last[3] = @lexer.cursor - @lexer.tokens.last[2]
         end
-        #add the last TEXT token if it exists
-        @tokens << @current_token if @current_token and not empty_text_token?
+        token[2] = @lexer.cursor
+        super(token)
+    end
-        #remove empty para start or finish the paragraph if necessary
-        if @tokens.last and @tokens.last[0] == :PARA_START
-            @tokens.pop
-            @para = false
-        else
-            end_para if @para
+    def append_pending(text)
+        if @lexer.tokens.last && @lexer.tokens.last[3].nil?
+            @lexer.tokens.last[3] = text.idx - @lexer.tokens.last[2]
         end
-        #RACC wants us to put this to indicate EOF
-        @tokens << [false, false]
-        @tokens
+        token = [:TEXT, text, text.idx, text.length]
+        push(token)
     end
-    #Returns the next token from the stream. Useful for RACC parsers.
-    def lex
-        token = @tokens[@position]
-        @position += 1
-        return token
+    def to_s
+        string_copy = ""
+        each do |token|
+            string_copy << "#{token[0..1]}[#{token[2]}, #{token[3]}]"
+        end
+        string_copy
     end
+end
-private
-    #Returns true if the token breaks the paragraph.
-    def para_breaker?(token)
-        [:SECTION_START, :SECTION_END,
-        :TABLE_START, :TABLE_END, :ROW_START, :ROW_END, :HEAD_START, :HEAD_END, :CELL_START, :CELL_END,
-        :UL_START, :UL_END, :OL_START, :OL_END,
-        :DL_START, :DL_END, :HLINE, :PRE].include?(token)
-    end
-    #Returns true if the paragraph can be started after the token
-    def para_starter?(token)
-        [:SECTION_END, :TABLE_END, :UL_END, :OL_END, :DL_END, :HLINE, :PRE].include?(token)
-    end
+class MediaWikiLexer
+  INLINE_ELEMENTS = [:LINK, :INTLINK, :BOLD, :ITALIC]
+  BLOCK_ELEMENTS = [:PARA, :PRE, :PREINDENT, :UL, :OL, :DL, :LI, :SECTION, :TABLE, :ROW, :CELL, :HEAD]
+  PARA_BREAK_ELEMENTS = [:UL, :OL, :DL, :PRE, :PREINDENT, :PASTE_START, :SECTION, :TABLE, :HLINE, :KEYWORD]
+  NAME_CHAR_TABLE = (0 .. 255).collect{|n| n.chr =~ /[a-zA-Z0-9_\-]/ ? true : false}
+  TOKEN_CHAR_TABLE = (0 .. 255).collect{|n| n.chr =~ /[a-zA-Z0-9_\-.;:?&@~=#%\/]/ ? true : false}
+  PUNCTUATION_CHAR_TABLE = (0 .. 255).collect{|n| n.chr =~ /[\.,;:\-?]/ ? true : false}
+  HTML_TAGS = %w{ a abbr acronym address applet area b base basefont bdo big blockquote body br
+    button caption center cite code col colgroup dd del dir div dfn dl dt em fieldset font form frame
+    frameset h1 h2 h3 h4 h5 h6 head hr html i iframe img input ins isindex kbd label legend li link map
+    menu meta noframes noscript object ol optgroup option p param pre q s samp script select small span
+    strike strong style sub sup table tbody td textarea tfoot th thead title tr tt u ul var xmp }
+  WIKI_TAGS = %w{ nowiki math paste }
+  TAGS_WITHOUT_CLOSE_TAG = %w{ br hr img }
+  attr_reader :cursor
+  attr_reader :tokens
+  def initialize
+    # Current position in token list
+    @position = 0
-    def in_block?
-      @pair_stack.select {|token| para_breaker?(token[0])}.size > 0 or
-        (@sub_tokens and @sub_tokens.select {|token| para_breaker?(token[0])}.size > 0)
-    end
+    # Lexer table of methods that handle only formatting, e.g. bold or italicized
+    # text; or spans of XHTML, or wiki-escape, markup
+    @formatting_lexer_table = {}
+    @formatting_lexer_table["'"] = method(:match_quote)
+    @formatting_lexer_table["<"] = method(:match_left_angle)
+    @formatting_lexer_table["&"] = method(:match_ampersand)
+    @formatting_lexer_table["{"] = method(:match_left_curly)
+    # Lexer table of methods that handle everything that may occur in-line in
+    # addition to formatting, i.e. links and signatures
+    @inline_lexer_table = @formatting_lexer_table.dup
+    @inline_lexer_table["["] = method(:match_left_square)
+    @inline_lexer_table["~"] = method(:match_tilde)
+    @inline_lexer_table["h"] = method(:match_h_char)
+    # Default lexer table, which includes all in-line formatting and links, plus
+    # methods that handle constructs that begin on a newline
+    @default_lexer_table = @inline_lexer_table.dup
+    @default_lexer_table[" "] = method(:match_space)
+    @default_lexer_table["="] = method(:match_equal)
+    @default_lexer_table["*"] = method(:match_star)
+    @default_lexer_table["#"] = method(:match_hash)
+    @default_lexer_table[":"] = method(:match_colon)
+    @default_lexer_table[";"] = method(:match_semicolon)
+    @default_lexer_table["-"] = method(:match_dash)
+    @default_lexer_table["_"] = method(:match_underscore)
+    @default_lexer_table["\n"] = method(:match_newline)
+    @default_lexer_table["\r"] = method(:match_newline)
+    # Lexer table used inside spans of markup, wherein spans of newlines are not
+    # automatically treated as paragraphs.
+    @markup_lexer_table = @default_lexer_table.dup
+    @markup_lexer_table["\n"] = nil
+    @markup_lexer_table["\r"] = nil
+    # Lexer table used inside of headings
+    @heading_lexer_table = @inline_lexer_table.dup
+    @heading_lexer_table["="] = method(:match_equal_in_heading)
+    @heading_lexer_table["\n"] = method(:match_newline_in_heading)
+    # Lexer table used inside the left half of an external link
+    @link_lexer_table = {}
+    @link_lexer_table["]"] = method(:match_right_square_in_link)
+    @link_lexer_table["\n"] = method(:match_newline_in_link)
+    @link_lexer_table["\r"] = method(:match_newline_in_link)
+    @link_lexer_table[" "] = method(:match_space_in_link)
+    # Lexer table used inside the right half of an external link, or the right
+    # half of an internal link
+    @link_opt_lexer_table = @inline_lexer_table.dup
+    @link_opt_lexer_table["]"] = method(:match_right_square_in_link)
+    @link_opt_lexer_table["\n"] = method(:match_newline_in_link)
+    @link_opt_lexer_table["\r"] = method(:match_newline_in_link)
+    # Lexer table used inside the left half of an internal link or internal
+    # resource link
+    @intlink_lexer_table = {}
+    @intlink_lexer_table["]"] = method(:match_right_square_in_intlink)
+    @intlink_lexer_table["\r"] = method(:match_newline_in_intlink)
+    @intlink_lexer_table["\n"] = method(:match_newline_in_intlink)
+    @intlink_lexer_table[":"] = method(:match_colon_in_intlink)
+    @intlink_lexer_table["|"] = method(:match_pipe_in_intlink)
+    @intlink_lexer_table["C"] = method(:match_c_char_in_intlink)
+    # Lexer table used inside the category name of the left half of an
+    # internal link
+    @intlink_cat_lexer_table = {}
+    @intlink_cat_lexer_table["]"] = method(:match_right_square_in_intlink)
+    @intlink_cat_lexer_table["\r"] = method(:match_newline_in_intlink)
+    @intlink_cat_lexer_table["\n"] = method(:match_newline_in_intlink)
+    @intlink_cat_lexer_table["|"] = method(:match_pipe_in_intlink)
+    # Lexer table used inside the right half of an internal link
+    @intlink_opt_lexer_table = @formatting_lexer_table.dup
+    @intlink_opt_lexer_table["]"] = method(:match_right_square_in_intlink)
+    @intlink_opt_lexer_table["\n"] = method(:match_newline_in_intlink)
+    @intlink_opt_lexer_table["\r"] = method(:match_newline_in_intlink)
+    # Lexer table used inside the right half of an internal resource link
+    @resourcelink_opt_lexer_table = @inline_lexer_table.dup
+    @resourcelink_opt_lexer_table["]"] = method(:match_right_square_in_intlink)
+    @resourcelink_opt_lexer_table["\n"] = method(:match_newline_in_intlink)
+    @resourcelink_opt_lexer_table["\r"] = method(:match_newline_in_intlink)
+    @resourcelink_opt_lexer_table["|"] = method(:match_pipe_in_intlink)
+    # Lexer table used to parse tables
+    @table_lexer_table = @inline_lexer_table.dup
+    @table_lexer_table["*"] = method(:match_star)
+    @table_lexer_table["#"] = method(:match_hash)
+    @table_lexer_table["|"] = method(:match_pipe_in_table)
+    @table_lexer_table["!"] = method(:match_bang_in_table)
+    @table_lexer_table["{"] = method(:match_left_curly)
+    @table_lexer_table[" "] = method(:match_space)
+    # Lexer table used to parse ordered and unordered list items (which may nest)
+    @items_lexer_table = @inline_lexer_table.dup
+    @items_lexer_table["\n"] = method(:match_newline_in_items)
+    # Lexer table used to parse entries in a definition list (which may not nest)
+    @entries_lexer_table = @inline_lexer_table.dup
+    @entries_lexer_table["\n"] = method(:match_newline_in_entries)
+    @entries_lexer_table[":"] = method(:match_colon_in_entries)
+    # Lexer table used inside spans of indented text
+    @indent_lexer_table = @inline_lexer_table.dup
+    @indent_lexer_table["\n"] = method(:match_newline_in_indent)
+    # Lexer table used inside spans of pre-formatted text
+    @pre_lexer_table = {}
+    @pre_lexer_table["<"] = method(:match_left_angle_in_pre)
+    # Lexer table used inside spans of <code>
+    @code_lexer_table = @inline_lexer_table.dup
+    @code_lexer_table[" "] = method(:match_space_in_code)
+    @code_lexer_table["<"] = method(:match_left_angle_in_code)
+    # Lexer table used when inside spans of wiki-escaped text
+    @nowiki_lexer_table = {}
+    @nowiki_lexer_table["<"] = method(:match_left_angle_in_nowiki)
-    #-- ================== Match methods ================== ++#
+    @paste_lexer_table = {}
+    @paste_lexer_table["<"] = method(:match_left_angle_in_paste)
+    @paste_lexer_table["\n"] = method(:match_newline_in_paste)
+    @paste_lexer_table["\r"] = method(:match_newline_in_paste)
-    #Matches anything that was not matched. Returns :TEXT to indicate
-    #that matched characters should go into :TEXT token.
-    def match_other
+    # Lexer table used when inside spans of math
+    @math_lexer_table = {}
+    @math_lexer_table["<"] = method(:match_left_angle_in_math)
+    # Lexer table used when inside a wiki template inclusion
+    @template_lexer_table = {}
+    @template_lexer_table["{"] = method(:match_left_curly_in_template)
+    @template_lexer_table["|"] = method(:match_pipe_in_template)
+    @template_lexer_table["}"] = method(:match_right_curly_in_template)
+    @template_param_lexer_table = {}
+    @template_param_lexer_table["{"] = method(:match_left_curly_in_template)
+    @template_param_lexer_table["}"] = method(:match_right_curly_in_template)
+    @template_param_lexer_table["|"] = method(:match_pipe_in_template)
+    # Begin lexing in default state
+    @lexer_table = LexerTable.new
+    @lexer_table.push(@default_lexer_table)
+  end
+  def tokenize(input)
+    @text = input
+    # Current position in the input text
+    @cursor = 0
+    # Tokens to be returned
+    @tokens = TokenArray.new(self)
+    # Stack of open token spans
+    @context = []
+    # Already lexed character data, not yet added to a TEXT token
+    @pending = TokenString.new(self)
+    # List symbols from the most recent line item of a list, e.g. '***'
+    @list = ''
+    start_span(:PARA)
+    while (@cursor < @text.length)
+      @char = @text[@cursor, 1]
+      if @lexer_table[@char]
+        @lexer_table[@char].call
+      else
+        @pending << @char
         @cursor += 1
-        return :TEXT
+      end
     end
-    #Matches italic or bold symbols:
-    # "'''"     { return :BOLD; }
-    # "''"      { return :ITALIC; }
-    def match_italic_or_bold
-        if @text[@cursor, 5] == "'''''"
-            if @pair_stack.last[0] == :BOLDSTART
-              matchBold
-              @cursor += 3
-            else
-              matchItalic
-              @cursor += 2
-            end
-            return
-        end
-        if @text[@cursor, 3] == "'''"
-            matchBold
-            @cursor += 3
-            return
-        end
-        if @text[@cursor, 2] == "''"
-            matchItalic
-            @cursor += 2
-            return
-        end
-        match_other
+    if @pending.is_empty_token?
+      if @context.size > 0 and @tokens.last[0] == :PARA_START
+        @context.pop
+        @tokens.pop
+      end
+    else
+      @tokens.append_pending(@pending)
+      @pending = TokenString.new(self)
     end
+    while(@context.size > 0) do
+      @tokens << [(@context.pop.to_s + '_END').to_sym, '']
+    end
+    @tokens << [false, false, 0, 0]
+    @tokens
+  end
-    def matchBold
-        if @pair_stack.last[0] == :BOLDSTART
-            @next_token[0] = :BOLDEND
-            @pair_stack.pop
-        else
-            @next_token[0] = :BOLDSTART
-            @pair_stack.push @next_token
-        end
+  #Returns the next token from the stream. Useful for RACC parsers.
+  def lex
+    token = @tokens[@position]
+    @position += 1
+    return token
+  end
+  private
+  def match_text
+    @pending << @char
+    @cursor += 1
+  end
+  def match_ampersand
+    i = @cursor + 1
+    i += 1 while i < @text.size and NAME_CHAR_TABLE[@text[i].ord]
+    if @text[i, 1] == ';'
+      append_to_tokens([:CHAR_ENT, @text[(@cursor + 1) ... i]])
+      @cursor = i + 1
+    else
+      match_text
+    end
+  end
+  def match_quote
+    if @text[@cursor, 5] == "'''''"
+      if @context.last == :BOLD
+        match_bold
+        @cursor += 3
+      else
+        match_italic
+        @cursor += 2
+      end
+    elsif @text[@cursor, 3] == "'''"
+      match_bold
+      @cursor += 3
+    elsif @text[@cursor, 2] == "''"
+      match_italic
+      @cursor += 2
+    else
+      match_text
     end
+  end
-    def matchItalic
-        if @pair_stack.last[0] == :ITALICSTART
-            @next_token[0] = :ITALICEND
-            @pair_stack.pop
-        else
-            @next_token[0] = :ITALICSTART
-            @pair_stack.push @next_token
-        end
+  def match_bold
+    if @context.last == :BOLD
+      end_span(:BOLD, "'''")
+    else
+      start_span(:BOLD, "'''")
     end
+  end
-    #Matches sections
-    def match_section
-        if at_start_of_line? or (@pair_stack.last[0] == :SECTION_START)
-            i = 0
-            i += 1 while @text[@cursor+i, 1] == "="
-            @cursor += i
+  def match_italic
+    if @context.last == :ITALIC
+      end_span(:ITALIC, "''")
+    else
+      start_span(:ITALIC, "''")
+    end
+  end
-            if @pair_stack.last[0] == :SECTION_START
-                @next_token[0] = :SECTION_END
-                @pair_stack.pop
+  def match_tilde
+    if @text[@cursor, 5] == "~~~~~"
+      empty_span(:SIGNATURE_DATE, "~~~~~", 5)
+    elsif @text[@cursor, 4] == "~~~~"
+      empty_span(:SIGNATURE_FULL, "~~~~", 4)
+    elsif @text[@cursor, 3] == "~~~"
+      empty_span(:SIGNATURE_NAME, "~~~", 3)
+    else
+      match_text
+    end
+  end
+  def match_left_angle
+    next_char = @text[@cursor + 1]
+    if !next_char
+      match_text
+    elsif next_char.ord == 47
+      # Might be an XHTML end tag
+      if @text[@cursor .. -1] =~ %r{</([a-zA-Z][a-zA-Z0-9\-_]*)(\s*)>} and @context.include?(:TAG)
+        # Found an XHTML end tag
+        tag_name = $1
+        end_span(:TAG, $1)
+        @lexer_table.pop
+        @cursor += $1.length + $2.length + 3
+      else
+        match_text
+      end
+    elsif next_char.ord > 64 and next_char.ord < 123
+      # Might be an XHTML open or empty tag
+      scanner = StringScanner.new(@text[@cursor .. -1])
+      if scanner.scan(%r{<([a-zA-Z][a-zA-Z0-9\-_]*)}) and (HTML_TAGS.include?(scanner[1]) or WIKI_TAGS.include?(scanner[1]))
+        # Sequence begins with a valid tag name, so check for attributes
+        tag_name = scanner[1]
+        attrs = {}
+        while scanner.scan(%r{\s+([a-zA-Z][a-zA-Z0-9\-_]*)\s*=\s*('([^']+)'|"([^"]+)"|([^>\s]+))}) do
+          attrs[scanner[1]] = scanner[3] ? scanner[3] : (scanner[4] ? scanner[4] : scanner[5])
+        end
+        scanner.scan(%r{\s*})
+        if ((c = scanner.get_byte) == '>' or (c == '/' and scanner.get_byte == '>'))
+          # Found an XHTML start or empty tag
+          if tag_name == 'nowiki'
+            @lexer_table.push(@nowiki_lexer_table) unless c == '/'
+          elsif tag_name == 'paste'
+            unless c == '/'
+                maybe_close_para(:PASTE_START, true)
+                append_to_tokens([:PASTE_START, ''])
+                @cursor += scanner.pos
+                @lexer_table.push(@paste_lexer_table)
+                #eat newline after <paste> if if exists because otherwise
+                #it will be transformed into <br/>
+                if @text[@cursor, 1] == "\n"
+                    @cursor += 1
+                elsif @text[@cursor, 2] == "\r\n"
+                    @cursor += 2
+                end
+                return
+            end
+          else
+            if tag_name == 'pre'
+              table = @pre_lexer_table
+            elsif tag_name == 'code'
+              table = @code_lexer_table
+            elsif tag_name == 'math'
+              table = @math_lexer_table
+            else
+              table = @markup_lexer_table
+            end
+            start_span(:TAG, tag_name)
+            attrs.collect do |(name, value)|
+              append_to_tokens([:ATTR_NAME, name])
+              append_to_tokens([:ATTR_VALUE, value]) if value
+            end
+            if c == '/' or TAGS_WITHOUT_CLOSE_TAG.include? tag_name
+              end_span(:TAG, tag_name)
             else
-                @next_token[0] = :SECTION_START
-                @pair_stack.push @next_token
+              @lexer_table.push(table)
             end
+          end
+          @cursor += scanner.pos #FIXME: will break xhtml attribute length calculation
         else
-            match_other
+          match_text
         end
+      else
+        match_text
+      end
+    else
+      match_text
     end
+  end
-    #Matches start of the hyperlinks
-    # "[["      { return INTLINKSTART; }
-    # "["       { return LINKSTART; }
-    def match_link_start
-        if @text[@cursor, 2] == "[[" and @text[@cursor+2, @text.length - (@cursor + 2)] =~ %r{\A\s*[^\s\]]}
-            @next_token[0] = :INTLINKSTART
-            @pair_stack.push @next_token
-            @cursor += 2
-        elsif @text[@cursor, 1] == "[" and link_protocol?(@cursor+1)
-            @next_token[0] = :LINKSTART
-            @pair_stack.push @next_token
-            @cursor += 1
+  def match_equal
+    if at_start_of_line?
+      @heading = extract_char_sequence('=')
+      @cursor += @heading.length
+      if at_end_of_line? or blank_line?
+        @cursor -= @heading.length
+        #special case - no header text, just "=" signs
+        #try to split header into "=" formatting and text with "=":
+        # example:
+        #  ==== should become: = == =
+        #  ===== should become: == = ==
+        if @heading =~ /(={6})(=+)(={6})/ or
+                @heading =~ /(={5})(=+)(={5})/ or
+                @heading =~ /(={4})(=+)(={4})/ or
+                @heading =~ /(={3})(=+)(={3})/ or
+                @heading =~ /(={2})(=+)(={2})/ or
+                @heading =~ /(=)(=+)(=)/
+            start_span(:SECTION, $1)
+            @cursor += $1.length
+            @tokens << [:TEXT, $2]
+            @cursor += $2.length
+            end_span(:SECTION, $3)
+            @cursor += $3.length
         else
-            match_other
+            match_text
         end
+      else
+        @cursor -= @heading.length
+        start_span(:SECTION, @heading)
+        @cursor += @heading.length
+        @lexer_table.push(@heading_lexer_table)
+      end
+    else
+      match_text
+    end
+  end
+  def match_equal_in_heading
+    heading = extract_char_sequence('=')
+    if @heading.length <= heading.length
+      end_span(:SECTION, heading)
+      @lexer_table.pop
+      @cursor += heading.length
+      skip_newline
+    else
+      @pending << heading
+      @cursor += heading.length
     end
+  end
+  def match_newline_in_heading
+    end_span(:SECTION)
+    @lexer_table.pop
+  end
-    #Matches end of the hyperlinks
-    # "]]"      { return INTLINKEND; }
-    # "]"       { return LINKEND; }
-    def match_link_end
-        if @text[@cursor, 2] == "]]" and @pair_stack.last[0] == :INTLINKSTART
-            @next_token[0] = :INTLINKEND
-            @pair_stack.pop
-            @cursor += 2
-        elsif @text[@cursor, 1] == "]" and @pair_stack.last[0] == :LINKSTART
-            @next_token[0] = :LINKEND
-            @pair_stack.pop
-            @cursor += 1
-        else
-            match_other
-        end
+  def match_left_square
+    if @text[@cursor, 2] == "[["
+      if @text[@cursor + 2, 1] != "]"
+        start_span(:INTLINK, "[[")
+        @cursor += 2
+        @lexer_table.push(@intlink_lexer_table)
+      else
+        match_text
+      end
+    elsif @text[@cursor + 1 .. -1] =~ %r{\A\s*((http|https|file)://|mailto:)}
+      start_span(:LINK, "[")
+      @cursor += 1
+      skip_whitespace
+      @lexer_table.push(@link_lexer_table)
+    else
+      match_text
     end
+  end
+  def match_right_square_in_link
+    end_span(:LINK, "]")
+    @cursor += 1
+    @lexer_table.pop
+  end
+  def match_right_square_in_intlink
+    if @text[@cursor, 2] == "]]"
+      end_span(:INTLINK, "]]")
+      @cursor += 2
+      @lexer_table.pop
+    else
+      match_text
+    end
+  end
+  def match_space_in_link
+    spaces = extract_char_sequence(' ')
+    append_to_tokens([:LINKSEP, ' ']) unless @text[@cursor, 1] == ']'
+    @cursor += spaces.length
+    @lexer_table.pop
+    @lexer_table.push(@link_opt_lexer_table)
+  end
-    #Matches link separator inside of internal links
-    def match_link_sep
-      if @tokens[-1][0] == :INTLINKSTART or inside_resource_link
-        @next_token[0] = :INTLINKSEP
-        @cursor += 1
-      else
-        match_other
+  def match_pipe_in_intlink
+    if @tokens.last[0] == :INTLINK_START
+      @lexer_table.pop
+      @lexer_table.push(@intlink_opt_lexer_table)
+    end
+    append_to_tokens([:INTLINKSEP, "|"])
+    @cursor += 1
+  end
+  def match_colon_in_intlink
+    if not @pending.is_empty_token?
+      @lexer_table.pop
+      @lexer_table.push(@resourcelink_opt_lexer_table)
+    end
+    append_to_tokens([:RESOURCESEP, ":"])
+    @cursor += 1
+  end
+  def match_c_char_in_intlink
+    if @text[@cursor, 9] == 'Category:'
+      append_to_tokens([:CATEGORY, 'Category:'])
+      @lexer_table.pop
+      @lexer_table.push(@intlink_cat_lexer_table)
+      @cursor += 9
+    else
+      match_text
+    end
+  end
+  def match_newline_in_link
+    end_span(:LINK)
+    @lexer_table.pop
+  end
+  def match_newline_in_intlink
+    end_span(:INTLINK)
+    @lexer_table.pop
+  end
+  def match_h_char
+    link = @text[@cursor, 7] if @text[@cursor, 7] == 'http://'
+    link = @text[@cursor, 8] if @text[@cursor, 8] == 'https://'
+    if link
+      start_span(:LINK)
+      i = @cursor + link.length
+      while i < @text.size and TOKEN_CHAR_TABLE[@text[i].ord] do
+        link << @text[i, 1]
+        i += 1
+      end
+      #exclude punctuation at the end
+      while link.length > 0 and PUNCTUATION_CHAR_TABLE[link[-1].ord] do
+        link = link[0..-2]
+        i -= 1
       end
+      @pending = TokenString.new(self)
+      @pending << link
+      @cursor = i
+      end_span(:LINK)
+    else
+      match_text
     end
+  end
-    #Matches inlined unformatted html link
-    # "http://[^\s]*"   { return [ LINKSTART TEXT LINKEND]; }
-    def match_inline_link
-        #if no link start token was detected and the text starts with http://
-        #then it's the inlined unformatted html link
-        last_pair_token = @pair_stack.last[0]
-        if link_protocol?(@cursor) and last_pair_token != :INTLINKSTART and last_pair_token != :LINKSTART
-            @next_token[0] = :LINKSTART
-            text = @text[@cursor..-1]
-            if last_pair_token == :ITALICSTART and text =~ /\A([^\s\n]+)''/
-              linkText = $1
-            elsif last_pair_token == :BOLDSTART and text =~ /\A([^\s\n]+)'''/
-              linkText = $1
-            elsif text =~ /\A([^\s\n]+)[\s\n]/
-              linkText = $1
-            else
-              linkText = text
-            end
-            @sub_tokens = []
-            @sub_tokens << [:TEXT, linkText]
-            @sub_tokens << [:LINKEND, ']']
-            @cursor += linkText.length
-            @token_start = @cursor
-        else
-            match_other
-        end
+  def match_space
+    if at_start_of_line? and !blank_line?
+      start_span(:PREINDENT)
+      @lexer_table.push(@indent_lexer_table)
+      match_text
+    else
+      match_text
+    end
+  end
+  def match_newline_in_indent
+    match_text
+    unless @text[@cursor, 1] == " "
+      @tokens.append_pending(@pending)
+      @pending = TokenString.new(self)
+      end_span(:PREINDENT)
+      @lexer_table.pop
     end
+  end
-    #Matches space to find preformatted areas which start with a space after a newline
-    # "\n\s[^\n]*"     { return PRE; }
-    def match_space
-        if at_start_of_line? and ! in_table?
-            match_untill_eol
-            @next_token[0] = :PRE
-            strip_ws_from_token_start
-        elsif @pair_stack.last[0] == :LINKSTART and @current_token[0] == :TEXT and @tokens.last[0] != :LINKSEP
-            @next_token[0] = :LINKSEP
-            @cursor += 1
-            strip_ws_from_token_start
-        else
-            match_other
+  def match_star
+    if at_start_of_line?
+      @list = extract_char_sequence('#*')
+      open_list(@list)
+      @lexer_table.push(@items_lexer_table)
+    else
+      match_text
+    end
+  end
+  def match_hash
+    if at_start_of_line?
+      @list = extract_char_sequence('#*')
+      open_list(@list)
+      @lexer_table.push(@items_lexer_table)
+    else
+      match_text
+    end
+  end
+  def match_underscore
+    if @text[@cursor, 7] == '__TOC__'
+      empty_span(:KEYWORD, 'TOC', 7)
+    elsif @text[@cursor, 9] == '__NOTOC__'
+      empty_span(:KEYWORD, 'NOTOC', 9)
+    else
+      match_text
+    end
+  end
+  def match_newline_in_items
+    if @text[@cursor, 1] == "\n"
+      newline = "\n"
+      char = @text[@cursor + 1, 1]
+    else
+      newline = "\r\n"
+      char = @text[@cursor + 2, 1]
+    end
+    @pending << newline
+    @cursor += newline.length
+    if (char == @list[0, 1])
+      list = extract_char_sequence('#*')
+      if list == @list
+        end_span(:LI)
+        start_span(:LI)
+        @cursor += list.length
+      else
+        l = @list.length > list.length ? list.length : @list.length
+        i = 0
+        i += 1 while (i < l and @list[i] == list[i])
+        if i < @list.length
+          close_list(@list[i .. -1])
+          if @context.last == :LI
+            end_span(:LI)
+            start_span(:LI)
+          end
         end
+        if i < list.length
+          start_span(:LI) if @context.last != :LI
+          open_list(list[i .. -1])
+        end
+        @cursor += i
+        @list = list
+      end
+    else
+      close_list(@list)
+      @lexer_table.pop
     end
+  end
+  def match_dash
+    if at_start_of_line? and @text[@cursor, 4] == "----"
+      empty_span(:HLINE, "----", 4)
+    else
+      match_text
+    end
+  end
+  def match_left_angle_in_nowiki
+    if @text[@cursor, 9] == '</nowiki>'
+      @cursor += 9
+      @lexer_table.pop
+    else
+      match_text
+    end
+  end
-    #Matches any kind of list by using sublexing technique. MediaWiki lists are context-sensitive
-    #therefore we need to do some special processing with lists. The idea here is to strip
-    #the leftmost symbol indicating the list from the group of input lines and use separate
-    #lexer to process extracted fragment.
-    def match_list
-        if at_start_of_line?
-            list_id = @text[@cursor, 1]
-            sub_text = extract_list_contents(list_id)
-            extracted = 0
-            #hack to tokenize everything inside the list
-            @sub_tokens = []
-            sub_lines = ""
-            @sub_tokens << [:LI_START, ""]
-            sub_text.each do |t|
-                extracted += 1
-                if text_is_list? t
-                    sub_lines += t
-                else
-                    if not sub_lines.empty?
-                        @sub_tokens += sub_lex(sub_lines)
-                        sub_lines = ""
-                    end
-                    if @sub_tokens.last[0] != :LI_START
-                        @sub_tokens << [:LI_END, ""]
-                        @sub_tokens << [:LI_START, ""]
-                    end
-                    @sub_tokens += sub_lex(t.lstrip)
-                end
-            end
-            if not sub_lines.empty?
-                @sub_tokens += sub_lex(sub_lines)
-                @sub_tokens << [:LI_END, ""]
-            else
-                @sub_tokens << [:LI_END, ""]
-            end
+  def match_left_angle_in_paste
+    if @text[@cursor, 8] == '</paste>'
+      @lexer_table.pop
+      append_to_tokens([:PASTE_END, ''])
+      @cursor += 8
+      maybe_open_para(:PASTE_END)
+    else
+      match_text
+    end
+  end
-            #end of hack
-            @cursor += sub_text.length + extracted
-            @token_start = @cursor
-            case
-                when list_id == "*"
-                    @next_token[0] = :UL_START
-                    @sub_tokens << [:UL_END, ""]
-                when list_id == "#"
-                    @next_token[0] = :OL_START
-                    @sub_tokens << [:OL_END, ""]
-                when list_id == ";", list_id == ":"
-                    @next_token[0] = :DL_START
-                    @sub_tokens << [:DL_END, ""]
-            end
-        elsif @text[@cursor, 1] == ':' and @tokens[-1][0] == :INTLINKSTART
-            @next_token[0] = :RESOURCE_SEP
-            @cursor += 1
-        else
-            match_other
-        end
+  def match_newline_in_paste
+    append_to_tokens([:TAG_START, 'br'])
+    if @text[@cursor, 1] == "\n"
+      @cursor += 1
+    elsif @text[@cursor, 2] == "\r\n"
+      @cursor += 2
     end
+    append_to_tokens([:TAG_END, 'br'])
+  end
-    #Matches the line until \n
-    def match_untill_eol
-        val = @text[@cursor, 1]
-        while (val != "\n") and (!val.nil?)
-            @cursor += 1
-            val = @text[@cursor, 1]
-        end
+  def match_left_angle_in_math
+    if @text[@cursor, 7] == '</math>'
+      end_span(:TAG, 'math')
+      @cursor += 7
+      @lexer_table.pop
+    else
+      match_text
+    end
+  end
+  def match_left_angle_in_pre
+    if @text[@cursor, 6] == '</pre>'
+      end_span(:TAG, 'pre')
+      @cursor += 6
+      #eat newline after </pre>
+      if @text[@cursor, 1] == "\n"
         @cursor += 1
+      elsif @text[@cursor, 2] == "\r\n"
+        @cursor += 2
+      end
+      @lexer_table.pop
+    else
+      match_text
     end
+  end
-    #Matches hline tag that start with "-"
-    # "\n----"      { return HLINE; }
-    def match_line
-        if at_start_of_line? and @text[@cursor, 4] == "----"
-            @next_token[0] = :HLINE
-            @cursor += 4
-        else
-            match_other
-        end
+  def match_space_in_code
+    match_text
+  end
+  def match_left_angle_in_code
+    if @text[@cursor, 7] == '</code>'
+      end_span(:TAG, 'code')
+      @cursor += 7
+      @lexer_table.pop
+    else
+      match_left_angle
     end
+  end
-    #Matches signature
-    # "~~~~~"      { return SIGNATURE_DATE; }
-    # "~~~~"      { return SIGNATURE_FULL; }
-    # "~~~"      { return SIGNATURE_NAME; }
-    def match_signature
-        if @text[@cursor, 5] == "~~~~~"
-            @next_token[0] = :SIGNATURE_DATE
-            @cursor += 5
-        elsif @text[@cursor, 4] == "~~~~"
-            @next_token[0] = :SIGNATURE_FULL
-            @cursor += 4
-        elsif @text[@cursor, 3] == "~~~"
-            @next_token[0] = :SIGNATURE_NAME
-            @cursor += 3
-        else
-            match_other
-        end
+  def match_left_curly
+    if at_start_of_line? and @text[@cursor + 1, 1] == '|'
+      start_span(:TABLE, "{|")
+      @cursor += 2
+      @lexer_table.push(@table_lexer_table)
+    elsif @text[@cursor + 1, 1] == '{' and @text[@cursor + 2, 2] != "}}"
+      start_span(:TEMPLATE, "{{")
+      @cursor += 2
+      @lexer_table.push(@template_lexer_table)
+    else
+      match_text
     end
-    def match_tag_start
-        if @text[@cursor, 8] == '<nowiki>'
-            @cursor += 8
-            @token_start = @cursor
-            @current_lexer_table = @tag_lexer_table
-            @current_lexer_table[@text[@cursor, 1]].call
-        else
-            match_other
-        end
+  end
+  def match_left_curly_in_template
+    if @text[@cursor + 1, 1] == '{' and @text[@cursor + 2, 2] != "}}"
+      start_span(:TEMPLATE, "{{")
+      @cursor += 2
+      @lexer_table.push(@template_lexer_table)
+    else
+      match_text
     end
-    def match_tag_end
-        if @text[@cursor, 9] == '</nowiki>'
-            @cursor += 9
-            @token_start = @cursor
-            @current_lexer_table = @lexer_table
-            @current_lexer_table[@text[@cursor, 1]].call
-        else
-            match_other
-        end
+  end
+  def match_right_curly_in_template
+    if @text[@cursor + 1, 1] == '}'
+      end_span(:TEMPLATE, "}}")
+      @cursor += 2
+      @lexer_table.pop
+    else
+      match_text
     end
-    def match_table
-        if at_start_of_line? and @text[@cursor + 1, 1] == '|'
-            tokens = []
-            if @para
-                tokens = end_tokens_for_open_pairs
-                if @tokens.last and @tokens.last[0] == :PARA_START and empty_text_token?
-                    tokens.pop
-                else
-                    tokens << [:PARA_END, ""]
-                end
-                @para = false
-            end
-            tokens << [:TABLE_START, '']
-            @pair_stack.push [:TABLE_START, '']
-            @next_token = tokens.shift
-            @sub_tokens = tokens
-            @cursor += 2
-        else
-            match_other
-        end
+  end
+  def match_pipe_in_template
+    if @tokens.last[0] == :TEMPLATE_START
+      @lexer_table.pop
+      @lexer_table.push(@template_param_lexer_table)
     end
+    append_to_tokens([:INTLINKSEP, "|"])
+    @cursor += 1
+  end
-    def match_table_head
-        if at_start_of_line? and in_table?
-            @cursor += 1
-            tokens = []
-            if @pair_stack.last[0] == :CELL_START
-                tokens << [:CELL_END, '']
-                @pair_stack.pop
-            elsif @pair_stack.last[0] == :HEAD_START
-                tokens << [:HEAD_END, '']
-                @pair_stack.pop
-            elsif @pair_stack.last[0] != :ROW_START
-                tokens << [:ROW_START, '']
-                @pair_stack.push [:ROW_START, '']
-            end
-            tokens << [:HEAD_START, '']
-            @pair_stack.push [:HEAD_START, '']
-            @next_token = tokens.shift
-            @sub_tokens = tokens
-        else
-            match_other
-        end
+  def match_bang_in_table
+    if at_start_of_line?
+      if @context.last == :CELL
+        end_span(:CELL)
+      elsif @context.last == :HEAD
+        end_span(:HEAD)
+      elsif @context.last != :ROW
+        start_span(:ROW)
+      end
+      start_span(:HEAD, "!")
+      @cursor += 1
+    else
+      match_text
     end
+  end
-    def match_link_sep_or_table_cell
-        if in_table?
-            tokens = []
-            if at_start_of_line?
-                @cursor += 1
-                close_table_cell(tokens)
-                if ['-', '}'].include?(@text[@cursor, 1])
-                    close_table_row(tokens)
-                    if @text[@cursor, 1] == '-'
-                        tokens << [:ROW_START, '']
-                        @pair_stack.push [:ROW_START, '']
-                    else
-                        tokens << [:TABLE_END, '']
-                        @pair_stack.pop
-                    end
-                    @cursor += 1
-                else
-                    if @pair_stack.last[0] != :ROW_START
-                        tokens << [:ROW_START, '']
-                        @pair_stack.push [:ROW_START, '']
-                    end
-                    tokens << [:CELL_START, '']
-                    @pair_stack.push [:CELL_START, '']
-                end
-                @next_token = tokens.shift
-                @sub_tokens = tokens
-            elsif @text[@cursor + 1, 1] == '|'
-                @cursor += 2
-                close_table_cell(tokens)
-                next_token = tokens.last[0] == :HEAD_END ? [:HEAD_START, ''] : [:CELL_START, '']
-                tokens << next_token
-                @pair_stack.push next_token
-                @next_token = tokens.shift
-                @sub_tokens = tokens
-            else
-                match_link_sep
-            end
-        else
-            match_link_sep
+  def match_pipe_in_table
+    if at_start_of_line?
+      context = @context[@context.rindex(:TABLE) + 1 .. -1]
+      if @text[@cursor+1, 1] == '-'
+        end_span(:ROW) if context.include? :ROW
+        start_span(:ROW, "|-")
+        @cursor += 2
+      elsif @text[@cursor+1, 1] == '}'
+        end_span(:TABLE, "|}")
+        @cursor += 2
+        @lexer_table.pop
+        skip_newline
+      else
+        if context.include? :CELL
+          end_span(:CELL)
+        elsif context.include? :HEAD
+          end_span(:HEAD)
         end
+        start_span(:ROW) unless @context.last == :ROW
+        start_span(:CELL, "|")
+        @cursor += 1
+      end
+    elsif @text[@cursor + 1, 1] == '|'
+      context = @context[@context.rindex(:TABLE) + 1 .. -1]
+      if context.include?:CELL
+        end_span(:CELL)
+        start_span(:CELL, "||")
+      elsif context.include? :HEAD
+        end_span(:HEAD)
+        start_span(:HEAD, "||")
+      end
+      @cursor += 2
+    else
+      context = @context[@context.rindex(:TABLE) + 1 .. -1]
+      if context.include? :CELL
+        end_span(:CELL, "attributes")
+        start_span(:CELL, "|")
+        @char = ''    #WTF?
+        #CHECK: this usecase and cursor increments
+      end
+      match_text
     end
+  end
-    #Matches a new line and breaks the paragraph if two newline characters
-    #("\n\n") are met.
-    def match_newline
-        if @text[@cursor, 2] == "\n\n"
-            if @para
-                @sub_tokens = end_tokens_for_open_pairs
-                @sub_tokens << [:PARA_END, '']
-                @sub_tokens << [:PARA_START, '']
-                @next_token[0] = @sub_tokens.slice!(0)[0]
-                @cursor += 2
-                return
-            end
-        end
-        match_other
-    end
-    #Matches a new line and breaks the paragraph if two carriage return - newline
-    #sequences ("\r\n\r\n") are met.
-    def match_carriagereturn
-        if @text[@cursor, 4] == "\r\n\r\n"
-            if @para
-                @sub_tokens = end_tokens_for_open_pairs
-                @sub_tokens << [:PARA_END, '']
-                @sub_tokens << [:PARA_START, '']
-                @next_token[0] = @sub_tokens.slice!(0)[0]
-                @cursor += 4
-                return
-            end
-        end
-        match_other
+  def match_newline
+    if @text[@cursor, 2] == "\n\n"
+      @pending << "\n\n"
+      @cursor += 2
+      end_span(:PARA)
+      start_span(:PARA)
+    elsif @text[@cursor, 4] == "\r\n\r\n"
+      @pending << "\r\n\r\n"
+      @cursor += 4
+      end_span(:PARA)
+      start_span(:PARA)
+    else
+      match_text
     end
+  end
+  def match_newline_in_table
+    if @text[@cursor, 2] == "\n\n"
+      start_span(:PARA)
+      append_to_tokens([:TEXT, "\n\n"])
+      @cursor += 2
+      end_span(:PARA)
+    elsif @text[@cursor, 4] == "\r\n\r\n"
+      start_span(:PARA)
+      append_to_tokens([:TEXT, "\r\n\r\n"])
+      @cursor += 4
+      end_span(:PARA)
+    else
+      match_text
+    end
+  end
+  def match_semicolon
+    if at_start_of_line?
+      start_span(:DL)
+      start_span(:DT, ';')
+      @lexer_table.push(@entries_lexer_table)
+      @cursor += 1
+    else
+      match_text
+    end
+  end
+  def match_colon
+    if at_start_of_line?
+      start_span(:DL)
+      start_span(:DD, ':')
+      @lexer_table.push(@entries_lexer_table)
+      @cursor += 1
+    else
+      match_text
+    end
+  end
+  def match_colon_in_entries
+    if @context.include? :DD
+      end_span(:DD)
+    elsif @context.include? :DT
+      end_span(:DT)
+    end
+    start_span(:DD, ':')
+    @cursor += 1
+  end
+  def match_newline_in_entries
+    match_text
+    unless @text[@cursor, 1] == ':'
+      if @context.include? :DD
+        end_span(:DD)
+      elsif @context.include? :DT
+        end_span(:DT)
+      end
+      end_span(:DL)
+      @lexer_table.pop
+    end
+  end
+  #-- ================== Helper methods ================== ++#
+  # Returns true if the text cursor is on the first character of a line
+  def at_start_of_line?
+    @cursor == 0 or @text[@cursor - 1, 1] == "\n"
+  end
-    #-- ================== Helper methods ================== ++#
+  # Returns true if the text cursor is after the last character of a line
+  def at_end_of_line?
+    @text[@cursor, 1] == "\n" or @text[@cursor, 1].nil?
+  end
-    # Checks if we are lexing inside a resource link like
-    # [[Image:example.png|100px|Embedded image]]
-    def inside_resource_link
-      if @pair_stack.last[0] == :INTLINKSTART
-        pos = -1
-        while((token = @tokens[pos][0])  != :INTLINKSTART)
-          if token == :RESOURCE_SEP
-            return true
-          else
-            pos -= 1
-          end
-        end
+  def blank_line?
+    i = @cursor
+    i += 1 while (@text[i,1] == ' ')
+    return (@text[i,1] == '' or (@text[i,1] == "\n") or (@text[i,2] == "\r\n"))
+  end
+  # Advances the text cursor to the next non-blank character, without appending
+  # any of the blank characters to the pending text buffer
+  def skip_whitespace
+    @cursor += 1 while @text[@cursor, 1] == ' '
+  end
+  # Advances the text cursor beyond the next newline sequence, if any. This is
+  # used to strip newlines after certain block-level elements, like section
+  # headings and tables, to prevent an empty paragraph when the block is followed
+  # by an extra newline sequence.
+  def skip_newline
+    if @text[@cursor, 2] == "\r\n"
+      @cursor += 2
+    elsif @text[@cursor, 1] == "\n"
+      @cursor += 1
+    end
+  end
+  # Extracts from the input text the sequence of characters consisting of the
+  # character or characters specified, and returns the sequence as a string. The
+  # text cursor is advanaced to point to the next character after the sequence.
+  def extract_char_sequence(char)
+    sequence = ''
+    i = @cursor
+    if char.length == 1
+      while @text[i, 1] == char do
+        sequence << char
+        i += 1
+      end
+    else
+      chars = char.split('')
+      while chars.include?(@text[i, 1]) do
+        sequence << @text[i, 1]
+        i += 1
       end
-      false
     end
+    sequence
+  end
+  # Opens list and list item spans for each item symbol in the string specified.
+  def open_list(symbols)
+    symbols.split('').each do |symbol|
+      if symbol == '*'
+        start_span(:UL)
+      else
+        start_span(:OL)
+      end
+      start_span(:LI)
+      @cursor += symbol.length
+    end
+  end
+  # Closes list and list item spans for each item symbol in the string specified.
+  def close_list(symbols)
+    symbols.split('').reverse.each do |symbol|
+      end_span(:LI)
+      if symbol == '*'
+        end_span(:UL)
+      else
+        end_span(:OL)
+      end
+    end
+  end
+  # Open a token span for the symbol specified. This will append a token start
+  # to the list of output tokens, and push the symbol onto the context stack. If
+  # there is an open paragraph, and the symbol is a block element, then the
+  # open paragraph will be closed (or, if empty, removed) before the token start
+  # is appended.
+  def start_span(symbol, text='')
+    maybe_close_para(symbol, ['pre','table','p'].include?(text))
+    @context << symbol
+    append_to_tokens [(symbol.to_s + '_START').to_sym, text]
+  end
-    #Checks if the token is placed at the start of the line.
-    def at_start_of_line?
-        if @cursor == 0 or @text[@cursor-1, 1] == "\n"
-            true
+  # Close a token span for the symbol specified. This will append an end token
+  # to the list of output tokens, and pop the symbol from the context stack. Any
+  # unclosed contexts on top of this symbol's context will also be close (this
+  # generally happens when in-line markup is not terminated before a new block
+  # begins). If the context is empty as a result, a new paragraph will be opened.
+  def end_span(symbol, text='')
+    while(@context.size > 0 and @context.last != symbol) do
+      append_to_tokens [(@context.pop.to_s + '_END').to_sym, '']
+    end
+    @context.pop
+    append_to_tokens [(symbol.to_s + '_END').to_sym, text]
+    maybe_open_para(symbol)
+  end
+  def empty_span(symbol, text, cursor_increment)
+    maybe_close_para(symbol)
+    append_to_tokens [symbol, text, @cursor, cursor_increment]
+    @cursor += cursor_increment
+    maybe_open_para(symbol)
+  end
+  def maybe_close_para(symbol, force = false)
+    if @context.size > 0 and (PARA_BREAK_ELEMENTS.include?(symbol) or force)
+      i = 1
+      i += 1 while INLINE_ELEMENTS.include?(@context[-i])
+      if @context[-i] == :PARA
+        if @pending.is_empty_token? and @tokens.last[0] == :PARA_START
+          @context.pop
+          @tokens.pop
         else
-            false
+          (1 .. i).each do
+            symbol = @context.pop
+            append_to_tokens [(symbol.to_s + '_END').to_sym, '']
+          end
         end
+      end
     end
-    def in_table?
-        @pair_stack.include?([:TABLE_START, ''])
-    end
-    #Checks if the text at position contains the start of a link using any of
-    #HTTP, HTTPS, MAILTO or FILE protocols
-    def link_protocol?(position)
-        return @text[position, @text.length - position] =~ %r{\A((http|https|file)://|mailto:)}
+  end
+  def maybe_open_para(symbol)
+    if @context.size == 0 and symbol != :PARA
+      @tokens << [:PARA_START, '']
+      @context << :PARA
     end
-    #Adjusts @token_start to skip leading whitespaces
-    def strip_ws_from_token_start
-        @token_start += 1 while @text[@token_start, 1] == " "
+  end
+  def append_to_tokens(token)
+    unless @pending.is_empty_token?
+      @tokens.append_pending(@pending)
     end
-    #Returns true if the TEXT token is empty or contains newline only
-    def empty_text_token?
-        @current_token[0] == :TEXT and
-            (@current_token[1] == '' or @current_token[1] == "\n" or @current_token[1] == "\r\n")
+    @pending = TokenString.new(self)
+    @tokens << token
+  end
+  class LexerTable
+    def initialize
+      @tables = []
     end
-    #Returns true if the text is a list, i.e. starts with one of #;*: symbols
-    #that indicate a list
-    def text_is_list?(text)
-        return text =~ /^[#;*:].*/
+    def push(table)
+      @tables << table
+      @table = table
     end
-    #Runs sublexer to tokenize sub_text
-    def sub_lex(sub_text, strip_paragraphs=true)
-        sub_lexer = MediaWikiLexer.new
-        sub_tokens = sub_lexer.tokenize(sub_text)
-        sub_tokens.pop #false token
-        if strip_paragraphs and sub_tokens.size > 0
-            #the last PARA_END token
-            sub_tokens.pop if sub_tokens.last[0] == :PARA_END
-            #the first PARA_START token
-            sub_tokens.delete_at(0) if sub_tokens[0][0] == :PARA_START
-        end
-        sub_tokens
-    end
-    #Extract list contents of list type set by list_id variable.
-    #Example list:
-    # *a
-    # **a
-    #Extracted list with id "*" will look like:
-    # a
-    # *a
-    def extract_list_contents(list_id)
-        i = @cursor+1
-        list = ""
-        while i < @text.length
-            curr = @text[i, 1]
-            if (curr == "\n") and (@text[i+1, 1] != list_id)
-                list+=curr
-                break
-            end
-            if (curr == list_id) and (@text[i-1, 1] == "\n")
-                list += "\n" if i + 1 == @text.length
-            else
-                list += curr
-            end
-            i += 1
-        end
-        list
-    end
-    def start_para
-        @tokens << [:PARA_START, ""]
-        @para = true
-    end
-    def end_para
-        @tokens += end_tokens_for_open_pairs
-        @tokens << [:PARA_END, ""]
-        @para = false
-    end
-    def end_tokens_for_open_pairs
-        tokens = []
-        restore = []
-        while(@pair_stack.size > 1) do
-          last = @pair_stack.pop
-          case last[0]
-          when :ITALICSTART
-              tokens << [:ITALICEND, '']
-          when :BOLDSTART
-              tokens << [:BOLDEND, '']
-          when :INTLINKSTART
-              tokens << [:INTLINKEND, '']
-          when :LINKSTART
-              tokens << [:LINKEND, '']
-          when :TABLE_START
-              tokens << [:TABLE_END, '']
-          when :ROW_START
-              tokens << [:ROW_END, '']
-          when :CELL_START
-              tokens << [:CELL_END, '']
-          when :HEAD_START
-              tokens << [:HEAD_END, '']
-          else
-              restore << last
-          end
-        end
-        @pair_stack += restore.reverse
-        tokens
-    end
-    def close_table_cell(tokens)
-        restore = []
-        last = @pair_stack.pop
-        while (last[0] != :CELL_START and last[0] != :HEAD_START and last[0] != :ROW_START and last[0] != :TABLE_START) do
-            case last[0]
-            when :ITALICSTART
-                tokens << [:ITALICEND, '']
-            when :BOLDSTART
-                tokens << [:BOLDEND, '']
-            when :INTLINKSTART
-                tokens << [:INTLINKEND, '']
-            when :LINKSTART
-                tokens << [:LINKEND, '']
-            end
-            last = @pair_stack.pop
-        end
-        if last[0] == :CELL_START
-            tokens << [:CELL_END, '']
-        elsif last[0] == :HEAD_START
-            tokens << [:HEAD_END, '']
-        else
-            @pair_stack.push last
-        end
+    def pop
+      @tables.pop
+      @table = @tables.last
     end
-    def close_table_row(tokens)
-        if @pair_stack.last[0] == :ROW_START
-            @pair_stack.pop
-            tokens << [:ROW_END, '']
-        end
+    def[] (char)
+      @table[char]
     end
+  end
 end