RubyGems - kramdown - Versions diffs - 0.7.0 → 0.8.0 - Mend

kramdown 0.7.0 → 0.8.0

Potentially problematic release.

This version of kramdown might be problematic. Click here for more details.

Files changed (93) hide show

data/CONTRIBUTERS +4 -0
data/ChangeLog +671 -0
data/README +10 -0
data/Rakefile +40 -23
data/VERSION +1 -1
data/data/kramdown/document.html +1 -1
data/data/kramdown/document.latex +10 -5
data/doc/default.less.css +52 -10
data/doc/default.template +4 -0
data/doc/documentation.page +72 -0
data/doc/index.page +8 -41
data/doc/installation.page +6 -6
data/doc/links.markdown +2 -0
data/doc/quickref.page +6 -2
data/doc/syntax.page +8 -7
data/doc/tests.page +1 -2
data/lib/kramdown.rb +1 -1
data/lib/kramdown/compatibility.rb +1 -1
data/lib/kramdown/converter.rb +8 -3
data/lib/kramdown/converter/base.rb +27 -5
data/lib/kramdown/converter/html.rb +26 -28
data/lib/kramdown/converter/latex.rb +29 -15
data/lib/kramdown/document.rb +15 -8
data/lib/kramdown/error.rb +1 -1
data/lib/kramdown/options.rb +21 -13
data/lib/kramdown/parser.rb +9 -3
data/lib/kramdown/parser/base.rb +95 -0
data/lib/kramdown/parser/html.rb +387 -0
data/lib/kramdown/parser/kramdown.rb +11 -56
data/lib/kramdown/parser/kramdown/attribute_list.rb +1 -1
data/lib/kramdown/parser/kramdown/autolink.rb +1 -1
data/lib/kramdown/parser/kramdown/blank_line.rb +1 -1
data/lib/kramdown/parser/kramdown/blockquote.rb +1 -1
data/lib/kramdown/parser/kramdown/codeblock.rb +1 -1
data/lib/kramdown/parser/kramdown/codespan.rb +1 -1
data/lib/kramdown/parser/kramdown/emphasis.rb +1 -1
data/lib/kramdown/parser/kramdown/eob.rb +1 -1
data/lib/kramdown/parser/kramdown/escaped_chars.rb +1 -1
data/lib/kramdown/parser/kramdown/extension.rb +2 -90
data/lib/kramdown/parser/kramdown/footnote.rb +1 -1
data/lib/kramdown/parser/kramdown/header.rb +1 -1
data/lib/kramdown/parser/kramdown/horizontal_rule.rb +1 -1
data/lib/kramdown/parser/kramdown/html.rb +69 -149
data/lib/kramdown/parser/kramdown/html_entity.rb +4 -4
data/lib/kramdown/parser/kramdown/line_break.rb +1 -1
data/lib/kramdown/parser/kramdown/link.rb +2 -2
data/lib/kramdown/parser/kramdown/list.rb +2 -6
data/lib/kramdown/parser/kramdown/math.rb +3 -3
data/lib/kramdown/parser/kramdown/paragraph.rb +1 -1
data/lib/kramdown/parser/kramdown/smart_quotes.rb +3 -2
data/lib/kramdown/parser/kramdown/table.rb +3 -2
data/lib/kramdown/parser/kramdown/typographic_symbol.rb +7 -3
data/lib/kramdown/version.rb +2 -2
data/man/man1/kramdown.1 +19 -0
data/test/run_tests.rb +1 -0
data/test/test_files.rb +68 -7
data/test/testcases/block/09_html/comment.html +5 -0
data/test/testcases/block/09_html/comment.text +3 -0
data/test/testcases/block/09_html/content_model/tables.html +2 -2
data/test/testcases/block/09_html/html_to_native/code.html +10 -0
data/test/testcases/block/09_html/html_to_native/code.text +9 -0
data/test/testcases/block/09_html/html_to_native/comment.html +7 -0
data/test/testcases/block/09_html/html_to_native/comment.text +8 -0
data/test/testcases/block/09_html/html_to_native/emphasis.html +1 -0
data/test/testcases/block/09_html/html_to_native/emphasis.text +1 -0
data/test/testcases/block/09_html/html_to_native/entity.html +1 -0
data/test/testcases/block/09_html/html_to_native/entity.text +1 -0
data/test/testcases/block/09_html/html_to_native/header.html +6 -0
data/test/testcases/block/09_html/html_to_native/header.options +2 -0
data/test/testcases/block/09_html/html_to_native/header.text +6 -0
data/test/testcases/block/09_html/html_to_native/list_dl.html +8 -0
data/test/testcases/block/09_html/html_to_native/list_dl.text +8 -0
data/test/testcases/block/09_html/html_to_native/list_ol.html +15 -0
data/test/testcases/block/09_html/html_to_native/list_ol.text +17 -0
data/test/testcases/block/09_html/html_to_native/list_ul.html +19 -0
data/test/testcases/block/09_html/html_to_native/list_ul.text +22 -0
data/test/testcases/block/09_html/html_to_native/options +1 -0
data/test/testcases/block/09_html/html_to_native/paragraph.html +3 -0
data/test/testcases/block/09_html/html_to_native/paragraph.text +4 -0
data/test/testcases/block/09_html/html_to_native/table_normal.html +13 -0
data/test/testcases/block/09_html/html_to_native/table_normal.text +12 -0
data/test/testcases/block/09_html/html_to_native/table_simple.html +10 -0
data/test/testcases/block/09_html/html_to_native/table_simple.text +14 -0
data/test/testcases/block/09_html/html_to_native/typography.html +1 -0
data/test/testcases/block/09_html/html_to_native/typography.text +1 -0
data/test/testcases/block/09_html/parse_as_raw.html +3 -5
data/test/testcases/block/09_html/parse_as_raw.text +0 -1
data/test/testcases/span/04_footnote/definitions.latex +18 -0
data/test/testcases/span/04_footnote/footnote_nr.latex +6 -0
data/test/testcases/span/04_footnote/markers.latex +32 -0
data/test/testcases/span/05_html/invalid.html +1 -0
data/test/testcases/span/05_html/invalid.text +1 -0
metadata +52 -5

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 #--
-# Copyright (C) 2009 Thomas Leitner <t_leitner@gmx.at>
+# Copyright (C) 2009-2010 Thomas Leitner <t_leitner@gmx.at>
 #
 # This file is part of kramdown.
 #
@@ -69,7 +69,8 @@ module Kramdown
     # #new) and the conversion phase.
     attr_reader :warnings
-    # Holds needed parse information like ALDs, link definitions and so on.
+    # Holds needed parse information which is dependent on the used parser, like ALDs, link
+    # definitions and so on. This information may be used by converters afterwards.
     attr_reader :parse_infos
     # Holds conversion information which is dependent on the used converter. A converter clears this
@@ -77,15 +78,16 @@ module Kramdown
     attr_reader :conversion_infos
-    # Create a new Kramdown document from the string +source+ and use the provided +options+.
+    # Create a new Kramdown document from the string +source+ and use the provided +options+. The
+    # options that can be used are defined in the Options module.
     #
     # The special options key <tt>:input</tt> can be used to select the parser that should parse the
     # +source+. It has to be the name of a class in the Kramdown::Parser module. For example, to
     # select the kramdown parser, one would set the <tt>:input</tt> key to +Kramdown+. If this key
     # is not set, it defaults to +Kramdown+.
     #
-    # The +source+ is immediately parsed by the selected parser so that after this call the document
-    # tree is available and the output can be generated.
+    # The +source+ is immediately parsed by the selected parser so that the document tree is
+    # immediately available and the output can be generated.
     def initialize(source, options = {})
       @options = Options.merge(options)
       @warnings = []
@@ -106,7 +108,7 @@ module Kramdown
     # For example, +to_html+ would instantiate the Kramdown::Converter::Html class.
     def method_missing(id, *attr, &block)
       if id.to_s =~ /^to_(\w+)$/
-        Converter.const_get($1.capitalize).convert(self)
+        Converter.const_get($1[0..0].upcase + $1[1..-1]).convert(self)
       else
         super
       end
@@ -125,7 +127,7 @@ module Kramdown
   # (paragraphs, headers, emphasis, ...). The type of element can be set via the #type accessor.
   class Element
-    # A symbol representing the element type. For example, +:p+ or +:blockquote+.
+    # A symbol representing the element type. For example, <tt>:p</tt> or <tt>:blockquote</tt>.
     attr_accessor :type
     # The value of the element. The interpretation of this field depends on the type of the element.
@@ -133,7 +135,12 @@ module Kramdown
     attr_accessor :value
     # The options hash for the element. It is used for storing arbitray options as well as the
-    # *attributes* of the element under the <tt>:attr</tt> key.
+    # following special contents:
+    #
+    # - *Attributes* of the element under the <tt>:attr</tt> key
+    # - Category of the element, either <tt>:block</tt> or <tt>:span</tt>, under the
+    #   <tt>:category</tt> key. If this key is absent, it can be assumed that the element is in the
+    #   <tt>:span</tt> category.
     attr_accessor :options
     # The child elements of this element.

data/lib/kramdown/error.rb CHANGED

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 #--
-# Copyright (C) 2009 Thomas Leitner <t_leitner@gmx.at>
+# Copyright (C) 2009-2010 Thomas Leitner <t_leitner@gmx.at>
 #
 # This file is part of kramdown.
 #

data/lib/kramdown/options.rb CHANGED

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 #--
-# Copyright (C) 2009 Thomas Leitner <t_leitner@gmx.at>
+# Copyright (C) 2009-2010 Thomas Leitner <t_leitner@gmx.at>
 #
 # This file is part of kramdown.
 #
@@ -22,7 +22,8 @@
 module Kramdown
-  # This module defines all options that are used by parsers and/or converters.
+  # This module defines all options that are used by parsers and/or converters as well as providing
+  # methods to deal with the options.
   module Options
     # Helper class introducing a boolean type for specifying boolean values (+true+ and +false+) as
@@ -39,13 +40,13 @@ module Kramdown
     # ----------------------------
     # :section: Option definitions
     #
-    # This sections informs about the methods that can be used on the Options class.
+    # This sections informs describes the methods that can be used on the Options module.
     # ----------------------------
     # Contains the definition of an option.
     Definition = Struct.new(:name, :type, :default, :desc)
-    # Allowed option types
+    # Allowed option types.
     ALLOWED_TYPES = [String, Integer, Float, Symbol, Boolean, Array, Object]
     @options = {}
@@ -54,7 +55,7 @@ module Kramdown
     # Symbol, Boolean, Array, Object), default value +default+ and the description +desc+.
     #
     # The type 'Object' should only be used if none of the other types suffices because such an
-    # option will be opaque!
+    # option will be opaque and cannot be used, for example, by CLI command!
     def self.define(name, type, default, desc)
       raise ArgumentError, "Option name #{name} is already used" if @options.has_key?(name)
       raise ArgumentError, "Invalid option type #{type} specified" if !ALLOWED_TYPES.include?(type)
@@ -67,7 +68,7 @@ module Kramdown
       @options
     end
-    # Return +true+ if an option +name+ is defined.
+    # Return +true+ if an option called +name+ is defined.
     def self.defined?(name)
       @options.has_key?(name)
     end
@@ -79,7 +80,8 @@ module Kramdown
       temp
     end
-    # Merge the #defaults Hash with the parsed options from the given Hash.
+    # Merge the #defaults Hash with the *parsed* options from the given Hash, i.e. only valid option
+    # names are considered and their value is run through the #parse method.
     def self.merge(hash)
       temp = defaults
       hash.each do |k,v|
@@ -185,14 +187,18 @@ Default: true
 Used by: kramdown parser
 EOF
-    define(:extension, Object, nil, <<EOF)
-An object for handling the extensions
+    define(:html_to_native, Boolean, false, <<EOF)
+Convert HTML elements to native elements
-The value for this option needs to be an object that can handle the
-extensions found in a kramdown document. If this option is `nil`, the
-default extension object is used.
+If this option is `true`, the parser converts HTML elements to native
+elements. For example, when parsing `<em>hallo</em>` the emphasis tag
+would normally be converted to an `:html` element with tag type `:em`.
+If `html_to_native` is `true`, then the emphasis would be converted to a
+native `:em` element.
-Default: nil
+This is useful for converters that cannot deal with HTML elements.
+Default: false
 Used by: kramdown parser
 EOF
@@ -207,6 +213,8 @@ Used by: HTML converter
 EOF
     define(:filter_html, Array, [], <<EOF)
+NOTE: This option is deprecated and will be removed in a future release!
 An array of HTML tags that should be filtered from the output
 The value can either be specified as array or as a space separated

data/lib/kramdown/parser.rb CHANGED

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 #--
-# Copyright (C) 2009 Thomas Leitner <t_leitner@gmx.at>
+# Copyright (C) 2009-2010 Thomas Leitner <t_leitner@gmx.at>
 #
 # This file is part of kramdown.
 #
@@ -22,11 +22,17 @@
 module Kramdown
-  # This module contains all available parsers. Currently, there is only one parser for parsing
-  # documents in kramdown format.
+  # == Parser Module
+  #
+  # This module contains all available parsers. Currently, there two parsers:
+  #
+  # * Kramdown for parsing documents in kramdown format
+  # * Html for parsing HTML documents
   module Parser
+    autoload :Base, 'kramdown/parser/base'
     autoload :Kramdown, 'kramdown/parser/kramdown'
+    autoload :Html, 'kramdown/parser/html'
   end

data/lib/kramdown/parser/base.rb ADDED

@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+#
+#--
+# Copyright (C) 2009-2010 Thomas Leitner <t_leitner@gmx.at>
+#
+# This file is part of kramdown.
+#
+# kramdown is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#++
+#
+module Kramdown
+  module Parser
+    # == Base class for parsers
+    #
+    # This class serves as base class for parsers. It provides common methods that can/should be
+    # used by all parsers, especially by those using StringScanner for parsing.
+    #
+    class Base
+      # Initialize the parser with the given Kramdown document +doc+.
+      def initialize(doc)
+        @doc = doc
+        @doc.parse_infos.clear
+        @text_type = :text
+      end
+      private_class_method(:new, :allocate)
+      # Parse the +source+ string into an element tree, using the information provided by the
+      # Kramdown document +doc+.
+      #
+      # Initializes a new instance of the calling class and then calls the #parse method that must
+      # be implemented by each subclass.
+      def self.parse(source, doc)
+        new(doc).parse(source)
+      end
+      # Add the given warning +text+ to the warning array of the Kramdown document.
+      def warning(text)
+        @doc.warnings << text
+        #TODO: add position information
+      end
+      # Modify the string +source+ to be usable by the parser.
+      def adapt_source(source)
+        source.gsub(/\r\n?/, "\n").chomp + "\n"
+      end
+      # This helper method adds the given +text+ either to the last element in the +tree+ if it is a
+      # +type+ element or creates a new text element with the given +type+.
+      def add_text(text, tree = @tree, type = @text_type)
+        if tree.children.last && tree.children.last.type == type
+          tree.children.last.value << text
+        elsif !text.empty?
+          tree.children << Element.new(type, text)
+        end
+      end
+      # Extract the part of the StringScanner +srcscan+ backed string specified by the +range+. This
+      # method also works correctly under Ruby 1.9.
+      def extract_string(range, strscan)
+        result = nil
+        if RUBY_VERSION >= '1.9'
+          begin
+            enc = strscan.string.encoding
+            strscan.string.force_encoding('ASCII-8BIT')
+            result = strscan.string[range].force_encoding(enc)
+          ensure
+            strscan.string.force_encoding(enc)
+          end
+        else
+          result = strscan.string[range]
+        end
+        result
+      end
+    end
+  end
+end

data/lib/kramdown/parser/html.rb ADDED

@@ -0,0 +1,387 @@
+# -*- coding: utf-8 -*-
+#
+#--
+# Copyright (C) 2009-2010 Thomas Leitner <t_leitner@gmx.at>
+#
+# This file is part of kramdown.
+#
+# kramdown is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#++
+#
+require 'rexml/parsers/baseparser'
+require 'strscan'
+module Kramdown
+  module Parser
+    # Used for parsing a HTML document.
+    class Html < Base
+      # Contains all constants that are used when parsing.
+      module Constants
+        #:stopdoc:
+        # The following regexps are based on the ones used by REXML, with some slight modifications.
+        HTML_DOCTYPE_RE = /<!DOCTYPE.*?>/m
+        HTML_COMMENT_RE = /<!--(.*?)-->/m
+        HTML_INSTRUCTION_RE = /<\?(.*?)\?>/m
+        HTML_ATTRIBUTE_RE = /\s*(#{REXML::Parsers::BaseParser::UNAME_STR})\s*=\s*(["'])(.*?)\2/m
+        HTML_TAG_RE = /<((?>#{REXML::Parsers::BaseParser::UNAME_STR}))\s*((?>\s+#{REXML::Parsers::BaseParser::UNAME_STR}\s*=\s*(["']).*?\3)*)\s*(\/)?>/m
+        HTML_TAG_CLOSE_RE = /<\/(#{REXML::Parsers::BaseParser::NAME_STR})\s*>/m
+        HTML_ENTITY_RE = /&([\w:][\-\w\d\.:]*);|&#(\d+);|&\#x([0-9a-fA-F]+);/
+        HTML_PARSE_AS_BLOCK = %w{applet button blockquote colgroup dd div dl fieldset form iframe li
+                               map noscript object ol table tbody thead tfoot tr td ul}
+        HTML_PARSE_AS_SPAN  = %w{a abbr acronym address b bdo big cite caption del dfn dt em
+                               h1 h2 h3 h4 h5 h6 i ins kbd label legend optgroup p q rb rbc
+                               rp rt rtc ruby samp select small span strong sub sup th tt var}
+        HTML_PARSE_AS_RAW   = %w{script math option textarea pre code}
+        HTML_PARSE_AS = Hash.new {|h,k| h[k] = :raw}
+        HTML_PARSE_AS_BLOCK.each {|i| HTML_PARSE_AS[i] = :block}
+        HTML_PARSE_AS_SPAN.each {|i| HTML_PARSE_AS[i] = :span}
+        HTML_PARSE_AS_RAW.each {|i| HTML_PARSE_AS[i] = :raw}
+        # Some HTML elements like script belong to both categories (i.e. are valid in block and
+        # span HTML) and don't appear therefore!
+        HTML_SPAN_ELEMENTS = %w{a abbr acronym b big bdo br button cite code del dfn em i img input
+                              ins kbd label option q rb rbc rp rt rtc ruby samp select small span
+                              strong sub sup textarea tt var}
+        HTML_BLOCK_ELEMENTS = %w{address article aside applet body button blockquote caption col colgroup dd div dl dt fieldset
+                               figcaption footer form h1 h2 h3 h4 h5 h6 header hgroup hr html head iframe legend listing menu
+                               li map nav ol optgroup p pre section summary table tbody td th thead tfoot tr ul}
+        HTML_ELEMENTS_WITHOUT_BODY = %w{area base br col command embed hr img input keygen link meta param source track wbr}
+      end
+      # Contains the parsing methods. This module can be mixed into any parser to get HTML parsing
+      # functionality. The only thing that must be provided by the class are instance variable
+      # <tt>@stack</tt> for storing needed state and <tt>@src</tt> (instance of StringScanner) for
+      # the actual parsing.
+      module Parser
+        include Constants
+        # Process the HTML start tag that has already be scanned/checked. Does the common processing
+        # steps and then yields to the caller for further processing.
+        def handle_html_start_tag
+          name = @src[1]
+          closed = !@src[4].nil?
+          attrs = {}
+          @src[2].scan(HTML_ATTRIBUTE_RE).each {|attr,sep,val| attrs[attr] = val}
+          el = Element.new(:html_element, name, :attr => attrs, :category => :block)
+          @tree.children << el
+          if !closed && HTML_ELEMENTS_WITHOUT_BODY.include?(el.value)
+            warning("The HTML tag '#{el.value}' cannot have any content - auto-closing it")
+            closed = true
+          end
+          if name == 'script'
+            handle_html_script_tag
+            yield(el, true)
+          else
+            yield(el, closed)
+          end
+        end
+        def handle_html_script_tag
+          curpos = @src.pos
+          if result = @src.scan_until(/(?=<\/script\s*>)/m)
+            add_text(extract_string(curpos...@src.pos, @src), @tree.children.last, :raw)
+            @src.scan(HTML_TAG_CLOSE_RE)
+          else
+            add_text(@src.scan(/.*/m), @tree.children.last, :raw)
+            warning("Found no end tag for 'script' - auto-closing it")
+          end
+        end
+        HTML_RAW_START = /(?=<(#{REXML::Parsers::BaseParser::UNAME_STR}|\/|!--|\?))/
+        # Parse raw HTML from the current source position, storing the found elements in +el+.
+        # Parsing continues until one of the following criteria are fulfilled:
+        #
+        # - The end of the document is reached.
+        # - The matching end tag for the element +el+ is found (only used if +el+ is an HTML
+        #   element).
+        #
+        # When an HTML start tag is found, processing is deferred to #handle_html_start_tag,
+        # providing the block given to this method.
+        def parse_raw_html(el, &block)
+          @stack.push(@tree)
+          @tree = el
+          done = false
+          while !@src.eos? && !done
+            if result = @src.scan_until(HTML_RAW_START)
+              add_text(result, @tree, :text)
+              if result = @src.scan(HTML_COMMENT_RE)
+                @tree.children << Element.new(:xml_comment, result, :category => :block, :parent_is_raw => true)
+              elsif result = @src.scan(HTML_INSTRUCTION_RE)
+                @tree.children << Element.new(:xml_pi, result, :category => :block, :parent_is_raw => true)
+              elsif @src.scan(HTML_TAG_RE)
+                handle_html_start_tag(&block)
+              elsif @src.scan(HTML_TAG_CLOSE_RE)
+                if @tree.value == @src[1]
+                  done = true
+                else
+                  warning("Found invalidly used HTML closing tag for '#{@src[1]}' - ignoring it")
+                end
+              else
+                add_text(@src.scan(/./), @tree, :text)
+              end
+            else
+              result = @src.scan(/.*/m)
+              add_text(result, @tree, :text)
+              warning("Found no end tag for '#{@tree.value}' - auto-closing it") if @tree.type == :html_element
+              done = true
+            end
+          end
+          @tree = @stack.pop
+        end
+      end
+      # Converts HTML elements to native elements if possible.
+      class ElementConverter
+        include Constants
+        REMOVE_TEXT_CHILDREN =  %w{html head hgroup ol ul dl table colgroup tbody thead tfoot tr select optgroup}
+        REMOVE_WHITESPACE_CHILDREN = %w{body section nav article aside header footer address
+                                        div li dd blockquote figure figcaption td th fieldset form}
+        STRIP_WHITESPACE = %w{address article aside blockquote body caption dd div dl dt fieldset figcaption form footer
+                              header h1 h2 h3 h4 h5 h6 legend li nav p section td th}
+        SIMPLE_ELEMENTS = %w{em strong blockquote hr br a img p thead tbody tfoot tr td th ul ol dl li dl dt dd}
+        # Convert the element +el+ and its children.
+        def process(el, convert_simple = true, parent = nil)
+          case el.type
+          when :xml_comment, :xml_pi, :html_doctype
+            ptype = if parent.nil?
+                      'div'
+                    else
+                      case parent.type
+                      when :html_element then parent.value
+                      when :code_span then 'code'
+                      when :code_block then 'pre'
+                      when :header then 'h1'
+                      else parent.type.to_s
+                      end
+                    end
+            el.options = {:category => HTML_PARSE_AS_SPAN.include?(ptype) ? :span : :block}
+            return
+          when :html_element
+          else return
+          end
+          type = el.value
+          remove_text_children(el) if REMOVE_TEXT_CHILDREN.include?(type)
+          mname = "convert_#{el.value}"
+          if self.class.method_defined?(mname)
+            send(mname, el)
+          elsif convert_simple && SIMPLE_ELEMENTS.include?(type)
+            set_basics(el, type.intern, HTML_SPAN_ELEMENTS.include?(type) ? :span : :block)
+            process_children(el, convert_simple)
+          else
+            process_html_element(el, convert_simple)
+          end
+          strip_whitespace(el) if STRIP_WHITESPACE.include?(type)
+          remove_whitespace_children(el) if REMOVE_WHITESPACE_CHILDREN.include?(type)
+        end
+        def process_children(el, convert_simple = true)
+          el.children.map! do |c|
+            if c.type == :text
+              process_text(c.value)
+            else
+              process(c, convert_simple, el)
+              c
+            end
+          end.flatten!
+        end
+        # Process the HTML text +raw+: compress whitespace (if +preserve+ is +false+) and convert
+        # entities in entity elements.
+        def process_text(raw, preserve = false)
+          raw.gsub!(/\s+/, ' ') unless preserve
+          src = StringScanner.new(raw)
+          result = []
+          while !src.eos?
+            if tmp = src.scan_until(/(?=#{HTML_ENTITY_RE})/)
+              result << Element.new(:text, tmp)
+              src.scan(HTML_ENTITY_RE)
+              val = src[1] || (src[2] && src[2].to_i) || src[3].hex
+              result << if %w{lsquo rsquo ldquo rdquo}.include?(val)
+                          Element.new(:smart_quote, val.intern)
+                        elsif %w{mdash ndash hellip laquo raquo}.include?(val)
+                          Element.new(:typographic_sym, val.intern)
+                        else
+                          Element.new(:entity, val)
+                        end
+            else
+              result << Element.new(:text, src.scan(/.*/m))
+            end
+          end
+          result
+        end
+        def process_html_element(el, convert_simple = true)
+          el.options = {:category => HTML_SPAN_ELEMENTS.include?(el.value) ? :span : :block,
+            :parse_type => HTML_PARSE_AS[el.value],
+            :attr => el.options[:attr]
+          }
+          process_children(el, convert_simple)
+        end
+        def remove_text_children(el)
+          el.children.delete_if {|c| c.type == :text}
+        end
+        def strip_whitespace(el)
+          return if el.children.empty?
+          if el.children.first.type == :text
+            el.children.first.value.lstrip!
+          end
+          if el.children.last.type == :text
+            el.children.last.value.rstrip!
+          end
+        end
+        def remove_whitespace_children(el)
+          i = -1
+          el.children.delete_if do |c|
+            i += 1
+            c.type == :text && c.value.strip.empty? &&
+              (i == 0 || i == el.children.length - 1 || (el.children[i-1].options[:category] == :block &&
+                                                         el.children[i+1].options[:category] == :block))
+          end
+        end
+        def set_basics(el, type, category, opts = {})
+          el.type = type
+          el.options = {:category => category, :attr => el.options[:attr]}.merge(opts)
+          el.value = nil
+        end
+        def extract_text(el, raw)
+          raw << el.value.to_s if el.type == :text
+          el.children.each {|c| extract_text(c, raw)}
+        end
+        def convert_h1(el)
+          set_basics(el, :header, :block, :level => el.value[1..1].to_i)
+          extract_text(el, el.options[:raw_text] = '')
+          process_children(el)
+        end
+        %w{h2 h3 h4 h5 h6}.each {|i| alias_method("convert_#{i}".intern, :convert_h1)}
+        def convert_code(el)
+          if el.value == 'code'
+            set_basics(el, :codespan, :span)
+          else
+            set_basics(el, :codeblock, :block)
+          end
+          raw = ''
+          extract_text(el, raw)
+          result = process_text(raw, true)
+          if result.length > 1 || result.first.type != :text
+            el.children = result
+          else
+            el.value = result.first.value
+          end
+        end
+        alias :convert_pre :convert_code
+        def convert_table(el)
+          if !is_simple_table?(el)
+            process_html_element(el, false)
+            return
+          end
+          process_children(el)
+          set_basics(el, :table, :block)
+          el.options[:alignment] = []
+          helper = lambda do |c|
+            if c.type == :tr && el.options[:alignment].empty?
+              el.options[:alignment] = [:default] * c.children.length
+              break
+            else
+              c.children.each {|cc| helper.call(cc)}
+            end
+          end
+          helper.call(el)
+          true
+        end
+        def is_simple_table?(el)
+          only_phrasing_content = lambda do |c|
+            c.children.all? do |cc|
+              (cc.type == :text || !HTML_BLOCK_ELEMENTS.include?(cc.value)) && only_phrasing_content.call(cc)
+            end
+          end
+          helper = Proc.new do |c|
+            if c.value == 'th' || c.value == 'td'
+              return false if !only_phrasing_content.call(c)
+            else
+              c.children.each {|cc| helper.call(cc)}
+            end
+          end
+          helper.call(el)
+          true
+        end
+      end
+      include Parser
+      # Parse +source+ as HTML document and return the created +tree+.
+      def parse(source)
+        @stack = []
+        @tree = Element.new(:root)
+        @src = StringScanner.new(adapt_source(source))
+        while true
+          if result = @src.scan(/\s*#{HTML_INSTRUCTION_RE}/)
+            @tree.children << Element.new(:xml_pi, result.strip, :category => :block)
+          elsif result = @src.scan(/\s*#{HTML_DOCTYPE_RE}/)
+            @tree.children << Element.new(:html_doctype, result.strip, :category => :block)
+          elsif result = @src.scan(/\s*#{HTML_COMMENT_RE}/)
+            @tree.children << Element.new(:xml_comment, result.strip, :category => :block)
+          else
+            break
+          end
+        end
+        tag_handler = lambda do |c, closed|
+          parse_raw_html(c, &tag_handler) if !closed
+        end
+        parse_raw_html(@tree, &tag_handler)
+        ec = ElementConverter.new
+        @tree.children.each {|c| ec.process(c)}
+        ec.remove_whitespace_children(@tree)
+        @tree
+      end
+    end
+  end
+end