RubyGems - commonmarker - Versions diffs - 0.2.1 → 0.3.0 - Mend

commonmarker 0.2.1 → 0.3.0

Potentially problematic release.

This version of commonmarker might be problematic. Click here for more details.

Files changed (82) hide show

data/ext/commonmarker/cmark/src/scanners.h CHANGED

@@ -10,7 +10,13 @@ bufsize_t _scan_scheme(const unsigned char *p);
 bufsize_t _scan_autolink_uri(const unsigned char *p);
 bufsize_t _scan_autolink_email(const unsigned char *p);
 bufsize_t _scan_html_tag(const unsigned char *p);
-bufsize_t _scan_html_block_tag(const unsigned char *p);
+bufsize_t _scan_html_block_start(const unsigned char *p);
+bufsize_t _scan_html_block_start_7(const unsigned char *p);
+bufsize_t _scan_html_block_end_1(const unsigned char *p);
+bufsize_t _scan_html_block_end_2(const unsigned char *p);
+bufsize_t _scan_html_block_end_3(const unsigned char *p);
+bufsize_t _scan_html_block_end_4(const unsigned char *p);
+bufsize_t _scan_html_block_end_5(const unsigned char *p);
 bufsize_t _scan_link_url(const unsigned char *p);
 bufsize_t _scan_link_title(const unsigned char *p);
 bufsize_t _scan_spacechars(const unsigned char *p);
@@ -20,12 +26,19 @@ bufsize_t _scan_hrule(const unsigned char *p);
 bufsize_t _scan_open_code_fence(const unsigned char *p);
 bufsize_t _scan_close_code_fence(const unsigned char *p);
 bufsize_t _scan_entity(const unsigned char *p);
+bufsize_t _scan_dangerous_url(const unsigned char *p);
 #define scan_scheme(c, n) _scan_at(&_scan_scheme, c, n)
 #define scan_autolink_uri(c, n) _scan_at(&_scan_autolink_uri, c, n)
 #define scan_autolink_email(c, n) _scan_at(&_scan_autolink_email, c, n)
 #define scan_html_tag(c, n) _scan_at(&_scan_html_tag, c, n)
-#define scan_html_block_tag(c, n) _scan_at(&_scan_html_block_tag, c, n)
+#define scan_html_block_start(c, n) _scan_at(&_scan_html_block_start, c, n)
+#define scan_html_block_start_7(c, n) _scan_at(&_scan_html_block_start_7, c, n)
+#define scan_html_block_end_1(c, n) _scan_at(&_scan_html_block_end_1, c, n)
+#define scan_html_block_end_2(c, n) _scan_at(&_scan_html_block_end_2, c, n)
+#define scan_html_block_end_3(c, n) _scan_at(&_scan_html_block_end_3, c, n)
+#define scan_html_block_end_4(c, n) _scan_at(&_scan_html_block_end_4, c, n)
+#define scan_html_block_end_5(c, n) _scan_at(&_scan_html_block_end_5, c, n)
 #define scan_link_url(c, n) _scan_at(&_scan_link_url, c, n)
 #define scan_link_title(c, n) _scan_at(&_scan_link_title, c, n)
 #define scan_spacechars(c, n) _scan_at(&_scan_spacechars, c, n)
@@ -35,6 +48,7 @@ bufsize_t _scan_entity(const unsigned char *p);
 #define scan_open_code_fence(c, n) _scan_at(&_scan_open_code_fence, c, n)
 #define scan_close_code_fence(c, n) _scan_at(&_scan_close_code_fence, c, n)
 #define scan_entity(c, n) _scan_at(&_scan_entity, c, n)
+#define scan_dangerous_url(c, n) _scan_at(&_scan_dangerous_url, c, n)
 #ifdef __cplusplus
 }

data/ext/commonmarker/cmark/src/scanners.re CHANGED

@@ -30,9 +30,9 @@ bufsize_t _scan_at(bufsize_t (*scanner)(const unsigned char *), cmark_chunk *c,
   escaped_char = [\\][!"#$%&'()*+,./:;<=>?@[\\\]^_`{|}~-];
-  tagname = [A-Za-z][A-Za-z0-9]*;
+  tagname = [A-Za-z][A-Za-z0-9-]*;
-  blocktagname = 'article'|'header'|'aside'|'hgroup'|'iframe'|'blockquote'|'hr'|'body'|'li'|'map'|'button'|'object'|'canvas'|'ol'|'caption'|'output'|'col'|'p'|'colgroup'|'pre'|'dd'|'progress'|'div'|'section'|'dl'|'table'|'td'|'dt'|'tbody'|'embed'|'textarea'|'fieldset'|'tfoot'|'figcaption'|'th'|'figure'|'thead'|'footer'|'footer'|'tr'|'form'|'ul'|'h1'|'h2'|'h3'|'h4'|'h5'|'h6'|'video'|'script'|'style';
+  blocktagname = 'address'|'article'|'aside'|'base'|'basefont'|'blockquote'|'body'|'caption'|'center'|'col'|'colgroup'|'dd'|'details'|'dialog'|'dir'|'div'|'dl'|'dt'|'fieldset'|'figcaption'|'figure'|'footer'|'form'|'frame'|'frameset'|'h1'|'head'|'header'|'hr'|'html'|'legend'|'li'|'link'|'main'|'menu'|'menuitem'|'meta'|'nav'|'noframes'|'ol'|'optgroup'|'option'|'p'|'param'|'pre'|'section'|'source'|'title'|'summary'|'table'|'tbody'|'td'|'tfoot'|'th'|'thead'|'title'|'tr'|'track'|'ul';
   attributename = [a-zA-Z_:][a-zA-Z0-9:._-]*;
@@ -117,16 +117,85 @@ bufsize_t _scan_html_tag(const unsigned char *p)
 */
 }
-// Try to match an HTML block tag including first <,
-// returning num of chars matched.
-bufsize_t _scan_html_block_tag(const unsigned char *p)
+// Try to match an HTML block tag start line, returning
+// an integer code for the type of block (1-6, matching the spec).
+// #7 is handled by a separate function, below.
+bufsize_t _scan_html_block_start(const unsigned char *p)
+{
+  const unsigned char *marker = NULL;
+/*!re2c
+  [<] ('script'|'pre'|'style') (spacechar | [>]) { return 1; }
+  '<!--' { return 2; }
+  '<?' { return 3; }
+  '<!' [A-Z] { return 4; }
+  '<![CDATA[' { return 5; }
+  [<] [/]? blocktagname (spacechar | [/]? [>])  { return 6; }
+  .? { return 0; }
+*/
+}
+// Try to match an HTML block tag start line of type 7, returning
+// 7 if successful, 0 if not.
+bufsize_t _scan_html_block_start_7(const unsigned char *p)
+{
+  const unsigned char *marker = NULL;
+/*!re2c
+  [<] (opentag | closetag) [\t\n\f ]* [\r\n] { return 7; }
+  .? { return 0; }
+*/
+}
+// Try to match an HTML block end line of type 1
+bufsize_t _scan_html_block_end_1(const unsigned char *p)
+{
+  const unsigned char *marker = NULL;
+  const unsigned char *start = p;
+/*!re2c
+  .* [<] [/] ('script'|'pre'|'style') [>] { return (bufsize_t)(p - start); }
+  .? { return 0; }
+*/
+}
+// Try to match an HTML block end line of type 2
+bufsize_t _scan_html_block_end_2(const unsigned char *p)
+{
+  const unsigned char *marker = NULL;
+  const unsigned char *start = p;
+/*!re2c
+  .* '-->' { return (bufsize_t)(p - start); }
+  .? { return 0; }
+*/
+}
+// Try to match an HTML block end line of type 3
+bufsize_t _scan_html_block_end_3(const unsigned char *p)
+{
+  const unsigned char *marker = NULL;
+  const unsigned char *start = p;
+/*!re2c
+  .* '?>' { return (bufsize_t)(p - start); }
+  .? { return 0; }
+*/
+}
+// Try to match an HTML block end line of type 4
+bufsize_t _scan_html_block_end_4(const unsigned char *p)
+{
+  const unsigned char *marker = NULL;
+  const unsigned char *start = p;
+/*!re2c
+  .* '>' { return (bufsize_t)(p - start); }
+  .? { return 0; }
+*/
+}
+// Try to match an HTML block end line of type 5
+bufsize_t _scan_html_block_end_5(const unsigned char *p)
 {
   const unsigned char *marker = NULL;
   const unsigned char *start = p;
 /*!re2c
-  [<] [/] blocktagname (spacechar | [>])  { return (bufsize_t)(p - start); }
-  [<] blocktagname (spacechar | [/>]) { return (bufsize_t)(p - start); }
-  [<] [!?] { return (bufsize_t)(p - start); }
+  .* ']]>' { return (bufsize_t)(p - start); }
   .? { return 0; }
 */
 }
@@ -141,7 +210,7 @@ bufsize_t _scan_link_url(const unsigned char *p)
   const unsigned char *start = p;
 /*!re2c
   [ \r\n]* [<] ([^<>\r\n\\\x00] | escaped_char | [\\])* [>] { return (bufsize_t)(p - start); }
-  [ \r\n]* (reg_char+ | escaped_char | in_parens_nosp | [\\])* { return (bufsize_t)(p - start); }
+  [ \r\n]* (reg_char+ | escaped_char | in_parens_nosp | [\\][^()])* { return (bufsize_t)(p - start); }
   .? { return 0; }
 */
 }
@@ -164,6 +233,7 @@ bufsize_t _scan_link_title(const unsigned char *p)
 // Match space characters, including newlines.
 bufsize_t _scan_spacechars(const unsigned char *p)
 {
+  const unsigned char *marker = NULL;
   const unsigned char *start = p; \
 /*!re2c
   [ \t\v\f\r\n]* { return (bufsize_t)(p - start); }
@@ -245,3 +315,17 @@ bufsize_t _scan_entity(const unsigned char *p)
   .? { return 0; }
 */
 }
+// Returns positive value if a URL begins in a way that is potentially
+// dangerous, with javascript:, vbscript:, file:, or data:, otherwise 0.
+bufsize_t _scan_dangerous_url(const unsigned char *p)
+{
+  const unsigned char *marker = NULL;
+  const unsigned char *start = p;
+/*!re2c
+  'data:image/' ('png'|'gif'|'jpeg'|'webp') { return 0; }
+  'javascript:' | 'vbscript:' | 'file:' | 'data:' { return (bufsize_t)(p - start); }
+  .? { return 0; }
+*/
+}

data/ext/commonmarker/cmark/test/__pycache__/cmark.cpython-34.pyc ADDED

Binary file

data/ext/commonmarker/cmark/test/__pycache__/normalize.cpython-34.pyc ADDED

Binary file

data/ext/commonmarker/cmark/test/smart_punct.txt CHANGED

@@ -1,5 +1,9 @@
 ## Smart punctuation
+Open quotes are matched with closed quotes.
+The same method is used for matching openers and closers
+as is used in emphasis parsing:
 .
 "Hello," said the spider.
 "'Shelob' is my name."
@@ -28,6 +32,10 @@ So is ‘pine.’</p>
 <p>‘He said, “I want to go.”’</p>
 .
+A single quote that isn't an open quote matched
+with a close quote will be treated as an
+apostrophe:
 .
 Were you alive in the 70's?
 .
@@ -40,12 +48,19 @@ Here is some quoted '`code`' and a "[quoted link](url)".
 <p>Here is some quoted ‘<code>code</code>’ and a “<a href="url">quoted link</a>”.</p>
 .
+Here the first `'` is treated as an apostrophe, not
+an open quote, because the final single quote is matched
+by the single quote before `jolly`:
 .
 'tis the season to be 'jolly'
 .
 <p>’tis the season to be ‘jolly’</p>
 .
+An unmatched double quote will be interpreted as a
+left double quote, to facilitate this style:
 .
 "A paragraph with no closing quote.
@@ -55,40 +70,89 @@ Here is some quoted '`code`' and a "[quoted link](url)".
 <p>“Second paragraph by same speaker, in fiction.”</p>
 .
+Quotes that are escaped come out as literal straight
+quotes:
 .
 \"This is not smart.\"
 This isn\'t either.
+5\'8\"
 .
 <p>&quot;This is not smart.&quot;
-This isn't either.</p>
+This isn't either.
+5'8&quot;</p>
 .
+Two hyphens form an en-dash, three an em-dash.
 .
-Some dashes:  one---two ---
-three---four --- five.
+Some dashes:  em---em
+en--en
+em --- em
+en -- en
+2--3
 .
-<p>Some dashes:  one—two —
-three—four — five.</p>
+<p>Some dashes:  em—em
+en–en
+em — em
+en – en
+2–3</p>
 .
+A sequence of more than three hyphens is
+parsed as a sequence of em and/or en dashes,
+with no hyphens. If possible, a homogeneous
+sequence of dashes is used (so, 10 hyphens
+= 5 en dashes, and 9 hyphens = 3 em dashes).
+When a heterogeneous sequence must be used,
+the em dashes come first, followed by the en
+dashes, and as few en dashes as possible are
+used (so, 7 hyphens = 2 em dashes an 1 en
+dash).
 .
-Escaped dashes: \-- \-\-\-.
-.
-<p>Escaped dashes: -- ---.</p>
+one-
+two--
+three---
+four----
+five-----
+six------
+seven-------
+eight--------
+nine---------
+thirteen-------------.
+.
+<p>one-
+two–
+three—
+four––
+five—–
+six——
+seven—––
+eight––––
+nine———
+thirteen———––.</p>
 .
+Hyphens can be escaped:
 .
-Dashes between numbers: 5--7, 255--66, 1987--1999.
+Escaped hyphens: \-- \-\-\-.
 .
-<p>Dashes between numbers: 5–7, 255–66, 1987–1999.</p>
+<p>Escaped hyphens: -- ---.</p>
 .
+Three periods form an ellipsis:
 .
 Ellipses...and...and....
 .
 <p>Ellipses…and…and….</p>
 .
+Periods can be escaped if ellipsis-formation
+is not wanted:
 .
 No ellipses\.\.\.
 .

data/ext/commonmarker/cmark/test/spec.txt CHANGED

@@ -1,8 +1,8 @@
 ---
 title: CommonMark Spec
 author: John MacFarlane
-version: 0.20
-date: 2015-06-08
+version: 0.21
+date:
 license: '[CC-BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/)'
 ...
@@ -237,7 +237,7 @@ or more [unicode whitespace character]s.
 A [space](@space) is `U+0020`.
-A [non-space character](@non-space-character) is any character
+A [non-whitespace character](@non-space-character) is any character
 that is not a [whitespace character].
 An [ASCII punctuation character](@ascii-punctuation-character)
@@ -474,7 +474,7 @@ a------
 <p>---a---</p>
 .
-It is required that all of the [non-space character]s be the same.
+It is required that all of the [non-whitespace character]s be the same.
 So, this is not a horizontal rule:
 .
@@ -564,7 +564,7 @@ consists of a string of characters, parsed as inline content, between an
 opening sequence of 1--6 unescaped `#` characters and an optional
 closing sequence of any number of `#` characters.  The opening sequence
 of `#` characters cannot be followed directly by a
-[non-space character]. The optional closing sequence of `#`s must be
+[non-whitespace character]. The optional closing sequence of `#`s must be
 preceded by a [space] and may be followed by spaces only.  The opening
 `#` character may be indented 0-3 spaces.  The raw contents of the
 header are stripped of leading and trailing spaces before being parsed
@@ -696,7 +696,7 @@ Spaces are allowed after the closing sequence:
 .
 A sequence of `#` characters with a
-[non-space character] following it
+[non-whitespace character] following it
 is not a closing sequence, but counts as part of the contents of the
 header:
@@ -765,7 +765,7 @@ ATX headers can be empty:
 ## Setext headers
 A [setext header](@setext-header)
-consists of a line of text, containing at least one [non-space character],
+consists of a line of text, containing at least one [non-whitespace character],
 with no more than 3 spaces indentation, followed by a [setext header
 underline].  The line of text must be
 one that, were it not followed by the setext header underline,
@@ -1348,7 +1348,8 @@ aaa
 </code></pre>
 .
-Unclosed code blocks are closed by the end of the document:
+Unclosed code blocks are closed by the end of the document
+(or the enclosing [block quote] or [list item]):
 .
 ```
@@ -1368,6 +1369,19 @@ aaa
 </code></pre>
 .
+.
+> ```
+> aaa
+bbb
+.
+<blockquote>
+<pre><code>aaa
+</code></pre>
+</blockquote>
+<p>bbb</p>
+.
 A code block can have all empty lines as its content:
 .
@@ -1593,27 +1607,65 @@ Closing code fences cannot have [info string]s:
 ## HTML blocks
-An [HTML block tag](@html-block-tag) is
-an [open tag] or [closing tag] whose tag
-name is one of the following (case-insensitive):
-`article`, `header`, `aside`, `hgroup`, `blockquote`, `hr`, `iframe`,
-`body`, `li`, `map`, `button`, `object`, `canvas`, `ol`, `caption`,
-`output`, `col`, `p`, `colgroup`, `pre`, `dd`, `progress`, `div`,
-`section`, `dl`, `table`, `td`, `dt`, `tbody`, `embed`, `textarea`,
-`fieldset`, `tfoot`, `figcaption`, `th`, `figure`, `thead`, `footer`,
-`tr`, `form`, `ul`, `h1`, `h2`, `h3`, `h4`, `h5`, `h6`, `video`,
-`script`, `style`.
-An [HTML block](@html-block) begins with an
-[HTML block tag], [HTML comment], [processing instruction],
-[declaration], or [CDATA section].
-It ends when a [blank line] or the end of the
-input is encountered.  The initial line may be indented up to three
-spaces, and subsequent lines may have any indentation.  The contents
-of the HTML block are interpreted as raw HTML, and will not be escaped
-in HTML output.
-Some simple examples:
+An [HTML block](@html-block) is a group of lines that is treated
+as raw HTML (and will not be escaped in HTML output).
+There are seven kinds of [HTML block], which can be defined
+by their start and end conditions.  The block begins with a line that
+meets a [start condition](@start-condition) (after up to three spaces
+optional indentation).  It ends with the first subsequent line that
+meets a matching [end condition](@end-condition), or the last line of
+the document, if no line is encountered that meets the
+[end condition].  If the first line meets both the [start condition]
+and the [end condition], the block will contain just that line.
+1.  **Start condition:**  line begins with the string `<script`,
+`<pre`, or `<style` (case-insensitive), followed by whitespace,
+the string `>`, or the end of the line.\
+**End condition:**  line contains an end tag
+`</script>`, `</pre>`, or `</style>` (case-insensitive; it
+need not match the start tag).
+2.  **Start condition:** line begins with the string `<!--`.\
+**End condition:**  line contains the string `-->`.
+3.  **Start condition:** line begins with the string `<?`.\
+**End condition:** line contains the string `?>`.
+4.  **Start condition:** line begins with the string `<!`
+followed by an uppercase ASCII letter.\
+**End condition:** line contains the character `>`.
+5.  **Start condition:**  line begins with the string
+`<![CDATA[`.\
+**End condition:** line contains the string `]]>`.
+6.  **Start condition:** line begins the string `<` or `</`
+followed by one of the strings (case-insensitive) `address`,
+`article`, `aside`, `base`, `basefont`, `blockquote`, `body`,
+`caption`, `center`, `col`, `colgroup`, `dd`, `details`, `dialog`,
+`dir`, `div`, `dl`, `dt`, `fieldset`, `figcaption`, `figure`,
+`footer`, `form`, `frame`, `frameset`, `h1`, `head`, `header`, `hr`,
+`html`, `legend`, `li`, `link`, `main`, `menu`, `menuitem`, `meta`,
+`nav`, `noframes`, `ol`, `optgroup`, `option`, `p`, `param`, `pre`,
+`section`, `source`, `title`, `summary`, `table`, `tbody`, `td`,
+`tfoot`, `th`, `thead`, `title`, `tr`, `track`, `ul`, followed
+by [whitespace], the end of the line, the string `>`, or
+the string `/>`.\
+**End condition:** line is followed by a [blank line].
+7.  **Start condition:**  line begins with an [open tag]
+(with any [tag name]) followed only by [whitespace] or the end
+of the line.\
+**End condition:** line is followed by a [blank line].
+All types of [HTML blocks] except type 7 may interrupt
+a paragraph.  Blocks of type 7 may not interrupt a paragraph.
+(This restricted is intended to prevent unwanted interpretation
+of long tags inside a wrapped paragraph as starting HTML blocks.)
+Some simple examples follow.  Here are some basic HTML blocks
+of type 6:
 .
 <table>
@@ -1646,6 +1698,16 @@ okay.
          <foo><a>
 .
+A block can also start with a closing tag:
+.
+</div>
+*foo*
+.
+</div>
+*foo*
+.
 Here we have two HTML blocks with a Markdown paragraph between them:
 .
@@ -1660,7 +1722,94 @@ Here we have two HTML blocks with a Markdown paragraph between them:
 </DIV>
 .
-In the following example, what looks like a Markdown code block
+The tag on the first line can be partial, as long
+as it is split where there would be whitespace:
+.
+<div id="foo"
+  class="bar">
+</div>
+.
+<div id="foo"
+  class="bar">
+</div>
+.
+.
+<div id="foo" class="bar
+  baz">
+</div>
+.
+<div id="foo" class="bar
+  baz">
+</div>
+.
+An open tag need not be closed:
+.
+<div>
+*foo*
+*bar*
+.
+<div>
+*foo*
+<p><em>bar</em></p>
+.
+A partial tag need not even be completed (garbage
+in, garbage out):
+.
+<div id="foo"
+*hi*
+.
+<div id="foo"
+*hi*
+.
+.
+<div class
+foo
+.
+<div class
+foo
+.
+The initial tag doesn't even need to be a valid
+tag, as long as it starts like one:
+.
+<div *???-&&&-<---
+*foo*
+.
+<div *???-&&&-<---
+*foo*
+.
+In type 6 blocks, the initial tag need not be on a line by
+itself:
+.
+<div><a href="bar">*foo*</a></div>
+.
+<div><a href="bar">*foo*</a></div>
+.
+.
+<table><tr><td>
+foo
+</td></tr></table>
+.
+<table><tr><td>
+foo
+</td></tr></table>
+.
+Everything until the next blank line or end of document
+gets included in the HTML block.  So, in the following
+example, what looks like a Markdown code block
 is actually part of the HTML block, which continues until a blank
 line or the end of the document is reached:
@@ -1676,43 +1825,267 @@ int x = 33;
 ```
 .
-A comment:
+To start an [HTML block] with a tag that is *not* in the
+list of block-level tags in (6), you must put the tag by
+itself on the first line (and it must be complete):
+.
+<a href="foo">
+*bar*
+</a>
+.
+<a href="foo">
+*bar*
+</a>
+.
+In type 7 blocks, the [tag name] can be anything:
+.
+<Warning>
+*bar*
+</Warning>
+.
+<Warning>
+*bar*
+</Warning>
+.
+.
+<i class="foo">
+*bar*
+</i>
+.
+<i class="foo">
+*bar*
+</i>
+.
+These rules are designed to allow us to work with tags that
+can function as either block-level or inline-level tags.
+The `<del>` tag is a nice example.  We can surround content with
+`<del>` tags in three different ways.  In this case, we get a raw
+HTML block, because the `<del>` tag is on a line by itself:
+.
+<del>
+*foo*
+</del>
+.
+<del>
+*foo*
+</del>
+.
+In this case, we get a raw HTML block that just includes
+the `<del>` tag (because it ends with the following blank
+line).  So the contents get interpreted as CommonMark:
+.
+<del>
+*foo*
+</del>
+.
+<del>
+<p><em>foo</em></p>
+</del>
+.
+Finally, in this case, the `<del>` tags are interpreted
+as [raw HTML] *inside* the CommonMark paragraph.  (Because
+the tag is not on a line by itself, we get inline HTML
+rather than an [HTML block].)
+.
+<del>*foo*</del>
+.
+<p><del><em>foo</em></del></p>
+.
+HTML tags designed to contain literal content
+(`script`, `style`, `pre`), comments, processing instructions,
+and declarations are treated somewhat differently.
+Instead of ending at the first blank line, these blocks
+end at the first line containing a corresponding end tag.
+As a result, these blocks can contain blank lines:
+A pre tag (type 1):
+.
+<pre language="haskell"><code>
+import Text.HTML.TagSoup
+main :: IO ()
+main = print $ parseTags tags
+</code></pre>
+.
+<pre language="haskell"><code>
+import Text.HTML.TagSoup
+main :: IO ()
+main = print $ parseTags tags
+</code></pre>
+.
+A script tag (type 1):
+.
+<script type="text/javascript">
+// JavaScript example
+document.getElementById("demo").innerHTML = "Hello JavaScript!";
+</script>
+.
+<script type="text/javascript">
+// JavaScript example
+document.getElementById("demo").innerHTML = "Hello JavaScript!";
+</script>
+.
+A style tag (type 1):
+.
+<style
+  type="text/css">
+h1 {color:red;}
+p {color:blue;}
+</style>
+.
+<style
+  type="text/css">
+h1 {color:red;}
+p {color:blue;}
+</style>
+.
+If there is no matching end tag, the block will end at the
+end of the document (or the enclosing [block quote] or
+[list item]):
+.
+<style
+  type="text/css">
+foo
+.
+<style
+  type="text/css">
+foo
+.
+.
+> <div>
+> foo
+bar
+.
+<blockquote>
+<div>
+foo
+</blockquote>
+<p>bar</p>
+.
+.
+- <div>
+- foo
+.
+<ul>
+<li>
+<div>
+</li>
+<li>foo</li>
+</ul>
+.
+The end tag can occur on the same line as the start tag:
+.
+<style>p{color:red;}</style>
+*foo*
+.
+<style>p{color:red;}</style>
+<p><em>foo</em></p>
+.
+.
+<!-- foo -->*bar*
+*baz*
+.
+<!-- foo -->*bar*
+<p><em>baz</em></p>
+.
+Note that anything on the last line after the
+end tag will be included in the [HTML block]:
+.
+<script>
+foo
+</script>1. *bar*
+.
+<script>
+foo
+</script>1. *bar*
+.
+A comment (type 2):
 .
 <!-- Foo
 bar
    baz -->
 .
 <!-- Foo
 bar
    baz -->
 .
-A processing instruction:
+A processing instruction (type 3):
 .
 <?php
   echo '>';
 ?>
 .
 <?php
   echo '>';
 ?>
 .
-CDATA:
+A declaration (type 4):
+.
+<!DOCTYPE html>
+.
+<!DOCTYPE html>
+.
+CDATA (type 5):
 .
 <![CDATA[
 function matchwo(a,b)
 {
-if (a < b && a < 0) then
-  {
-  return 1;
-  }
-else
-  {
-  return 0;
+  if (a < b && a < 0) then {
+    return 1;
+  } else {
+    return 0;
   }
 }
 ]]>
@@ -1720,13 +2093,12 @@ else
 <![CDATA[
 function matchwo(a,b)
 {
-if (a < b && a < 0) then
-  {
-  return 1;
-  }
-else
-  {
-  return 0;
+  if (a < b && a < 0) then {
+    return 1;
+  } else {
+    return 0;
   }
 }
 ]]>
@@ -1744,8 +2116,18 @@ The opening tag can be indented 1-3 spaces, but not 4:
 </code></pre>
 .
-An HTML block can interrupt a paragraph, and need not be preceded
-by a blank line.
+.
+  <div>
+    <div>
+.
+  <div>
+<pre><code>&lt;div&gt;
+</code></pre>
+.
+An HTML block of types 1--6 can interrupt a paragraph, and need not be
+preceded by a blank line.
 .
 Foo
@@ -1759,8 +2141,8 @@ bar
 </div>
 .
-However, a following blank line is always needed, except at the end of
-a document:
+However, a following blank line is needed, except at the end of
+a document, and except for blocks of types 1--5, above:
 .
 <div>
@@ -1774,14 +2156,16 @@ bar
 *foo*
 .
-An incomplete HTML block tag may also start an HTML block:
+HTML blocks of type 7 cannot interrupt a paragraph:
 .
-<div class
-foo
+Foo
+<a href="bar">
+baz
 .
-<div class
-foo
+<p>Foo
+<a href="bar">
+baz</p>
 .
 This rule differs from John Gruber's original Markdown syntax
@@ -1800,8 +2184,8 @@ here:
 - It requires a matching end tag, which it also does not allow to
   be indented.
-Indeed, most Markdown implementations, including some of Gruber's
-own perl implementations, do not impose these restrictions.
+Most Markdown implementations (including some of Gruber's own) do not
+respect all of these restrictions.
 There is one respect, however, in which Gruber's rule is more liberal
 than the one given here, since it allows blank lines to occur inside
@@ -1812,6 +2196,8 @@ if no matching end tag is found. Second, it provides a very simple
 and flexible way of including Markdown content inside HTML tags:
 simply separate the Markdown from the HTML using blank lines:
+Compare:
 .
 <div>
@@ -1824,8 +2210,6 @@ simply separate the Markdown from the HTML using blank lines:
 </div>
 .
-Compare:
 .
 <div>
 *Emphasized* text.
@@ -1869,11 +2253,37 @@ Hi
 </table>
 .
-Moreover, blank lines are usually not necessary and can be
-deleted.  The exception is inside `<pre>` tags; here, one can
-replace the blank lines with `&#10;` entities.
+There are problems, however, if the inner tags are indented
+*and* separated by spaces, as then they will be interpreted as
+an indented code block:
+.
+<table>
+  <tr>
+    <td>
+      Hi
+    </td>
+  </tr>
+</table>
+.
+<table>
+  <tr>
+<pre><code>&lt;td&gt;
+  Hi
+&lt;/td&gt;
+</code></pre>
+  </tr>
+</table>
+.
-So there is no important loss of expressive power with the new rule.
+Fortunately, blank lines are usually not necessary and can be
+deleted.  The exception is inside `<pre>` tags, but as described
+above, raw HTML blocks starting with `<pre>` *can* contain blank
+lines.
 ## Link reference definitions
@@ -1885,7 +2295,7 @@ optional [whitespace] (including up to one
 [line ending]), and an optional [link
 title], which if it is present must be separated
 from the [link destination] by [whitespace].
-No further [non-space character]s may occur on the line.
+No further [non-whitespace character]s may occur on the line.
 A [link reference definition]
 does not correspond to a structural element of a document.  Instead, it
@@ -2056,7 +2466,7 @@ bar
 .
 This is not a link reference definition, because there are
-[non-space character]s after the title:
+[non-whitespace character]s after the title:
 .
 [foo]: /url "title" ok
@@ -2305,7 +2715,7 @@ So, we explain what counts as a block quote or list item by explaining
 how these can be *generated* from their contents. This should suffice
 to define the syntax, although it does not give a recipe for *parsing*
 these constructions.  (A recipe is provided below in the section entitled
-[A parsing strategy](#appendix-a-a-parsing-strategy).)
+[A parsing strategy](#appendix-a-parsing-strategy).)
 ## Block quotes
@@ -2323,7 +2733,7 @@ The following rules define [block quotes]:
 2.  **Laziness.**  If a string of lines *Ls* constitute a [block
     quote](#block-quotes) with contents *Bs*, then the result of deleting
     the initial [block quote marker] from one or
-    more lines in which the next [non-space character] after the [block
+    more lines in which the next [non-whitespace character] after the [block
     quote marker] is [paragraph continuation
     text] is a block quote with *Bs* as its content.
     [Paragraph continuation text](@paragraph-continuation-text) is text
@@ -2694,13 +3104,15 @@ A [bullet list marker](@bullet-list-marker)
 is a `-`, `+`, or `*` character.
 An [ordered list marker](@ordered-list-marker)
-is a sequence of one of more digits (`0-9`), followed by either a
-`.` character or a `)` character.
+is a sequence of 1--9 arabic digits (`0-9`), followed by either a
+`.` character or a `)` character.  (The reason for the length
+limit is that with 10 digits we start seeing integer overflows
+in some browsers.)
 The following rules define [list items]:
 1.  **Basic case.**  If a sequence of lines *Ls* constitute a sequence of
-    blocks *Bs* starting with a [non-space character] and not separated
+    blocks *Bs* starting with a [non-whitespace character] and not separated
     from each other by more than one blank line, and *M* is a list
     marker of width *W* followed by 0 < *N* < 5 spaces, then the result
     of prepending *M* and the following spaces to the first line of
@@ -2758,7 +3170,7 @@ The most important thing to notice is that the position of
 the text after the list marker determines how much indentation
 is needed in subsequent blocks in the list item.  If the list
 marker takes up two spaces, and there are three spaces between
-the list marker and the next [non-space character], then blocks
+the list marker and the next [non-whitespace character], then blocks
 must be indented five spaces in order to fall under the list
 item.
@@ -2816,7 +3228,7 @@ put under the list item:
 It is tempting to think of this in terms of columns:  the continuation
 blocks must be indented at least to the column of the first
-[non-space character] after the list marker. However, that is not quite right.
+[non-whitespace character] after the list marker. However, that is not quite right.
 The spaces after the list marker determine how much relative indentation
 is needed.  Which column this indentation reaches will depend on
 how the list item is embedded in other constructions, as shown by
@@ -2964,6 +3376,49 @@ A list item may contain any kind of block:
 </ol>
 .
+Note that ordered list start numbers must be nine digits or less:
+.
+123456789. ok
+.
+<ol start="123456789">
+<li>ok</li>
+</ol>
+.
+.
+1234567890. not ok
+.
+<p>1234567890. not ok</p>
+.
+A start number may begin with 0s:
+.
+0. ok
+.
+<ol start="0">
+<li>ok</li>
+</ol>
+.
+.
+003. ok
+.
+<ol start="3">
+<li>ok</li>
+</ol>
+.
+A start number may not be negative:
+.
+-1. not ok
+.
+<p>-1. not ok</p>
+.
 2.  **Item starting with indented code.**  If a sequence of lines *Ls*
     constitute a sequence of blocks *Bs* starting with an indented code
     block and not separated from each other by more than one blank line,
@@ -3069,7 +3524,7 @@ inside the code block:
 Note that rules #1 and #2 only apply to two cases:  (a) cases
 in which the lines to be included in a list item begin with a
-[non-space character], and (b) cases in which
+[non-whitespace character], and (b) cases in which
 they begin with an indented code
 block.  In a case like the following, where the first block begins with
 a three-space indent, the rules do not allow us to form a list item by
@@ -3301,7 +3756,7 @@ Four spaces indent gives a code block:
 5.  **Laziness.**  If a string of lines *Ls* constitute a [list
     item](#list-items) with contents *Bs*, then the result of deleting
     some or all of the indentation from one or more lines in which the
-    next [non-space character] after the indentation is
+    next [non-whitespace character] after the indentation is
     [paragraph continuation text] is a
     list item with the same contents and attributes.  The unindented
     lines are called
@@ -4360,7 +4815,7 @@ raw HTML:
 .
 <a href="/bar\/)">
 .
-<p><a href="/bar\/)"></p>
+<a href="/bar\/)">
 .
 But they work in all other contexts, including URLs and link titles,
@@ -4474,7 +4929,7 @@ code blocks, including raw HTML, URLs, [link title]s, and
 .
 <a href="&ouml;&ouml;.html">
 .
-<p><a href="&ouml;&ouml;.html"></p>
+<a href="&ouml;&ouml;.html">
 .
 .
@@ -6031,6 +6486,20 @@ in Markdown:
 <p><a href="foo):">link</a></p>
 .
+A link can contain fragment identifiers and queries:
+.
+[link](#fragment)
+[link](http://example.com#fragment)
+[link](http://example.com?foo=bar&baz#fragment)
+.
+<p><a href="#fragment">link</a></p>
+<p><a href="http://example.com#fragment">link</a></p>
+<p><a href="http://example.com?foo=bar&amp;baz#fragment">link</a></p>
+.
 Note that a backslash before a non-escapable character is
 just a backslash:
@@ -6245,7 +6714,7 @@ that [matches] a [link reference definition] elsewhere in the document.
 A [link label](@link-label)  begins with a left bracket (`[`) and ends
 with the first right bracket (`]`) that is not backslash-escaped.
-Between these brackets there must be at least one non-[whitespace character].
+Between these brackets there must be at least one [non-whitespace character].
 Unescaped square bracket characters are not allowed in
 [link label]s.  A link label can have at most 999
 characters inside the square brackets.
@@ -6492,7 +6961,7 @@ backslash-escaped:
 <p><a href="/uri">foo</a></p>
 .
-A [link label] must contain at least one non-[whitespace character]:
+A [link label] must contain at least one [non-whitespace character]:
 .
 []
@@ -7074,7 +7543,8 @@ so custom tags (and even, say, DocBook tags) may be used.
 Here is the grammar for tags:
 A [tag name](@tag-name) consists of an ASCII letter
-followed by zero or more ASCII letters or digits.
+followed by zero or more ASCII letters, digits, or
+hyphens (`-`).
 An [attribute](@attribute) consists of [whitespace],
 an [attribute name], and an optional
@@ -7107,7 +7577,7 @@ consists of `"`, zero or more
 characters not including `"`, and a final `"`.
 An [open tag](@open-tag) consists of a `<` character, a [tag name],
-zero or more [attributes], optional [whitespace], an optional `/`
+zero or more [attributes](@attribute], optional [whitespace], an optional `/`
 character, and a `>` character.
 A [closing tag](@closing-tag) consists of the string `</`, a
@@ -7172,6 +7642,21 @@ _boolean zoop:33=zoop:33 />
 _boolean zoop:33=zoop:33 /></p>
 .
+Custom tag names can be used:
+.
+<responsive-image src="foo.jpg" />
+<My-Tag>
+foo
+</My-Tag>
+.
+<responsive-image src="foo.jpg" />
+<My-Tag>
+foo
+</My-Tag>
+.
 Illegal tag names, not parsed as HTML:
 .
@@ -7220,8 +7705,8 @@ Closing tags:
 </a>
 </foo >
 .
-<p></a>
-</foo ></p>
+</a>
+</foo >
 .
 Illegal attributes in closing tag:
@@ -7288,7 +7773,7 @@ Entities are preserved in HTML attributes:
 .
 <a href="&ouml;">
 .
-<p><a href="&ouml;"></p>
+<a href="&ouml;">
 .
 Backslash escapes do not work in HTML attributes:
@@ -7296,7 +7781,7 @@ Backslash escapes do not work in HTML attributes:
 .
 <a href="\*">
 .
-<p><a href="\*"></p>
+<a href="\*">
 .
 .
@@ -7500,7 +7985,10 @@ Multiple     spaces
 <!-- END TESTS -->
-# Appendix A: A parsing strategy {-}
+# Appendix: A parsing strategy {-}
+In this appendix we describe some features of the parsing strategy
+used in the CommonMark reference implementations.
 ## Overview {-}
@@ -7517,8 +8005,6 @@ are parsed into sequences of Markdown inline elements (strings,
 code spans, links, emphasis, and so on), using the map of link
 references constructed in phase 1.
-## The document tree {-}
 At each point in processing, the document is represented as a tree of
 **blocks**.  The root of the tree is a `document` block.  The `document`
 may have any number of other blocks as **children**.  These children
@@ -7542,7 +8028,7 @@ marked by arrows:
              "aliquando id"
 ```
-## How source lines alter the document tree {-}
+## Phase 1: block structure {-}
 Each line that is processed has an effect on this tree.  The line is
 analyzed and, depending on its contents, the document may be altered
@@ -7557,6 +8043,36 @@ in one or more of the following ways:
 Once a line has been incorporated into the tree in this way,
 it can be discarded, so input can be read in a stream.
+For each line, we follow this procedure:
+1. First we iterate through the open blocks, starting with the
+root document, and descending through last children down to the last
+open block.  Each block imposes a condition that the line must satisfy
+if the block is to remain open.  For example, a block quote requires a
+`>` character.  A paragraph requires a non-blank line.
+In this phase we may match all or just some of the open
+blocks.  But we cannot close unmatched blocks yet, because we may have a
+[lazy continuation line].
+2.  Next, after consuming the continuation markers for existing
+blocks, we look for new block starts (e.g. `>` for a block quote.
+If we encounter a new block start, we close any blocks unmatched
+in step 1 before creating the new block as a child of the last
+matched block.
+3.  Finally, we look at the remainder of the line (after block
+markers like `>`, list markers, and indentation have been consumed).
+This is text that can be incorporated into the last open
+block (a paragraph, code block, header, or raw HTML).
+Setext headers are formed when we detect that the second line of
+a paragraph is a setext header line.
+Reference link definitions are detected when a paragraph is closed;
+the accumulated text lines are parsed to see if they begin with
+one or more reference link definitions.  Any remainder becomes a
+normal paragraph.
 We can see how this works by considering how the tree above is
 generated by four lines of Markdown:
@@ -7654,7 +8170,7 @@ We thus obtain the final tree:
              "aliquando id"
 ```
-## From block structure to the final document {-}
+## Phase 2: inline structure {-}
 Once all of the input has been parsed, all open blocks are closed.
@@ -7685,5 +8201,123 @@ Notice how the [line ending] in the first paragraph has
 been parsed as a `softbreak`, and the asterisks in the first list item
 have become an `emph`.
-The document can be rendered as HTML, or in any other format, given
-an appropriate renderer.
+### An algorithm for parsing nested emphasis and links {-}
+By far the trickiest part of inline parsing is handling emphasis,
+strong emphasis, links, and images.  This is done using the following
+algorithm.
+When we're parsing inlines and we hit either
+- a run of `*` or `_` characters, or
+- a `[` or `![`
+we insert a text node with these symbols as its literal content, and we
+add a pointer to this text node to the [delimiter stack](@delimiter-stack).
+The [delimiter stack] is a doubly linked list.  Each
+element contains a pointer to a text node, plus information about
+- the type of delimiter (`[`, `![`, `*`, `_`)
+- the number of delimiters,
+- whether the delimiter is "active" (all are active to start), and
+- whether the delimiter is a potential opener, a potential closer,
+  or both (which depends on what sort of characters precede
+  and follow the delimiters).
+When we hit a `]` character, we call the *look for link or image*
+procedure (see below).
+When we hit the end of the input, we call the *process emphasis*
+procedure (see below), with `stack_bottom` = NULL.
+#### *look for link or image* {-}
+Starting at the top of the delimiter stack, we look backwards
+through the stack for an opening `[` or `![` delimiter.
+- If we don't find one, we return a literal text node `]`.
+- If we do find one, but it's not *active*, we remove the inactive
+  delimiter from the stack, and return a literal text node `]`.
+- If we find one and it's active, then we parse ahead to see if
+  we have an inline link/image, reference link/image, compact reference
+  link/image, or shortcut reference link/image.
+  + If we don't, then we remove the opening delimiter from the
+    delimiter stack and return a literal text node `]`.
+  + If we do, then
+    * We return a link or image node whose children are the inlines
+      after the text node pointed to by the opening delimiter.
+    * We run *process emphasis* on these inlines, with the `[` opener
+      as `stack_bottom`.
+    * We remove the opening delimiter.
+    * If we have a link (and not an image), we also set all
+      `[` delimiters before the opening delimiter to *inactive*.  (This
+      will prevent us from getting links within links.)
+#### *process emphasis* {-}
+Parameter `stack_bottom` sets a lower bound to how far we
+descend in the [delimiter stack].  If it is NULL, we can
+go all the way to the bottom.  Otherwise, we stop before
+visiting `stack_bottom`.
+Let `current_position` point to the element on the [delimiter stack]
+just above `stack_bottom` (or the first element if `stack_bottom`
+is NULL).
+We keep track of the `openers_bottom` for each delimiter
+type (`*`, `_`).  Initialize this to `stack_bottom`.
+Then we repeat the following until we run out of potential
+closers:
+- Move `current_position` forward in the delimiter stack (if needed)
+  until we find the first potential closer with delimiter `*` or `_`.
+  (This will be the potential closer closest
+  to the beginning of the input -- the first one in parse order.)
+- Now, look back in the stack (staying above `stack_bottom` and
+  the `openers_bottom` for this delimiter type) for the
+  first matching potential opener ("matching" means same delimiter).
+- If one is found:
+  + Figure out whether we have emphasis or strong emphasis:
+    if both closer and opener spans have length >= 2, we have
+    strong, otherwise regular.
+  + Insert an emph or strong emph node accordingly, after
+    the text node corresponding to the opener.
+  + Remove any delimiters between the opener and closer from
+    the delimiter stack.
+  + Remove 1 (for regular emph) or 2 (for strong emph) delimiters
+    from the opening and closing text nodes.  If they become empty
+    as a result, remove them and remove the corresponding element
+    of the delimiter stack.  If the closing node is removed, reset
+    `current_position` to the next element in the stack.
+- If none in found:
+  + Set `openers_bottom` to the element before `current_position`.
+    (We know that there are no openers for this kind of closer up to and
+    including this point, so this puts a lower bound on future searches.)
+  + If the closer at `current_position` is not a potential opener,
+    remove it from the delimiter stack (since we know it can't
+    be a closer either).
+  + Advance `current_position` to the next element in the stack.
+After we're done, we remove all delimiters above `stack_bottom` from the
+delimiter stack.