bean-kramdown 0.13.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. data/AUTHORS +1 -0
  2. data/CONTRIBUTERS +11 -0
  3. data/COPYING +24 -0
  4. data/ChangeLog +6683 -0
  5. data/GPL +674 -0
  6. data/README +43 -0
  7. data/VERSION +1 -0
  8. data/bin/kramdown +78 -0
  9. data/lib/kramdown.rb +23 -0
  10. data/lib/kramdown/compatibility.rb +49 -0
  11. data/lib/kramdown/converter.rb +41 -0
  12. data/lib/kramdown/converter/base.rb +169 -0
  13. data/lib/kramdown/converter/bean_html.rb +71 -0
  14. data/lib/kramdown/converter/html.rb +411 -0
  15. data/lib/kramdown/converter/kramdown.rb +428 -0
  16. data/lib/kramdown/converter/latex.rb +607 -0
  17. data/lib/kramdown/converter/toc.rb +82 -0
  18. data/lib/kramdown/document.rb +119 -0
  19. data/lib/kramdown/element.rb +524 -0
  20. data/lib/kramdown/error.rb +30 -0
  21. data/lib/kramdown/options.rb +373 -0
  22. data/lib/kramdown/parser.rb +39 -0
  23. data/lib/kramdown/parser/base.rb +136 -0
  24. data/lib/kramdown/parser/bean_kramdown.rb +25 -0
  25. data/lib/kramdown/parser/bean_kramdown/info_box.rb +52 -0
  26. data/lib/kramdown/parser/bean_kramdown/oembed.rb +230 -0
  27. data/lib/kramdown/parser/html.rb +570 -0
  28. data/lib/kramdown/parser/kramdown.rb +339 -0
  29. data/lib/kramdown/parser/kramdown/abbreviation.rb +71 -0
  30. data/lib/kramdown/parser/kramdown/autolink.rb +53 -0
  31. data/lib/kramdown/parser/kramdown/blank_line.rb +43 -0
  32. data/lib/kramdown/parser/kramdown/block_boundary.rb +46 -0
  33. data/lib/kramdown/parser/kramdown/blockquote.rb +51 -0
  34. data/lib/kramdown/parser/kramdown/codeblock.rb +63 -0
  35. data/lib/kramdown/parser/kramdown/codespan.rb +56 -0
  36. data/lib/kramdown/parser/kramdown/emphasis.rb +70 -0
  37. data/lib/kramdown/parser/kramdown/eob.rb +39 -0
  38. data/lib/kramdown/parser/kramdown/escaped_chars.rb +38 -0
  39. data/lib/kramdown/parser/kramdown/extensions.rb +204 -0
  40. data/lib/kramdown/parser/kramdown/footnote.rb +74 -0
  41. data/lib/kramdown/parser/kramdown/header.rb +68 -0
  42. data/lib/kramdown/parser/kramdown/horizontal_rule.rb +39 -0
  43. data/lib/kramdown/parser/kramdown/html.rb +169 -0
  44. data/lib/kramdown/parser/kramdown/html_entity.rb +44 -0
  45. data/lib/kramdown/parser/kramdown/image.rb +157 -0
  46. data/lib/kramdown/parser/kramdown/line_break.rb +38 -0
  47. data/lib/kramdown/parser/kramdown/link.rb +154 -0
  48. data/lib/kramdown/parser/kramdown/list.rb +240 -0
  49. data/lib/kramdown/parser/kramdown/math.rb +65 -0
  50. data/lib/kramdown/parser/kramdown/paragraph.rb +63 -0
  51. data/lib/kramdown/parser/kramdown/smart_quotes.rb +214 -0
  52. data/lib/kramdown/parser/kramdown/table.rb +178 -0
  53. data/lib/kramdown/parser/kramdown/typographic_symbol.rb +52 -0
  54. data/lib/kramdown/parser/markdown.rb +69 -0
  55. data/lib/kramdown/utils.rb +42 -0
  56. data/lib/kramdown/utils/entities.rb +348 -0
  57. data/lib/kramdown/utils/html.rb +85 -0
  58. data/lib/kramdown/utils/ordered_hash.rb +100 -0
  59. data/lib/kramdown/version.rb +28 -0
  60. metadata +140 -0
@@ -0,0 +1,25 @@
1
+ require 'kramdown/parser/kramdown'
2
+
3
+ module Kramdown
4
+ module Parser
5
+ class BeanKramdown < Kramdown
6
+
7
+ # Array with all the parsing methods that should be removed from the standard kramdown parser.
8
+ EXCEPT = [:codeblock_fenced, :block_extensions, :span_extensions]
9
+
10
+ # initialise new parsers
11
+ def initialize(source, options)
12
+ super
13
+
14
+ @block_parsers.unshift(:info_box)
15
+ @span_parsers.unshift(:oembed)
16
+
17
+ @block_parsers.delete_if {|i| EXCEPT.include?(i)}
18
+ @span_parsers.delete_if {|i| EXCEPT.include?(i)}
19
+ end
20
+
21
+ require 'kramdown/parser/bean_kramdown/info_box'
22
+ require 'kramdown/parser/bean_kramdown/oembed'
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,52 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ #--
4
+ # Copyright (C) 2009-2012 Thomas Leitner <t_leitner@gmx.at>
5
+ #
6
+ # This file is part of kramdown.
7
+ #
8
+ # kramdown is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+ #
22
+
23
+ require 'kramdown/parser/kramdown/blank_line'
24
+ require 'kramdown/parser/kramdown/extensions'
25
+ require 'kramdown/parser/kramdown/eob'
26
+
27
+ module Kramdown
28
+ module Parser
29
+ class BeanKramdown
30
+
31
+ INFO_BOX_START = /^#{OPT_SPACE}% ?/
32
+
33
+ # Parse the info box at the current location.
34
+ def parse_info_box
35
+ result = @src.scan(PARAGRAPH_MATCH)
36
+ while !@src.match?(self.class::LAZY_END)
37
+ result << @src.scan(PARAGRAPH_MATCH)
38
+ end
39
+ result.gsub!(INFO_BOX_START, '')
40
+
41
+ el = new_block_el(:info_box)
42
+ @tree.children << el
43
+ parse_blocks(el, result)
44
+ true
45
+ end
46
+
47
+ define_parser(:info_box, INFO_BOX_START)
48
+
49
+
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,230 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ #--
4
+ # Copyright (C) 2009-2012 Thomas Leitner <t_leitner@gmx.at>
5
+ #
6
+ # This file is part of kramdown.
7
+ #
8
+ # kramdown is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+ #
22
+
23
+ #
24
+ # This file has been edited to suit the needs of The Beans Group Ltd. Changes were made to the types of media availbable
25
+ # images keep their ! however new types are ? for oembedd etc
26
+ # If you wish to change the types of media you need to change the OEMBED_START constant to include your special symbol
27
+ # for the new media object as well as change the reg ex on the parser definition towards the bottom of this file.
28
+ #
29
+
30
+ require 'open-uri'
31
+ require 'cgi'
32
+ require 'json'
33
+
34
+ module Kramdown
35
+ module Parser
36
+ class BeanKramdown
37
+
38
+ # Normalize the oembed identifier.
39
+ def normalize_oembed_id(id)
40
+ id.gsub(/(\s|\n)+/, ' ').downcase
41
+ end
42
+
43
+ OEMBED_DEFINITION_START = /^#{OPT_SPACE}\[([^\n\]]+)\]:[ \t]*(?:<(.*?)>|([^'"\n]*?\S[^'"\n]*?))[ \t]*?(?:\n?[ \t]*?(["'])(.+?)\4[ \t]*?)?\n/
44
+
45
+ # Parse the oembed definition at the current location.
46
+ def parse_oembed_definition
47
+ @src.pos += @src.matched_size
48
+ oembed_id, oembed_url, oembed_title = normalize_oembed_id(@src[1]), @src[2] || @src[3], @src[5]
49
+ warning("Duplicate oembed ID '#{oembed_id}' - overwriting") if @oembed_defs[oembed_id]
50
+ @oembed_defs[oembed_id] = [oembed_url, oembed_title]
51
+ @tree.children << Element.new(:eob, :oembed_def)
52
+ true
53
+ end
54
+ define_parser(:oembed_definition, OEMBED_DEFINITION_START)
55
+
56
+
57
+ # This helper methods adds the approriate attributes to the element +el+ of type +a+ or +img+
58
+ # and the element itself to the @tree.
59
+ def add_oembed(el, href, title, alt_text = nil)
60
+
61
+ providers = {
62
+ :twitter => "https://api.twitter.com/1/statuses/oembed.json?url=%s",
63
+ :youtube => "http://www.youtube.com/oembed?url=%s&format=json&maxwidth=550",
64
+ :flickr => "http://flickr.com/services/oembed?url=%s&maxwidth=460&format=json&maxwidth=550",
65
+ :vidler => "http://lab.viddler.com/services/oembed/?url=%s&type=simple&format=json",
66
+ :qik => "http://qik.com/api/oembed.json?url=%s&maxwidth=550",
67
+ :revision3 => "http://revision3.com/api/oembed/?url=%s&format=json&maxwidth=550",
68
+ :hulu => "http://www.hulu.com/api/oembed.json?url=%s&maxwidth=550",
69
+ :vimeo => "http://vimeo.com/api/oembed.json?url=%s&maxwidth=550",
70
+ :collegehumor => "http://www.collegehumor.com/oembed.json?url=%s&maxwidth=550",
71
+ # :pollyeverywhere => "http://www.polleverywhere.com/services/oembed?url=%s&format=json",
72
+ # :opera => "http://my.opera.com/service/oembed/?url=%s",
73
+ :embedly => "http://api.embed.ly/1/oembed?url=%w&maxwidth=550",
74
+ :ifixit => "http://www.ifixit.com/Embed?url=%s&format=json",
75
+ :smugmug => "http://api.smugmug.com/services/oembed/?url=%s&format=json",
76
+ :slideshare => "http://www.slideshare.net/api/oembed/2?url=%s&format=json&maxwidth=550",
77
+ :wordpress => "http://public-api.wordpress.com/oembed/1.0/?format=json&url=%s&maxwidth=550"
78
+ }
79
+ # ready the hash for matching
80
+ provider_names = (providers.keys.each { |name| name.to_s }).join('|')
81
+ # match possible providers to see if we have a provider suitable for embedding the current href/url
82
+ result = href.match provider_names
83
+ if result and result[0]
84
+ safe_href = CGI.escape(href)
85
+ provider = result[0].to_sym
86
+ oembed_url = providers[provider] % safe_href
87
+ # unique figure id
88
+ fig_id = rand(1000)
89
+ # oembed
90
+ el = Element.new :oembed
91
+ begin
92
+ # get the oEmbed content
93
+ result = JSON.parse(open(oembed_url).read)
94
+ el.attr['provider_name'] = result['provider_name']
95
+ case result['type']
96
+ when "photo"
97
+ title = result['title']
98
+ el.attr['role'] = "img"
99
+ img = Element.new(:img)
100
+ img.attr['src'] = result['url']
101
+ img.attr['alt'] = result['title']
102
+ img.attr['width'] = result['width']
103
+ img.attr['height'] = result['height']
104
+ img.children.clear
105
+ el.children << img
106
+ when "video"
107
+ title = result['title']
108
+ el.attr['html'] = CGI.unescapeHTML(result['html'])
109
+ when "rich"
110
+ el.attr['html'] = CGI.unescapeHTML(result['html'])
111
+ end
112
+
113
+ if title
114
+ # unique figure id
115
+ el_id = rand(1000)
116
+ el.attr['id'] = el_id
117
+ cap = Element.new(:figCaption, title)
118
+ cap.attr['id'] = el_id
119
+ if el.attr['role'] === "img"
120
+ link = Element.new(:a, result['author_name'])
121
+ link.attr['href'] = result['author_url']
122
+ cap.children << link
123
+ end
124
+ el.children << cap
125
+ end
126
+ @tree.children << el
127
+
128
+ rescue
129
+ warning("Could not retrieve oEmbed information for URL #{oembed_url}")
130
+ end
131
+ else
132
+ warning("No oEmbed provider found for URL #{href}")
133
+ end
134
+
135
+
136
+
137
+ # if el.type == :a
138
+ # el.attr['href'] = href
139
+ # else
140
+ # el.attr['src'] = href
141
+ # el.attr['alt'] = alt_text
142
+ # el.children.clear
143
+ # end
144
+ # el.attr['title'] = title if title
145
+ # @tree.children << el
146
+ end
147
+
148
+ OEMBED_BRACKET_STOP_RE = /(\])|!?\[/
149
+ OEMBED_PAREN_STOP_RE = /(\()|(\))|\s(?=['"])/
150
+ OEMBED_INLINE_ID_RE = /\s*?\[([^\]]+)?\]/
151
+ OEMBED_INLINE_TITLE_RE = /\s*?(["'])(.+?)\1\s*?\)/
152
+ OEMBED_START = /\?\[(?=[^^])/
153
+
154
+ # Parse the oembed at the current scanner position. This method is used to parse normal oembeds as
155
+ # well as image oembeds.
156
+ def parse_oembed
157
+ result = @src.scan(OEMBED_START)
158
+ reset_pos = @src.pos
159
+ oembed_type = :img
160
+
161
+ el = Element.new(oembed_type)
162
+
163
+ count = 1
164
+ found = parse_spans(el, OEMBED_BRACKET_STOP_RE) do
165
+ count = count + (@src[1] ? -1 : 1)
166
+ count - el.children.select {|c| c.type == :img}.size == 0
167
+ end
168
+ if !found || (oembed_type == :a && el.children.empty?)
169
+ @src.pos = reset_pos
170
+ add_text(result)
171
+ return
172
+ end
173
+ alt_text = extract_string(reset_pos...@src.pos, @src)
174
+ @src.scan(OEMBED_BRACKET_STOP_RE)
175
+
176
+ # reference style oembed or no oembed url
177
+ if @src.scan(OEMBED_INLINE_ID_RE) || !@src.check(/\(/)
178
+ oembed_id = normalize_oembed_id(@src[1] || alt_text)
179
+ if @oembed_defs.has_key?(oembed_id)
180
+ add_oembed(el, @oembed_defs[oembed_id].first, @oembed_defs[oembed_id].last, alt_text)
181
+ else
182
+ warning("No oembed definition for oembed ID '#{oembed_id}' found")
183
+ @src.pos = reset_pos
184
+ add_text(result)
185
+ end
186
+ return
187
+ end
188
+
189
+ # oembed url in parentheses
190
+ if @src.scan(/\(<(.*?)>/)
191
+ oembed_url = @src[1]
192
+ if @src.scan(/\)/)
193
+ add_oembed(el, oembed_url, nil, alt_text)
194
+ return
195
+ end
196
+ else
197
+ oembed_url = ''
198
+ nr_of_brackets = 0
199
+ while temp = @src.scan_until(OEMBED_PAREN_STOP_RE)
200
+ oembed_url << temp
201
+ if @src[2]
202
+ nr_of_brackets -= 1
203
+ break if nr_of_brackets == 0
204
+ elsif @src[1]
205
+ nr_of_brackets += 1
206
+ else
207
+ break
208
+ end
209
+ end
210
+ oembed_url = oembed_url[1..-2]
211
+ oembed_url.strip!
212
+
213
+ if nr_of_brackets == 0
214
+ add_oembed(el, oembed_url, nil, alt_text)
215
+ return
216
+ end
217
+ end
218
+
219
+ if @src.scan(OEMBED_INLINE_TITLE_RE)
220
+ add_oembed(el, oembed_url, @src[2], alt_text)
221
+ else
222
+ @src.pos = reset_pos
223
+ add_text(result)
224
+ end
225
+ end
226
+ define_parser(:oembed, OEMBED_START, '\?\[')
227
+
228
+ end
229
+ end
230
+ end
@@ -0,0 +1,570 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ #--
4
+ # Copyright (C) 2009-2012 Thomas Leitner <t_leitner@gmx.at>
5
+ #
6
+ # This file is part of kramdown.
7
+ #
8
+ # kramdown is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+ #
22
+
23
+ require 'rexml/parsers/baseparser'
24
+ require 'strscan'
25
+
26
+ module Kramdown
27
+
28
+ module Parser
29
+
30
+ # Used for parsing a HTML document.
31
+ #
32
+ # The parsing code is in the Parser module that can also be used by other parsers.
33
+ class Html < Base
34
+
35
+ # Contains all constants that are used when parsing.
36
+ module Constants
37
+
38
+ #:stopdoc:
39
+ # The following regexps are based on the ones used by REXML, with some slight modifications.
40
+ HTML_DOCTYPE_RE = /<!DOCTYPE.*?>/m
41
+ HTML_COMMENT_RE = /<!--(.*?)-->/m
42
+ HTML_INSTRUCTION_RE = /<\?(.*?)\?>/m
43
+ HTML_ATTRIBUTE_RE = /\s*(#{REXML::Parsers::BaseParser::UNAME_STR})(?:\s*=\s*(["'])(.*?)\2)?/m
44
+ HTML_TAG_RE = /<((?>#{REXML::Parsers::BaseParser::UNAME_STR}))\s*((?>\s+#{REXML::Parsers::BaseParser::UNAME_STR}(?:\s*=\s*(["']).*?\3)?)*)\s*(\/)?>/m
45
+ HTML_TAG_CLOSE_RE = /<\/(#{REXML::Parsers::BaseParser::UNAME_STR})\s*>/m
46
+ HTML_ENTITY_RE = /&([\w:][\-\w\.:]*);|&#(\d+);|&\#x([0-9a-fA-F]+);/
47
+
48
+ HTML_CONTENT_MODEL_BLOCK = %w{address applet article aside button blockquote body
49
+ dd div dl fieldset figure figcaption footer form header hgroup iframe li map menu nav
50
+ noscript object section td}
51
+ HTML_CONTENT_MODEL_SPAN = %w{a abbr acronym b bdo big button cite caption del dfn dt em
52
+ h1 h2 h3 h4 h5 h6 i ins kbd label legend optgroup p q rb rbc
53
+ rp rt rtc ruby samp select small span strong sub sup summary th tt var}
54
+ HTML_CONTENT_MODEL_RAW = %w{script style math option textarea pre code}
55
+ # The following elements are also parsed as raw since they need child elements that cannot
56
+ # be expressed using kramdown syntax: colgroup table tbody thead tfoot tr ul ol
57
+
58
+ HTML_CONTENT_MODEL = Hash.new {|h,k| h[k] = :raw}
59
+ HTML_CONTENT_MODEL_BLOCK.each {|i| HTML_CONTENT_MODEL[i] = :block}
60
+ HTML_CONTENT_MODEL_SPAN.each {|i| HTML_CONTENT_MODEL[i] = :span}
61
+ HTML_CONTENT_MODEL_RAW.each {|i| HTML_CONTENT_MODEL[i] = :raw}
62
+
63
+ # Some HTML elements like script belong to both categories (i.e. are valid in block and
64
+ # span HTML) and don't appear therefore!
65
+ HTML_SPAN_ELEMENTS = %w{a abbr acronym b big bdo br button cite code del dfn em i img input
66
+ ins kbd label option q rb rbc rp rt rtc ruby samp select small span
67
+ strong sub sup textarea tt var}
68
+ HTML_BLOCK_ELEMENTS = %w{address article aside applet body button blockquote caption col colgroup dd div dl dt fieldset
69
+ figcaption footer form h1 h2 h3 h4 h5 h6 header hgroup hr html head iframe legend menu
70
+ li map nav ol optgroup p pre section summary table tbody td th thead tfoot tr ul}
71
+ HTML_ELEMENTS_WITHOUT_BODY = %w{area base br col command embed hr img input keygen link meta param source track wbr}
72
+ end
73
+
74
+
75
+ # Contains the parsing methods. This module can be mixed into any parser to get HTML parsing
76
+ # functionality. The only thing that must be provided by the class are instance variable
77
+ # @stack for storing the needed state and @src (instance of StringScanner) for the actual
78
+ # parsing.
79
+ module Parser
80
+
81
+ include Constants
82
+
83
+ # Process the HTML start tag that has already be scanned/checked via @src.
84
+ #
85
+ # Does the common processing steps and then yields to the caller for further processing
86
+ # (first parameter is the created element, the second parameter is +true+ if the HTML
87
+ # element is already closed, ie. contains no body).
88
+ def handle_html_start_tag # :yields: el, closed
89
+ name = @src[1].downcase
90
+ closed = !@src[4].nil?
91
+ attrs = Utils::OrderedHash.new
92
+ @src[2].scan(HTML_ATTRIBUTE_RE).each {|attr,sep,val| attrs[attr.downcase] = val || ""}
93
+
94
+ el = Element.new(:html_element, name, attrs, :category => :block)
95
+ @tree.children << el
96
+
97
+ if !closed && HTML_ELEMENTS_WITHOUT_BODY.include?(el.value)
98
+ warning("The HTML tag '#{el.value}' cannot have any content - auto-closing it")
99
+ closed = true
100
+ end
101
+ if name == 'script' || name == 'style'
102
+ handle_raw_html_tag(name)
103
+ yield(el, true)
104
+ else
105
+ yield(el, closed)
106
+ end
107
+ end
108
+
109
+ # Handle the raw HTML tag at the current position.
110
+ def handle_raw_html_tag(name)
111
+ curpos = @src.pos
112
+ if @src.scan_until(/(?=<\/#{name}\s*>)/mi)
113
+ add_text(extract_string(curpos...@src.pos, @src), @tree.children.last, :raw)
114
+ @src.scan(HTML_TAG_CLOSE_RE)
115
+ else
116
+ add_text(@src.rest, @tree.children.last, :raw)
117
+ @src.terminate
118
+ warning("Found no end tag for '#{name}' - auto-closing it")
119
+ end
120
+ end
121
+
122
+ HTML_RAW_START = /(?=<(#{REXML::Parsers::BaseParser::UNAME_STR}|\/|!--|\?))/ # :nodoc:
123
+
124
+ # Parse raw HTML from the current source position, storing the found elements in +el+.
125
+ # Parsing continues until one of the following criteria are fulfilled:
126
+ #
127
+ # - The end of the document is reached.
128
+ # - The matching end tag for the element +el+ is found (only used if +el+ is an HTML
129
+ # element).
130
+ #
131
+ # When an HTML start tag is found, processing is deferred to #handle_html_start_tag,
132
+ # providing the block given to this method.
133
+ def parse_raw_html(el, &block)
134
+ @stack.push(@tree)
135
+ @tree = el
136
+
137
+ done = false
138
+ while !@src.eos? && !done
139
+ if result = @src.scan_until(HTML_RAW_START)
140
+ add_text(result, @tree, :text)
141
+ if result = @src.scan(HTML_COMMENT_RE)
142
+ @tree.children << Element.new(:xml_comment, result, nil, :category => :block)
143
+ elsif result = @src.scan(HTML_INSTRUCTION_RE)
144
+ @tree.children << Element.new(:xml_pi, result, nil, :category => :block)
145
+ elsif @src.scan(HTML_TAG_RE)
146
+ handle_html_start_tag(&block)
147
+ elsif @src.scan(HTML_TAG_CLOSE_RE)
148
+ if @tree.value == @src[1].downcase
149
+ done = true
150
+ else
151
+ warning("Found invalidly used HTML closing tag for '#{@src[1].downcase}' - ignoring it")
152
+ end
153
+ else
154
+ add_text(@src.getch, @tree, :text)
155
+ end
156
+ else
157
+ add_text(@src.rest, @tree, :text)
158
+ @src.terminate
159
+ warning("Found no end tag for '#{@tree.value}' - auto-closing it") if @tree.type == :html_element
160
+ done = true
161
+ end
162
+ end
163
+
164
+ @tree = @stack.pop
165
+ end
166
+
167
+ end
168
+
169
+
170
+ # Converts HTML elements to native elements if possible.
171
+ class ElementConverter
172
+
173
+ # :stopdoc:
174
+
175
+ include Constants
176
+ include ::Kramdown::Utils::Entities
177
+
178
+ REMOVE_TEXT_CHILDREN = %w{html head hgroup ol ul dl table colgroup tbody thead tfoot tr select optgroup}
179
+ WRAP_TEXT_CHILDREN = %w{body section nav article aside header footer address div li dd blockquote figure
180
+ figcaption fieldset form}
181
+ REMOVE_WHITESPACE_CHILDREN = %w{body section nav article aside header footer address
182
+ div li dd blockquote figure figcaption td th fieldset form}
183
+ STRIP_WHITESPACE = %w{address article aside blockquote body caption dd div dl dt fieldset figcaption form footer
184
+ header h1 h2 h3 h4 h5 h6 legend li nav p section td th}
185
+ SIMPLE_ELEMENTS = %w{em strong blockquote hr br img p thead tbody tfoot tr td th ul ol dl li dl dt dd}
186
+
187
+ def initialize(root)
188
+ @root = root
189
+ end
190
+
191
+ def self.convert(root, el = root)
192
+ new(root).process(el)
193
+ end
194
+
195
+ # Convert the element +el+ and its children.
196
+ def process(el, do_conversion = true, preserve_text = false, parent = nil)
197
+ case el.type
198
+ when :xml_comment, :xml_pi
199
+ ptype = if parent.nil?
200
+ 'div'
201
+ else
202
+ case parent.type
203
+ when :html_element then parent.value
204
+ when :code_span then 'code'
205
+ when :code_block then 'pre'
206
+ when :header then 'h1'
207
+ else parent.type.to_s
208
+ end
209
+ end
210
+ el.options.replace({:category => (HTML_CONTENT_MODEL[ptype] == :span ? :span : :block)})
211
+ return
212
+ when :html_element
213
+ when :root
214
+ el.children.each {|c| process(c)}
215
+ remove_whitespace_children(el)
216
+ return
217
+ else return
218
+ end
219
+
220
+ mname = "convert_#{el.value}"
221
+ if do_conversion && self.class.method_defined?(mname)
222
+ send(mname, el)
223
+ else
224
+ type = el.value
225
+ remove_text_children(el) if do_conversion && REMOVE_TEXT_CHILDREN.include?(type)
226
+
227
+ if do_conversion && SIMPLE_ELEMENTS.include?(type)
228
+ set_basics(el, type.intern)
229
+ process_children(el, do_conversion, preserve_text)
230
+ else
231
+ process_html_element(el, do_conversion, preserve_text)
232
+ end
233
+
234
+ if do_conversion
235
+ strip_whitespace(el) if STRIP_WHITESPACE.include?(type)
236
+ remove_whitespace_children(el) if REMOVE_WHITESPACE_CHILDREN.include?(type)
237
+ wrap_text_children(el) if WRAP_TEXT_CHILDREN.include?(type)
238
+ end
239
+ end
240
+ end
241
+
242
+ def process_children(el, do_conversion = true, preserve_text = false)
243
+ el.children.map! do |c|
244
+ if c.type == :text
245
+ process_text(c.value, preserve_text || !do_conversion)
246
+ else
247
+ process(c, do_conversion, preserve_text, el)
248
+ c
249
+ end
250
+ end.flatten!
251
+ end
252
+
253
+ # Process the HTML text +raw+: compress whitespace (if +preserve+ is +false+) and convert
254
+ # entities in entity elements.
255
+ def process_text(raw, preserve = false)
256
+ raw.gsub!(/\s+/, ' ') unless preserve
257
+ src = StringScanner.new(raw)
258
+ result = []
259
+ while !src.eos?
260
+ if tmp = src.scan_until(/(?=#{HTML_ENTITY_RE})/)
261
+ result << Element.new(:text, tmp)
262
+ src.scan(HTML_ENTITY_RE)
263
+ val = src[1] || (src[2] && src[2].to_i) || src[3].hex
264
+ result << if %w{lsquo rsquo ldquo rdquo}.include?(val)
265
+ Element.new(:smart_quote, val.intern)
266
+ elsif %w{mdash ndash hellip laquo raquo}.include?(val)
267
+ Element.new(:typographic_sym, val.intern)
268
+ else
269
+ begin
270
+ Element.new(:entity, entity(val), nil, :original => src.matched)
271
+ rescue ::Kramdown::Error
272
+ src.pos -= src.matched_size - 1
273
+ Element.new(:entity, ::Kramdown::Utils::Entities.entity('amp'))
274
+ end
275
+ end
276
+ else
277
+ result << Element.new(:text, src.rest)
278
+ src.terminate
279
+ end
280
+ end
281
+ result
282
+ end
283
+
284
+ def process_html_element(el, do_conversion = true, preserve_text = false)
285
+ el.options.replace(:category => HTML_SPAN_ELEMENTS.include?(el.value) ? :span : :block,
286
+ :content_model => (do_conversion ? HTML_CONTENT_MODEL[el.value] : :raw))
287
+ process_children(el, do_conversion, preserve_text)
288
+ end
289
+
290
+ def remove_text_children(el)
291
+ el.children.delete_if {|c| c.type == :text}
292
+ end
293
+
294
+ def wrap_text_children(el)
295
+ tmp = []
296
+ last_is_p = false
297
+ el.children.each do |c|
298
+ if Element.category(c) != :block || c.type == :text
299
+ if !last_is_p
300
+ tmp << Element.new(:p, nil, nil, :transparent => true)
301
+ last_is_p = true
302
+ end
303
+ tmp.last.children << c
304
+ tmp
305
+ else
306
+ tmp << c
307
+ last_is_p = false
308
+ end
309
+ end
310
+ el.children = tmp
311
+ end
312
+
313
+ def strip_whitespace(el)
314
+ return if el.children.empty?
315
+ if el.children.first.type == :text
316
+ el.children.first.value.lstrip!
317
+ end
318
+ if el.children.last.type == :text
319
+ el.children.last.value.rstrip!
320
+ end
321
+ end
322
+
323
+ def remove_whitespace_children(el)
324
+ i = -1
325
+ el.children = el.children.reject do |c|
326
+ i += 1
327
+ c.type == :text && c.value.strip.empty? &&
328
+ (i == 0 || i == el.children.length - 1 || (Element.category(el.children[i-1]) == :block &&
329
+ Element.category(el.children[i+1]) == :block))
330
+ end
331
+ end
332
+
333
+ def set_basics(el, type, opts = {})
334
+ el.type = type
335
+ el.options.replace(opts)
336
+ el.value = nil
337
+ end
338
+
339
+ def extract_text(el, raw)
340
+ raw << el.value.to_s if el.type == :text
341
+ el.children.each {|c| extract_text(c, raw)}
342
+ end
343
+
344
+ def convert_a(el)
345
+ if el.attr['href']
346
+ set_basics(el, :a)
347
+ process_children(el)
348
+ else
349
+ process_html_element(el, false)
350
+ end
351
+ end
352
+
353
+ EMPHASIS_TYPE_MAP = {'em' => :em, 'i' => :em, 'strong' => :strong, 'b' => :strong}
354
+ def convert_em(el)
355
+ text = ''
356
+ extract_text(el, text)
357
+ if text =~ /\A\s/ || text =~ /\s\z/
358
+ process_html_element(el, false)
359
+ else
360
+ set_basics(el, EMPHASIS_TYPE_MAP[el.value])
361
+ process_children(el)
362
+ end
363
+ end
364
+ %w{b strong i}.each do |i|
365
+ alias_method("convert_#{i}".to_sym, :convert_em)
366
+ end
367
+
368
+ def convert_h1(el)
369
+ set_basics(el, :header, :level => el.value[1..1].to_i)
370
+ extract_text(el, el.options[:raw_text] = '')
371
+ process_children(el)
372
+ end
373
+ %w{h2 h3 h4 h5 h6}.each do |i|
374
+ alias_method("convert_#{i}".to_sym, :convert_h1)
375
+ end
376
+
377
+ def convert_code(el)
378
+ raw = ''
379
+ extract_text(el, raw)
380
+ result = process_text(raw, true)
381
+ begin
382
+ str = result.inject('') do |mem, c|
383
+ if c.type == :text
384
+ mem << c.value
385
+ elsif c.type == :entity
386
+ if RUBY_VERSION >= '1.9'
387
+ mem << c.value.char.encode(@root.options[:encoding])
388
+ elsif [60, 62, 34, 38].include?(c.value.code_point)
389
+ mem << c.value.code_point.chr
390
+ end
391
+ elsif c.type == :smart_quote || c.type == :typographic_sym
392
+ mem << entity(c.value.to_s).char.encode(@root.options[:encoding])
393
+ else
394
+ raise "Bug - please report"
395
+ end
396
+ end
397
+ result.clear
398
+ result << Element.new(:text, str)
399
+ rescue
400
+ end
401
+ if result.length > 1 || result.first.type != :text
402
+ process_html_element(el, false, true)
403
+ else
404
+ if el.value == 'code'
405
+ set_basics(el, :codespan)
406
+ else
407
+ set_basics(el, :codeblock)
408
+ end
409
+ el.value = result.first.value
410
+ el.children.clear
411
+ end
412
+ end
413
+ alias :convert_pre :convert_code
414
+
415
+ def convert_table(el)
416
+ if !is_simple_table?(el)
417
+ process_html_element(el, false)
418
+ return
419
+ end
420
+ remove_text_children(el)
421
+ process_children(el)
422
+ set_basics(el, :table)
423
+
424
+ calc_alignment = lambda do |c|
425
+ if c.type == :tr
426
+ el.options[:alignment] = c.children.map do |td|
427
+ if td.attr['style']
428
+ td.attr['style'].slice!(/(?:;\s*)?text-align:\s+(center|left|right)/)
429
+ td.attr.delete('style') if td.attr['style'].strip.empty?
430
+ $1.to_sym
431
+ else
432
+ :default
433
+ end
434
+ end
435
+ else
436
+ c.children.each {|cc| calc_alignment.call(cc)}
437
+ end
438
+ end
439
+ calc_alignment.call(el)
440
+ el.children.delete_if {|c| c.type == :html_element}
441
+
442
+ change_th_type = lambda do |c|
443
+ if c.type == :th
444
+ c.type = :td
445
+ else
446
+ c.children.each {|cc| change_th_type.call(cc)}
447
+ end
448
+ end
449
+ change_th_type.call(el)
450
+
451
+ if el.children.first.type == :tr
452
+ tbody = Element.new(:tbody)
453
+ tbody.children = el.children
454
+ el.children = [tbody]
455
+ end
456
+ end
457
+
458
+ def is_simple_table?(el)
459
+ only_phrasing_content = lambda do |c|
460
+ c.children.all? do |cc|
461
+ (cc.type == :text || !HTML_BLOCK_ELEMENTS.include?(cc.value)) && only_phrasing_content.call(cc)
462
+ end
463
+ end
464
+ check_cells = Proc.new do |c|
465
+ if c.value == 'th' || c.value == 'td'
466
+ return false if !only_phrasing_content.call(c)
467
+ else
468
+ c.children.each {|cc| check_cells.call(cc)}
469
+ end
470
+ end
471
+ check_cells.call(el)
472
+
473
+ nr_cells = 0
474
+ check_nr_cells = lambda do |t|
475
+ if t.value == 'tr'
476
+ count = t.children.select {|cc| cc.value == 'th' || cc.value == 'td'}.length
477
+ if count != nr_cells
478
+ if nr_cells == 0
479
+ nr_cells = count
480
+ else
481
+ nr_cells = -1
482
+ break
483
+ end
484
+ end
485
+ else
486
+ t.children.each {|cc| check_nr_cells.call(cc)}
487
+ end
488
+ end
489
+ check_nr_cells.call(el)
490
+ return false if nr_cells == -1
491
+
492
+ alignment = nil
493
+ check_alignment = Proc.new do |t|
494
+ if t.value == 'tr'
495
+ cur_alignment = t.children.select {|cc| cc.value == 'th' || cc.value == 'td'}.map do |cell|
496
+ md = /text-align:\s+(center|left|right|justify|inherit)/.match(cell.attr['style'].to_s)
497
+ return false if md && (md[1] == 'justify' || md[1] == 'inherit')
498
+ md.nil? ? :default : md[1]
499
+ end
500
+ alignment = cur_alignment if alignment.nil?
501
+ return false if alignment != cur_alignment
502
+ else
503
+ t.children.each {|cc| check_alignment.call(cc)}
504
+ end
505
+ end
506
+ check_alignment.call(el)
507
+
508
+ check_rows = lambda do |t, type|
509
+ t.children.all? {|r| (r.value == 'tr' || r.type == :text) && r.children.all? {|c| c.value == type || c.type == :text}}
510
+ end
511
+ check_rows.call(el, 'td') ||
512
+ (el.children.all? do |t|
513
+ t.type == :text || (t.value == 'thead' && check_rows.call(t, 'th')) ||
514
+ ((t.value == 'tfoot' || t.value == 'tbody') && check_rows.call(t, 'td'))
515
+ end && el.children.any? {|t| t.value == 'tbody'})
516
+ end
517
+
518
+ def convert_script(el)
519
+ if !is_math_tag?(el)
520
+ process_html_element(el)
521
+ else
522
+ handle_math_tag(el)
523
+ end
524
+ end
525
+
526
+ def is_math_tag?(el)
527
+ el.attr['type'].to_s =~ /\bmath\/tex\b/
528
+ end
529
+
530
+ def handle_math_tag(el)
531
+ set_basics(el, :math, :category => (el.attr['type'] =~ /mode=display/ ? :block : :span))
532
+ el.value = el.children.shift.value.sub(/\A<!\[CDATA\[(.*)\]\]>\z/m, '\1')
533
+ el.attr.delete('type')
534
+ end
535
+
536
+ end
537
+
538
+ include Parser
539
+
540
+ # Parse the source string provided on initialization as HTML document.
541
+ def parse
542
+ @stack, @tree = [], @root
543
+ @src = StringScanner.new(adapt_source(source))
544
+
545
+ while true
546
+ if result = @src.scan(/\s*#{HTML_INSTRUCTION_RE}/)
547
+ @tree.children << Element.new(:xml_pi, result.strip, nil, :category => :block)
548
+ elsif result = @src.scan(/\s*#{HTML_DOCTYPE_RE}/)
549
+ # ignore the doctype
550
+ elsif result = @src.scan(/\s*#{HTML_COMMENT_RE}/)
551
+ @tree.children << Element.new(:xml_comment, result.strip, nil, :category => :block)
552
+ else
553
+ break
554
+ end
555
+ end
556
+
557
+ tag_handler = lambda do |c, closed|
558
+ parse_raw_html(c, &tag_handler) if !closed
559
+ end
560
+ parse_raw_html(@tree, &tag_handler)
561
+
562
+ ElementConverter.convert(@tree)
563
+ end
564
+
565
+ end
566
+
567
+ end
568
+
569
+ end
570
+