bean-kramdown 0.13.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. data/AUTHORS +1 -0
  2. data/CONTRIBUTERS +11 -0
  3. data/COPYING +24 -0
  4. data/ChangeLog +6683 -0
  5. data/GPL +674 -0
  6. data/README +43 -0
  7. data/VERSION +1 -0
  8. data/bin/kramdown +78 -0
  9. data/lib/kramdown.rb +23 -0
  10. data/lib/kramdown/compatibility.rb +49 -0
  11. data/lib/kramdown/converter.rb +41 -0
  12. data/lib/kramdown/converter/base.rb +169 -0
  13. data/lib/kramdown/converter/bean_html.rb +71 -0
  14. data/lib/kramdown/converter/html.rb +411 -0
  15. data/lib/kramdown/converter/kramdown.rb +428 -0
  16. data/lib/kramdown/converter/latex.rb +607 -0
  17. data/lib/kramdown/converter/toc.rb +82 -0
  18. data/lib/kramdown/document.rb +119 -0
  19. data/lib/kramdown/element.rb +524 -0
  20. data/lib/kramdown/error.rb +30 -0
  21. data/lib/kramdown/options.rb +373 -0
  22. data/lib/kramdown/parser.rb +39 -0
  23. data/lib/kramdown/parser/base.rb +136 -0
  24. data/lib/kramdown/parser/bean_kramdown.rb +25 -0
  25. data/lib/kramdown/parser/bean_kramdown/info_box.rb +52 -0
  26. data/lib/kramdown/parser/bean_kramdown/oembed.rb +230 -0
  27. data/lib/kramdown/parser/html.rb +570 -0
  28. data/lib/kramdown/parser/kramdown.rb +339 -0
  29. data/lib/kramdown/parser/kramdown/abbreviation.rb +71 -0
  30. data/lib/kramdown/parser/kramdown/autolink.rb +53 -0
  31. data/lib/kramdown/parser/kramdown/blank_line.rb +43 -0
  32. data/lib/kramdown/parser/kramdown/block_boundary.rb +46 -0
  33. data/lib/kramdown/parser/kramdown/blockquote.rb +51 -0
  34. data/lib/kramdown/parser/kramdown/codeblock.rb +63 -0
  35. data/lib/kramdown/parser/kramdown/codespan.rb +56 -0
  36. data/lib/kramdown/parser/kramdown/emphasis.rb +70 -0
  37. data/lib/kramdown/parser/kramdown/eob.rb +39 -0
  38. data/lib/kramdown/parser/kramdown/escaped_chars.rb +38 -0
  39. data/lib/kramdown/parser/kramdown/extensions.rb +204 -0
  40. data/lib/kramdown/parser/kramdown/footnote.rb +74 -0
  41. data/lib/kramdown/parser/kramdown/header.rb +68 -0
  42. data/lib/kramdown/parser/kramdown/horizontal_rule.rb +39 -0
  43. data/lib/kramdown/parser/kramdown/html.rb +169 -0
  44. data/lib/kramdown/parser/kramdown/html_entity.rb +44 -0
  45. data/lib/kramdown/parser/kramdown/image.rb +157 -0
  46. data/lib/kramdown/parser/kramdown/line_break.rb +38 -0
  47. data/lib/kramdown/parser/kramdown/link.rb +154 -0
  48. data/lib/kramdown/parser/kramdown/list.rb +240 -0
  49. data/lib/kramdown/parser/kramdown/math.rb +65 -0
  50. data/lib/kramdown/parser/kramdown/paragraph.rb +63 -0
  51. data/lib/kramdown/parser/kramdown/smart_quotes.rb +214 -0
  52. data/lib/kramdown/parser/kramdown/table.rb +178 -0
  53. data/lib/kramdown/parser/kramdown/typographic_symbol.rb +52 -0
  54. data/lib/kramdown/parser/markdown.rb +69 -0
  55. data/lib/kramdown/utils.rb +42 -0
  56. data/lib/kramdown/utils/entities.rb +348 -0
  57. data/lib/kramdown/utils/html.rb +85 -0
  58. data/lib/kramdown/utils/ordered_hash.rb +100 -0
  59. data/lib/kramdown/version.rb +28 -0
  60. metadata +140 -0
@@ -0,0 +1,25 @@
1
+ require 'kramdown/parser/kramdown'
2
+
3
+ module Kramdown
4
+ module Parser
5
+ class BeanKramdown < Kramdown
6
+
7
+ # Array with all the parsing methods that should be removed from the standard kramdown parser.
8
+ EXCEPT = [:codeblock_fenced, :block_extensions, :span_extensions]
9
+
10
+ # initialise new parsers
11
+ def initialize(source, options)
12
+ super
13
+
14
+ @block_parsers.unshift(:info_box)
15
+ @span_parsers.unshift(:oembed)
16
+
17
+ @block_parsers.delete_if {|i| EXCEPT.include?(i)}
18
+ @span_parsers.delete_if {|i| EXCEPT.include?(i)}
19
+ end
20
+
21
+ require 'kramdown/parser/bean_kramdown/info_box'
22
+ require 'kramdown/parser/bean_kramdown/oembed'
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,52 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ #--
4
+ # Copyright (C) 2009-2012 Thomas Leitner <t_leitner@gmx.at>
5
+ #
6
+ # This file is part of kramdown.
7
+ #
8
+ # kramdown is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+ #
22
+
23
+ require 'kramdown/parser/kramdown/blank_line'
24
+ require 'kramdown/parser/kramdown/extensions'
25
+ require 'kramdown/parser/kramdown/eob'
26
+
27
+ module Kramdown
28
+ module Parser
29
+ class BeanKramdown
30
+
31
+ INFO_BOX_START = /^#{OPT_SPACE}% ?/
32
+
33
+ # Parse the info box at the current location.
34
+ def parse_info_box
35
+ result = @src.scan(PARAGRAPH_MATCH)
36
+ while !@src.match?(self.class::LAZY_END)
37
+ result << @src.scan(PARAGRAPH_MATCH)
38
+ end
39
+ result.gsub!(INFO_BOX_START, '')
40
+
41
+ el = new_block_el(:info_box)
42
+ @tree.children << el
43
+ parse_blocks(el, result)
44
+ true
45
+ end
46
+
47
+ define_parser(:info_box, INFO_BOX_START)
48
+
49
+
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,230 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ #--
4
+ # Copyright (C) 2009-2012 Thomas Leitner <t_leitner@gmx.at>
5
+ #
6
+ # This file is part of kramdown.
7
+ #
8
+ # kramdown is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+ #
22
+
23
+ #
24
+ # This file has been edited to suit the needs of The Beans Group Ltd. Changes were made to the types of media availbable
25
+ # images keep their ! however new types are ? for oembedd etc
26
+ # If you wish to change the types of media you need to change the OEMBED_START constant to include your special symbol
27
+ # for the new media object as well as change the reg ex on the parser definition towards the bottom of this file.
28
+ #
29
+
30
+ require 'open-uri'
31
+ require 'cgi'
32
+ require 'json'
33
+
34
+ module Kramdown
35
+ module Parser
36
+ class BeanKramdown
37
+
38
+ # Normalize the oembed identifier.
39
+ def normalize_oembed_id(id)
40
+ id.gsub(/(\s|\n)+/, ' ').downcase
41
+ end
42
+
43
+ OEMBED_DEFINITION_START = /^#{OPT_SPACE}\[([^\n\]]+)\]:[ \t]*(?:<(.*?)>|([^'"\n]*?\S[^'"\n]*?))[ \t]*?(?:\n?[ \t]*?(["'])(.+?)\4[ \t]*?)?\n/
44
+
45
+ # Parse the oembed definition at the current location.
46
+ def parse_oembed_definition
47
+ @src.pos += @src.matched_size
48
+ oembed_id, oembed_url, oembed_title = normalize_oembed_id(@src[1]), @src[2] || @src[3], @src[5]
49
+ warning("Duplicate oembed ID '#{oembed_id}' - overwriting") if @oembed_defs[oembed_id]
50
+ @oembed_defs[oembed_id] = [oembed_url, oembed_title]
51
+ @tree.children << Element.new(:eob, :oembed_def)
52
+ true
53
+ end
54
+ define_parser(:oembed_definition, OEMBED_DEFINITION_START)
55
+
56
+
57
+ # This helper methods adds the approriate attributes to the element +el+ of type +a+ or +img+
58
+ # and the element itself to the @tree.
59
+ def add_oembed(el, href, title, alt_text = nil)
60
+
61
+ providers = {
62
+ :twitter => "https://api.twitter.com/1/statuses/oembed.json?url=%s",
63
+ :youtube => "http://www.youtube.com/oembed?url=%s&format=json&maxwidth=550",
64
+ :flickr => "http://flickr.com/services/oembed?url=%s&maxwidth=460&format=json&maxwidth=550",
65
+ :vidler => "http://lab.viddler.com/services/oembed/?url=%s&type=simple&format=json",
66
+ :qik => "http://qik.com/api/oembed.json?url=%s&maxwidth=550",
67
+ :revision3 => "http://revision3.com/api/oembed/?url=%s&format=json&maxwidth=550",
68
+ :hulu => "http://www.hulu.com/api/oembed.json?url=%s&maxwidth=550",
69
+ :vimeo => "http://vimeo.com/api/oembed.json?url=%s&maxwidth=550",
70
+ :collegehumor => "http://www.collegehumor.com/oembed.json?url=%s&maxwidth=550",
71
+ # :pollyeverywhere => "http://www.polleverywhere.com/services/oembed?url=%s&format=json",
72
+ # :opera => "http://my.opera.com/service/oembed/?url=%s",
73
+ :embedly => "http://api.embed.ly/1/oembed?url=%w&maxwidth=550",
74
+ :ifixit => "http://www.ifixit.com/Embed?url=%s&format=json",
75
+ :smugmug => "http://api.smugmug.com/services/oembed/?url=%s&format=json",
76
+ :slideshare => "http://www.slideshare.net/api/oembed/2?url=%s&format=json&maxwidth=550",
77
+ :wordpress => "http://public-api.wordpress.com/oembed/1.0/?format=json&url=%s&maxwidth=550"
78
+ }
79
+ # ready the hash for matching
80
+ provider_names = (providers.keys.each { |name| name.to_s }).join('|')
81
+ # match possible providers to see if we have a provider suitable for embedding the current href/url
82
+ result = href.match provider_names
83
+ if result and result[0]
84
+ safe_href = CGI.escape(href)
85
+ provider = result[0].to_sym
86
+ oembed_url = providers[provider] % safe_href
87
+ # unique figure id
88
+ fig_id = rand(1000)
89
+ # oembed
90
+ el = Element.new :oembed
91
+ begin
92
+ # get the oEmbed content
93
+ result = JSON.parse(open(oembed_url).read)
94
+ el.attr['provider_name'] = result['provider_name']
95
+ case result['type']
96
+ when "photo"
97
+ title = result['title']
98
+ el.attr['role'] = "img"
99
+ img = Element.new(:img)
100
+ img.attr['src'] = result['url']
101
+ img.attr['alt'] = result['title']
102
+ img.attr['width'] = result['width']
103
+ img.attr['height'] = result['height']
104
+ img.children.clear
105
+ el.children << img
106
+ when "video"
107
+ title = result['title']
108
+ el.attr['html'] = CGI.unescapeHTML(result['html'])
109
+ when "rich"
110
+ el.attr['html'] = CGI.unescapeHTML(result['html'])
111
+ end
112
+
113
+ if title
114
+ # unique figure id
115
+ el_id = rand(1000)
116
+ el.attr['id'] = el_id
117
+ cap = Element.new(:figCaption, title)
118
+ cap.attr['id'] = el_id
119
+ if el.attr['role'] === "img"
120
+ link = Element.new(:a, result['author_name'])
121
+ link.attr['href'] = result['author_url']
122
+ cap.children << link
123
+ end
124
+ el.children << cap
125
+ end
126
+ @tree.children << el
127
+
128
+ rescue
129
+ warning("Could not retrieve oEmbed information for URL #{oembed_url}")
130
+ end
131
+ else
132
+ warning("No oEmbed provider found for URL #{href}")
133
+ end
134
+
135
+
136
+
137
+ # if el.type == :a
138
+ # el.attr['href'] = href
139
+ # else
140
+ # el.attr['src'] = href
141
+ # el.attr['alt'] = alt_text
142
+ # el.children.clear
143
+ # end
144
+ # el.attr['title'] = title if title
145
+ # @tree.children << el
146
+ end
147
+
148
+ OEMBED_BRACKET_STOP_RE = /(\])|!?\[/
149
+ OEMBED_PAREN_STOP_RE = /(\()|(\))|\s(?=['"])/
150
+ OEMBED_INLINE_ID_RE = /\s*?\[([^\]]+)?\]/
151
+ OEMBED_INLINE_TITLE_RE = /\s*?(["'])(.+?)\1\s*?\)/
152
+ OEMBED_START = /\?\[(?=[^^])/
153
+
154
+ # Parse the oembed at the current scanner position. This method is used to parse normal oembeds as
155
+ # well as image oembeds.
156
+ def parse_oembed
157
+ result = @src.scan(OEMBED_START)
158
+ reset_pos = @src.pos
159
+ oembed_type = :img
160
+
161
+ el = Element.new(oembed_type)
162
+
163
+ count = 1
164
+ found = parse_spans(el, OEMBED_BRACKET_STOP_RE) do
165
+ count = count + (@src[1] ? -1 : 1)
166
+ count - el.children.select {|c| c.type == :img}.size == 0
167
+ end
168
+ if !found || (oembed_type == :a && el.children.empty?)
169
+ @src.pos = reset_pos
170
+ add_text(result)
171
+ return
172
+ end
173
+ alt_text = extract_string(reset_pos...@src.pos, @src)
174
+ @src.scan(OEMBED_BRACKET_STOP_RE)
175
+
176
+ # reference style oembed or no oembed url
177
+ if @src.scan(OEMBED_INLINE_ID_RE) || !@src.check(/\(/)
178
+ oembed_id = normalize_oembed_id(@src[1] || alt_text)
179
+ if @oembed_defs.has_key?(oembed_id)
180
+ add_oembed(el, @oembed_defs[oembed_id].first, @oembed_defs[oembed_id].last, alt_text)
181
+ else
182
+ warning("No oembed definition for oembed ID '#{oembed_id}' found")
183
+ @src.pos = reset_pos
184
+ add_text(result)
185
+ end
186
+ return
187
+ end
188
+
189
+ # oembed url in parentheses
190
+ if @src.scan(/\(<(.*?)>/)
191
+ oembed_url = @src[1]
192
+ if @src.scan(/\)/)
193
+ add_oembed(el, oembed_url, nil, alt_text)
194
+ return
195
+ end
196
+ else
197
+ oembed_url = ''
198
+ nr_of_brackets = 0
199
+ while temp = @src.scan_until(OEMBED_PAREN_STOP_RE)
200
+ oembed_url << temp
201
+ if @src[2]
202
+ nr_of_brackets -= 1
203
+ break if nr_of_brackets == 0
204
+ elsif @src[1]
205
+ nr_of_brackets += 1
206
+ else
207
+ break
208
+ end
209
+ end
210
+ oembed_url = oembed_url[1..-2]
211
+ oembed_url.strip!
212
+
213
+ if nr_of_brackets == 0
214
+ add_oembed(el, oembed_url, nil, alt_text)
215
+ return
216
+ end
217
+ end
218
+
219
+ if @src.scan(OEMBED_INLINE_TITLE_RE)
220
+ add_oembed(el, oembed_url, @src[2], alt_text)
221
+ else
222
+ @src.pos = reset_pos
223
+ add_text(result)
224
+ end
225
+ end
226
+ define_parser(:oembed, OEMBED_START, '\?\[')
227
+
228
+ end
229
+ end
230
+ end
@@ -0,0 +1,570 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ #--
4
+ # Copyright (C) 2009-2012 Thomas Leitner <t_leitner@gmx.at>
5
+ #
6
+ # This file is part of kramdown.
7
+ #
8
+ # kramdown is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+ #
22
+
23
+ require 'rexml/parsers/baseparser'
24
+ require 'strscan'
25
+
26
+ module Kramdown
27
+
28
+ module Parser
29
+
30
+ # Used for parsing a HTML document.
31
+ #
32
+ # The parsing code is in the Parser module that can also be used by other parsers.
33
+ class Html < Base
34
+
35
+ # Contains all constants that are used when parsing.
36
+ module Constants
37
+
38
+ #:stopdoc:
39
+ # The following regexps are based on the ones used by REXML, with some slight modifications.
40
+ HTML_DOCTYPE_RE = /<!DOCTYPE.*?>/m
41
+ HTML_COMMENT_RE = /<!--(.*?)-->/m
42
+ HTML_INSTRUCTION_RE = /<\?(.*?)\?>/m
43
+ HTML_ATTRIBUTE_RE = /\s*(#{REXML::Parsers::BaseParser::UNAME_STR})(?:\s*=\s*(["'])(.*?)\2)?/m
44
+ HTML_TAG_RE = /<((?>#{REXML::Parsers::BaseParser::UNAME_STR}))\s*((?>\s+#{REXML::Parsers::BaseParser::UNAME_STR}(?:\s*=\s*(["']).*?\3)?)*)\s*(\/)?>/m
45
+ HTML_TAG_CLOSE_RE = /<\/(#{REXML::Parsers::BaseParser::UNAME_STR})\s*>/m
46
+ HTML_ENTITY_RE = /&([\w:][\-\w\.:]*);|&#(\d+);|&\#x([0-9a-fA-F]+);/
47
+
48
+ HTML_CONTENT_MODEL_BLOCK = %w{address applet article aside button blockquote body
49
+ dd div dl fieldset figure figcaption footer form header hgroup iframe li map menu nav
50
+ noscript object section td}
51
+ HTML_CONTENT_MODEL_SPAN = %w{a abbr acronym b bdo big button cite caption del dfn dt em
52
+ h1 h2 h3 h4 h5 h6 i ins kbd label legend optgroup p q rb rbc
53
+ rp rt rtc ruby samp select small span strong sub sup summary th tt var}
54
+ HTML_CONTENT_MODEL_RAW = %w{script style math option textarea pre code}
55
+ # The following elements are also parsed as raw since they need child elements that cannot
56
+ # be expressed using kramdown syntax: colgroup table tbody thead tfoot tr ul ol
57
+
58
+ HTML_CONTENT_MODEL = Hash.new {|h,k| h[k] = :raw}
59
+ HTML_CONTENT_MODEL_BLOCK.each {|i| HTML_CONTENT_MODEL[i] = :block}
60
+ HTML_CONTENT_MODEL_SPAN.each {|i| HTML_CONTENT_MODEL[i] = :span}
61
+ HTML_CONTENT_MODEL_RAW.each {|i| HTML_CONTENT_MODEL[i] = :raw}
62
+
63
+ # Some HTML elements like script belong to both categories (i.e. are valid in block and
64
+ # span HTML) and don't appear therefore!
65
+ HTML_SPAN_ELEMENTS = %w{a abbr acronym b big bdo br button cite code del dfn em i img input
66
+ ins kbd label option q rb rbc rp rt rtc ruby samp select small span
67
+ strong sub sup textarea tt var}
68
+ HTML_BLOCK_ELEMENTS = %w{address article aside applet body button blockquote caption col colgroup dd div dl dt fieldset
69
+ figcaption footer form h1 h2 h3 h4 h5 h6 header hgroup hr html head iframe legend menu
70
+ li map nav ol optgroup p pre section summary table tbody td th thead tfoot tr ul}
71
+ HTML_ELEMENTS_WITHOUT_BODY = %w{area base br col command embed hr img input keygen link meta param source track wbr}
72
+ end
73
+
74
+
75
+ # Contains the parsing methods. This module can be mixed into any parser to get HTML parsing
76
+ # functionality. The only thing that must be provided by the class are instance variable
77
+ # @stack for storing the needed state and @src (instance of StringScanner) for the actual
78
+ # parsing.
79
+ module Parser
80
+
81
+ include Constants
82
+
83
+ # Process the HTML start tag that has already be scanned/checked via @src.
84
+ #
85
+ # Does the common processing steps and then yields to the caller for further processing
86
+ # (first parameter is the created element, the second parameter is +true+ if the HTML
87
+ # element is already closed, ie. contains no body).
88
+ def handle_html_start_tag # :yields: el, closed
89
+ name = @src[1].downcase
90
+ closed = !@src[4].nil?
91
+ attrs = Utils::OrderedHash.new
92
+ @src[2].scan(HTML_ATTRIBUTE_RE).each {|attr,sep,val| attrs[attr.downcase] = val || ""}
93
+
94
+ el = Element.new(:html_element, name, attrs, :category => :block)
95
+ @tree.children << el
96
+
97
+ if !closed && HTML_ELEMENTS_WITHOUT_BODY.include?(el.value)
98
+ warning("The HTML tag '#{el.value}' cannot have any content - auto-closing it")
99
+ closed = true
100
+ end
101
+ if name == 'script' || name == 'style'
102
+ handle_raw_html_tag(name)
103
+ yield(el, true)
104
+ else
105
+ yield(el, closed)
106
+ end
107
+ end
108
+
109
+ # Handle the raw HTML tag at the current position.
110
+ def handle_raw_html_tag(name)
111
+ curpos = @src.pos
112
+ if @src.scan_until(/(?=<\/#{name}\s*>)/mi)
113
+ add_text(extract_string(curpos...@src.pos, @src), @tree.children.last, :raw)
114
+ @src.scan(HTML_TAG_CLOSE_RE)
115
+ else
116
+ add_text(@src.rest, @tree.children.last, :raw)
117
+ @src.terminate
118
+ warning("Found no end tag for '#{name}' - auto-closing it")
119
+ end
120
+ end
121
+
122
+ HTML_RAW_START = /(?=<(#{REXML::Parsers::BaseParser::UNAME_STR}|\/|!--|\?))/ # :nodoc:
123
+
124
+ # Parse raw HTML from the current source position, storing the found elements in +el+.
125
+ # Parsing continues until one of the following criteria are fulfilled:
126
+ #
127
+ # - The end of the document is reached.
128
+ # - The matching end tag for the element +el+ is found (only used if +el+ is an HTML
129
+ # element).
130
+ #
131
+ # When an HTML start tag is found, processing is deferred to #handle_html_start_tag,
132
+ # providing the block given to this method.
133
+ def parse_raw_html(el, &block)
134
+ @stack.push(@tree)
135
+ @tree = el
136
+
137
+ done = false
138
+ while !@src.eos? && !done
139
+ if result = @src.scan_until(HTML_RAW_START)
140
+ add_text(result, @tree, :text)
141
+ if result = @src.scan(HTML_COMMENT_RE)
142
+ @tree.children << Element.new(:xml_comment, result, nil, :category => :block)
143
+ elsif result = @src.scan(HTML_INSTRUCTION_RE)
144
+ @tree.children << Element.new(:xml_pi, result, nil, :category => :block)
145
+ elsif @src.scan(HTML_TAG_RE)
146
+ handle_html_start_tag(&block)
147
+ elsif @src.scan(HTML_TAG_CLOSE_RE)
148
+ if @tree.value == @src[1].downcase
149
+ done = true
150
+ else
151
+ warning("Found invalidly used HTML closing tag for '#{@src[1].downcase}' - ignoring it")
152
+ end
153
+ else
154
+ add_text(@src.getch, @tree, :text)
155
+ end
156
+ else
157
+ add_text(@src.rest, @tree, :text)
158
+ @src.terminate
159
+ warning("Found no end tag for '#{@tree.value}' - auto-closing it") if @tree.type == :html_element
160
+ done = true
161
+ end
162
+ end
163
+
164
+ @tree = @stack.pop
165
+ end
166
+
167
+ end
168
+
169
+
170
+ # Converts HTML elements to native elements if possible.
171
+ class ElementConverter
172
+
173
+ # :stopdoc:
174
+
175
+ include Constants
176
+ include ::Kramdown::Utils::Entities
177
+
178
+ REMOVE_TEXT_CHILDREN = %w{html head hgroup ol ul dl table colgroup tbody thead tfoot tr select optgroup}
179
+ WRAP_TEXT_CHILDREN = %w{body section nav article aside header footer address div li dd blockquote figure
180
+ figcaption fieldset form}
181
+ REMOVE_WHITESPACE_CHILDREN = %w{body section nav article aside header footer address
182
+ div li dd blockquote figure figcaption td th fieldset form}
183
+ STRIP_WHITESPACE = %w{address article aside blockquote body caption dd div dl dt fieldset figcaption form footer
184
+ header h1 h2 h3 h4 h5 h6 legend li nav p section td th}
185
+ SIMPLE_ELEMENTS = %w{em strong blockquote hr br img p thead tbody tfoot tr td th ul ol dl li dl dt dd}
186
+
187
+ def initialize(root)
188
+ @root = root
189
+ end
190
+
191
+ def self.convert(root, el = root)
192
+ new(root).process(el)
193
+ end
194
+
195
+ # Convert the element +el+ and its children.
196
+ def process(el, do_conversion = true, preserve_text = false, parent = nil)
197
+ case el.type
198
+ when :xml_comment, :xml_pi
199
+ ptype = if parent.nil?
200
+ 'div'
201
+ else
202
+ case parent.type
203
+ when :html_element then parent.value
204
+ when :code_span then 'code'
205
+ when :code_block then 'pre'
206
+ when :header then 'h1'
207
+ else parent.type.to_s
208
+ end
209
+ end
210
+ el.options.replace({:category => (HTML_CONTENT_MODEL[ptype] == :span ? :span : :block)})
211
+ return
212
+ when :html_element
213
+ when :root
214
+ el.children.each {|c| process(c)}
215
+ remove_whitespace_children(el)
216
+ return
217
+ else return
218
+ end
219
+
220
+ mname = "convert_#{el.value}"
221
+ if do_conversion && self.class.method_defined?(mname)
222
+ send(mname, el)
223
+ else
224
+ type = el.value
225
+ remove_text_children(el) if do_conversion && REMOVE_TEXT_CHILDREN.include?(type)
226
+
227
+ if do_conversion && SIMPLE_ELEMENTS.include?(type)
228
+ set_basics(el, type.intern)
229
+ process_children(el, do_conversion, preserve_text)
230
+ else
231
+ process_html_element(el, do_conversion, preserve_text)
232
+ end
233
+
234
+ if do_conversion
235
+ strip_whitespace(el) if STRIP_WHITESPACE.include?(type)
236
+ remove_whitespace_children(el) if REMOVE_WHITESPACE_CHILDREN.include?(type)
237
+ wrap_text_children(el) if WRAP_TEXT_CHILDREN.include?(type)
238
+ end
239
+ end
240
+ end
241
+
242
+ def process_children(el, do_conversion = true, preserve_text = false)
243
+ el.children.map! do |c|
244
+ if c.type == :text
245
+ process_text(c.value, preserve_text || !do_conversion)
246
+ else
247
+ process(c, do_conversion, preserve_text, el)
248
+ c
249
+ end
250
+ end.flatten!
251
+ end
252
+
253
+ # Process the HTML text +raw+: compress whitespace (if +preserve+ is +false+) and convert
254
+ # entities in entity elements.
255
+ def process_text(raw, preserve = false)
256
+ raw.gsub!(/\s+/, ' ') unless preserve
257
+ src = StringScanner.new(raw)
258
+ result = []
259
+ while !src.eos?
260
+ if tmp = src.scan_until(/(?=#{HTML_ENTITY_RE})/)
261
+ result << Element.new(:text, tmp)
262
+ src.scan(HTML_ENTITY_RE)
263
+ val = src[1] || (src[2] && src[2].to_i) || src[3].hex
264
+ result << if %w{lsquo rsquo ldquo rdquo}.include?(val)
265
+ Element.new(:smart_quote, val.intern)
266
+ elsif %w{mdash ndash hellip laquo raquo}.include?(val)
267
+ Element.new(:typographic_sym, val.intern)
268
+ else
269
+ begin
270
+ Element.new(:entity, entity(val), nil, :original => src.matched)
271
+ rescue ::Kramdown::Error
272
+ src.pos -= src.matched_size - 1
273
+ Element.new(:entity, ::Kramdown::Utils::Entities.entity('amp'))
274
+ end
275
+ end
276
+ else
277
+ result << Element.new(:text, src.rest)
278
+ src.terminate
279
+ end
280
+ end
281
+ result
282
+ end
283
+
284
+ def process_html_element(el, do_conversion = true, preserve_text = false)
285
+ el.options.replace(:category => HTML_SPAN_ELEMENTS.include?(el.value) ? :span : :block,
286
+ :content_model => (do_conversion ? HTML_CONTENT_MODEL[el.value] : :raw))
287
+ process_children(el, do_conversion, preserve_text)
288
+ end
289
+
290
+ def remove_text_children(el)
291
+ el.children.delete_if {|c| c.type == :text}
292
+ end
293
+
294
+ def wrap_text_children(el)
295
+ tmp = []
296
+ last_is_p = false
297
+ el.children.each do |c|
298
+ if Element.category(c) != :block || c.type == :text
299
+ if !last_is_p
300
+ tmp << Element.new(:p, nil, nil, :transparent => true)
301
+ last_is_p = true
302
+ end
303
+ tmp.last.children << c
304
+ tmp
305
+ else
306
+ tmp << c
307
+ last_is_p = false
308
+ end
309
+ end
310
+ el.children = tmp
311
+ end
312
+
313
+ def strip_whitespace(el)
314
+ return if el.children.empty?
315
+ if el.children.first.type == :text
316
+ el.children.first.value.lstrip!
317
+ end
318
+ if el.children.last.type == :text
319
+ el.children.last.value.rstrip!
320
+ end
321
+ end
322
+
323
+ def remove_whitespace_children(el)
324
+ i = -1
325
+ el.children = el.children.reject do |c|
326
+ i += 1
327
+ c.type == :text && c.value.strip.empty? &&
328
+ (i == 0 || i == el.children.length - 1 || (Element.category(el.children[i-1]) == :block &&
329
+ Element.category(el.children[i+1]) == :block))
330
+ end
331
+ end
332
+
333
+ def set_basics(el, type, opts = {})
334
+ el.type = type
335
+ el.options.replace(opts)
336
+ el.value = nil
337
+ end
338
+
339
+ def extract_text(el, raw)
340
+ raw << el.value.to_s if el.type == :text
341
+ el.children.each {|c| extract_text(c, raw)}
342
+ end
343
+
344
+ def convert_a(el)
345
+ if el.attr['href']
346
+ set_basics(el, :a)
347
+ process_children(el)
348
+ else
349
+ process_html_element(el, false)
350
+ end
351
+ end
352
+
353
+ EMPHASIS_TYPE_MAP = {'em' => :em, 'i' => :em, 'strong' => :strong, 'b' => :strong}
354
+ def convert_em(el)
355
+ text = ''
356
+ extract_text(el, text)
357
+ if text =~ /\A\s/ || text =~ /\s\z/
358
+ process_html_element(el, false)
359
+ else
360
+ set_basics(el, EMPHASIS_TYPE_MAP[el.value])
361
+ process_children(el)
362
+ end
363
+ end
364
+ %w{b strong i}.each do |i|
365
+ alias_method("convert_#{i}".to_sym, :convert_em)
366
+ end
367
+
368
+ def convert_h1(el)
369
+ set_basics(el, :header, :level => el.value[1..1].to_i)
370
+ extract_text(el, el.options[:raw_text] = '')
371
+ process_children(el)
372
+ end
373
+ %w{h2 h3 h4 h5 h6}.each do |i|
374
+ alias_method("convert_#{i}".to_sym, :convert_h1)
375
+ end
376
+
377
+ def convert_code(el)
378
+ raw = ''
379
+ extract_text(el, raw)
380
+ result = process_text(raw, true)
381
+ begin
382
+ str = result.inject('') do |mem, c|
383
+ if c.type == :text
384
+ mem << c.value
385
+ elsif c.type == :entity
386
+ if RUBY_VERSION >= '1.9'
387
+ mem << c.value.char.encode(@root.options[:encoding])
388
+ elsif [60, 62, 34, 38].include?(c.value.code_point)
389
+ mem << c.value.code_point.chr
390
+ end
391
+ elsif c.type == :smart_quote || c.type == :typographic_sym
392
+ mem << entity(c.value.to_s).char.encode(@root.options[:encoding])
393
+ else
394
+ raise "Bug - please report"
395
+ end
396
+ end
397
+ result.clear
398
+ result << Element.new(:text, str)
399
+ rescue
400
+ end
401
+ if result.length > 1 || result.first.type != :text
402
+ process_html_element(el, false, true)
403
+ else
404
+ if el.value == 'code'
405
+ set_basics(el, :codespan)
406
+ else
407
+ set_basics(el, :codeblock)
408
+ end
409
+ el.value = result.first.value
410
+ el.children.clear
411
+ end
412
+ end
413
+ alias :convert_pre :convert_code
414
+
415
+ def convert_table(el)
416
+ if !is_simple_table?(el)
417
+ process_html_element(el, false)
418
+ return
419
+ end
420
+ remove_text_children(el)
421
+ process_children(el)
422
+ set_basics(el, :table)
423
+
424
+ calc_alignment = lambda do |c|
425
+ if c.type == :tr
426
+ el.options[:alignment] = c.children.map do |td|
427
+ if td.attr['style']
428
+ td.attr['style'].slice!(/(?:;\s*)?text-align:\s+(center|left|right)/)
429
+ td.attr.delete('style') if td.attr['style'].strip.empty?
430
+ $1.to_sym
431
+ else
432
+ :default
433
+ end
434
+ end
435
+ else
436
+ c.children.each {|cc| calc_alignment.call(cc)}
437
+ end
438
+ end
439
+ calc_alignment.call(el)
440
+ el.children.delete_if {|c| c.type == :html_element}
441
+
442
+ change_th_type = lambda do |c|
443
+ if c.type == :th
444
+ c.type = :td
445
+ else
446
+ c.children.each {|cc| change_th_type.call(cc)}
447
+ end
448
+ end
449
+ change_th_type.call(el)
450
+
451
+ if el.children.first.type == :tr
452
+ tbody = Element.new(:tbody)
453
+ tbody.children = el.children
454
+ el.children = [tbody]
455
+ end
456
+ end
457
+
458
+ def is_simple_table?(el)
459
+ only_phrasing_content = lambda do |c|
460
+ c.children.all? do |cc|
461
+ (cc.type == :text || !HTML_BLOCK_ELEMENTS.include?(cc.value)) && only_phrasing_content.call(cc)
462
+ end
463
+ end
464
+ check_cells = Proc.new do |c|
465
+ if c.value == 'th' || c.value == 'td'
466
+ return false if !only_phrasing_content.call(c)
467
+ else
468
+ c.children.each {|cc| check_cells.call(cc)}
469
+ end
470
+ end
471
+ check_cells.call(el)
472
+
473
+ nr_cells = 0
474
+ check_nr_cells = lambda do |t|
475
+ if t.value == 'tr'
476
+ count = t.children.select {|cc| cc.value == 'th' || cc.value == 'td'}.length
477
+ if count != nr_cells
478
+ if nr_cells == 0
479
+ nr_cells = count
480
+ else
481
+ nr_cells = -1
482
+ break
483
+ end
484
+ end
485
+ else
486
+ t.children.each {|cc| check_nr_cells.call(cc)}
487
+ end
488
+ end
489
+ check_nr_cells.call(el)
490
+ return false if nr_cells == -1
491
+
492
+ alignment = nil
493
+ check_alignment = Proc.new do |t|
494
+ if t.value == 'tr'
495
+ cur_alignment = t.children.select {|cc| cc.value == 'th' || cc.value == 'td'}.map do |cell|
496
+ md = /text-align:\s+(center|left|right|justify|inherit)/.match(cell.attr['style'].to_s)
497
+ return false if md && (md[1] == 'justify' || md[1] == 'inherit')
498
+ md.nil? ? :default : md[1]
499
+ end
500
+ alignment = cur_alignment if alignment.nil?
501
+ return false if alignment != cur_alignment
502
+ else
503
+ t.children.each {|cc| check_alignment.call(cc)}
504
+ end
505
+ end
506
+ check_alignment.call(el)
507
+
508
+ check_rows = lambda do |t, type|
509
+ t.children.all? {|r| (r.value == 'tr' || r.type == :text) && r.children.all? {|c| c.value == type || c.type == :text}}
510
+ end
511
+ check_rows.call(el, 'td') ||
512
+ (el.children.all? do |t|
513
+ t.type == :text || (t.value == 'thead' && check_rows.call(t, 'th')) ||
514
+ ((t.value == 'tfoot' || t.value == 'tbody') && check_rows.call(t, 'td'))
515
+ end && el.children.any? {|t| t.value == 'tbody'})
516
+ end
517
+
518
+ def convert_script(el)
519
+ if !is_math_tag?(el)
520
+ process_html_element(el)
521
+ else
522
+ handle_math_tag(el)
523
+ end
524
+ end
525
+
526
+ def is_math_tag?(el)
527
+ el.attr['type'].to_s =~ /\bmath\/tex\b/
528
+ end
529
+
530
+ def handle_math_tag(el)
531
+ set_basics(el, :math, :category => (el.attr['type'] =~ /mode=display/ ? :block : :span))
532
+ el.value = el.children.shift.value.sub(/\A<!\[CDATA\[(.*)\]\]>\z/m, '\1')
533
+ el.attr.delete('type')
534
+ end
535
+
536
+ end
537
+
538
+ include Parser
539
+
540
+ # Parse the source string provided on initialization as HTML document.
541
+ def parse
542
+ @stack, @tree = [], @root
543
+ @src = StringScanner.new(adapt_source(source))
544
+
545
+ while true
546
+ if result = @src.scan(/\s*#{HTML_INSTRUCTION_RE}/)
547
+ @tree.children << Element.new(:xml_pi, result.strip, nil, :category => :block)
548
+ elsif result = @src.scan(/\s*#{HTML_DOCTYPE_RE}/)
549
+ # ignore the doctype
550
+ elsif result = @src.scan(/\s*#{HTML_COMMENT_RE}/)
551
+ @tree.children << Element.new(:xml_comment, result.strip, nil, :category => :block)
552
+ else
553
+ break
554
+ end
555
+ end
556
+
557
+ tag_handler = lambda do |c, closed|
558
+ parse_raw_html(c, &tag_handler) if !closed
559
+ end
560
+ parse_raw_html(@tree, &tag_handler)
561
+
562
+ ElementConverter.convert(@tree)
563
+ end
564
+
565
+ end
566
+
567
+ end
568
+
569
+ end
570
+