html-pipeline-linuxfr 0.0.14

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,18 @@
1
+ module HTML
2
+ class Pipeline
3
+
4
+ # HTML Filter for replacing http and https urls with protocol relative versions.
5
+ class RelativeLinksFilter < Filter
6
+
7
+ def call
8
+ h = context[:host]
9
+ doc.css("a[href^=\"http://#{h}\"],a[href^=\"https://#{h}\"]").each do |element|
10
+ element['href'] = element['href'].sub(/^https?:/, '')
11
+ end
12
+ doc
13
+ end
14
+
15
+ end
16
+
17
+ end
18
+ end
@@ -0,0 +1,108 @@
1
+ require 'sanitize'
2
+
3
+ module HTML
4
+ class Pipeline
5
+
6
+ # HTML filter with sanization routines and whitelists. This module defines
7
+ # what HTML is allowed in user provided content and fixes up issues with
8
+ # unbalanced tags and whatnot.
9
+ #
10
+ # See the Sanitize docs for more information on the underlying library:
11
+ #
12
+ # https://github.com/rgrove/sanitize/#readme
13
+ #
14
+ # Context options:
15
+ # :whitelist - The sanitizer whitelist configuration to use. This can be one
16
+ # of the options constants defined in this class or a custom
17
+ # sanitize options hash.
18
+ #
19
+ # This filter does not write additional information to the context.
20
+ class SanitizationFilter < Filter
21
+ LISTS = Set.new(%w(ul ol).freeze)
22
+ LIST_ITEM = 'li'.freeze
23
+
24
+ # List of table child elements. These must be contained by a <table> element
25
+ # or they are not allowed through. Otherwise they can be used to break out
26
+ # of places we're using tables to contain formatted user content (like pull
27
+ # request review comments).
28
+ TABLE_ITEMS = Set.new(%w(tr td th).freeze)
29
+ TABLE = 'table'.freeze
30
+
31
+ # The main sanitization whitelist. Only these elements and attributes are
32
+ # allowed through by default.
33
+ WHITELIST = {
34
+ :elements => %w(
35
+ h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
36
+ div ins del sup sub p ol ul table blockquote dl dt dd
37
+ kbd q samp var hr ruby rt rp li tr td th
38
+ ),
39
+ :remove_contents => ['script'],
40
+ :attributes => {
41
+ 'a' => ['href'],
42
+ 'img' => ['src'],
43
+ 'div' => ['itemscope', 'itemtype'],
44
+ :all => ['abbr', 'accept', 'accept-charset',
45
+ 'accesskey', 'action', 'align', 'alt', 'axis',
46
+ 'border', 'cellpadding', 'cellspacing', 'char',
47
+ 'charoff', 'charset', 'checked', 'cite',
48
+ 'clear', 'cols', 'colspan', 'color',
49
+ 'compact', 'coords', 'datetime', 'dir',
50
+ 'disabled', 'enctype', 'for', 'frame',
51
+ 'headers', 'height', 'hreflang',
52
+ 'hspace', 'ismap', 'label', 'lang',
53
+ 'longdesc', 'maxlength', 'media', 'method',
54
+ 'multiple', 'name', 'nohref', 'noshade',
55
+ 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
56
+ 'rows', 'rowspan', 'rules', 'scope',
57
+ 'selected', 'shape', 'size', 'span',
58
+ 'start', 'summary', 'tabindex', 'target',
59
+ 'title', 'type', 'usemap', 'valign', 'value',
60
+ 'vspace', 'width', 'itemprop']
61
+ },
62
+ :protocols => {
63
+ 'a' => {'href' => ['http', 'https', 'mailto', :relative, 'github-windows', 'github-mac']},
64
+ 'img' => {'src' => ['http', 'https', :relative]}
65
+ },
66
+ :transformers => [
67
+ # Top-level <li> elements are removed because they can break out of
68
+ # containing markup.
69
+ lambda { |env|
70
+ name, node = env[:node_name], env[:node]
71
+ if name == LIST_ITEM && !node.ancestors.any?{ |n| LISTS.include?(n.name) }
72
+ node.replace(node.children)
73
+ end
74
+ },
75
+
76
+ # Table child elements that are not contained by a <table> are removed.
77
+ lambda { |env|
78
+ name, node = env[:node_name], env[:node]
79
+ if TABLE_ITEMS.include?(name) && !node.ancestors.any? { |n| n.name == TABLE }
80
+ node.replace(node.children)
81
+ end
82
+ }
83
+ ]
84
+ }
85
+
86
+ # A more limited sanitization whitelist. This includes all attributes,
87
+ # protocols, and transformers from WHITELIST but with a more locked down
88
+ # set of allowed elements.
89
+ LIMITED = WHITELIST.merge(
90
+ :elements => %w(b i strong em a pre code img ins del sup sub p ol ul li))
91
+
92
+ # Strip all HTML tags from the document.
93
+ FULL = { :elements => [] }
94
+
95
+ # Sanitize markup using the Sanitize library.
96
+ def call
97
+ Sanitize.clean_node!(doc, whitelist)
98
+ end
99
+
100
+ # The whitelist to use when sanitizing. This can be passed in the context
101
+ # hash to the filter but defaults to WHITELIST constant value above.
102
+ def whitelist
103
+ context[:whitelist] || WHITELIST
104
+ end
105
+ end
106
+
107
+ end
108
+ end
@@ -0,0 +1,31 @@
1
+ require 'pygments'
2
+
3
+ module HTML
4
+ class Pipeline
5
+
6
+ # HTML Filter that syntax highlights code blocks wrapped
7
+ # in <pre lang="...">.
8
+ class SyntaxHighlightFilter < Filter
9
+ def call
10
+ doc.search('code').each do |node|
11
+ next unless lang = node['class']
12
+ next unless lexer = Pygments::Lexer[lang]
13
+ text = node.inner_text
14
+
15
+ html = highlight_with_timeout_handling(lexer, text)
16
+ next if html.nil?
17
+
18
+ node.child.replace(html)
19
+ end
20
+ doc
21
+ end
22
+
23
+ def highlight_with_timeout_handling(lexer, text)
24
+ lexer.highlight(text, options: { nowrap: true })
25
+ rescue Timeout::Error
26
+ nil
27
+ end
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,14 @@
1
+ module HTML
2
+ class Pipeline
3
+ class TextFilter < Filter
4
+ attr_reader :text
5
+
6
+ def initialize(text, context = nil, result = nil)
7
+ raise TypeError, "text cannot be HTML" if text.is_a?(DocumentFragment)
8
+ # Ensure that this is always a string
9
+ @text = text.respond_to?(:to_str) ? text.to_str : text.to_s
10
+ super nil, context, result
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,61 @@
1
+ module HTML
2
+ class Pipeline
3
+
4
+ # HTML filter that adds a 'name' attribute to all headers
5
+ # in a document, so they can be accessed from a table of contents
6
+ #
7
+ # Context options:
8
+ # :toc_minimal_length (required) - Only add the table of contents to text with this number of characters
9
+ # :toc_header (required) - Introduce the table of contents with this header
10
+ #
11
+ class TableOfContentsFilter < Filter
12
+
13
+ def call
14
+ headers = Hash.new 0
15
+ was = 2
16
+ toc = ""
17
+ doc.css('h1, h2, h3, h4, h5, h6').each do |node|
18
+ level = node.name.scan(/\d/).first.to_i
19
+ name = node.text.downcase
20
+ name.gsub!(/[^\w\- ]/, '') # remove punctuation
21
+ name.gsub!(' ', '-') # replace spaces with dash
22
+ name = EscapeUtils.escape_uri(name) # escape extended UTF-8 chars
23
+
24
+ uniq = (headers[name] > 0) ? "-#{headers[name]}" : ''
25
+ headers[name] += 1
26
+ node['id'] = "#{name}#{uniq}"
27
+ while was > level
28
+ toc << "</ul>\n</li>\n"
29
+ was -= 1
30
+ end
31
+ while was < level
32
+ toc << "<li>\n<ul>"
33
+ was += 1
34
+ end
35
+ toc << "<li><a href=\"##{name}#{uniq}\">#{node.inner_html}</a></li>"
36
+ end
37
+
38
+ length = 0
39
+ doc.traverse {|node| length += node.text.length if node.text? }
40
+ return doc unless length >= context[:toc_minimal_length]
41
+
42
+ while was > 1
43
+ toc << "</ul>\n</li>\n"
44
+ was -= 1
45
+ end
46
+
47
+ unless headers.empty?
48
+ first_child = doc.child
49
+ first_child.add_previous_sibling context[:toc_header]
50
+ first_child.add_previous_sibling "<ul class=\"toc\">#{toc}</ul>"
51
+ end
52
+ doc
53
+ end
54
+
55
+ def validate
56
+ needs :toc_minimal_length, :toc_header
57
+ end
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,5 @@
1
+ module HTML
2
+ class Pipeline
3
+ VERSION = "0.0.14"
4
+ end
5
+ end
@@ -0,0 +1,17 @@
1
+ class MockedInstrumentationService
2
+ attr_reader :events
3
+ def initialize(event = nil, events = [])
4
+ @events = events
5
+ subscribe event
6
+ end
7
+ def instrument(event, payload = nil)
8
+ payload ||= {}
9
+ res = yield payload
10
+ events << [event, payload, res] if @subscribe == event
11
+ res
12
+ end
13
+ def subscribe(event)
14
+ @subscribe = event
15
+ @events
16
+ end
17
+ end
@@ -0,0 +1,56 @@
1
+ require "test_helper"
2
+
3
+ class HTML::Pipeline::AbsoluteSourceFilterTest < Test::Unit::TestCase
4
+ AbsoluteSourceFilter = HTML::Pipeline::AbsoluteSourceFilter
5
+
6
+ def setup
7
+ @image_base_url = 'http://assets.example.com'
8
+ @image_subpage_url = 'http://blog.example.com/a/post'
9
+ @options = {
10
+ :image_base_url => @image_base_url,
11
+ :image_subpage_url => @image_subpage_url
12
+ }
13
+ end
14
+
15
+ def test_rewrites_root_relative_urls
16
+ orig = %(<p><img src="/img.png"></p>)
17
+ puts AbsoluteSourceFilter.call(orig, @options).to_s
18
+ assert_equal "<p><img src=\"#{@image_base_url}/img.png\"></p>",
19
+ AbsoluteSourceFilter.call(orig, @options).to_s
20
+ end
21
+
22
+ def test_rewrites_root_relative_urls
23
+ orig = %(<p><img src="post/img.png"></p>)
24
+ assert_equal "<p><img src=\"#{@image_subpage_url}/img.png\"></p>",
25
+ AbsoluteSourceFilter.call(orig, @options).to_s
26
+ end
27
+
28
+ def test_does_not_rewrite_absolute_urls
29
+ orig = %(<p><img src="http://other.example.com/img.png"></p>)
30
+ result = AbsoluteSourceFilter.call(orig, @options).to_s
31
+ assert_no_match /@image_base_url/, result
32
+ assert_no_match /@image_subpage_url/, result
33
+ end
34
+
35
+ def test_fails_when_context_is_missing
36
+ assert_raise RuntimeError do
37
+ AbsoluteSourceFilter.call("<img src=\"img.png\">", {})
38
+ end
39
+ assert_raise RuntimeError do
40
+ AbsoluteSourceFilter.call("<img src=\"/img.png\">", {})
41
+ end
42
+ end
43
+
44
+ def test_tells_you_where_context_is_required
45
+ exception = assert_raise(RuntimeError) {
46
+ AbsoluteSourceFilter.call("<img src=\"img.png\">", {})
47
+ }
48
+ assert_match 'HTML::Pipeline::AbsoluteSourceFilter', exception.message
49
+
50
+ exception = assert_raise(RuntimeError) {
51
+ AbsoluteSourceFilter.call("<img src=\"/img.png\">", {})
52
+ }
53
+ assert_match 'HTML::Pipeline::AbsoluteSourceFilter', exception.message
54
+ end
55
+
56
+ end
@@ -0,0 +1,47 @@
1
+ require "test_helper"
2
+
3
+ class HTML::Pipeline::CamoFilterTest < Test::Unit::TestCase
4
+ CamoFilter = HTML::Pipeline::CamoFilter
5
+
6
+ def setup
7
+ @asset_proxy_url = 'https//assets.example.org'
8
+ @asset_proxy_secret_key = 'ssssh-secret'
9
+ @options = {
10
+ :asset_proxy => @asset_proxy_url,
11
+ :asset_proxy_secret_key => @asset_proxy_secret_key
12
+ }
13
+ end
14
+
15
+ def test_camouflaging_http_image_urls
16
+ orig = %(<p><img src="http://twitter.com/img.png"></p>)
17
+ assert_includes 'img src="' + @asset_proxy_url,
18
+ CamoFilter.call(orig, @options).to_s
19
+ end
20
+
21
+ def test_rewrites_dotcom_image_urls
22
+ orig = %(<p><img src="http://github.com/img.png"></p>)
23
+ assert_equal "<p><img src=\"https://github.com/img.png\"></p>",
24
+ CamoFilter.call(orig, @options).to_s
25
+ end
26
+
27
+ def test_not_camouflaging_https_image_urls
28
+ orig = %(<p><img src="https://foo.com/img.png"></p>)
29
+ assert_doesnt_include 'img src="' + @asset_proxy_url,
30
+ CamoFilter.call(orig, @options).to_s
31
+ end
32
+
33
+ def test_handling_images_with_no_src_attribute
34
+ orig = %(<p><img></p>)
35
+ assert_nothing_raised do
36
+ CamoFilter.call(orig, @options).to_s
37
+ end
38
+ end
39
+
40
+ def test_required_context_validation
41
+ exception = assert_raise(ArgumentError) {
42
+ CamoFilter.call("", {})
43
+ }
44
+ assert_match /:asset_proxy[^_]/, exception.message
45
+ assert_match /:asset_proxy_secret_key/, exception.message
46
+ end
47
+ end
@@ -0,0 +1,50 @@
1
+ require "test_helper"
2
+
3
+ class HTML::Pipeline::ImageMaxWidthFilterTest < Test::Unit::TestCase
4
+ def filter(html)
5
+ HTML::Pipeline::ImageMaxWidthFilter.call(html)
6
+ end
7
+
8
+ def test_rewrites_image_style_tags
9
+ body = "<p>Screenshot: <img src='screenshot.png'></p>"
10
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
11
+
12
+ res = filter(doc)
13
+ assert_equal_html %q(<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
14
+ res.to_html
15
+ end
16
+
17
+ def test_leaves_existing_image_style_tags_alone
18
+ body = "<p><img src='screenshot.png' style='width:100px;'></p>"
19
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
20
+
21
+ res = filter(doc)
22
+ assert_equal_html '<p><img src="screenshot.png" style="width:100px;"></p>',
23
+ res.to_html
24
+ end
25
+
26
+ def test_links_to_image
27
+ body = "<p>Screenshot: <img src='screenshot.png'></p>"
28
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
29
+
30
+ res = filter(doc)
31
+ assert_equal_html '<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>',
32
+ res.to_html
33
+ end
34
+
35
+ def test_doesnt_link_to_image_when_already_linked
36
+ body = "<p>Screenshot: <a href='blah.png'><img src='screenshot.png'></a></p>"
37
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
38
+
39
+ res = filter(doc)
40
+ assert_equal_html %q(<p>Screenshot: <a href="blah.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
41
+ res.to_html
42
+ end
43
+
44
+ def test_doesnt_screw_up_inlined_images
45
+ body = "<p>Screenshot <img src='screenshot.png'>, yes, this is a <b>screenshot</b> indeed.</p>"
46
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
47
+
48
+ assert_equal_html %q(<p>Screenshot <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a>, yes, this is a <b>screenshot</b> indeed.</p>), filter(doc).to_html
49
+ end
50
+ end
@@ -0,0 +1,101 @@
1
+ require "test_helper"
2
+
3
+ MarkdownFilter = HTML::Pipeline::MarkdownFilter
4
+
5
+ class HTML::Pipeline::MarkdownFilterTest < Test::Unit::TestCase
6
+ def setup
7
+ @haiku =
8
+ "Pointing at the moon\n" +
9
+ "Reminded of simple things\n" +
10
+ "Moments matter most"
11
+ @links =
12
+ "See http://example.org/ for more info"
13
+ @code =
14
+ "```\n" +
15
+ "def hello()" +
16
+ " 'world'" +
17
+ "end" +
18
+ "```"
19
+ end
20
+
21
+ def test_fails_when_given_a_documentfragment
22
+ body = "<p>heyo</p>"
23
+ doc = HTML::Pipeline.parse(body)
24
+ assert_raise(TypeError) { MarkdownFilter.call(doc, {}) }
25
+ end
26
+
27
+ def test_gfm_enabled_by_default
28
+ doc = MarkdownFilter.to_document(@haiku, {})
29
+ assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
30
+ assert_equal 2, doc.search('br').size
31
+ end
32
+
33
+ def test_disabling_gfm
34
+ doc = MarkdownFilter.to_document(@haiku, :gfm => false)
35
+ assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
36
+ assert_equal 0, doc.search('br').size
37
+ end
38
+
39
+ def test_fenced_code_blocks
40
+ doc = MarkdownFilter.to_document(@code)
41
+ assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
42
+ assert_equal 1, doc.search('pre').size
43
+ end
44
+
45
+ def test_fenced_code_blocks_with_language
46
+ doc = MarkdownFilter.to_document(@code.sub("```", "``` ruby"))
47
+ assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
48
+ assert_equal 1, doc.search('pre').size
49
+ assert_equal 'ruby', doc.search('pre').first['lang']
50
+ end
51
+ end
52
+
53
+ class GFMTest < Test::Unit::TestCase
54
+ def gfm(text)
55
+ MarkdownFilter.call(text, :gfm => true)
56
+ end
57
+
58
+ def test_not_touch_single_underscores_inside_words
59
+ assert_equal "<p>foo_bar</p>",
60
+ gfm("foo_bar")
61
+ end
62
+
63
+ def test_not_touch_underscores_in_code_blocks
64
+ assert_equal "<pre><code>foo_bar_baz\n</code></pre>",
65
+ gfm(" foo_bar_baz")
66
+ end
67
+
68
+ def test_not_touch_underscores_in_pre_blocks
69
+ assert_equal "<pre>\nfoo_bar_baz\n</pre>",
70
+ gfm("<pre>\nfoo_bar_baz\n</pre>")
71
+ end
72
+
73
+ def test_not_touch_two_or_more_underscores_inside_words
74
+ assert_equal "<p>foo_bar_baz</p>",
75
+ gfm("foo_bar_baz")
76
+ end
77
+
78
+ def test_turn_newlines_into_br_tags_in_simple_cases
79
+ assert_equal "<p>foo<br>\nbar</p>",
80
+ gfm("foo\nbar")
81
+ end
82
+
83
+ def test_convert_newlines_in_all_groups
84
+ assert_equal "<p>apple<br>\npear<br>\norange</p>\n\n" +
85
+ "<p>ruby<br>\npython<br>\nerlang</p>",
86
+ gfm("apple\npear\norange\n\nruby\npython\nerlang")
87
+ end
88
+
89
+ def test_convert_newlines_in_even_long_groups
90
+ assert_equal "<p>apple<br>\npear<br>\norange<br>\nbanana</p>\n\n" +
91
+ "<p>ruby<br>\npython<br>\nerlang</p>",
92
+ gfm("apple\npear\norange\nbanana\n\nruby\npython\nerlang")
93
+ end
94
+
95
+ def test_not_convert_newlines_in_lists
96
+ assert_equal "<h1>foo</h1>\n\n<h1>bar</h1>",
97
+ gfm("# foo\n# bar")
98
+ assert_equal "<ul>\n<li>foo</li>\n<li>bar</li>\n</ul>",
99
+ gfm("* foo\n* bar")
100
+ end
101
+ end