html-pipeline-linuxfr 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ module HTML
2
+ class Pipeline
3
+
4
+ # HTML Filter for replacing http and https urls with protocol relative versions.
5
+ class RelativeLinksFilter < Filter
6
+
7
+ def call
8
+ h = context[:host]
9
+ doc.css("a[href^=\"http://#{h}\"],a[href^=\"https://#{h}\"]").each do |element|
10
+ element['href'] = element['href'].sub(/^https?:/, '')
11
+ end
12
+ doc
13
+ end
14
+
15
+ end
16
+
17
+ end
18
+ end
@@ -0,0 +1,108 @@
1
+ require 'sanitize'
2
+
3
+ module HTML
4
+ class Pipeline
5
+
6
+ # HTML filter with sanization routines and whitelists. This module defines
7
+ # what HTML is allowed in user provided content and fixes up issues with
8
+ # unbalanced tags and whatnot.
9
+ #
10
+ # See the Sanitize docs for more information on the underlying library:
11
+ #
12
+ # https://github.com/rgrove/sanitize/#readme
13
+ #
14
+ # Context options:
15
+ # :whitelist - The sanitizer whitelist configuration to use. This can be one
16
+ # of the options constants defined in this class or a custom
17
+ # sanitize options hash.
18
+ #
19
+ # This filter does not write additional information to the context.
20
+ class SanitizationFilter < Filter
21
+ LISTS = Set.new(%w(ul ol).freeze)
22
+ LIST_ITEM = 'li'.freeze
23
+
24
+ # List of table child elements. These must be contained by a <table> element
25
+ # or they are not allowed through. Otherwise they can be used to break out
26
+ # of places we're using tables to contain formatted user content (like pull
27
+ # request review comments).
28
+ TABLE_ITEMS = Set.new(%w(tr td th).freeze)
29
+ TABLE = 'table'.freeze
30
+
31
+ # The main sanitization whitelist. Only these elements and attributes are
32
+ # allowed through by default.
33
+ WHITELIST = {
34
+ :elements => %w(
35
+ h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
36
+ div ins del sup sub p ol ul table blockquote dl dt dd
37
+ kbd q samp var hr ruby rt rp li tr td th
38
+ ),
39
+ :remove_contents => ['script'],
40
+ :attributes => {
41
+ 'a' => ['href'],
42
+ 'img' => ['src'],
43
+ 'div' => ['itemscope', 'itemtype'],
44
+ :all => ['abbr', 'accept', 'accept-charset',
45
+ 'accesskey', 'action', 'align', 'alt', 'axis',
46
+ 'border', 'cellpadding', 'cellspacing', 'char',
47
+ 'charoff', 'charset', 'checked', 'cite',
48
+ 'clear', 'cols', 'colspan', 'color',
49
+ 'compact', 'coords', 'datetime', 'dir',
50
+ 'disabled', 'enctype', 'for', 'frame',
51
+ 'headers', 'height', 'hreflang',
52
+ 'hspace', 'ismap', 'label', 'lang',
53
+ 'longdesc', 'maxlength', 'media', 'method',
54
+ 'multiple', 'name', 'nohref', 'noshade',
55
+ 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
56
+ 'rows', 'rowspan', 'rules', 'scope',
57
+ 'selected', 'shape', 'size', 'span',
58
+ 'start', 'summary', 'tabindex', 'target',
59
+ 'title', 'type', 'usemap', 'valign', 'value',
60
+ 'vspace', 'width', 'itemprop']
61
+ },
62
+ :protocols => {
63
+ 'a' => {'href' => ['http', 'https', 'mailto', :relative, 'github-windows', 'github-mac']},
64
+ 'img' => {'src' => ['http', 'https', :relative]}
65
+ },
66
+ :transformers => [
67
+ # Top-level <li> elements are removed because they can break out of
68
+ # containing markup.
69
+ lambda { |env|
70
+ name, node = env[:node_name], env[:node]
71
+ if name == LIST_ITEM && !node.ancestors.any?{ |n| LISTS.include?(n.name) }
72
+ node.replace(node.children)
73
+ end
74
+ },
75
+
76
+ # Table child elements that are not contained by a <table> are removed.
77
+ lambda { |env|
78
+ name, node = env[:node_name], env[:node]
79
+ if TABLE_ITEMS.include?(name) && !node.ancestors.any? { |n| n.name == TABLE }
80
+ node.replace(node.children)
81
+ end
82
+ }
83
+ ]
84
+ }
85
+
86
+ # A more limited sanitization whitelist. This includes all attributes,
87
+ # protocols, and transformers from WHITELIST but with a more locked down
88
+ # set of allowed elements.
89
+ LIMITED = WHITELIST.merge(
90
+ :elements => %w(b i strong em a pre code img ins del sup sub p ol ul li))
91
+
92
+ # Strip all HTML tags from the document.
93
+ FULL = { :elements => [] }
94
+
95
+ # Sanitize markup using the Sanitize library.
96
+ def call
97
+ Sanitize.clean_node!(doc, whitelist)
98
+ end
99
+
100
+ # The whitelist to use when sanitizing. This can be passed in the context
101
+ # hash to the filter but defaults to WHITELIST constant value above.
102
+ def whitelist
103
+ context[:whitelist] || WHITELIST
104
+ end
105
+ end
106
+
107
+ end
108
+ end
@@ -0,0 +1,31 @@
1
+ require 'pygments'
2
+
3
+ module HTML
4
+ class Pipeline
5
+
6
+ # HTML Filter that syntax highlights code blocks wrapped
7
+ # in <pre lang="...">.
8
+ class SyntaxHighlightFilter < Filter
9
+ def call
10
+ doc.search('code').each do |node|
11
+ next unless lang = node['class']
12
+ next unless lexer = Pygments::Lexer[lang]
13
+ text = node.inner_text
14
+
15
+ html = highlight_with_timeout_handling(lexer, text)
16
+ next if html.nil?
17
+
18
+ node.child.replace(html)
19
+ end
20
+ doc
21
+ end
22
+
23
+ def highlight_with_timeout_handling(lexer, text)
24
+ lexer.highlight(text, options: { nowrap: true })
25
+ rescue Timeout::Error
26
+ nil
27
+ end
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,14 @@
1
+ module HTML
2
+ class Pipeline
3
+ class TextFilter < Filter
4
+ attr_reader :text
5
+
6
+ def initialize(text, context = nil, result = nil)
7
+ raise TypeError, "text cannot be HTML" if text.is_a?(DocumentFragment)
8
+ # Ensure that this is always a string
9
+ @text = text.respond_to?(:to_str) ? text.to_str : text.to_s
10
+ super nil, context, result
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,61 @@
1
+ module HTML
2
+ class Pipeline
3
+
4
+ # HTML filter that adds a 'name' attribute to all headers
5
+ # in a document, so they can be accessed from a table of contents
6
+ #
7
+ # Context options:
8
+ # :toc_minimal_length (required) - Only add the table of contents to text with this number of characters
9
+ # :toc_header (required) - Introduce the table of contents with this header
10
+ #
11
+ class TableOfContentsFilter < Filter
12
+
13
+ def call
14
+ headers = Hash.new 0
15
+ was = 2
16
+ toc = ""
17
+ doc.css('h1, h2, h3, h4, h5, h6').each do |node|
18
+ level = node.name.scan(/\d/).first.to_i
19
+ name = node.text.downcase
20
+ name.gsub!(/[^\w\- ]/, '') # remove punctuation
21
+ name.gsub!(' ', '-') # replace spaces with dash
22
+ name = EscapeUtils.escape_uri(name) # escape extended UTF-8 chars
23
+
24
+ uniq = (headers[name] > 0) ? "-#{headers[name]}" : ''
25
+ headers[name] += 1
26
+ node['id'] = "#{name}#{uniq}"
27
+ while was > level
28
+ toc << "</ul>\n</li>\n"
29
+ was -= 1
30
+ end
31
+ while was < level
32
+ toc << "<li>\n<ul>"
33
+ was += 1
34
+ end
35
+ toc << "<li><a href=\"##{name}#{uniq}\">#{node.inner_html}</a></li>"
36
+ end
37
+
38
+ length = 0
39
+ doc.traverse {|node| length += node.text.length if node.text? }
40
+ return doc unless length >= context[:toc_minimal_length]
41
+
42
+ while was > 1
43
+ toc << "</ul>\n</li>\n"
44
+ was -= 1
45
+ end
46
+
47
+ unless headers.empty?
48
+ first_child = doc.child
49
+ first_child.add_previous_sibling context[:toc_header]
50
+ first_child.add_previous_sibling "<ul class=\"toc\">#{toc}</ul>"
51
+ end
52
+ doc
53
+ end
54
+
55
+ def validate
56
+ needs :toc_minimal_length, :toc_header
57
+ end
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,5 @@
1
+ module HTML
2
+ class Pipeline
3
+ VERSION = "0.0.14"
4
+ end
5
+ end
@@ -0,0 +1,17 @@
1
+ class MockedInstrumentationService
2
+ attr_reader :events
3
+ def initialize(event = nil, events = [])
4
+ @events = events
5
+ subscribe event
6
+ end
7
+ def instrument(event, payload = nil)
8
+ payload ||= {}
9
+ res = yield payload
10
+ events << [event, payload, res] if @subscribe == event
11
+ res
12
+ end
13
+ def subscribe(event)
14
+ @subscribe = event
15
+ @events
16
+ end
17
+ end
@@ -0,0 +1,56 @@
1
+ require "test_helper"
2
+
3
+ class HTML::Pipeline::AbsoluteSourceFilterTest < Test::Unit::TestCase
4
+ AbsoluteSourceFilter = HTML::Pipeline::AbsoluteSourceFilter
5
+
6
+ def setup
7
+ @image_base_url = 'http://assets.example.com'
8
+ @image_subpage_url = 'http://blog.example.com/a/post'
9
+ @options = {
10
+ :image_base_url => @image_base_url,
11
+ :image_subpage_url => @image_subpage_url
12
+ }
13
+ end
14
+
15
+ def test_rewrites_root_relative_urls
16
+ orig = %(<p><img src="/img.png"></p>)
17
+ puts AbsoluteSourceFilter.call(orig, @options).to_s
18
+ assert_equal "<p><img src=\"#{@image_base_url}/img.png\"></p>",
19
+ AbsoluteSourceFilter.call(orig, @options).to_s
20
+ end
21
+
22
+ def test_rewrites_root_relative_urls
23
+ orig = %(<p><img src="post/img.png"></p>)
24
+ assert_equal "<p><img src=\"#{@image_subpage_url}/img.png\"></p>",
25
+ AbsoluteSourceFilter.call(orig, @options).to_s
26
+ end
27
+
28
+ def test_does_not_rewrite_absolute_urls
29
+ orig = %(<p><img src="http://other.example.com/img.png"></p>)
30
+ result = AbsoluteSourceFilter.call(orig, @options).to_s
31
+ assert_no_match /@image_base_url/, result
32
+ assert_no_match /@image_subpage_url/, result
33
+ end
34
+
35
+ def test_fails_when_context_is_missing
36
+ assert_raise RuntimeError do
37
+ AbsoluteSourceFilter.call("<img src=\"img.png\">", {})
38
+ end
39
+ assert_raise RuntimeError do
40
+ AbsoluteSourceFilter.call("<img src=\"/img.png\">", {})
41
+ end
42
+ end
43
+
44
+ def test_tells_you_where_context_is_required
45
+ exception = assert_raise(RuntimeError) {
46
+ AbsoluteSourceFilter.call("<img src=\"img.png\">", {})
47
+ }
48
+ assert_match 'HTML::Pipeline::AbsoluteSourceFilter', exception.message
49
+
50
+ exception = assert_raise(RuntimeError) {
51
+ AbsoluteSourceFilter.call("<img src=\"/img.png\">", {})
52
+ }
53
+ assert_match 'HTML::Pipeline::AbsoluteSourceFilter', exception.message
54
+ end
55
+
56
+ end
@@ -0,0 +1,47 @@
1
+ require "test_helper"
2
+
3
+ class HTML::Pipeline::CamoFilterTest < Test::Unit::TestCase
4
+ CamoFilter = HTML::Pipeline::CamoFilter
5
+
6
+ def setup
7
+ @asset_proxy_url = 'https//assets.example.org'
8
+ @asset_proxy_secret_key = 'ssssh-secret'
9
+ @options = {
10
+ :asset_proxy => @asset_proxy_url,
11
+ :asset_proxy_secret_key => @asset_proxy_secret_key
12
+ }
13
+ end
14
+
15
+ def test_camouflaging_http_image_urls
16
+ orig = %(<p><img src="http://twitter.com/img.png"></p>)
17
+ assert_includes 'img src="' + @asset_proxy_url,
18
+ CamoFilter.call(orig, @options).to_s
19
+ end
20
+
21
+ def test_rewrites_dotcom_image_urls
22
+ orig = %(<p><img src="http://github.com/img.png"></p>)
23
+ assert_equal "<p><img src=\"https://github.com/img.png\"></p>",
24
+ CamoFilter.call(orig, @options).to_s
25
+ end
26
+
27
+ def test_not_camouflaging_https_image_urls
28
+ orig = %(<p><img src="https://foo.com/img.png"></p>)
29
+ assert_doesnt_include 'img src="' + @asset_proxy_url,
30
+ CamoFilter.call(orig, @options).to_s
31
+ end
32
+
33
+ def test_handling_images_with_no_src_attribute
34
+ orig = %(<p><img></p>)
35
+ assert_nothing_raised do
36
+ CamoFilter.call(orig, @options).to_s
37
+ end
38
+ end
39
+
40
+ def test_required_context_validation
41
+ exception = assert_raise(ArgumentError) {
42
+ CamoFilter.call("", {})
43
+ }
44
+ assert_match /:asset_proxy[^_]/, exception.message
45
+ assert_match /:asset_proxy_secret_key/, exception.message
46
+ end
47
+ end
@@ -0,0 +1,50 @@
1
+ require "test_helper"
2
+
3
+ class HTML::Pipeline::ImageMaxWidthFilterTest < Test::Unit::TestCase
4
+ def filter(html)
5
+ HTML::Pipeline::ImageMaxWidthFilter.call(html)
6
+ end
7
+
8
+ def test_rewrites_image_style_tags
9
+ body = "<p>Screenshot: <img src='screenshot.png'></p>"
10
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
11
+
12
+ res = filter(doc)
13
+ assert_equal_html %q(<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
14
+ res.to_html
15
+ end
16
+
17
+ def test_leaves_existing_image_style_tags_alone
18
+ body = "<p><img src='screenshot.png' style='width:100px;'></p>"
19
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
20
+
21
+ res = filter(doc)
22
+ assert_equal_html '<p><img src="screenshot.png" style="width:100px;"></p>',
23
+ res.to_html
24
+ end
25
+
26
+ def test_links_to_image
27
+ body = "<p>Screenshot: <img src='screenshot.png'></p>"
28
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
29
+
30
+ res = filter(doc)
31
+ assert_equal_html '<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>',
32
+ res.to_html
33
+ end
34
+
35
+ def test_doesnt_link_to_image_when_already_linked
36
+ body = "<p>Screenshot: <a href='blah.png'><img src='screenshot.png'></a></p>"
37
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
38
+
39
+ res = filter(doc)
40
+ assert_equal_html %q(<p>Screenshot: <a href="blah.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
41
+ res.to_html
42
+ end
43
+
44
+ def test_doesnt_screw_up_inlined_images
45
+ body = "<p>Screenshot <img src='screenshot.png'>, yes, this is a <b>screenshot</b> indeed.</p>"
46
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
47
+
48
+ assert_equal_html %q(<p>Screenshot <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a>, yes, this is a <b>screenshot</b> indeed.</p>), filter(doc).to_html
49
+ end
50
+ end
@@ -0,0 +1,101 @@
1
+ require "test_helper"
2
+
3
+ MarkdownFilter = HTML::Pipeline::MarkdownFilter
4
+
5
+ class HTML::Pipeline::MarkdownFilterTest < Test::Unit::TestCase
6
+ def setup
7
+ @haiku =
8
+ "Pointing at the moon\n" +
9
+ "Reminded of simple things\n" +
10
+ "Moments matter most"
11
+ @links =
12
+ "See http://example.org/ for more info"
13
+ @code =
14
+ "```\n" +
15
+ "def hello()" +
16
+ " 'world'" +
17
+ "end" +
18
+ "```"
19
+ end
20
+
21
+ def test_fails_when_given_a_documentfragment
22
+ body = "<p>heyo</p>"
23
+ doc = HTML::Pipeline.parse(body)
24
+ assert_raise(TypeError) { MarkdownFilter.call(doc, {}) }
25
+ end
26
+
27
+ def test_gfm_enabled_by_default
28
+ doc = MarkdownFilter.to_document(@haiku, {})
29
+ assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
30
+ assert_equal 2, doc.search('br').size
31
+ end
32
+
33
+ def test_disabling_gfm
34
+ doc = MarkdownFilter.to_document(@haiku, :gfm => false)
35
+ assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
36
+ assert_equal 0, doc.search('br').size
37
+ end
38
+
39
+ def test_fenced_code_blocks
40
+ doc = MarkdownFilter.to_document(@code)
41
+ assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
42
+ assert_equal 1, doc.search('pre').size
43
+ end
44
+
45
+ def test_fenced_code_blocks_with_language
46
+ doc = MarkdownFilter.to_document(@code.sub("```", "``` ruby"))
47
+ assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
48
+ assert_equal 1, doc.search('pre').size
49
+ assert_equal 'ruby', doc.search('pre').first['lang']
50
+ end
51
+ end
52
+
53
+ class GFMTest < Test::Unit::TestCase
54
+ def gfm(text)
55
+ MarkdownFilter.call(text, :gfm => true)
56
+ end
57
+
58
+ def test_not_touch_single_underscores_inside_words
59
+ assert_equal "<p>foo_bar</p>",
60
+ gfm("foo_bar")
61
+ end
62
+
63
+ def test_not_touch_underscores_in_code_blocks
64
+ assert_equal "<pre><code>foo_bar_baz\n</code></pre>",
65
+ gfm(" foo_bar_baz")
66
+ end
67
+
68
+ def test_not_touch_underscores_in_pre_blocks
69
+ assert_equal "<pre>\nfoo_bar_baz\n</pre>",
70
+ gfm("<pre>\nfoo_bar_baz\n</pre>")
71
+ end
72
+
73
+ def test_not_touch_two_or_more_underscores_inside_words
74
+ assert_equal "<p>foo_bar_baz</p>",
75
+ gfm("foo_bar_baz")
76
+ end
77
+
78
+ def test_turn_newlines_into_br_tags_in_simple_cases
79
+ assert_equal "<p>foo<br>\nbar</p>",
80
+ gfm("foo\nbar")
81
+ end
82
+
83
+ def test_convert_newlines_in_all_groups
84
+ assert_equal "<p>apple<br>\npear<br>\norange</p>\n\n" +
85
+ "<p>ruby<br>\npython<br>\nerlang</p>",
86
+ gfm("apple\npear\norange\n\nruby\npython\nerlang")
87
+ end
88
+
89
+ def test_convert_newlines_in_even_long_groups
90
+ assert_equal "<p>apple<br>\npear<br>\norange<br>\nbanana</p>\n\n" +
91
+ "<p>ruby<br>\npython<br>\nerlang</p>",
92
+ gfm("apple\npear\norange\nbanana\n\nruby\npython\nerlang")
93
+ end
94
+
95
+ def test_not_convert_newlines_in_lists
96
+ assert_equal "<h1>foo</h1>\n\n<h1>bar</h1>",
97
+ gfm("# foo\n# bar")
98
+ assert_equal "<ul>\n<li>foo</li>\n<li>bar</li>\n</ul>",
99
+ gfm("* foo\n* bar")
100
+ end
101
+ end