html-pipeline-linuxfr 0.14.1 → 0.14.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9c82883511b9b07b3ad833693c7c9724358f25d9
|
4
|
+
data.tar.gz: a4ec53810d2bb95168ef0f4f901e027c34e1ff04
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4de55f3dd8c5d90fdeef900aff669e2d603d5648ddc64704604e1d3fad7d9fbf378e3e012c4aada5258b56acaf22545d676833f110092a3b0aeed54ea52dd50f
|
7
|
+
data.tar.gz: 5ee83a859f31924e334f02e2acbea4b6548fa20fa8d02c30a0696ff2d24f11020ad28e24edc52220fba0e2eb00a2d703d80fe39a201a4db1fa876e4356e5cba3
|
@@ -15,11 +15,18 @@ module HTML
|
|
15
15
|
HTML::Pipeline::TableOfContentsFilter,
|
16
16
|
HTML::Pipeline::SyntaxHighlightFilter,
|
17
17
|
HTML::Pipeline::RelativeLinksFilter,
|
18
|
-
HTML::Pipeline::CustomLinksFilter
|
18
|
+
HTML::Pipeline::CustomLinksFilter,
|
19
|
+
HTML::Pipeline::SanitizationFilter
|
19
20
|
], CONTEXT
|
20
21
|
result = pipeline.call text
|
21
22
|
result[:output].to_s
|
22
23
|
end
|
24
|
+
|
25
|
+
def self.sanitize(text)
|
26
|
+
pipeline = HTML::Pipeline.new [HTML::Pipeline::SanitizationFilter], CONTEXT
|
27
|
+
result = pipeline.call text
|
28
|
+
result[:output].to_s
|
29
|
+
end
|
23
30
|
end
|
24
31
|
|
25
32
|
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
require 'sanitize'
|
2
3
|
|
3
4
|
module HTML
|
@@ -18,69 +19,31 @@ module HTML
|
|
18
19
|
#
|
19
20
|
# This filter does not write additional information to the context.
|
20
21
|
class SanitizationFilter < Filter
|
21
|
-
LISTS = Set.new(%w(ul ol).freeze)
|
22
|
-
LIST_ITEM = 'li'.freeze
|
23
|
-
|
24
|
-
# List of table child elements. These must be contained by a <table> element
|
25
|
-
# or they are not allowed through. Otherwise they can be used to break out
|
26
|
-
# of places we're using tables to contain formatted user content (like pull
|
27
|
-
# request review comments).
|
28
|
-
TABLE_ITEMS = Set.new(%w(tr td th).freeze)
|
29
|
-
TABLE = 'table'.freeze
|
30
22
|
|
31
23
|
# The main sanitization whitelist. Only these elements and attributes are
|
32
24
|
# allowed through by default.
|
33
25
|
WHITELIST = {
|
34
|
-
:
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
26
|
+
:output => :xhtml,
|
27
|
+
:elements => %w(a abbr b blockquote br cite code dd del dfn div dl dt em
|
28
|
+
h1 h2 h3 h4 h5 h6 hr i img ins kbd li mark meter ol p pre
|
29
|
+
q s samp small span strong sub sup table tbody td tfooter
|
30
|
+
th thead tr time ul var video wbr),
|
39
31
|
:remove_contents => ['script'],
|
40
32
|
:attributes => {
|
41
|
-
'
|
42
|
-
'
|
43
|
-
'
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
'clear', 'cols', 'colspan', 'color',
|
49
|
-
'compact', 'coords', 'datetime', 'dir',
|
50
|
-
'disabled', 'enctype', 'for', 'frame',
|
51
|
-
'headers', 'height', 'hreflang',
|
52
|
-
'hspace', 'ismap', 'label', 'lang',
|
53
|
-
'longdesc', 'maxlength', 'media', 'method',
|
54
|
-
'multiple', 'name', 'nohref', 'noshade',
|
55
|
-
'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
56
|
-
'rows', 'rowspan', 'rules', 'scope',
|
57
|
-
'selected', 'shape', 'size', 'span',
|
58
|
-
'start', 'summary', 'tabindex', 'target',
|
59
|
-
'title', 'type', 'usemap', 'valign', 'value',
|
60
|
-
'vspace', 'width', 'itemprop']
|
33
|
+
:all => ['data-after', 'data-id', 'id', 'title', 'class'],
|
34
|
+
'a' => ['href', 'name'],
|
35
|
+
'blockquote' => ['cite'],
|
36
|
+
'img' => ['alt', 'height', 'src', 'width'],
|
37
|
+
'q' => ['cite'],
|
38
|
+
'time' => ['datetime'],
|
39
|
+
'video' => ['src']
|
61
40
|
},
|
62
41
|
:protocols => {
|
63
|
-
'a'
|
64
|
-
'
|
65
|
-
},
|
66
|
-
|
67
|
-
|
68
|
-
# containing markup.
|
69
|
-
lambda { |env|
|
70
|
-
name, node = env[:node_name], env[:node]
|
71
|
-
if name == LIST_ITEM && !node.ancestors.any?{ |n| LISTS.include?(n.name) }
|
72
|
-
node.replace(node.children)
|
73
|
-
end
|
74
|
-
},
|
75
|
-
|
76
|
-
# Table child elements that are not contained by a <table> are removed.
|
77
|
-
lambda { |env|
|
78
|
-
name, node = env[:node_name], env[:node]
|
79
|
-
if TABLE_ITEMS.include?(name) && !node.ancestors.any? { |n| n.name == TABLE }
|
80
|
-
node.replace(node.children)
|
81
|
-
end
|
82
|
-
}
|
83
|
-
]
|
42
|
+
'a' => {'href' => ['ftp', 'http', 'https', 'irc', 'mailto', 'xmpp', :relative]},
|
43
|
+
'blockquote' => {'cite' => ['http', 'https', :relative]},
|
44
|
+
'img' => {'src' => ['http', 'https', :relative]},
|
45
|
+
'q' => {'cite' => ['http', 'https', :relative]}
|
46
|
+
}
|
84
47
|
}
|
85
48
|
|
86
49
|
# A more limited sanitization whitelist. This includes all attributes,
|
@@ -92,9 +55,21 @@ module HTML
|
|
92
55
|
# Strip all HTML tags from the document.
|
93
56
|
FULL = { :elements => [] }
|
94
57
|
|
58
|
+
# Match unicode chars encoded on 4 bytes in UTF-8
|
59
|
+
MB4_REGEXP = /[^\u{9}-\u{999}]/
|
60
|
+
|
61
|
+
# Remove utf-8 characters encoded on 4 bytes,
|
62
|
+
# because MySQL doesn't handle them.
|
63
|
+
def encode_mb4(doc)
|
64
|
+
doc.search("text()").each do |node|
|
65
|
+
node.content = node.content.gsub(MB4_REGEXP) { |c| "&##{c.unpack('U')[0]};" }
|
66
|
+
end
|
67
|
+
doc
|
68
|
+
end
|
69
|
+
|
95
70
|
# Sanitize markup using the Sanitize library.
|
96
71
|
def call
|
97
|
-
Sanitize.clean_node!(doc, whitelist)
|
72
|
+
encode_mb4 Sanitize.clean_node!(doc, whitelist)
|
98
73
|
end
|
99
74
|
|
100
75
|
# The whitelist to use when sanitizing. This can be passed in the context
|
@@ -12,9 +12,9 @@ module HTML
|
|
12
12
|
|
13
13
|
def call
|
14
14
|
headers = Hash.new 0
|
15
|
-
was =
|
15
|
+
was = 1
|
16
16
|
toc = ""
|
17
|
-
doc.css('
|
17
|
+
doc.css('h2, h3, h4, h5, h6').each do |node|
|
18
18
|
level = node.name.scan(/\d/).first.to_i
|
19
19
|
name = node.text.downcase
|
20
20
|
name.gsub!(/[^\w\- ]/, '') # remove punctuation
|
@@ -24,15 +24,21 @@ module HTML
|
|
24
24
|
uniq = (headers[name] > 0) ? "-#{headers[name]}" : ''
|
25
25
|
headers[name] += 1
|
26
26
|
node['id'] = "#{name}#{uniq}"
|
27
|
-
|
28
|
-
|
29
|
-
was
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
27
|
+
|
28
|
+
if was < level
|
29
|
+
while was < level
|
30
|
+
toc << "<ul>\n<li>"
|
31
|
+
was += 1
|
32
|
+
end
|
33
|
+
else
|
34
|
+
toc << "</li>\n"
|
35
|
+
while was > level
|
36
|
+
toc << "</ul></li>\n"
|
37
|
+
was -= 1
|
38
|
+
end
|
39
|
+
toc << "<li>"
|
34
40
|
end
|
35
|
-
toc << "<
|
41
|
+
toc << "<a href=\"##{name}#{uniq}\">#{node.inner_html}</a>"
|
36
42
|
end
|
37
43
|
|
38
44
|
length = 0
|
@@ -40,14 +46,15 @@ module HTML
|
|
40
46
|
return doc unless length >= context[:toc_minimal_length]
|
41
47
|
|
42
48
|
while was > 1
|
43
|
-
toc << "</
|
49
|
+
toc << "</li>\n</ul>\n"
|
44
50
|
was -= 1
|
45
51
|
end
|
52
|
+
toc.sub!('<ul>', '<ul class="toc">')
|
46
53
|
|
47
54
|
unless headers.empty?
|
48
55
|
first_child = doc.child
|
49
56
|
first_child.add_previous_sibling context[:toc_header]
|
50
|
-
first_child.add_previous_sibling
|
57
|
+
first_child.add_previous_sibling toc
|
51
58
|
end
|
52
59
|
doc
|
53
60
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html-pipeline-linuxfr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.14.
|
4
|
+
version: 0.14.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Tomayko
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2013-06-
|
13
|
+
date: 2013-06-25 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: nokogiri
|