html-pipeline-linuxfr 0.14.1 → 0.14.2
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9c82883511b9b07b3ad833693c7c9724358f25d9
|
4
|
+
data.tar.gz: a4ec53810d2bb95168ef0f4f901e027c34e1ff04
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4de55f3dd8c5d90fdeef900aff669e2d603d5648ddc64704604e1d3fad7d9fbf378e3e012c4aada5258b56acaf22545d676833f110092a3b0aeed54ea52dd50f
|
7
|
+
data.tar.gz: 5ee83a859f31924e334f02e2acbea4b6548fa20fa8d02c30a0696ff2d24f11020ad28e24edc52220fba0e2eb00a2d703d80fe39a201a4db1fa876e4356e5cba3
|
@@ -15,11 +15,18 @@ module HTML
|
|
15
15
|
HTML::Pipeline::TableOfContentsFilter,
|
16
16
|
HTML::Pipeline::SyntaxHighlightFilter,
|
17
17
|
HTML::Pipeline::RelativeLinksFilter,
|
18
|
-
HTML::Pipeline::CustomLinksFilter
|
18
|
+
HTML::Pipeline::CustomLinksFilter,
|
19
|
+
HTML::Pipeline::SanitizationFilter
|
19
20
|
], CONTEXT
|
20
21
|
result = pipeline.call text
|
21
22
|
result[:output].to_s
|
22
23
|
end
|
24
|
+
|
25
|
+
def self.sanitize(text)
|
26
|
+
pipeline = HTML::Pipeline.new [HTML::Pipeline::SanitizationFilter], CONTEXT
|
27
|
+
result = pipeline.call text
|
28
|
+
result[:output].to_s
|
29
|
+
end
|
23
30
|
end
|
24
31
|
|
25
32
|
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
require 'sanitize'
|
2
3
|
|
3
4
|
module HTML
|
@@ -18,69 +19,31 @@ module HTML
|
|
18
19
|
#
|
19
20
|
# This filter does not write additional information to the context.
|
20
21
|
class SanitizationFilter < Filter
|
21
|
-
LISTS = Set.new(%w(ul ol).freeze)
|
22
|
-
LIST_ITEM = 'li'.freeze
|
23
|
-
|
24
|
-
# List of table child elements. These must be contained by a <table> element
|
25
|
-
# or they are not allowed through. Otherwise they can be used to break out
|
26
|
-
# of places we're using tables to contain formatted user content (like pull
|
27
|
-
# request review comments).
|
28
|
-
TABLE_ITEMS = Set.new(%w(tr td th).freeze)
|
29
|
-
TABLE = 'table'.freeze
|
30
22
|
|
31
23
|
# The main sanitization whitelist. Only these elements and attributes are
|
32
24
|
# allowed through by default.
|
33
25
|
WHITELIST = {
|
34
|
-
:
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
26
|
+
:output => :xhtml,
|
27
|
+
:elements => %w(a abbr b blockquote br cite code dd del dfn div dl dt em
|
28
|
+
h1 h2 h3 h4 h5 h6 hr i img ins kbd li mark meter ol p pre
|
29
|
+
q s samp small span strong sub sup table tbody td tfooter
|
30
|
+
th thead tr time ul var video wbr),
|
39
31
|
:remove_contents => ['script'],
|
40
32
|
:attributes => {
|
41
|
-
'
|
42
|
-
'
|
43
|
-
'
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
'clear', 'cols', 'colspan', 'color',
|
49
|
-
'compact', 'coords', 'datetime', 'dir',
|
50
|
-
'disabled', 'enctype', 'for', 'frame',
|
51
|
-
'headers', 'height', 'hreflang',
|
52
|
-
'hspace', 'ismap', 'label', 'lang',
|
53
|
-
'longdesc', 'maxlength', 'media', 'method',
|
54
|
-
'multiple', 'name', 'nohref', 'noshade',
|
55
|
-
'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
56
|
-
'rows', 'rowspan', 'rules', 'scope',
|
57
|
-
'selected', 'shape', 'size', 'span',
|
58
|
-
'start', 'summary', 'tabindex', 'target',
|
59
|
-
'title', 'type', 'usemap', 'valign', 'value',
|
60
|
-
'vspace', 'width', 'itemprop']
|
33
|
+
:all => ['data-after', 'data-id', 'id', 'title', 'class'],
|
34
|
+
'a' => ['href', 'name'],
|
35
|
+
'blockquote' => ['cite'],
|
36
|
+
'img' => ['alt', 'height', 'src', 'width'],
|
37
|
+
'q' => ['cite'],
|
38
|
+
'time' => ['datetime'],
|
39
|
+
'video' => ['src']
|
61
40
|
},
|
62
41
|
:protocols => {
|
63
|
-
'a'
|
64
|
-
'
|
65
|
-
},
|
66
|
-
|
67
|
-
|
68
|
-
# containing markup.
|
69
|
-
lambda { |env|
|
70
|
-
name, node = env[:node_name], env[:node]
|
71
|
-
if name == LIST_ITEM && !node.ancestors.any?{ |n| LISTS.include?(n.name) }
|
72
|
-
node.replace(node.children)
|
73
|
-
end
|
74
|
-
},
|
75
|
-
|
76
|
-
# Table child elements that are not contained by a <table> are removed.
|
77
|
-
lambda { |env|
|
78
|
-
name, node = env[:node_name], env[:node]
|
79
|
-
if TABLE_ITEMS.include?(name) && !node.ancestors.any? { |n| n.name == TABLE }
|
80
|
-
node.replace(node.children)
|
81
|
-
end
|
82
|
-
}
|
83
|
-
]
|
42
|
+
'a' => {'href' => ['ftp', 'http', 'https', 'irc', 'mailto', 'xmpp', :relative]},
|
43
|
+
'blockquote' => {'cite' => ['http', 'https', :relative]},
|
44
|
+
'img' => {'src' => ['http', 'https', :relative]},
|
45
|
+
'q' => {'cite' => ['http', 'https', :relative]}
|
46
|
+
}
|
84
47
|
}
|
85
48
|
|
86
49
|
# A more limited sanitization whitelist. This includes all attributes,
|
@@ -92,9 +55,21 @@ module HTML
|
|
92
55
|
# Strip all HTML tags from the document.
|
93
56
|
FULL = { :elements => [] }
|
94
57
|
|
58
|
+
# Match unicode chars encoded on 4 bytes in UTF-8
|
59
|
+
MB4_REGEXP = /[^\u{9}-\u{999}]/
|
60
|
+
|
61
|
+
# Remove utf-8 characters encoded on 4 bytes,
|
62
|
+
# because MySQL doesn't handle them.
|
63
|
+
def encode_mb4(doc)
|
64
|
+
doc.search("text()").each do |node|
|
65
|
+
node.content = node.content.gsub(MB4_REGEXP) { |c| "&##{c.unpack('U')[0]};" }
|
66
|
+
end
|
67
|
+
doc
|
68
|
+
end
|
69
|
+
|
95
70
|
# Sanitize markup using the Sanitize library.
|
96
71
|
def call
|
97
|
-
Sanitize.clean_node!(doc, whitelist)
|
72
|
+
encode_mb4 Sanitize.clean_node!(doc, whitelist)
|
98
73
|
end
|
99
74
|
|
100
75
|
# The whitelist to use when sanitizing. This can be passed in the context
|
@@ -12,9 +12,9 @@ module HTML
|
|
12
12
|
|
13
13
|
def call
|
14
14
|
headers = Hash.new 0
|
15
|
-
was =
|
15
|
+
was = 1
|
16
16
|
toc = ""
|
17
|
-
doc.css('
|
17
|
+
doc.css('h2, h3, h4, h5, h6').each do |node|
|
18
18
|
level = node.name.scan(/\d/).first.to_i
|
19
19
|
name = node.text.downcase
|
20
20
|
name.gsub!(/[^\w\- ]/, '') # remove punctuation
|
@@ -24,15 +24,21 @@ module HTML
|
|
24
24
|
uniq = (headers[name] > 0) ? "-#{headers[name]}" : ''
|
25
25
|
headers[name] += 1
|
26
26
|
node['id'] = "#{name}#{uniq}"
|
27
|
-
|
28
|
-
|
29
|
-
was
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
27
|
+
|
28
|
+
if was < level
|
29
|
+
while was < level
|
30
|
+
toc << "<ul>\n<li>"
|
31
|
+
was += 1
|
32
|
+
end
|
33
|
+
else
|
34
|
+
toc << "</li>\n"
|
35
|
+
while was > level
|
36
|
+
toc << "</ul></li>\n"
|
37
|
+
was -= 1
|
38
|
+
end
|
39
|
+
toc << "<li>"
|
34
40
|
end
|
35
|
-
toc << "<
|
41
|
+
toc << "<a href=\"##{name}#{uniq}\">#{node.inner_html}</a>"
|
36
42
|
end
|
37
43
|
|
38
44
|
length = 0
|
@@ -40,14 +46,15 @@ module HTML
|
|
40
46
|
return doc unless length >= context[:toc_minimal_length]
|
41
47
|
|
42
48
|
while was > 1
|
43
|
-
toc << "</
|
49
|
+
toc << "</li>\n</ul>\n"
|
44
50
|
was -= 1
|
45
51
|
end
|
52
|
+
toc.sub!('<ul>', '<ul class="toc">')
|
46
53
|
|
47
54
|
unless headers.empty?
|
48
55
|
first_child = doc.child
|
49
56
|
first_child.add_previous_sibling context[:toc_header]
|
50
|
-
first_child.add_previous_sibling
|
57
|
+
first_child.add_previous_sibling toc
|
51
58
|
end
|
52
59
|
doc
|
53
60
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html-pipeline-linuxfr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.14.
|
4
|
+
version: 0.14.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Tomayko
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2013-06-
|
13
|
+
date: 2013-06-25 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: nokogiri
|