html-pipeline-linuxfr 0.14.1 → 0.14.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0c0be6927338ac92e554cdaaf3e0302fba5c13df
4
- data.tar.gz: 3a9e9f543beef05060fd97e260677b2bc805345c
3
+ metadata.gz: 9c82883511b9b07b3ad833693c7c9724358f25d9
4
+ data.tar.gz: a4ec53810d2bb95168ef0f4f901e027c34e1ff04
5
5
  SHA512:
6
- metadata.gz: 4545be908cb22bc45b21b84c24268137721ab0954289f9829919c995db9472a67bcd063df6fb884503aa45c3363c4924733431b277cfa1848f79b5d9dfef119b
7
- data.tar.gz: 0eea8eb7b8db4ad93ea7dde485ad28d03c3e5b7ae407bfb30532c96082f08566ae5f33c7b3f4c6be5e0e87acf064096af58d3d8477ada451bb03c2fa82558f23
6
+ metadata.gz: 4de55f3dd8c5d90fdeef900aff669e2d603d5648ddc64704604e1d3fad7d9fbf378e3e012c4aada5258b56acaf22545d676833f110092a3b0aeed54ea52dd50f
7
+ data.tar.gz: 5ee83a859f31924e334f02e2acbea4b6548fa20fa8d02c30a0696ff2d24f11020ad28e24edc52220fba0e2eb00a2d703d80fe39a201a4db1fa876e4356e5cba3
@@ -15,11 +15,18 @@ module HTML
15
15
  HTML::Pipeline::TableOfContentsFilter,
16
16
  HTML::Pipeline::SyntaxHighlightFilter,
17
17
  HTML::Pipeline::RelativeLinksFilter,
18
- HTML::Pipeline::CustomLinksFilter
18
+ HTML::Pipeline::CustomLinksFilter,
19
+ HTML::Pipeline::SanitizationFilter
19
20
  ], CONTEXT
20
21
  result = pipeline.call text
21
22
  result[:output].to_s
22
23
  end
24
+
25
+ def self.sanitize(text)
26
+ pipeline = HTML::Pipeline.new [HTML::Pipeline::SanitizationFilter], CONTEXT
27
+ result = pipeline.call text
28
+ result[:output].to_s
29
+ end
23
30
  end
24
31
 
25
32
  end
@@ -1,5 +1,6 @@
1
1
  # encoding: utf-8
2
2
  require 'redcarpet'
3
+ require 'cgi'
3
4
 
4
5
  module HTML
5
6
  class Pipeline
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
  require 'sanitize'
2
3
 
3
4
  module HTML
@@ -18,69 +19,31 @@ module HTML
18
19
  #
19
20
  # This filter does not write additional information to the context.
20
21
  class SanitizationFilter < Filter
21
- LISTS = Set.new(%w(ul ol).freeze)
22
- LIST_ITEM = 'li'.freeze
23
-
24
- # List of table child elements. These must be contained by a <table> element
25
- # or they are not allowed through. Otherwise they can be used to break out
26
- # of places we're using tables to contain formatted user content (like pull
27
- # request review comments).
28
- TABLE_ITEMS = Set.new(%w(tr td th).freeze)
29
- TABLE = 'table'.freeze
30
22
 
31
23
  # The main sanitization whitelist. Only these elements and attributes are
32
24
  # allowed through by default.
33
25
  WHITELIST = {
34
- :elements => %w(
35
- h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
36
- div ins del sup sub p ol ul table blockquote dl dt dd
37
- kbd q samp var hr ruby rt rp li tr td th
38
- ),
26
+ :output => :xhtml,
27
+ :elements => %w(a abbr b blockquote br cite code dd del dfn div dl dt em
28
+ h1 h2 h3 h4 h5 h6 hr i img ins kbd li mark meter ol p pre
29
+ q s samp small span strong sub sup table tbody td tfooter
30
+ th thead tr time ul var video wbr),
39
31
  :remove_contents => ['script'],
40
32
  :attributes => {
41
- 'a' => ['href'],
42
- 'img' => ['src'],
43
- 'div' => ['itemscope', 'itemtype'],
44
- :all => ['abbr', 'accept', 'accept-charset',
45
- 'accesskey', 'action', 'align', 'alt', 'axis',
46
- 'border', 'cellpadding', 'cellspacing', 'char',
47
- 'charoff', 'charset', 'checked', 'cite',
48
- 'clear', 'cols', 'colspan', 'color',
49
- 'compact', 'coords', 'datetime', 'dir',
50
- 'disabled', 'enctype', 'for', 'frame',
51
- 'headers', 'height', 'hreflang',
52
- 'hspace', 'ismap', 'label', 'lang',
53
- 'longdesc', 'maxlength', 'media', 'method',
54
- 'multiple', 'name', 'nohref', 'noshade',
55
- 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
56
- 'rows', 'rowspan', 'rules', 'scope',
57
- 'selected', 'shape', 'size', 'span',
58
- 'start', 'summary', 'tabindex', 'target',
59
- 'title', 'type', 'usemap', 'valign', 'value',
60
- 'vspace', 'width', 'itemprop']
33
+ :all => ['data-after', 'data-id', 'id', 'title', 'class'],
34
+ 'a' => ['href', 'name'],
35
+ 'blockquote' => ['cite'],
36
+ 'img' => ['alt', 'height', 'src', 'width'],
37
+ 'q' => ['cite'],
38
+ 'time' => ['datetime'],
39
+ 'video' => ['src']
61
40
  },
62
41
  :protocols => {
63
- 'a' => {'href' => ['http', 'https', 'mailto', :relative, 'github-windows', 'github-mac']},
64
- 'img' => {'src' => ['http', 'https', :relative]}
65
- },
66
- :transformers => [
67
- # Top-level <li> elements are removed because they can break out of
68
- # containing markup.
69
- lambda { |env|
70
- name, node = env[:node_name], env[:node]
71
- if name == LIST_ITEM && !node.ancestors.any?{ |n| LISTS.include?(n.name) }
72
- node.replace(node.children)
73
- end
74
- },
75
-
76
- # Table child elements that are not contained by a <table> are removed.
77
- lambda { |env|
78
- name, node = env[:node_name], env[:node]
79
- if TABLE_ITEMS.include?(name) && !node.ancestors.any? { |n| n.name == TABLE }
80
- node.replace(node.children)
81
- end
82
- }
83
- ]
42
+ 'a' => {'href' => ['ftp', 'http', 'https', 'irc', 'mailto', 'xmpp', :relative]},
43
+ 'blockquote' => {'cite' => ['http', 'https', :relative]},
44
+ 'img' => {'src' => ['http', 'https', :relative]},
45
+ 'q' => {'cite' => ['http', 'https', :relative]}
46
+ }
84
47
  }
85
48
 
86
49
  # A more limited sanitization whitelist. This includes all attributes,
@@ -92,9 +55,21 @@ module HTML
92
55
  # Strip all HTML tags from the document.
93
56
  FULL = { :elements => [] }
94
57
 
58
+ # Match unicode chars encoded on 4 bytes in UTF-8
59
+ MB4_REGEXP = /[^\u{9}-\u{999}]/
60
+
61
+ # Remove utf-8 characters encoded on 4 bytes,
62
+ # because MySQL doesn't handle them.
63
+ def encode_mb4(doc)
64
+ doc.search("text()").each do |node|
65
+ node.content = node.content.gsub(MB4_REGEXP) { |c| "&##{c.unpack('U')[0]};" }
66
+ end
67
+ doc
68
+ end
69
+
95
70
  # Sanitize markup using the Sanitize library.
96
71
  def call
97
- Sanitize.clean_node!(doc, whitelist)
72
+ encode_mb4 Sanitize.clean_node!(doc, whitelist)
98
73
  end
99
74
 
100
75
  # The whitelist to use when sanitizing. This can be passed in the context
@@ -12,9 +12,9 @@ module HTML
12
12
 
13
13
  def call
14
14
  headers = Hash.new 0
15
- was = 2
15
+ was = 1
16
16
  toc = ""
17
- doc.css('h1, h2, h3, h4, h5, h6').each do |node|
17
+ doc.css('h2, h3, h4, h5, h6').each do |node|
18
18
  level = node.name.scan(/\d/).first.to_i
19
19
  name = node.text.downcase
20
20
  name.gsub!(/[^\w\- ]/, '') # remove punctuation
@@ -24,15 +24,21 @@ module HTML
24
24
  uniq = (headers[name] > 0) ? "-#{headers[name]}" : ''
25
25
  headers[name] += 1
26
26
  node['id'] = "#{name}#{uniq}"
27
- while was > level
28
- toc << "</ul>\n</li>\n"
29
- was -= 1
30
- end
31
- while was < level
32
- toc << "<li>\n<ul>"
33
- was += 1
27
+
28
+ if was < level
29
+ while was < level
30
+ toc << "<ul>\n<li>"
31
+ was += 1
32
+ end
33
+ else
34
+ toc << "</li>\n"
35
+ while was > level
36
+ toc << "</ul></li>\n"
37
+ was -= 1
38
+ end
39
+ toc << "<li>"
34
40
  end
35
- toc << "<li><a href=\"##{name}#{uniq}\">#{node.inner_html}</a></li>"
41
+ toc << "<a href=\"##{name}#{uniq}\">#{node.inner_html}</a>"
36
42
  end
37
43
 
38
44
  length = 0
@@ -40,14 +46,15 @@ module HTML
40
46
  return doc unless length >= context[:toc_minimal_length]
41
47
 
42
48
  while was > 1
43
- toc << "</ul>\n</li>\n"
49
+ toc << "</li>\n</ul>\n"
44
50
  was -= 1
45
51
  end
52
+ toc.sub!('<ul>', '<ul class="toc">')
46
53
 
47
54
  unless headers.empty?
48
55
  first_child = doc.child
49
56
  first_child.add_previous_sibling context[:toc_header]
50
- first_child.add_previous_sibling "<ul class=\"toc\">#{toc}</ul>"
57
+ first_child.add_previous_sibling toc
51
58
  end
52
59
  doc
53
60
  end
@@ -1,5 +1,5 @@
1
1
  module HTML
2
2
  class Pipeline
3
- VERSION = "0.14.1"
3
+ VERSION = "0.14.2"
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-pipeline-linuxfr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.1
4
+ version: 0.14.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Tomayko
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2013-06-23 00:00:00.000000000 Z
13
+ date: 2013-06-25 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: nokogiri