html-pipeline-linuxfr 0.14.1 → 0.14.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0c0be6927338ac92e554cdaaf3e0302fba5c13df
4
- data.tar.gz: 3a9e9f543beef05060fd97e260677b2bc805345c
3
+ metadata.gz: 9c82883511b9b07b3ad833693c7c9724358f25d9
4
+ data.tar.gz: a4ec53810d2bb95168ef0f4f901e027c34e1ff04
5
5
  SHA512:
6
- metadata.gz: 4545be908cb22bc45b21b84c24268137721ab0954289f9829919c995db9472a67bcd063df6fb884503aa45c3363c4924733431b277cfa1848f79b5d9dfef119b
7
- data.tar.gz: 0eea8eb7b8db4ad93ea7dde485ad28d03c3e5b7ae407bfb30532c96082f08566ae5f33c7b3f4c6be5e0e87acf064096af58d3d8477ada451bb03c2fa82558f23
6
+ metadata.gz: 4de55f3dd8c5d90fdeef900aff669e2d603d5648ddc64704604e1d3fad7d9fbf378e3e012c4aada5258b56acaf22545d676833f110092a3b0aeed54ea52dd50f
7
+ data.tar.gz: 5ee83a859f31924e334f02e2acbea4b6548fa20fa8d02c30a0696ff2d24f11020ad28e24edc52220fba0e2eb00a2d703d80fe39a201a4db1fa876e4356e5cba3
@@ -15,11 +15,18 @@ module HTML
15
15
  HTML::Pipeline::TableOfContentsFilter,
16
16
  HTML::Pipeline::SyntaxHighlightFilter,
17
17
  HTML::Pipeline::RelativeLinksFilter,
18
- HTML::Pipeline::CustomLinksFilter
18
+ HTML::Pipeline::CustomLinksFilter,
19
+ HTML::Pipeline::SanitizationFilter
19
20
  ], CONTEXT
20
21
  result = pipeline.call text
21
22
  result[:output].to_s
22
23
  end
24
+
25
+ def self.sanitize(text)
26
+ pipeline = HTML::Pipeline.new [HTML::Pipeline::SanitizationFilter], CONTEXT
27
+ result = pipeline.call text
28
+ result[:output].to_s
29
+ end
23
30
  end
24
31
 
25
32
  end
@@ -1,5 +1,6 @@
1
1
  # encoding: utf-8
2
2
  require 'redcarpet'
3
+ require 'cgi'
3
4
 
4
5
  module HTML
5
6
  class Pipeline
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
  require 'sanitize'
2
3
 
3
4
  module HTML
@@ -18,69 +19,31 @@ module HTML
18
19
  #
19
20
  # This filter does not write additional information to the context.
20
21
  class SanitizationFilter < Filter
21
- LISTS = Set.new(%w(ul ol).freeze)
22
- LIST_ITEM = 'li'.freeze
23
-
24
- # List of table child elements. These must be contained by a <table> element
25
- # or they are not allowed through. Otherwise they can be used to break out
26
- # of places we're using tables to contain formatted user content (like pull
27
- # request review comments).
28
- TABLE_ITEMS = Set.new(%w(tr td th).freeze)
29
- TABLE = 'table'.freeze
30
22
 
31
23
  # The main sanitization whitelist. Only these elements and attributes are
32
24
  # allowed through by default.
33
25
  WHITELIST = {
34
- :elements => %w(
35
- h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
36
- div ins del sup sub p ol ul table blockquote dl dt dd
37
- kbd q samp var hr ruby rt rp li tr td th
38
- ),
26
+ :output => :xhtml,
27
+ :elements => %w(a abbr b blockquote br cite code dd del dfn div dl dt em
28
+ h1 h2 h3 h4 h5 h6 hr i img ins kbd li mark meter ol p pre
29
+ q s samp small span strong sub sup table tbody td tfooter
30
+ th thead tr time ul var video wbr),
39
31
  :remove_contents => ['script'],
40
32
  :attributes => {
41
- 'a' => ['href'],
42
- 'img' => ['src'],
43
- 'div' => ['itemscope', 'itemtype'],
44
- :all => ['abbr', 'accept', 'accept-charset',
45
- 'accesskey', 'action', 'align', 'alt', 'axis',
46
- 'border', 'cellpadding', 'cellspacing', 'char',
47
- 'charoff', 'charset', 'checked', 'cite',
48
- 'clear', 'cols', 'colspan', 'color',
49
- 'compact', 'coords', 'datetime', 'dir',
50
- 'disabled', 'enctype', 'for', 'frame',
51
- 'headers', 'height', 'hreflang',
52
- 'hspace', 'ismap', 'label', 'lang',
53
- 'longdesc', 'maxlength', 'media', 'method',
54
- 'multiple', 'name', 'nohref', 'noshade',
55
- 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
56
- 'rows', 'rowspan', 'rules', 'scope',
57
- 'selected', 'shape', 'size', 'span',
58
- 'start', 'summary', 'tabindex', 'target',
59
- 'title', 'type', 'usemap', 'valign', 'value',
60
- 'vspace', 'width', 'itemprop']
33
+ :all => ['data-after', 'data-id', 'id', 'title', 'class'],
34
+ 'a' => ['href', 'name'],
35
+ 'blockquote' => ['cite'],
36
+ 'img' => ['alt', 'height', 'src', 'width'],
37
+ 'q' => ['cite'],
38
+ 'time' => ['datetime'],
39
+ 'video' => ['src']
61
40
  },
62
41
  :protocols => {
63
- 'a' => {'href' => ['http', 'https', 'mailto', :relative, 'github-windows', 'github-mac']},
64
- 'img' => {'src' => ['http', 'https', :relative]}
65
- },
66
- :transformers => [
67
- # Top-level <li> elements are removed because they can break out of
68
- # containing markup.
69
- lambda { |env|
70
- name, node = env[:node_name], env[:node]
71
- if name == LIST_ITEM && !node.ancestors.any?{ |n| LISTS.include?(n.name) }
72
- node.replace(node.children)
73
- end
74
- },
75
-
76
- # Table child elements that are not contained by a <table> are removed.
77
- lambda { |env|
78
- name, node = env[:node_name], env[:node]
79
- if TABLE_ITEMS.include?(name) && !node.ancestors.any? { |n| n.name == TABLE }
80
- node.replace(node.children)
81
- end
82
- }
83
- ]
42
+ 'a' => {'href' => ['ftp', 'http', 'https', 'irc', 'mailto', 'xmpp', :relative]},
43
+ 'blockquote' => {'cite' => ['http', 'https', :relative]},
44
+ 'img' => {'src' => ['http', 'https', :relative]},
45
+ 'q' => {'cite' => ['http', 'https', :relative]}
46
+ }
84
47
  }
85
48
 
86
49
  # A more limited sanitization whitelist. This includes all attributes,
@@ -92,9 +55,21 @@ module HTML
92
55
  # Strip all HTML tags from the document.
93
56
  FULL = { :elements => [] }
94
57
 
58
+ # Match unicode chars encoded on 4 bytes in UTF-8
59
+ MB4_REGEXP = /[^\u{9}-\u{999}]/
60
+
61
+ # Remove utf-8 characters encoded on 4 bytes,
62
+ # because MySQL doesn't handle them.
63
+ def encode_mb4(doc)
64
+ doc.search("text()").each do |node|
65
+ node.content = node.content.gsub(MB4_REGEXP) { |c| "&##{c.unpack('U')[0]};" }
66
+ end
67
+ doc
68
+ end
69
+
95
70
  # Sanitize markup using the Sanitize library.
96
71
  def call
97
- Sanitize.clean_node!(doc, whitelist)
72
+ encode_mb4 Sanitize.clean_node!(doc, whitelist)
98
73
  end
99
74
 
100
75
  # The whitelist to use when sanitizing. This can be passed in the context
@@ -12,9 +12,9 @@ module HTML
12
12
 
13
13
  def call
14
14
  headers = Hash.new 0
15
- was = 2
15
+ was = 1
16
16
  toc = ""
17
- doc.css('h1, h2, h3, h4, h5, h6').each do |node|
17
+ doc.css('h2, h3, h4, h5, h6').each do |node|
18
18
  level = node.name.scan(/\d/).first.to_i
19
19
  name = node.text.downcase
20
20
  name.gsub!(/[^\w\- ]/, '') # remove punctuation
@@ -24,15 +24,21 @@ module HTML
24
24
  uniq = (headers[name] > 0) ? "-#{headers[name]}" : ''
25
25
  headers[name] += 1
26
26
  node['id'] = "#{name}#{uniq}"
27
- while was > level
28
- toc << "</ul>\n</li>\n"
29
- was -= 1
30
- end
31
- while was < level
32
- toc << "<li>\n<ul>"
33
- was += 1
27
+
28
+ if was < level
29
+ while was < level
30
+ toc << "<ul>\n<li>"
31
+ was += 1
32
+ end
33
+ else
34
+ toc << "</li>\n"
35
+ while was > level
36
+ toc << "</ul></li>\n"
37
+ was -= 1
38
+ end
39
+ toc << "<li>"
34
40
  end
35
- toc << "<li><a href=\"##{name}#{uniq}\">#{node.inner_html}</a></li>"
41
+ toc << "<a href=\"##{name}#{uniq}\">#{node.inner_html}</a>"
36
42
  end
37
43
 
38
44
  length = 0
@@ -40,14 +46,15 @@ module HTML
40
46
  return doc unless length >= context[:toc_minimal_length]
41
47
 
42
48
  while was > 1
43
- toc << "</ul>\n</li>\n"
49
+ toc << "</li>\n</ul>\n"
44
50
  was -= 1
45
51
  end
52
+ toc.sub!('<ul>', '<ul class="toc">')
46
53
 
47
54
  unless headers.empty?
48
55
  first_child = doc.child
49
56
  first_child.add_previous_sibling context[:toc_header]
50
- first_child.add_previous_sibling "<ul class=\"toc\">#{toc}</ul>"
57
+ first_child.add_previous_sibling toc
51
58
  end
52
59
  doc
53
60
  end
@@ -1,5 +1,5 @@
1
1
  module HTML
2
2
  class Pipeline
3
- VERSION = "0.14.1"
3
+ VERSION = "0.14.2"
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-pipeline-linuxfr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.1
4
+ version: 0.14.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Tomayko
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2013-06-23 00:00:00.000000000 Z
13
+ date: 2013-06-25 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: nokogiri