RubyGems - html-pipeline-linuxfr - Versions diffs - 0.14.1 → 0.14.2 - Mend

html-pipeline-linuxfr 0.14.1 → 0.14.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/lib/html/pipeline/linuxfr.rb +8 -1
data/lib/html/pipeline/markdown_filter.rb +1 -0
data/lib/html/pipeline/sanitization_filter.rb +31 -56
data/lib/html/pipeline/toc_filter.rb +19 -12
data/lib/html/pipeline/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 0c0be6927338ac92e554cdaaf3e0302fba5c13df
-  data.tar.gz: 3a9e9f543beef05060fd97e260677b2bc805345c
+  metadata.gz: 9c82883511b9b07b3ad833693c7c9724358f25d9
+  data.tar.gz: a4ec53810d2bb95168ef0f4f901e027c34e1ff04
 SHA512:
-  metadata.gz: 4545be908cb22bc45b21b84c24268137721ab0954289f9829919c995db9472a67bcd063df6fb884503aa45c3363c4924733431b277cfa1848f79b5d9dfef119b
-  data.tar.gz: 0eea8eb7b8db4ad93ea7dde485ad28d03c3e5b7ae407bfb30532c96082f08566ae5f33c7b3f4c6be5e0e87acf064096af58d3d8477ada451bb03c2fa82558f23
+  metadata.gz: 4de55f3dd8c5d90fdeef900aff669e2d603d5648ddc64704604e1d3fad7d9fbf378e3e012c4aada5258b56acaf22545d676833f110092a3b0aeed54ea52dd50f
+  data.tar.gz: 5ee83a859f31924e334f02e2acbea4b6548fa20fa8d02c30a0696ff2d24f11020ad28e24edc52220fba0e2eb00a2d703d80fe39a201a4db1fa876e4356e5cba3

data/lib/html/pipeline/linuxfr.rb CHANGED Viewed

@@ -15,11 +15,18 @@ module HTML
           HTML::Pipeline::TableOfContentsFilter,
           HTML::Pipeline::SyntaxHighlightFilter,
           HTML::Pipeline::RelativeLinksFilter,
-          HTML::Pipeline::CustomLinksFilter
+          HTML::Pipeline::CustomLinksFilter,
+          HTML::Pipeline::SanitizationFilter
         ], CONTEXT
         result = pipeline.call text
         result[:output].to_s
       end
+      def self.sanitize(text)
+        pipeline = HTML::Pipeline.new [HTML::Pipeline::SanitizationFilter], CONTEXT
+        result = pipeline.call text
+        result[:output].to_s
+      end
     end
   end

data/lib/html/pipeline/markdown_filter.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 # encoding: utf-8
 require 'redcarpet'
+require 'cgi'
 module HTML
   class Pipeline

data/lib/html/pipeline/sanitization_filter.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# encoding: utf-8
 require 'sanitize'
 module HTML
@@ -18,69 +19,31 @@ module HTML
     #
     # This filter does not write additional information to the context.
     class SanitizationFilter < Filter
-      LISTS     = Set.new(%w(ul ol).freeze)
-      LIST_ITEM = 'li'.freeze
-      # List of table child elements. These must be contained by a <table> element
-      # or they are not allowed through. Otherwise they can be used to break out
-      # of places we're using tables to contain formatted user content (like pull
-      # request review comments).
-      TABLE_ITEMS = Set.new(%w(tr td th).freeze)
-      TABLE       = 'table'.freeze
       # The main sanitization whitelist. Only these elements and attributes are
       # allowed through by default.
       WHITELIST = {
-        :elements => %w(
-          h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
-          div ins del sup sub p ol ul table blockquote dl dt dd
-          kbd q samp var hr ruby rt rp li tr td th
-        ),
+        :output => :xhtml,
+        :elements => %w(a abbr b blockquote br cite code dd del dfn div dl dt em
+                        h1 h2 h3 h4 h5 h6 hr i img ins kbd li mark meter ol p pre
+                        q s samp small span strong sub sup table tbody td tfooter
+                        th thead tr time ul var video wbr),
         :remove_contents => ['script'],
         :attributes => {
-          'a' => ['href'],
-          'img' => ['src'],
-          'div' => ['itemscope', 'itemtype'],
-          :all  => ['abbr', 'accept', 'accept-charset',
-                    'accesskey', 'action', 'align', 'alt', 'axis',
-                    'border', 'cellpadding', 'cellspacing', 'char',
-                    'charoff', 'charset', 'checked', 'cite',
-                    'clear', 'cols', 'colspan', 'color',
-                    'compact', 'coords', 'datetime', 'dir',
-                    'disabled', 'enctype', 'for', 'frame',
-                    'headers', 'height', 'hreflang',
-                    'hspace', 'ismap', 'label', 'lang',
-                    'longdesc', 'maxlength', 'media', 'method',
-                    'multiple', 'name', 'nohref', 'noshade',
-                    'nowrap', 'prompt', 'readonly', 'rel', 'rev',
-                    'rows', 'rowspan', 'rules', 'scope',
-                    'selected', 'shape', 'size', 'span',
-                    'start', 'summary', 'tabindex', 'target',
-                    'title', 'type', 'usemap', 'valign', 'value',
-                    'vspace', 'width', 'itemprop']
+          :all         => ['data-after', 'data-id', 'id', 'title', 'class'],
+          'a'          => ['href', 'name'],
+          'blockquote' => ['cite'],
+          'img'        => ['alt', 'height', 'src', 'width'],
+          'q'          => ['cite'],
+          'time'       => ['datetime'],
+          'video'      => ['src']
         },
         :protocols => {
-          'a'   => {'href' => ['http', 'https', 'mailto', :relative, 'github-windows', 'github-mac']},
-          'img' => {'src'  => ['http', 'https', :relative]}
-        },
-        :transformers => [
-          # Top-level <li> elements are removed because they can break out of
-          # containing markup.
-          lambda { |env|
-            name, node = env[:node_name], env[:node]
-            if name == LIST_ITEM && !node.ancestors.any?{ |n| LISTS.include?(n.name) }
-              node.replace(node.children)
-            end
-          },
-          # Table child elements that are not contained by a <table> are removed.
-          lambda { |env|
-            name, node = env[:node_name], env[:node]
-            if TABLE_ITEMS.include?(name) && !node.ancestors.any? { |n| n.name == TABLE }
-              node.replace(node.children)
-            end
-          }
-        ]
+          'a'          => {'href' => ['ftp', 'http', 'https', 'irc', 'mailto', 'xmpp', :relative]},
+          'blockquote' => {'cite' => ['http', 'https', :relative]},
+          'img'        => {'src'  => ['http', 'https', :relative]},
+          'q'          => {'cite' => ['http', 'https', :relative]}
+        }
       }
       # A more limited sanitization whitelist. This includes all attributes,
@@ -92,9 +55,21 @@ module HTML
       # Strip all HTML tags from the document.
       FULL = { :elements => [] }
+      # Match unicode chars encoded on 4 bytes in UTF-8
+      MB4_REGEXP = /[^\u{9}-\u{999}]/
+      # Remove utf-8 characters encoded on 4 bytes,
+      # because MySQL doesn't handle them.
+      def encode_mb4(doc)
+        doc.search("text()").each do |node|
+          node.content = node.content.gsub(MB4_REGEXP) { |c| "&##{c.unpack('U')[0]};" }
+        end
+        doc
+      end
       # Sanitize markup using the Sanitize library.
       def call
-        Sanitize.clean_node!(doc, whitelist)
+        encode_mb4 Sanitize.clean_node!(doc, whitelist)
       end
       # The whitelist to use when sanitizing. This can be passed in the context

data/lib/html/pipeline/toc_filter.rb CHANGED Viewed

@@ -12,9 +12,9 @@ module HTML
       def call
         headers = Hash.new 0
-        was = 2
+        was = 1
         toc = ""
-        doc.css('h1, h2, h3, h4, h5, h6').each do |node|
+        doc.css('h2, h3, h4, h5, h6').each do |node|
           level = node.name.scan(/\d/).first.to_i
           name = node.text.downcase
           name.gsub!(/[^\w\- ]/, '') # remove punctuation
@@ -24,15 +24,21 @@ module HTML
           uniq = (headers[name] > 0) ? "-#{headers[name]}" : ''
           headers[name] += 1
           node['id'] = "#{name}#{uniq}"
-          while was > level
-            toc << "</ul>\n</li>\n"
-            was -= 1
-          end
-          while was < level
-            toc << "<li>\n<ul>"
-            was += 1
+          if was < level
+            while was < level
+              toc << "<ul>\n<li>"
+              was += 1
+            end
+          else
+            toc << "</li>\n"
+            while was > level
+              toc << "</ul></li>\n"
+              was -= 1
+            end
+            toc << "<li>"
           end
-          toc << "<li><a href=\"##{name}#{uniq}\">#{node.inner_html}</a></li>"
+          toc << "<a href=\"##{name}#{uniq}\">#{node.inner_html}</a>"
         end
         length = 0
@@ -40,14 +46,15 @@ module HTML
         return doc unless length >= context[:toc_minimal_length]
         while was > 1
-          toc << "</ul>\n</li>\n"
+          toc << "</li>\n</ul>\n"
           was -= 1
         end
+        toc.sub!('<ul>', '<ul class="toc">')
         unless headers.empty?
           first_child = doc.child
           first_child.add_previous_sibling context[:toc_header]
-          first_child.add_previous_sibling "<ul class=\"toc\">#{toc}</ul>"
+          first_child.add_previous_sibling toc
         end
         doc
       end

data/lib/html/pipeline/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module HTML
   class Pipeline
-    VERSION = "0.14.1"
+    VERSION = "0.14.2"
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: html-pipeline-linuxfr
 version: !ruby/object:Gem::Version
-  version: 0.14.1
+  version: 0.14.2
 platform: ruby
 authors:
 - Ryan Tomayko
@@ -10,7 +10,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-06-23 00:00:00.000000000 Z
+date: 2013-06-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri