RubyGems - html-pipeline-linuxfr - Versions diffs - 0.0.14 - Mend

html-pipeline-linuxfr 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +7 -0
data/.gitignore +19 -0
data/.travis.yml +13 -0
data/CHANGELOG.md +51 -0
data/Gemfile +9 -0
data/LICENSE +22 -0
data/README.md +294 -0
data/Rakefile +11 -0
data/bin/html-pipeline +80 -0
data/html-pipeline-linuxfr.gemspec +24 -0
data/lib/html/pipeline.rb +167 -0
data/lib/html/pipeline/custom_links_filter.rb +47 -0
data/lib/html/pipeline/filter.rb +166 -0
data/lib/html/pipeline/linuxfr.rb +25 -0
data/lib/html/pipeline/markdown_filter.rb +76 -0
data/lib/html/pipeline/relative_links_filter.rb +18 -0
data/lib/html/pipeline/sanitization_filter.rb +108 -0
data/lib/html/pipeline/syntax_highlight_filter.rb +31 -0
data/lib/html/pipeline/text_filter.rb +14 -0
data/lib/html/pipeline/toc_filter.rb +61 -0
data/lib/html/pipeline/version.rb +5 -0
data/test/helpers/mocked_instrumentation_service.rb +17 -0
data/test/html/pipeline/absolute_source_filter_test.rb +56 -0
data/test/html/pipeline/camo_filter_test.rb +47 -0
data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
data/test/html/pipeline/markdown_filter_test.rb +101 -0
data/test/html/pipeline/mention_filter_test.rb +156 -0
data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
data/test/html/pipeline/sanitization_filter_test.rb +52 -0
data/test/html/pipeline/toc_filter_test.rb +47 -0
data/test/html/pipeline_test.rb +74 -0
data/test/test_helper.rb +38 -0
metadata +175 -0

data/html-pipeline-linuxfr.gemspec ADDED Viewed

@@ -0,0 +1,24 @@
+# -*- encoding: utf-8 -*-
+require File.expand_path("../lib/html/pipeline/version", __FILE__)
+Gem::Specification.new do |gem|
+  gem.name          = "html-pipeline-linuxfr"
+  gem.version       = HTML::Pipeline::VERSION
+  gem.license       = "MIT"
+  gem.authors       = ["Ryan Tomayko", "Jerry Cheung", "Bruno Michel"]
+  gem.email         = ["ryan@github.com", "jerry@github.com", "bmichel@menfin.info"]
+  gem.description   = %q{LinuxFr.org HTML processing filters and utilities, adapted from those of GitHub}
+  gem.summary       = %q{Helpers for processing content through a chain of filters}
+  gem.homepage      = "https://github.com/nono/html-pipeline-linuxfr"
+  gem.files         = `git ls-files`.split $/
+  gem.test_files    = gem.files.grep(%r{^test})
+  gem.require_paths = ["lib"]
+  gem.add_dependency "nokogiri",        "~> 1.4"
+  gem.add_dependency "redcarpet",       "~> 2.1"
+  gem.add_dependency "pygments.rb",     "~> 0.5"
+  gem.add_dependency "sanitize",        "~> 2.0"
+  gem.add_dependency "escape_utils",    "~> 0.3"
+  gem.add_dependency "activesupport",   ">= 2"
+end

data/lib/html/pipeline.rb ADDED Viewed

@@ -0,0 +1,167 @@
+require "nokogiri"
+require "active_support/xml_mini/nokogiri" # convert Documents to hashes
+require "escape_utils"
+module HTML
+  # GitHub HTML processing filters and utilities. This module includes a small
+  # framework for defining DOM based content filters and applying them to user
+  # provided content.
+  #
+  # See HTML::Pipeline::Filter for information on building filters.
+  #
+  # Construct a Pipeline for running multiple HTML filters.  A pipeline is created once
+  # with one to many filters, and it then can be `call`ed many times over the course
+  # of its lifetime with input.
+  #
+  # filters         - Array of Filter objects. Each must respond to call(doc,
+  #                   context) and return the modified DocumentFragment or a
+  #                   String containing HTML markup. Filters are performed in the
+  #                   order provided.
+  # default_context - The default context hash. Values specified here will be merged
+  #                   into values from the each individual pipeline run.  Can NOT be
+  #                   nil.  Default: empty Hash.
+  # result_class    - The default Class of the result object for individual
+  #                   calls.  Default: Hash.  Protip:  Pass in a Struct to get
+  #                   some semblance of type safety.
+  class Pipeline
+    autoload :VERSION,               'html/pipeline/version'
+    autoload :Filter,                'html/pipeline/filter'
+    autoload :TextFilter,            'html/pipeline/text_filter'
+    autoload :MarkdownFilter,        'html/pipeline/markdown_filter'
+    autoload :TableOfContentsFilter, 'html/pipeline/toc_filter'
+    autoload :SyntaxHighlightFilter, 'html/pipeline/syntax_highlight_filter'
+    autoload :RelativeLinksFilter,   'html/pipeline/relative_links_filter'
+    autoload :CustomLinksFilter,     'html/pipeline/custom_links_filter'
+    autoload :SanitizationFilter,    'html/pipeline/sanitization_filter'
+    autoload :LinuxFr,               'html/pipeline/linuxfr'
+    # Our DOM implementation.
+    DocumentFragment = Nokogiri::HTML::DocumentFragment
+    # Parse a String into a DocumentFragment object. When a DocumentFragment is
+    # provided, return it verbatim.
+    def self.parse(document_or_html)
+      document_or_html ||= ''
+      if document_or_html.is_a?(String)
+        DocumentFragment.parse(document_or_html)
+      else
+        document_or_html
+      end
+    end
+    # Public: Returns an Array of Filter objects for this Pipeline.
+    attr_reader :filters
+    # Public: Instrumentation service for the pipeline.
+    # Set an ActiveSupport::Notifications compatible object to enable.
+    attr_accessor :instrumentation_service
+    # Public: String name for this Pipeline. Defaults to Class name.
+    attr_writer :instrumentation_name
+    def instrumentation_name
+      @instrumentation_name || self.class.name
+    end
+    class << self
+      # Public: Default instrumentation service for new pipeline objects.
+      attr_accessor :default_instrumentation_service
+    end
+    def initialize(filters, default_context = {}, result_class = nil)
+      raise ArgumentError, "default_context cannot be nil" if default_context.nil?
+      @filters = filters.flatten.freeze
+      @default_context = default_context.freeze
+      @result_class = result_class || Hash
+      @instrumentation_service = self.class.default_instrumentation_service
+    end
+    # Apply all filters in the pipeline to the given HTML.
+    #
+    # html    - A String containing HTML or a DocumentFragment object.
+    # context - The context hash passed to each filter. See the Filter docs
+    #           for more info on possible values. This object MUST NOT be modified
+    #           in place by filters.  Use the Result for passing state back.
+    # result  - The result Hash passed to each filter for modification.  This
+    #           is where Filters store extracted information from the content.
+    #
+    # Returns the result Hash after being filtered by this Pipeline.  Contains an
+    # :output key with the DocumentFragment or String HTML markup based on the
+    # output of the last filter in the pipeline.
+    def call(html, context = {}, result = nil)
+      context = @default_context.merge(context)
+      context = context.freeze
+      result ||= @result_class.new
+      payload = default_payload :filters => @filters.map(&:name),
+        :context => context, :result => result
+      instrument "call_pipeline.html_pipeline", payload do
+        result[:output] =
+          @filters.inject(html) do |doc, filter|
+            perform_filter(filter, doc, context, result)
+          end
+      end
+      result
+    end
+    # Internal: Applies a specific filter to the supplied doc.
+    #
+    # The filter is instrumented.
+    #
+    # Returns the result of the filter.
+    def perform_filter(filter, doc, context, result)
+      payload = default_payload :filter => filter.name,
+        :context => context, :result => result
+      instrument "call_filter.html_pipeline", payload do
+        filter.call(doc, context, result)
+      end
+    end
+    # Like call but guarantee the value returned is a DocumentFragment.
+    # Pipelines may return a DocumentFragment or a String. Callers that need a
+    # DocumentFragment should use this method.
+    def to_document(input, context = {}, result = nil)
+      result = call(input, context, result)
+      HTML::Pipeline.parse(result[:output])
+    end
+    # Like call but guarantee the value returned is a string of HTML markup.
+    def to_html(input, context = {}, result = nil)
+      result = call(input, context, result = nil)
+      output = result[:output]
+      if output.respond_to?(:to_html)
+        output.to_html
+      else
+        output.to_s
+      end
+    end
+    # Public: setup instrumentation for this pipeline.
+    #
+    # Returns nothing.
+    def setup_instrumentation(name = nil, service = nil)
+      self.instrumentation_name = name
+      self.instrumentation_service =
+        service || self.class.default_instrumentation_service
+    end
+    # Internal: if the `instrumentation_service` object is set, instruments the
+    # block, otherwise the block is ran without instrumentation.
+    #
+    # Returns the result of the provided block.
+    def instrument(event, payload = nil)
+      payload ||= default_payload
+      return yield(payload) unless instrumentation_service
+      instrumentation_service.instrument event, payload do |payload|
+        yield payload
+      end
+    end
+    # Internal: Default payload for instrumentation.
+    #
+    # Accepts a Hash of additional payload data to be merged.
+    #
+    # Returns a Hash.
+    def default_payload(payload = {})
+      {:pipeline => instrumentation_name}.merge(payload)
+    end
+  end
+end

data/lib/html/pipeline/custom_links_filter.rb ADDED Viewed

@@ -0,0 +1,47 @@
+module HTML
+  class Pipeline
+    class CustomLinksFilter < Filter
+      LF_REGEXP = /\[\[\[([ '\.:\-\p{Word}]+)\]\]\]/
+      WP_REGEXP = /\[\[([ '\.+:!\-\(\)\p{Word}]+)\]\]/
+      LF_TITLE = "Lien du wiki interne LinuxFr.org"
+      WP_TITLE = "Définition Wikipédia"
+      # Don't look for links in text nodes that are children of these elements
+      IGNORE_PARENTS = %w(pre code a).to_set
+      def call
+        doc.search('text()').each do |node|
+          content = node.to_html
+          next if !content.include?('[[')
+          next if has_ancestor?(node, IGNORE_PARENTS)
+          html = content
+          html = process_internal_wiki_links html
+          html = process_wikipedia_links html
+          next if html == content
+          node.replace(html)
+        end
+        doc
+      end
+      def process_internal_wiki_links(text)
+        base_url = "//#{context[:host]}/wiki"
+        text.gsub(LF_REGEXP, "<a href=\"#{base_url}/\1\" title=\"#{LF_TITLE}\">\\1</a>")
+      end
+      def process_wikipedia_links(text)
+        text.gsub(WP_REGEXP) do
+          word = $1
+          escaped = word.gsub(/\(|\)|'/) {|x| "\\#{x}" }
+          parts = word.split(":")
+          parts.shift if %w(de en es eo wikt).include?(parts.first)
+          "<a href=\"http://fr.wikipedia.org/wiki/#{escaped}\" title=\"#{WP_TITLE}\")>#{parts.join ':'}</a>"
+        end
+      end
+    end
+  end
+end

data/lib/html/pipeline/filter.rb ADDED Viewed

@@ -0,0 +1,166 @@
+module HTML
+  class Pipeline
+    # Base class for user content HTML filters. Each filter takes an
+    # HTML string or Nokogiri::HTML::DocumentFragment, performs
+    # modifications and/or writes information to the result hash. Filters must
+    # return a DocumentFragment (typically the same instance provided to the call
+    # method) or a String with HTML markup.
+    #
+    # Example filter that replaces all images with trollface:
+    #
+    #   class FuuuFilter < HTML::Pipeline::Filter
+    #     def call
+    #       doc.search('img').each do |img|
+    #         img['src'] = "http://paradoxdgn.com/junk/avatars/trollface.jpg"
+    #       end
+    #     end
+    #   end
+    #
+    # The context Hash passes options to filters and should not be changed in
+    # place.  A Result Hash allows filters to make extracted information
+    # available to the caller and is mutable.
+    #
+    # Common context options:
+    #   :base_url   - The site's base URL
+    #   :repository - A Repository providing context for the HTML being processed
+    #
+    # Each filter may define additional options and output values. See the class
+    # docs for more info.
+    class Filter
+      class InvalidDocumentException < StandardError; end
+      def initialize(doc, context = nil, result = nil)
+        if doc.kind_of?(String)
+          @html = doc.to_str
+          @doc = nil
+        else
+          @doc = doc
+          @html = nil
+        end
+        @context = context || {}
+        @result = result || {}
+        validate
+      end
+      # Public: Returns a simple Hash used to pass extra information into filters
+      # and also to allow filters to make extracted information available to the
+      # caller.
+      attr_reader :context
+      # Public: Returns a Hash used to allow filters to pass back information
+      # to callers of the various Pipelines.  This can be used for
+      # #mentioned_users, for example.
+      attr_reader :result
+      # The Nokogiri::HTML::DocumentFragment to be manipulated. If the filter was
+      # provided a String, parse into a DocumentFragment the first time this
+      # method is called.
+      def doc
+        @doc ||= parse_html(html)
+      end
+      # The String representation of the document. If a DocumentFragment was
+      # provided to the Filter, it is serialized into a String when this method is
+      # called.
+      def html
+        raise InvalidDocumentException if @html.nil? && @doc.nil?
+        @html || doc.to_html
+      end
+      # The main filter entry point. The doc attribute is guaranteed to be a
+      # Nokogiri::HTML::DocumentFragment when invoked. Subclasses should modify
+      # this document in place or extract information and add it to the context
+      # hash.
+      def call
+        raise NotImplementedError
+      end
+      # Make sure the context has everything we need. Noop: Subclasses can override.
+      def validate
+      end
+      # The Repository object provided in the context hash, or nil when no
+      # :repository was specified.
+      #
+      # It's assumed that the repository context has already been checked
+      # for permissions
+      def repository
+        context[:repository]
+      end
+      # The User object provided in the context hash, or nil when no user
+      # was specified
+      def current_user
+        context[:current_user]
+      end
+      # The site's base URL provided in the context hash, or '/' when no
+      # base URL was specified.
+      def base_url
+        context[:base_url] || '/'
+      end
+      # Ensure the passed argument is a DocumentFragment. When a string is
+      # provided, it is parsed and returned; otherwise, the DocumentFragment is
+      # returned unmodified.
+      def parse_html(html)
+        HTML::Pipeline.parse(html)
+      end
+      # Helper method for filter subclasses used to determine if any of a node's
+      # ancestors have one of the tag names specified.
+      #
+      # node - The Node object to check.
+      # tags - An array of tag name strings to check. These should be downcase.
+      #
+      # Returns true when the node has a matching ancestor.
+      def has_ancestor?(node, tags)
+        while node = node.parent
+          if tags.include?(node.name.downcase)
+            break true
+          end
+        end
+      end
+      # Perform a filter on doc with the given context.
+      #
+      # Returns a HTML::Pipeline::DocumentFragment or a String containing HTML
+      # markup.
+      def self.call(doc, context = nil, result = nil)
+        new(doc, context, result).call
+      end
+      # Like call but guarantees that a DocumentFragment is returned, even when
+      # the last filter returns a String.
+      def self.to_document(input, context = nil)
+        html = call(input, context)
+        HTML::Pipeline::parse(html)
+      end
+      # Like call but guarantees that a string of HTML markup is returned.
+      def self.to_html(input, context = nil)
+        output = call(input, context)
+        if output.respond_to?(:to_html)
+          output.to_html
+        else
+          output.to_s
+        end
+      end
+      # Validator for required context. This will check that anything passed in
+      # contexts exists in @contexts
+      #
+      # If any errors are found an ArgumentError will be raised with a
+      # message listing all the missing contexts and the filters that
+      # require them.
+      def needs(*keys)
+        missing = keys.reject { |key| context.include? key }
+        if missing.any?
+          raise ArgumentError,
+            "Missing context keys for #{self.class.name}: #{missing.map(&:inspect).join ', '}"
+        end
+      end
+    end
+  end
+end

data/lib/html/pipeline/linuxfr.rb ADDED Viewed

@@ -0,0 +1,25 @@
+module HTML
+  class Pipeline
+    class LinuxFr
+      CONTEXT = {
+        toc_minimal_length: 5000,
+        toc_header: "<h2 class=\"sommaire\">Sommaire</h2>\n",
+        host: "linuxfr.org"
+      }
+      def self.render(text)
+        pipeline = HTML::Pipeline.new [
+          HTML::Pipeline::MarkdownFilter,
+          HTML::Pipeline::TableOfContentsFilter,
+          HTML::Pipeline::SyntaxHighlightFilter,
+          HTML::Pipeline::RelativeLinksFilter,
+          HTML::Pipeline::CustomLinksFilter
+        ], CONTEXT
+        result = pipeline.call text
+        result[:output].to_s
+      end
+    end
+  end
+end

data/lib/html/pipeline/markdown_filter.rb ADDED Viewed

@@ -0,0 +1,76 @@
+require 'redcarpet'
+module HTML
+  class Pipeline
+    # LinuxFr Flavored Markdown
+    class LFMarkdown < Redcarpet::Render::HTML
+      attr_accessor :image_class
+      PARSER_OPTIONS = {
+        :no_intra_emphasis  => true,
+        :tables             => true,
+        :fenced_code_blocks => true,
+        :autolink           => true,
+        :strikethrough      => true,
+        :superscript        => true
+      }
+      HTML_OPTIONS = {
+        :filter_html        => true,
+        :no_styles          => true,
+        :hard_wrap          => true,
+        :xhtml              => true
+      }
+      def initialize(extensions={})
+        super extensions.merge(HTML_OPTIONS)
+      end
+      def header(text, header_level)
+        l = header_level + 1
+        "<h#{l}>#{text}</h#{l}>\n"
+      end
+      def strikethrough(text)
+        "<s>#{text}</s>"
+      end
+      def image(link, title, alt_text)
+        return "" if link.blank?
+        ::Image.new(link, title, alt_text).to_html  # FIXME
+      end
+      def normal_text(text)
+        text = CGI.escapeHTML(text)
+        text.gsub!('« ', '«&nbsp;')
+        text.gsub!(/ ([:;»!?])/, '&nbsp;\1')
+        text.gsub!(' -- ', '—')
+        text.gsub!('...', '…')
+        text
+      end
+    end
+    # HTML Filter that converts Markdown text into HTML and converts into a
+    # DocumentFragment. This is different from most filters in that it can take a
+    # non-HTML as input. It must be used as the first filter in a pipeline.
+    #
+    # This filter does not write any additional information to the context hash.
+    class MarkdownFilter < TextFilter
+      def initialize(text, context = nil, result = nil)
+        super text, context, result
+        @text = @text.gsub "\r", ''
+      end
+      # Convert Markdown to HTML using the best available implementation
+      # and convert into a DocumentFragment.
+      def call
+        lfm = Redcarpet::Markdown.new LFMarkdown, LFMarkdown::PARSER_OPTIONS
+        lfm.render @text
+      end
+    end
+  end
+end