RubyGems - pagehub-markdown - Versions diffs - 0.1.0 - Mend

pagehub-markdown 0.1.0

Files changed (7) hide show

data/lib/pagehub-markdown.rb +13 -0
data/lib/pagehub-markdown/markdown.rb +128 -0
data/lib/pagehub-markdown/mutators/date_injector.rb +18 -0
data/lib/pagehub-markdown/processors/embedder.rb +250 -0
data/lib/pagehub-markdown/processors/pagehub_options.rb +21 -0
data/lib/pagehub-markdown/processors/toc_generator.rb +98 -0
metadata +115 -0

@@ -0,0 +1,13 @@
+require 'redcarpet'
+require 'albino'
+require 'pagehub-markdown/markdown'
+require 'pagehub-markdown/processor'
+require 'pagehub-markdown/mutator'
+require 'pagehub-markdown/processors/embedder'
+require 'pagehub-markdown/processors/pagehub_options'
+require 'pagehub-markdown/processors/toc_generator'
+require 'pagehub-markdown/mutators/date_injector'
+module PageHub
+end

data/lib/pagehub-markdown/markdown.rb ADDED

@@ -0,0 +1,128 @@
+module PageHub
+  module Markdown
+    class << self
+      def add_processor(stage, p) # :nodoc:
+        Stages.each { |s| @@hooks[s] ||= [] }
+        unless Stages.include?(stage.to_sym)
+          raise "Invalid stage #{stage}. Allowed stages are #{Stages.join(', ')}"
+        end
+        unless p.respond_to?(:call)
+          raise "Processor must be a callable object."
+        end
+        if stage.is_a? Array
+          stage.each { |s| @@hooks[s] << p }
+        else
+          @@hooks[stage.to_sym] << p
+        end
+      end
+      def add_mutator(m) # :nodoc:
+        unless m.respond_to?(:call)
+          raise "Mutator must be a callable object."
+        end
+        @@mutators << m
+      end
+      # (re)constructs the renderer with the given options, see
+      # PageHubOptions, RendererOptions, and RendererExtensions
+      # for accepted values
+      def configure(ph_options = {}, options = {}, extensions = {})
+        @@options  = PageHubOptions.merge(ph_options)
+        @@renderer = Redcarpet::Markdown.new(
+          HTMLWithAlbino.new(RendererOptions.merge(options)),
+          RendererExtensions.merge(extensions))
+      end
+      def render!(str)
+        configure unless @@renderer
+        @@hooks[:pre_render].each { |processor| processor.call(str) }
+        # escape any JavaScript snippets
+        if @@options[:escape_scripts]
+          str.gsub!(/\<script(.*)\>/i) {
+            mutated = true
+            "&lt;script#{$1}&gt;"
+          }
+        end
+        str = @@renderer.render(str)
+        @@hooks[:post_render].each { |processor| processor.call(str) }
+        str
+      end
+      def render(str)
+        o = str.dup; render!(o); o
+      end
+      def mutate!(str)
+        mutated = false
+        @@mutators.each { |m| mutated ||= m.call(str) }
+        mutated
+      end
+    end
+    protected
+    Stages      = [ :pre_render, :post_render ]
+    @@hooks     = { }
+    @@mutators  = [ ]
+    @@options   = { }
+    PageHubOptions = {
+      escape_scripts:   true
+    }
+    RendererOptions = {
+      filter_html:      false,
+      no_images:        false,
+      no_links:         false,
+      no_styles:        false,
+      safe_links_only:  false,
+      with_toc_data:    true,
+      hard_wrap:        false,
+      xhtml:            false
+    }
+    RendererExtensions = {
+      no_intra_emphasis:    true,
+      tables:               false,
+      fenced_code_blocks:   true,
+      autolink:             true,
+      strikethrough:        true,
+      lax_html_blocks:      false,
+      space_after_headers:  true,
+      superscript:          true
+    }
+    private
+    # a renderer that uses Albino to highlight syntax
+    class HTMLWithAlbino < Redcarpet::Render::HTML
+      def block_code(code, language)
+        begin
+          # TODO: try to figure out whether @language is valid
+          out = Albino.colorize(code, language)
+        rescue Exception => e
+          out = ""
+          # return "-- INVALID CODE BLOCK, MAKE SURE YOU'VE SURROUNDED CODE WITH ```"
+        end
+        # just render the code as plain text if the language is invalid
+        out.empty? ? block_code(code, "text") : out
+      end
+    end
+    @@renderer = nil
+  end
+end

data/lib/pagehub-markdown/mutators/date_injector.rb ADDED

@@ -0,0 +1,18 @@
+module PageHub
+module Markdown
+  add_mutator lambda { |str|
+    mutated = false
+    str.gsub!(/\[\!date(.*)\!\]/) {
+      mutated = true
+      format = $1.empty? ? "%D" : $1.strip
+      DateTime.now.strftime(format)
+    }
+    mutated
+  }
+end # Markdown module
+end # PageHub module

data/lib/pagehub-markdown/processors/embedder.rb ADDED

@@ -0,0 +1,250 @@
+require 'open-uri'
+require 'net/http'
+require 'nokogiri'
+module PageHub
+module Markdown
+  # Downloads remote textual resources from websites
+  # and allows for content extraction from HTML pages
+  # so it can be neatly embedded in another page.
+  module Embedder
+    class EmbeddingError    < RuntimeError; end
+    class InvalidSizeError  < EmbeddingError; end
+    class InvalidTypeError  < EmbeddingError; end
+    # Resources whose content-type is not specified in this
+    # list will be rejected
+    AllowedTypes = [/text\/plain/, /text\/html/, /application\/html/]
+    # Resources larger than 1 MByte will be rejected
+    MaximumLength = 1 * 1024 * 1024
+    # Resources served by any of the hosts specified in this list
+    # will be rejected
+    FilteredHosts = []
+    Timeout = 5
+    private
+    @@processors = []
+    public
+    class << self
+      # Performs a HEAD request to validate the resource, and if it
+      # passes the checks it will be downloaded and processed if
+      # any eligible Embedder::Processor is registered.
+      #
+      # Arguments:
+      # 1. raw_uri  the full raw URI of the file to be embedded
+      # 2. source   an optional identifier to specify the Processor
+      #             that should be used to post-process the content
+      # 3. args     options that can be meaningful to the Processor, if any
+      #
+      # Returns:
+      # A string containing the extracted data, or an empty one
+      def get_resource(raw_uri, source = "", args = "")
+        begin
+          uri = URI.parse(raw_uri)
+          # reject if the host is banned
+          return "" if FilteredHosts.include?(uri.host)
+          Net::HTTP.start(uri.host, uri.port) do |http|
+            http.open_timeout = Timeout
+            http.read_timeout = Timeout
+            # get the content type and length
+            ctype = ""
+            clength = 0
+            http.head(uri.path).each { |k,v|
+              # puts "#{k} => #{v}"
+              ctype = v if k == "content-type"
+              clength = v.to_i if k == "content-length"
+            }
+            raise InvalidTypeError.new ctype if !self.allowed?(ctype)
+            raise InvalidSizeError.new clength if clength > MaximumLength
+            open(raw_uri) { |f|
+              content = f.read
+              # invoke processors
+              keys = []
+              keys << source unless source.empty?
+              keys << raw_uri
+              @@processors.each { |p|
+                if p.applies_to?(keys) then
+                  content = p.process(content, raw_uri, args)
+                  break
+                end
+              }
+              return content
+            }
+          end
+        rescue EmbeddingError => e
+          # we want to escalate these errors
+          raise e
+        rescue Exception => e
+          # mask as a generic EmbeddingError
+          raise EmbeddingError.new e.message
+        end
+        ""
+      end
+      def allowed?(ctype)
+        AllowedTypes.each { |t| return true if t.match ctype }
+        false
+      end
+      def register_processor(proc)
+        @@processors ||= []
+        @@processors << proc
+      end
+    end # class << self
+    class Processor
+      # Processors apply to "keys" which can be written manually
+      # in Markdown by the user, or are found in the host portion
+      # of the resource URI
+      #
+      # IE, a Github Wiki processor would bind to the keys:
+      # "github-wiki", or/and <tt>/github.com.*\/wiki\//</tt>
+      #
+      # Manual keys are injected after the !include keyword:
+      # [!include github-wiki!](https://github.com/some-dude/wiki/Home)
+      #
+      def initialize(keys)
+        @keys = keys
+        super()
+      end
+      def process(content, uri, args = "")
+        raise NotImplementedError
+      end
+      def applies_to?(keys)
+        @keys.each { |h| keys.each { |k| return true if h.match k } }
+        false
+      end
+      # Node should be the root node that contains the embedded content,
+      # which will be stripped of all attributes and injected with new ones:
+      # 1. data-embed-uri containing the URI of the embedded resource
+      # 2. data-embed-src the name of the processor used for embedding
+      #
+      # All children nodes that have an @id attribute will have that attribute
+      # removed as well.
+      def stamp(node, uri, key)
+        node.xpath("//*[@id]").each { |node| node.remove_attribute "id" }
+        node.attributes.each_pair { |name,_| node.remove_attribute name }
+        node['data-embed-uri'] = uri
+        node['data-embed-src'] = key
+      end
+    end
+    # Extracts content from GitHub Wiki pages
+    #
+    # Bound keys:
+    # * "github-wiki"
+    # * URI("[...]github.com/[...]/wiki/[...]")
+    #
+    class GithubWikiProcessor < Processor
+      def initialize()
+        super(["github-wiki", /github.com.*\/wiki\//])
+      end
+      # Returns the content of the node <div class='markdown-body'></div>,
+      # it will also remove all id attributes of all content nodes.
+      #
+      # Supported options:
+      # 1. reduce-headings: all heading nodes (<h1> through <h5>) will be
+      # stepped one level, so h1 becomes h2, etc.
+      def process(content, uri, args = "")
+        html_doc = Nokogiri::HTML(content) do |config| config.noerror end
+        node = html_doc.xpath("//div[@class='markdown-body']").first
+        stamp(node, uri, 'github-wiki')
+        if args.include?("reduce-headings") then
+          5.downto(1) { |level|
+            node.xpath("//h#{level}").each { |heading_node|
+              heading_node.name = "h#{level+1}"
+            }
+          }
+        end
+        node
+      end
+    end
+    # Extracts content from PageHub shared documents
+    #
+    # Bound keys:
+    # * "pagehub"
+    # * URI([...]pagehub.org/[...])
+    class PageHubProcessor < Processor
+      def initialize()
+        super(["pagehub", /pagehub.org/])
+      end
+      def process(content, uri, args = "")
+        html_doc = Nokogiri::HTML(content) do |config| config.noerror end
+        node = html_doc.xpath("//div[@id='content']").first
+        node.xpath('div[@id="breadcrumbs"]').remove
+        node.xpath('div[@id="bottom"]').remove
+        stamp(node, uri, 'pagehub')
+        node
+      end
+    end
+    register_processor(GithubWikiProcessor.new)
+    register_processor(PageHubProcessor.new)
+  end # Embedder module
+  add_processor :pre_render, lambda {|str|
+    # Embed remote references, if any
+    str.gsub!(/^\B\[\!include\s?(.*)\!\]\((.*)\)/) {
+      content = ""
+      uri = $2
+      # parse the content source and args, if any
+      source = ($1 || "").split.first || ""
+      args = ($1 || "").split || []
+      args = args[1..args.length].join(' ') unless args.empty?
+      begin
+        content = Embedder.get_resource(uri, source, args)
+      rescue Embedder::InvalidSizeError => e
+        content << "**Embedding error**: the file you tried to embed is too big - #{e.message.to_i} bytes."
+        content << " (**Source**: [#{$2}](#{$2}))\n\n"
+      rescue Embedder::InvalidTypeError => e
+        content << "**Embedding error**: the file type you tried to embed (`#{e.message}`) is not supported."
+        content << " (**Source**: [#{$2}](#{$2}))\n\n"
+      rescue Embedder::EmbeddingError => e
+        content << "**Embedding error**: #{e.message}."
+        content << " (**Source**: [#{$2}](#{$2}))\n\n"
+      end
+      # content = "<div data-embedded=true>#{content.to_s.to_markdown}</div>".to_markdown
+      # content = "#{content}"
+      content
+    }
+    str
+  }
+end # Markdown module
+end # PageHub module

data/lib/pagehub-markdown/processors/pagehub_options.rb ADDED

@@ -0,0 +1,21 @@
+module PageHub
+module Markdown
+  add_processor :post_render, lambda { |str|
+    str.gsub!(/\[\!options(.*)\!\]/) {
+      opts = $1
+      out = ""
+      unless opts.empty?
+        opts = opts.split(' ').each { |opt|
+          case opt
+          when "no-title"
+            out += "<style>header h1 { display: none }</style>"
+          end
+        }
+      end
+      out
+    }
+  }
+end # Markdown module
+end # PageHub module

data/lib/pagehub-markdown/processors/toc_generator.rb ADDED

@@ -0,0 +1,98 @@
+module PageHub
+module Markdown
+  module ToC
+    # Builds a tree of headings from a given block of Markdown
+    # text, the returned list can be turned into HTML using
+    # ToC::to_html()
+    def self.from_markdown(markdown, threshold = 6)
+      self.from_content(/(#+)\s([^\n]+)/, lambda { |l, t| return l.length, t }, markdown, threshold)
+    end
+    # renders a table of content using nested <ol> list nodes
+    # from a given list of Heading objects produced by ToC::from_markdown()
+    def self.to_html(toc)
+      html = "<ol>"
+      toc.each { |heading| html << heading.to_html }
+      html << "</ol>"
+      html
+    end
+    private
+    def self.from_content(pattern, formatter, content, threshold)
+      headings  = []
+      current   = []
+      toc_index = 0
+      content.scan(pattern).each { |l, t|
+        level,title = formatter.call(l, t)
+        if level <= threshold
+          h = Heading.new(title, level, toc_index)
+          headings << h
+          current[level] = h
+          toc_index += 1 # toc_index is used for hyperlinking
+          # if there's a parent, attach this heading as a child to it
+          if current[level-1] then
+            current[level-1] << h
+          end
+        end
+      }
+      toc = []
+      headings.each { |h|
+        next if h.parent
+        toc << h
+      }
+      toc
+    end
+    class Heading
+      attr_accessor :level, :title, :children, :parent, :index
+      def initialize(title, level, index)
+        @title = title
+        @level = level
+        @index = index
+        @parent = nil
+        @children = []
+        super()
+      end
+      def <<(h)
+        @children.each { |child|
+          return if child.title == h.title
+        }
+        h.parent = self
+        @children << h
+      end
+      def to_html()
+        html = ""
+        html << "<li>"
+        html << "<a href=\"\#toc_#{index}\">" << title << "</a>"
+        if children.any? then
+          html << "<ol>"
+          children.each { |child| html << child.to_html }
+          html << "</ol>"
+        end
+        html << "</li>"
+      end
+    end
+  end
+  # register the processor
+  add_processor :pre_render, lambda { |str|
+    str.gsub!(/^\B\[\!toc(.*)\!\]/) {
+      ToC.to_html ToC.from_markdown(str, $1.empty? ? 6 : $1.strip.to_i)
+    }
+    str
+  }
+end # Markdown module
+end # PageHub module

metadata ADDED

@@ -0,0 +1,115 @@
+--- !ruby/object:Gem::Specification
+name: pagehub-markdown
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+  prerelease:
+platform: ruby
+authors:
+- Ahmad Amireh
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-10-03 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: redcarpet
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 2.1.1
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 2.1.1
+- !ruby/object:Gem::Dependency
+  name: albino
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.3.3
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.3.3
+- !ruby/object:Gem::Dependency
+  name: json
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.7.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.7.0
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.5.5
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.5.5
+description: A bunch of neat features added to the Markdown renderer via pure Markdown
+  syntax.
+email: ahmad@amireh.net
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/pagehub-markdown.rb
+- lib/pagehub-markdown/processors/toc_generator.rb
+- lib/pagehub-markdown/processors/pagehub_options.rb
+- lib/pagehub-markdown/processors/embedder.rb
+- lib/pagehub-markdown/markdown.rb
+- lib/pagehub-markdown/mutators/date_injector.rb
+homepage: http://github.com/amireh/pagehub-markdown
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.23
+signing_key:
+specification_version: 3
+summary: PageHub's extensions of GitHub's Redcarpet Markdown renderer.
+test_files: []