html-pipeline-linuxfr 0.0.14

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path("../lib/html/pipeline/version", __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.name = "html-pipeline-linuxfr"
6
+ gem.version = HTML::Pipeline::VERSION
7
+ gem.license = "MIT"
8
+ gem.authors = ["Ryan Tomayko", "Jerry Cheung", "Bruno Michel"]
9
+ gem.email = ["ryan@github.com", "jerry@github.com", "bmichel@menfin.info"]
10
+ gem.description = %q{LinuxFr.org HTML processing filters and utilities, adapted from those of GitHub}
11
+ gem.summary = %q{Helpers for processing content through a chain of filters}
12
+ gem.homepage = "https://github.com/nono/html-pipeline-linuxfr"
13
+
14
+ gem.files = `git ls-files`.split $/
15
+ gem.test_files = gem.files.grep(%r{^test})
16
+ gem.require_paths = ["lib"]
17
+
18
+ gem.add_dependency "nokogiri", "~> 1.4"
19
+ gem.add_dependency "redcarpet", "~> 2.1"
20
+ gem.add_dependency "pygments.rb", "~> 0.5"
21
+ gem.add_dependency "sanitize", "~> 2.0"
22
+ gem.add_dependency "escape_utils", "~> 0.3"
23
+ gem.add_dependency "activesupport", ">= 2"
24
+ end
@@ -0,0 +1,167 @@
1
+ require "nokogiri"
2
+ require "active_support/xml_mini/nokogiri" # convert Documents to hashes
3
+ require "escape_utils"
4
+
5
+ module HTML
6
+ # GitHub HTML processing filters and utilities. This module includes a small
7
+ # framework for defining DOM based content filters and applying them to user
8
+ # provided content.
9
+ #
10
+ # See HTML::Pipeline::Filter for information on building filters.
11
+ #
12
+ # Construct a Pipeline for running multiple HTML filters. A pipeline is created once
13
+ # with one to many filters, and it then can be `call`ed many times over the course
14
+ # of its lifetime with input.
15
+ #
16
+ # filters - Array of Filter objects. Each must respond to call(doc,
17
+ # context) and return the modified DocumentFragment or a
18
+ # String containing HTML markup. Filters are performed in the
19
+ # order provided.
20
+ # default_context - The default context hash. Values specified here will be merged
21
+ # into values from the each individual pipeline run. Can NOT be
22
+ # nil. Default: empty Hash.
23
+ # result_class - The default Class of the result object for individual
24
+ # calls. Default: Hash. Protip: Pass in a Struct to get
25
+ # some semblance of type safety.
26
+ class Pipeline
27
+ autoload :VERSION, 'html/pipeline/version'
28
+ autoload :Filter, 'html/pipeline/filter'
29
+ autoload :TextFilter, 'html/pipeline/text_filter'
30
+ autoload :MarkdownFilter, 'html/pipeline/markdown_filter'
31
+ autoload :TableOfContentsFilter, 'html/pipeline/toc_filter'
32
+ autoload :SyntaxHighlightFilter, 'html/pipeline/syntax_highlight_filter'
33
+ autoload :RelativeLinksFilter, 'html/pipeline/relative_links_filter'
34
+ autoload :CustomLinksFilter, 'html/pipeline/custom_links_filter'
35
+ autoload :SanitizationFilter, 'html/pipeline/sanitization_filter'
36
+ autoload :LinuxFr, 'html/pipeline/linuxfr'
37
+
38
+ # Our DOM implementation.
39
+ DocumentFragment = Nokogiri::HTML::DocumentFragment
40
+
41
+ # Parse a String into a DocumentFragment object. When a DocumentFragment is
42
+ # provided, return it verbatim.
43
+ def self.parse(document_or_html)
44
+ document_or_html ||= ''
45
+ if document_or_html.is_a?(String)
46
+ DocumentFragment.parse(document_or_html)
47
+ else
48
+ document_or_html
49
+ end
50
+ end
51
+
52
+ # Public: Returns an Array of Filter objects for this Pipeline.
53
+ attr_reader :filters
54
+
55
+ # Public: Instrumentation service for the pipeline.
56
+ # Set an ActiveSupport::Notifications compatible object to enable.
57
+ attr_accessor :instrumentation_service
58
+
59
+ # Public: String name for this Pipeline. Defaults to Class name.
60
+ attr_writer :instrumentation_name
61
+ def instrumentation_name
62
+ @instrumentation_name || self.class.name
63
+ end
64
+
65
+ class << self
66
+ # Public: Default instrumentation service for new pipeline objects.
67
+ attr_accessor :default_instrumentation_service
68
+ end
69
+
70
+ def initialize(filters, default_context = {}, result_class = nil)
71
+ raise ArgumentError, "default_context cannot be nil" if default_context.nil?
72
+ @filters = filters.flatten.freeze
73
+ @default_context = default_context.freeze
74
+ @result_class = result_class || Hash
75
+ @instrumentation_service = self.class.default_instrumentation_service
76
+ end
77
+
78
+ # Apply all filters in the pipeline to the given HTML.
79
+ #
80
+ # html - A String containing HTML or a DocumentFragment object.
81
+ # context - The context hash passed to each filter. See the Filter docs
82
+ # for more info on possible values. This object MUST NOT be modified
83
+ # in place by filters. Use the Result for passing state back.
84
+ # result - The result Hash passed to each filter for modification. This
85
+ # is where Filters store extracted information from the content.
86
+ #
87
+ # Returns the result Hash after being filtered by this Pipeline. Contains an
88
+ # :output key with the DocumentFragment or String HTML markup based on the
89
+ # output of the last filter in the pipeline.
90
+ def call(html, context = {}, result = nil)
91
+ context = @default_context.merge(context)
92
+ context = context.freeze
93
+ result ||= @result_class.new
94
+ payload = default_payload :filters => @filters.map(&:name),
95
+ :context => context, :result => result
96
+ instrument "call_pipeline.html_pipeline", payload do
97
+ result[:output] =
98
+ @filters.inject(html) do |doc, filter|
99
+ perform_filter(filter, doc, context, result)
100
+ end
101
+ end
102
+ result
103
+ end
104
+
105
+ # Internal: Applies a specific filter to the supplied doc.
106
+ #
107
+ # The filter is instrumented.
108
+ #
109
+ # Returns the result of the filter.
110
+ def perform_filter(filter, doc, context, result)
111
+ payload = default_payload :filter => filter.name,
112
+ :context => context, :result => result
113
+ instrument "call_filter.html_pipeline", payload do
114
+ filter.call(doc, context, result)
115
+ end
116
+ end
117
+
118
+ # Like call but guarantee the value returned is a DocumentFragment.
119
+ # Pipelines may return a DocumentFragment or a String. Callers that need a
120
+ # DocumentFragment should use this method.
121
+ def to_document(input, context = {}, result = nil)
122
+ result = call(input, context, result)
123
+ HTML::Pipeline.parse(result[:output])
124
+ end
125
+
126
+ # Like call but guarantee the value returned is a string of HTML markup.
127
+ def to_html(input, context = {}, result = nil)
128
+ result = call(input, context, result = nil)
129
+ output = result[:output]
130
+ if output.respond_to?(:to_html)
131
+ output.to_html
132
+ else
133
+ output.to_s
134
+ end
135
+ end
136
+
137
+ # Public: setup instrumentation for this pipeline.
138
+ #
139
+ # Returns nothing.
140
+ def setup_instrumentation(name = nil, service = nil)
141
+ self.instrumentation_name = name
142
+ self.instrumentation_service =
143
+ service || self.class.default_instrumentation_service
144
+ end
145
+
146
+ # Internal: if the `instrumentation_service` object is set, instruments the
147
+ # block, otherwise the block is ran without instrumentation.
148
+ #
149
+ # Returns the result of the provided block.
150
+ def instrument(event, payload = nil)
151
+ payload ||= default_payload
152
+ return yield(payload) unless instrumentation_service
153
+ instrumentation_service.instrument event, payload do |payload|
154
+ yield payload
155
+ end
156
+ end
157
+
158
+ # Internal: Default payload for instrumentation.
159
+ #
160
+ # Accepts a Hash of additional payload data to be merged.
161
+ #
162
+ # Returns a Hash.
163
+ def default_payload(payload = {})
164
+ {:pipeline => instrumentation_name}.merge(payload)
165
+ end
166
+ end
167
+ end
@@ -0,0 +1,47 @@
1
+ module HTML
2
+ class Pipeline
3
+
4
+ class CustomLinksFilter < Filter
5
+
6
+ LF_REGEXP = /\[\[\[([ '\.:\-\p{Word}]+)\]\]\]/
7
+ WP_REGEXP = /\[\[([ '\.+:!\-\(\)\p{Word}]+)\]\]/
8
+
9
+ LF_TITLE = "Lien du wiki interne LinuxFr.org"
10
+ WP_TITLE = "Définition Wikipédia"
11
+
12
+ # Don't look for links in text nodes that are children of these elements
13
+ IGNORE_PARENTS = %w(pre code a).to_set
14
+
15
+ def call
16
+ doc.search('text()').each do |node|
17
+ content = node.to_html
18
+ next if !content.include?('[[')
19
+ next if has_ancestor?(node, IGNORE_PARENTS)
20
+ html = content
21
+ html = process_internal_wiki_links html
22
+ html = process_wikipedia_links html
23
+ next if html == content
24
+ node.replace(html)
25
+ end
26
+ doc
27
+ end
28
+
29
+ def process_internal_wiki_links(text)
30
+ base_url = "//#{context[:host]}/wiki"
31
+ text.gsub(LF_REGEXP, "<a href=\"#{base_url}/\1\" title=\"#{LF_TITLE}\">\\1</a>")
32
+ end
33
+
34
+ def process_wikipedia_links(text)
35
+ text.gsub(WP_REGEXP) do
36
+ word = $1
37
+ escaped = word.gsub(/\(|\)|'/) {|x| "\\#{x}" }
38
+ parts = word.split(":")
39
+ parts.shift if %w(de en es eo wikt).include?(parts.first)
40
+ "<a href=\"http://fr.wikipedia.org/wiki/#{escaped}\" title=\"#{WP_TITLE}\")>#{parts.join ':'}</a>"
41
+ end
42
+ end
43
+
44
+ end
45
+
46
+ end
47
+ end
@@ -0,0 +1,166 @@
1
+ module HTML
2
+ class Pipeline
3
+ # Base class for user content HTML filters. Each filter takes an
4
+ # HTML string or Nokogiri::HTML::DocumentFragment, performs
5
+ # modifications and/or writes information to the result hash. Filters must
6
+ # return a DocumentFragment (typically the same instance provided to the call
7
+ # method) or a String with HTML markup.
8
+ #
9
+ # Example filter that replaces all images with trollface:
10
+ #
11
+ # class FuuuFilter < HTML::Pipeline::Filter
12
+ # def call
13
+ # doc.search('img').each do |img|
14
+ # img['src'] = "http://paradoxdgn.com/junk/avatars/trollface.jpg"
15
+ # end
16
+ # end
17
+ # end
18
+ #
19
+ # The context Hash passes options to filters and should not be changed in
20
+ # place. A Result Hash allows filters to make extracted information
21
+ # available to the caller and is mutable.
22
+ #
23
+ # Common context options:
24
+ # :base_url - The site's base URL
25
+ # :repository - A Repository providing context for the HTML being processed
26
+ #
27
+ # Each filter may define additional options and output values. See the class
28
+ # docs for more info.
29
+ class Filter
30
+ class InvalidDocumentException < StandardError; end
31
+
32
+ def initialize(doc, context = nil, result = nil)
33
+ if doc.kind_of?(String)
34
+ @html = doc.to_str
35
+ @doc = nil
36
+ else
37
+ @doc = doc
38
+ @html = nil
39
+ end
40
+ @context = context || {}
41
+ @result = result || {}
42
+ validate
43
+ end
44
+
45
+ # Public: Returns a simple Hash used to pass extra information into filters
46
+ # and also to allow filters to make extracted information available to the
47
+ # caller.
48
+ attr_reader :context
49
+
50
+ # Public: Returns a Hash used to allow filters to pass back information
51
+ # to callers of the various Pipelines. This can be used for
52
+ # #mentioned_users, for example.
53
+ attr_reader :result
54
+
55
+ # The Nokogiri::HTML::DocumentFragment to be manipulated. If the filter was
56
+ # provided a String, parse into a DocumentFragment the first time this
57
+ # method is called.
58
+ def doc
59
+ @doc ||= parse_html(html)
60
+ end
61
+
62
+ # The String representation of the document. If a DocumentFragment was
63
+ # provided to the Filter, it is serialized into a String when this method is
64
+ # called.
65
+ def html
66
+ raise InvalidDocumentException if @html.nil? && @doc.nil?
67
+ @html || doc.to_html
68
+ end
69
+
70
+ # The main filter entry point. The doc attribute is guaranteed to be a
71
+ # Nokogiri::HTML::DocumentFragment when invoked. Subclasses should modify
72
+ # this document in place or extract information and add it to the context
73
+ # hash.
74
+ def call
75
+ raise NotImplementedError
76
+ end
77
+
78
+ # Make sure the context has everything we need. Noop: Subclasses can override.
79
+ def validate
80
+ end
81
+
82
+ # The Repository object provided in the context hash, or nil when no
83
+ # :repository was specified.
84
+ #
85
+ # It's assumed that the repository context has already been checked
86
+ # for permissions
87
+ def repository
88
+ context[:repository]
89
+ end
90
+
91
+ # The User object provided in the context hash, or nil when no user
92
+ # was specified
93
+ def current_user
94
+ context[:current_user]
95
+ end
96
+
97
+ # The site's base URL provided in the context hash, or '/' when no
98
+ # base URL was specified.
99
+ def base_url
100
+ context[:base_url] || '/'
101
+ end
102
+
103
+ # Ensure the passed argument is a DocumentFragment. When a string is
104
+ # provided, it is parsed and returned; otherwise, the DocumentFragment is
105
+ # returned unmodified.
106
+ def parse_html(html)
107
+ HTML::Pipeline.parse(html)
108
+ end
109
+
110
+ # Helper method for filter subclasses used to determine if any of a node's
111
+ # ancestors have one of the tag names specified.
112
+ #
113
+ # node - The Node object to check.
114
+ # tags - An array of tag name strings to check. These should be downcase.
115
+ #
116
+ # Returns true when the node has a matching ancestor.
117
+ def has_ancestor?(node, tags)
118
+ while node = node.parent
119
+ if tags.include?(node.name.downcase)
120
+ break true
121
+ end
122
+ end
123
+ end
124
+
125
+ # Perform a filter on doc with the given context.
126
+ #
127
+ # Returns a HTML::Pipeline::DocumentFragment or a String containing HTML
128
+ # markup.
129
+ def self.call(doc, context = nil, result = nil)
130
+ new(doc, context, result).call
131
+ end
132
+
133
+ # Like call but guarantees that a DocumentFragment is returned, even when
134
+ # the last filter returns a String.
135
+ def self.to_document(input, context = nil)
136
+ html = call(input, context)
137
+ HTML::Pipeline::parse(html)
138
+ end
139
+
140
+ # Like call but guarantees that a string of HTML markup is returned.
141
+ def self.to_html(input, context = nil)
142
+ output = call(input, context)
143
+ if output.respond_to?(:to_html)
144
+ output.to_html
145
+ else
146
+ output.to_s
147
+ end
148
+ end
149
+
150
+ # Validator for required context. This will check that anything passed in
151
+ # contexts exists in @contexts
152
+ #
153
+ # If any errors are found an ArgumentError will be raised with a
154
+ # message listing all the missing contexts and the filters that
155
+ # require them.
156
+ def needs(*keys)
157
+ missing = keys.reject { |key| context.include? key }
158
+
159
+ if missing.any?
160
+ raise ArgumentError,
161
+ "Missing context keys for #{self.class.name}: #{missing.map(&:inspect).join ', '}"
162
+ end
163
+ end
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,25 @@
1
+ module HTML
2
+ class Pipeline
3
+
4
+ class LinuxFr
5
+ CONTEXT = {
6
+ toc_minimal_length: 5000,
7
+ toc_header: "<h2 class=\"sommaire\">Sommaire</h2>\n",
8
+ host: "linuxfr.org"
9
+ }
10
+
11
+ def self.render(text)
12
+ pipeline = HTML::Pipeline.new [
13
+ HTML::Pipeline::MarkdownFilter,
14
+ HTML::Pipeline::TableOfContentsFilter,
15
+ HTML::Pipeline::SyntaxHighlightFilter,
16
+ HTML::Pipeline::RelativeLinksFilter,
17
+ HTML::Pipeline::CustomLinksFilter
18
+ ], CONTEXT
19
+ result = pipeline.call text
20
+ result[:output].to_s
21
+ end
22
+ end
23
+
24
+ end
25
+ end
@@ -0,0 +1,76 @@
1
+ require 'redcarpet'
2
+
3
+ module HTML
4
+ class Pipeline
5
+
6
+ # LinuxFr Flavored Markdown
7
+ class LFMarkdown < Redcarpet::Render::HTML
8
+ attr_accessor :image_class
9
+
10
+ PARSER_OPTIONS = {
11
+ :no_intra_emphasis => true,
12
+ :tables => true,
13
+ :fenced_code_blocks => true,
14
+ :autolink => true,
15
+ :strikethrough => true,
16
+ :superscript => true
17
+ }
18
+
19
+ HTML_OPTIONS = {
20
+ :filter_html => true,
21
+ :no_styles => true,
22
+ :hard_wrap => true,
23
+ :xhtml => true
24
+ }
25
+
26
+ def initialize(extensions={})
27
+ super extensions.merge(HTML_OPTIONS)
28
+ end
29
+
30
+ def header(text, header_level)
31
+ l = header_level + 1
32
+ "<h#{l}>#{text}</h#{l}>\n"
33
+ end
34
+
35
+ def strikethrough(text)
36
+ "<s>#{text}</s>"
37
+ end
38
+
39
+ def image(link, title, alt_text)
40
+ return "" if link.blank?
41
+ ::Image.new(link, title, alt_text).to_html # FIXME
42
+ end
43
+
44
+ def normal_text(text)
45
+ text = CGI.escapeHTML(text)
46
+ text.gsub!('« ', '«&nbsp;')
47
+ text.gsub!(/ ([:;»!?])/, '&nbsp;\1')
48
+ text.gsub!(' -- ', '—')
49
+ text.gsub!('...', '…')
50
+ text
51
+ end
52
+
53
+ end
54
+
55
+
56
+ # HTML Filter that converts Markdown text into HTML and converts into a
57
+ # DocumentFragment. This is different from most filters in that it can take a
58
+ # non-HTML as input. It must be used as the first filter in a pipeline.
59
+ #
60
+ # This filter does not write any additional information to the context hash.
61
+ class MarkdownFilter < TextFilter
62
+ def initialize(text, context = nil, result = nil)
63
+ super text, context, result
64
+ @text = @text.gsub "\r", ''
65
+ end
66
+
67
+ # Convert Markdown to HTML using the best available implementation
68
+ # and convert into a DocumentFragment.
69
+ def call
70
+ lfm = Redcarpet::Markdown.new LFMarkdown, LFMarkdown::PARSER_OPTIONS
71
+ lfm.render @text
72
+ end
73
+ end
74
+
75
+ end
76
+ end