html-pipeline-linuxfr 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path("../lib/html/pipeline/version", __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.name = "html-pipeline-linuxfr"
6
+ gem.version = HTML::Pipeline::VERSION
7
+ gem.license = "MIT"
8
+ gem.authors = ["Ryan Tomayko", "Jerry Cheung", "Bruno Michel"]
9
+ gem.email = ["ryan@github.com", "jerry@github.com", "bmichel@menfin.info"]
10
+ gem.description = %q{LinuxFr.org HTML processing filters and utilities, adapted from those of GitHub}
11
+ gem.summary = %q{Helpers for processing content through a chain of filters}
12
+ gem.homepage = "https://github.com/nono/html-pipeline-linuxfr"
13
+
14
+ gem.files = `git ls-files`.split $/
15
+ gem.test_files = gem.files.grep(%r{^test})
16
+ gem.require_paths = ["lib"]
17
+
18
+ gem.add_dependency "nokogiri", "~> 1.4"
19
+ gem.add_dependency "redcarpet", "~> 2.1"
20
+ gem.add_dependency "pygments.rb", "~> 0.5"
21
+ gem.add_dependency "sanitize", "~> 2.0"
22
+ gem.add_dependency "escape_utils", "~> 0.3"
23
+ gem.add_dependency "activesupport", ">= 2"
24
+ end
@@ -0,0 +1,167 @@
1
+ require "nokogiri"
2
+ require "active_support/xml_mini/nokogiri" # convert Documents to hashes
3
+ require "escape_utils"
4
+
5
+ module HTML
6
+ # GitHub HTML processing filters and utilities. This module includes a small
7
+ # framework for defining DOM based content filters and applying them to user
8
+ # provided content.
9
+ #
10
+ # See HTML::Pipeline::Filter for information on building filters.
11
+ #
12
+ # Construct a Pipeline for running multiple HTML filters. A pipeline is created once
13
+ # with one to many filters, and it then can be `call`ed many times over the course
14
+ # of its lifetime with input.
15
+ #
16
+ # filters - Array of Filter objects. Each must respond to call(doc,
17
+ # context) and return the modified DocumentFragment or a
18
+ # String containing HTML markup. Filters are performed in the
19
+ # order provided.
20
+ # default_context - The default context hash. Values specified here will be merged
21
+ # into values from the each individual pipeline run. Can NOT be
22
+ # nil. Default: empty Hash.
23
+ # result_class - The default Class of the result object for individual
24
+ # calls. Default: Hash. Protip: Pass in a Struct to get
25
+ # some semblance of type safety.
26
+ class Pipeline
27
+ autoload :VERSION, 'html/pipeline/version'
28
+ autoload :Filter, 'html/pipeline/filter'
29
+ autoload :TextFilter, 'html/pipeline/text_filter'
30
+ autoload :MarkdownFilter, 'html/pipeline/markdown_filter'
31
+ autoload :TableOfContentsFilter, 'html/pipeline/toc_filter'
32
+ autoload :SyntaxHighlightFilter, 'html/pipeline/syntax_highlight_filter'
33
+ autoload :RelativeLinksFilter, 'html/pipeline/relative_links_filter'
34
+ autoload :CustomLinksFilter, 'html/pipeline/custom_links_filter'
35
+ autoload :SanitizationFilter, 'html/pipeline/sanitization_filter'
36
+ autoload :LinuxFr, 'html/pipeline/linuxfr'
37
+
38
+ # Our DOM implementation.
39
+ DocumentFragment = Nokogiri::HTML::DocumentFragment
40
+
41
+ # Parse a String into a DocumentFragment object. When a DocumentFragment is
42
+ # provided, return it verbatim.
43
+ def self.parse(document_or_html)
44
+ document_or_html ||= ''
45
+ if document_or_html.is_a?(String)
46
+ DocumentFragment.parse(document_or_html)
47
+ else
48
+ document_or_html
49
+ end
50
+ end
51
+
52
+ # Public: Returns an Array of Filter objects for this Pipeline.
53
+ attr_reader :filters
54
+
55
+ # Public: Instrumentation service for the pipeline.
56
+ # Set an ActiveSupport::Notifications compatible object to enable.
57
+ attr_accessor :instrumentation_service
58
+
59
+ # Public: String name for this Pipeline. Defaults to Class name.
60
+ attr_writer :instrumentation_name
61
+ def instrumentation_name
62
+ @instrumentation_name || self.class.name
63
+ end
64
+
65
+ class << self
66
+ # Public: Default instrumentation service for new pipeline objects.
67
+ attr_accessor :default_instrumentation_service
68
+ end
69
+
70
+ def initialize(filters, default_context = {}, result_class = nil)
71
+ raise ArgumentError, "default_context cannot be nil" if default_context.nil?
72
+ @filters = filters.flatten.freeze
73
+ @default_context = default_context.freeze
74
+ @result_class = result_class || Hash
75
+ @instrumentation_service = self.class.default_instrumentation_service
76
+ end
77
+
78
+ # Apply all filters in the pipeline to the given HTML.
79
+ #
80
+ # html - A String containing HTML or a DocumentFragment object.
81
+ # context - The context hash passed to each filter. See the Filter docs
82
+ # for more info on possible values. This object MUST NOT be modified
83
+ # in place by filters. Use the Result for passing state back.
84
+ # result - The result Hash passed to each filter for modification. This
85
+ # is where Filters store extracted information from the content.
86
+ #
87
+ # Returns the result Hash after being filtered by this Pipeline. Contains an
88
+ # :output key with the DocumentFragment or String HTML markup based on the
89
+ # output of the last filter in the pipeline.
90
+ def call(html, context = {}, result = nil)
91
+ context = @default_context.merge(context)
92
+ context = context.freeze
93
+ result ||= @result_class.new
94
+ payload = default_payload :filters => @filters.map(&:name),
95
+ :context => context, :result => result
96
+ instrument "call_pipeline.html_pipeline", payload do
97
+ result[:output] =
98
+ @filters.inject(html) do |doc, filter|
99
+ perform_filter(filter, doc, context, result)
100
+ end
101
+ end
102
+ result
103
+ end
104
+
105
+ # Internal: Applies a specific filter to the supplied doc.
106
+ #
107
+ # The filter is instrumented.
108
+ #
109
+ # Returns the result of the filter.
110
+ def perform_filter(filter, doc, context, result)
111
+ payload = default_payload :filter => filter.name,
112
+ :context => context, :result => result
113
+ instrument "call_filter.html_pipeline", payload do
114
+ filter.call(doc, context, result)
115
+ end
116
+ end
117
+
118
+ # Like call but guarantee the value returned is a DocumentFragment.
119
+ # Pipelines may return a DocumentFragment or a String. Callers that need a
120
+ # DocumentFragment should use this method.
121
+ def to_document(input, context = {}, result = nil)
122
+ result = call(input, context, result)
123
+ HTML::Pipeline.parse(result[:output])
124
+ end
125
+
126
+ # Like call but guarantee the value returned is a string of HTML markup.
127
+ def to_html(input, context = {}, result = nil)
128
+ result = call(input, context, result = nil)
129
+ output = result[:output]
130
+ if output.respond_to?(:to_html)
131
+ output.to_html
132
+ else
133
+ output.to_s
134
+ end
135
+ end
136
+
137
+ # Public: setup instrumentation for this pipeline.
138
+ #
139
+ # Returns nothing.
140
+ def setup_instrumentation(name = nil, service = nil)
141
+ self.instrumentation_name = name
142
+ self.instrumentation_service =
143
+ service || self.class.default_instrumentation_service
144
+ end
145
+
146
+ # Internal: if the `instrumentation_service` object is set, instruments the
147
+ # block, otherwise the block is ran without instrumentation.
148
+ #
149
+ # Returns the result of the provided block.
150
+ def instrument(event, payload = nil)
151
+ payload ||= default_payload
152
+ return yield(payload) unless instrumentation_service
153
+ instrumentation_service.instrument event, payload do |payload|
154
+ yield payload
155
+ end
156
+ end
157
+
158
+ # Internal: Default payload for instrumentation.
159
+ #
160
+ # Accepts a Hash of additional payload data to be merged.
161
+ #
162
+ # Returns a Hash.
163
+ def default_payload(payload = {})
164
+ {:pipeline => instrumentation_name}.merge(payload)
165
+ end
166
+ end
167
+ end
@@ -0,0 +1,47 @@
1
+ module HTML
2
+ class Pipeline
3
+
4
+ class CustomLinksFilter < Filter
5
+
6
+ LF_REGEXP = /\[\[\[([ '\.:\-\p{Word}]+)\]\]\]/
7
+ WP_REGEXP = /\[\[([ '\.+:!\-\(\)\p{Word}]+)\]\]/
8
+
9
+ LF_TITLE = "Lien du wiki interne LinuxFr.org"
10
+ WP_TITLE = "Définition Wikipédia"
11
+
12
+ # Don't look for links in text nodes that are children of these elements
13
+ IGNORE_PARENTS = %w(pre code a).to_set
14
+
15
+ def call
16
+ doc.search('text()').each do |node|
17
+ content = node.to_html
18
+ next if !content.include?('[[')
19
+ next if has_ancestor?(node, IGNORE_PARENTS)
20
+ html = content
21
+ html = process_internal_wiki_links html
22
+ html = process_wikipedia_links html
23
+ next if html == content
24
+ node.replace(html)
25
+ end
26
+ doc
27
+ end
28
+
29
+ def process_internal_wiki_links(text)
30
+ base_url = "//#{context[:host]}/wiki"
31
+ text.gsub(LF_REGEXP, "<a href=\"#{base_url}/\1\" title=\"#{LF_TITLE}\">\\1</a>")
32
+ end
33
+
34
+ def process_wikipedia_links(text)
35
+ text.gsub(WP_REGEXP) do
36
+ word = $1
37
+ escaped = word.gsub(/\(|\)|'/) {|x| "\\#{x}" }
38
+ parts = word.split(":")
39
+ parts.shift if %w(de en es eo wikt).include?(parts.first)
40
+ "<a href=\"http://fr.wikipedia.org/wiki/#{escaped}\" title=\"#{WP_TITLE}\")>#{parts.join ':'}</a>"
41
+ end
42
+ end
43
+
44
+ end
45
+
46
+ end
47
+ end
@@ -0,0 +1,166 @@
1
+ module HTML
2
+ class Pipeline
3
+ # Base class for user content HTML filters. Each filter takes an
4
+ # HTML string or Nokogiri::HTML::DocumentFragment, performs
5
+ # modifications and/or writes information to the result hash. Filters must
6
+ # return a DocumentFragment (typically the same instance provided to the call
7
+ # method) or a String with HTML markup.
8
+ #
9
+ # Example filter that replaces all images with trollface:
10
+ #
11
+ # class FuuuFilter < HTML::Pipeline::Filter
12
+ # def call
13
+ # doc.search('img').each do |img|
14
+ # img['src'] = "http://paradoxdgn.com/junk/avatars/trollface.jpg"
15
+ # end
16
+ # end
17
+ # end
18
+ #
19
+ # The context Hash passes options to filters and should not be changed in
20
+ # place. A Result Hash allows filters to make extracted information
21
+ # available to the caller and is mutable.
22
+ #
23
+ # Common context options:
24
+ # :base_url - The site's base URL
25
+ # :repository - A Repository providing context for the HTML being processed
26
+ #
27
+ # Each filter may define additional options and output values. See the class
28
+ # docs for more info.
29
+ class Filter
30
+ class InvalidDocumentException < StandardError; end
31
+
32
+ def initialize(doc, context = nil, result = nil)
33
+ if doc.kind_of?(String)
34
+ @html = doc.to_str
35
+ @doc = nil
36
+ else
37
+ @doc = doc
38
+ @html = nil
39
+ end
40
+ @context = context || {}
41
+ @result = result || {}
42
+ validate
43
+ end
44
+
45
+ # Public: Returns a simple Hash used to pass extra information into filters
46
+ # and also to allow filters to make extracted information available to the
47
+ # caller.
48
+ attr_reader :context
49
+
50
+ # Public: Returns a Hash used to allow filters to pass back information
51
+ # to callers of the various Pipelines. This can be used for
52
+ # #mentioned_users, for example.
53
+ attr_reader :result
54
+
55
+ # The Nokogiri::HTML::DocumentFragment to be manipulated. If the filter was
56
+ # provided a String, parse into a DocumentFragment the first time this
57
+ # method is called.
58
+ def doc
59
+ @doc ||= parse_html(html)
60
+ end
61
+
62
+ # The String representation of the document. If a DocumentFragment was
63
+ # provided to the Filter, it is serialized into a String when this method is
64
+ # called.
65
+ def html
66
+ raise InvalidDocumentException if @html.nil? && @doc.nil?
67
+ @html || doc.to_html
68
+ end
69
+
70
+ # The main filter entry point. The doc attribute is guaranteed to be a
71
+ # Nokogiri::HTML::DocumentFragment when invoked. Subclasses should modify
72
+ # this document in place or extract information and add it to the context
73
+ # hash.
74
+ def call
75
+ raise NotImplementedError
76
+ end
77
+
78
+ # Make sure the context has everything we need. Noop: Subclasses can override.
79
+ def validate
80
+ end
81
+
82
+ # The Repository object provided in the context hash, or nil when no
83
+ # :repository was specified.
84
+ #
85
+ # It's assumed that the repository context has already been checked
86
+ # for permissions
87
+ def repository
88
+ context[:repository]
89
+ end
90
+
91
+ # The User object provided in the context hash, or nil when no user
92
+ # was specified
93
+ def current_user
94
+ context[:current_user]
95
+ end
96
+
97
+ # The site's base URL provided in the context hash, or '/' when no
98
+ # base URL was specified.
99
+ def base_url
100
+ context[:base_url] || '/'
101
+ end
102
+
103
+ # Ensure the passed argument is a DocumentFragment. When a string is
104
+ # provided, it is parsed and returned; otherwise, the DocumentFragment is
105
+ # returned unmodified.
106
+ def parse_html(html)
107
+ HTML::Pipeline.parse(html)
108
+ end
109
+
110
+ # Helper method for filter subclasses used to determine if any of a node's
111
+ # ancestors have one of the tag names specified.
112
+ #
113
+ # node - The Node object to check.
114
+ # tags - An array of tag name strings to check. These should be downcase.
115
+ #
116
+ # Returns true when the node has a matching ancestor.
117
+ def has_ancestor?(node, tags)
118
+ while node = node.parent
119
+ if tags.include?(node.name.downcase)
120
+ break true
121
+ end
122
+ end
123
+ end
124
+
125
+ # Perform a filter on doc with the given context.
126
+ #
127
+ # Returns a HTML::Pipeline::DocumentFragment or a String containing HTML
128
+ # markup.
129
+ def self.call(doc, context = nil, result = nil)
130
+ new(doc, context, result).call
131
+ end
132
+
133
+ # Like call but guarantees that a DocumentFragment is returned, even when
134
+ # the last filter returns a String.
135
+ def self.to_document(input, context = nil)
136
+ html = call(input, context)
137
+ HTML::Pipeline::parse(html)
138
+ end
139
+
140
+ # Like call but guarantees that a string of HTML markup is returned.
141
+ def self.to_html(input, context = nil)
142
+ output = call(input, context)
143
+ if output.respond_to?(:to_html)
144
+ output.to_html
145
+ else
146
+ output.to_s
147
+ end
148
+ end
149
+
150
+ # Validator for required context. This will check that anything passed in
151
+ # contexts exists in @contexts
152
+ #
153
+ # If any errors are found an ArgumentError will be raised with a
154
+ # message listing all the missing contexts and the filters that
155
+ # require them.
156
+ def needs(*keys)
157
+ missing = keys.reject { |key| context.include? key }
158
+
159
+ if missing.any?
160
+ raise ArgumentError,
161
+ "Missing context keys for #{self.class.name}: #{missing.map(&:inspect).join ', '}"
162
+ end
163
+ end
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,25 @@
1
+ module HTML
2
+ class Pipeline
3
+
4
+ class LinuxFr
5
+ CONTEXT = {
6
+ toc_minimal_length: 5000,
7
+ toc_header: "<h2 class=\"sommaire\">Sommaire</h2>\n",
8
+ host: "linuxfr.org"
9
+ }
10
+
11
+ def self.render(text)
12
+ pipeline = HTML::Pipeline.new [
13
+ HTML::Pipeline::MarkdownFilter,
14
+ HTML::Pipeline::TableOfContentsFilter,
15
+ HTML::Pipeline::SyntaxHighlightFilter,
16
+ HTML::Pipeline::RelativeLinksFilter,
17
+ HTML::Pipeline::CustomLinksFilter
18
+ ], CONTEXT
19
+ result = pipeline.call text
20
+ result[:output].to_s
21
+ end
22
+ end
23
+
24
+ end
25
+ end
@@ -0,0 +1,76 @@
1
+ require 'redcarpet'
2
+
3
+ module HTML
4
+ class Pipeline
5
+
6
+ # LinuxFr Flavored Markdown
7
+ class LFMarkdown < Redcarpet::Render::HTML
8
+ attr_accessor :image_class
9
+
10
+ PARSER_OPTIONS = {
11
+ :no_intra_emphasis => true,
12
+ :tables => true,
13
+ :fenced_code_blocks => true,
14
+ :autolink => true,
15
+ :strikethrough => true,
16
+ :superscript => true
17
+ }
18
+
19
+ HTML_OPTIONS = {
20
+ :filter_html => true,
21
+ :no_styles => true,
22
+ :hard_wrap => true,
23
+ :xhtml => true
24
+ }
25
+
26
+ def initialize(extensions={})
27
+ super extensions.merge(HTML_OPTIONS)
28
+ end
29
+
30
+ def header(text, header_level)
31
+ l = header_level + 1
32
+ "<h#{l}>#{text}</h#{l}>\n"
33
+ end
34
+
35
+ def strikethrough(text)
36
+ "<s>#{text}</s>"
37
+ end
38
+
39
+ def image(link, title, alt_text)
40
+ return "" if link.blank?
41
+ ::Image.new(link, title, alt_text).to_html # FIXME
42
+ end
43
+
44
+ def normal_text(text)
45
+ text = CGI.escapeHTML(text)
46
+ text.gsub!('« ', '«&nbsp;')
47
+ text.gsub!(/ ([:;»!?])/, '&nbsp;\1')
48
+ text.gsub!(' -- ', '—')
49
+ text.gsub!('...', '…')
50
+ text
51
+ end
52
+
53
+ end
54
+
55
+
56
+ # HTML Filter that converts Markdown text into HTML and converts into a
57
+ # DocumentFragment. This is different from most filters in that it can take a
58
+ # non-HTML as input. It must be used as the first filter in a pipeline.
59
+ #
60
+ # This filter does not write any additional information to the context hash.
61
+ class MarkdownFilter < TextFilter
62
+ def initialize(text, context = nil, result = nil)
63
+ super text, context, result
64
+ @text = @text.gsub "\r", ''
65
+ end
66
+
67
+ # Convert Markdown to HTML using the best available implementation
68
+ # and convert into a DocumentFragment.
69
+ def call
70
+ lfm = Redcarpet::Markdown.new LFMarkdown, LFMarkdown::PARSER_OPTIONS
71
+ lfm.render @text
72
+ end
73
+ end
74
+
75
+ end
76
+ end