pagehub-markdown 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,13 @@
1
+ require 'redcarpet'
2
+ require 'albino'
3
+
4
+ require 'pagehub-markdown/markdown'
5
+ require 'pagehub-markdown/processor'
6
+ require 'pagehub-markdown/mutator'
7
+ require 'pagehub-markdown/processors/embedder'
8
+ require 'pagehub-markdown/processors/pagehub_options'
9
+ require 'pagehub-markdown/processors/toc_generator'
10
+ require 'pagehub-markdown/mutators/date_injector'
11
+
12
+ module PageHub
13
+ end
@@ -0,0 +1,128 @@
1
+ module PageHub
2
+ module Markdown
3
+
4
+ class << self
5
+ def add_processor(stage, p) # :nodoc:
6
+ Stages.each { |s| @@hooks[s] ||= [] }
7
+
8
+ unless Stages.include?(stage.to_sym)
9
+ raise "Invalid stage #{stage}. Allowed stages are #{Stages.join(', ')}"
10
+ end
11
+
12
+ unless p.respond_to?(:call)
13
+ raise "Processor must be a callable object."
14
+ end
15
+
16
+ if stage.is_a? Array
17
+ stage.each { |s| @@hooks[s] << p }
18
+ else
19
+ @@hooks[stage.to_sym] << p
20
+ end
21
+
22
+ end
23
+
24
+ def add_mutator(m) # :nodoc:
25
+ unless m.respond_to?(:call)
26
+ raise "Mutator must be a callable object."
27
+ end
28
+
29
+ @@mutators << m
30
+ end
31
+
32
+ # (re)constructs the renderer with the given options, see
33
+ # PageHubOptions, RendererOptions, and RendererExtensions
34
+ # for accepted values
35
+ def configure(ph_options = {}, options = {}, extensions = {})
36
+ @@options = PageHubOptions.merge(ph_options)
37
+
38
+ @@renderer = Redcarpet::Markdown.new(
39
+ HTMLWithAlbino.new(RendererOptions.merge(options)),
40
+ RendererExtensions.merge(extensions))
41
+ end
42
+
43
+ def render!(str)
44
+ configure unless @@renderer
45
+
46
+ @@hooks[:pre_render].each { |processor| processor.call(str) }
47
+
48
+ # escape any JavaScript snippets
49
+ if @@options[:escape_scripts]
50
+ str.gsub!(/\<script(.*)\>/i) {
51
+ mutated = true
52
+ "&lt;script#{$1}&gt;"
53
+ }
54
+ end
55
+
56
+ str = @@renderer.render(str)
57
+
58
+ @@hooks[:post_render].each { |processor| processor.call(str) }
59
+
60
+ str
61
+ end
62
+
63
+ def render(str)
64
+ o = str.dup; render!(o); o
65
+ end
66
+
67
+ def mutate!(str)
68
+ mutated = false
69
+ @@mutators.each { |m| mutated ||= m.call(str) }
70
+ mutated
71
+ end
72
+
73
+ end
74
+
75
+ protected
76
+
77
+ Stages = [ :pre_render, :post_render ]
78
+ @@hooks = { }
79
+ @@mutators = [ ]
80
+ @@options = { }
81
+
82
+ PageHubOptions = {
83
+ escape_scripts: true
84
+ }
85
+
86
+ RendererOptions = {
87
+ filter_html: false,
88
+ no_images: false,
89
+ no_links: false,
90
+ no_styles: false,
91
+ safe_links_only: false,
92
+ with_toc_data: true,
93
+ hard_wrap: false,
94
+ xhtml: false
95
+ }
96
+
97
+ RendererExtensions = {
98
+ no_intra_emphasis: true,
99
+ tables: false,
100
+ fenced_code_blocks: true,
101
+ autolink: true,
102
+ strikethrough: true,
103
+ lax_html_blocks: false,
104
+ space_after_headers: true,
105
+ superscript: true
106
+ }
107
+
108
+ private
109
+
110
+ # a renderer that uses Albino to highlight syntax
111
+ class HTMLWithAlbino < Redcarpet::Render::HTML
112
+ def block_code(code, language)
113
+ begin
114
+ # TODO: try to figure out whether @language is valid
115
+ out = Albino.colorize(code, language)
116
+ rescue Exception => e
117
+ out = ""
118
+ # return "-- INVALID CODE BLOCK, MAKE SURE YOU'VE SURROUNDED CODE WITH ```"
119
+ end
120
+
121
+ # just render the code as plain text if the language is invalid
122
+ out.empty? ? block_code(code, "text") : out
123
+ end
124
+ end
125
+
126
+ @@renderer = nil
127
+ end
128
+ end
@@ -0,0 +1,18 @@
1
+ module PageHub
2
+ module Markdown
3
+
4
+ add_mutator lambda { |str|
5
+ mutated = false
6
+ str.gsub!(/\[\!date(.*)\!\]/) {
7
+ mutated = true
8
+
9
+ format = $1.empty? ? "%D" : $1.strip
10
+
11
+ DateTime.now.strftime(format)
12
+ }
13
+
14
+ mutated
15
+ }
16
+
17
+ end # Markdown module
18
+ end # PageHub module
@@ -0,0 +1,250 @@
1
+ require 'open-uri'
2
+ require 'net/http'
3
+ require 'nokogiri'
4
+
5
+ module PageHub
6
+ module Markdown
7
+
8
+ # Downloads remote textual resources from websites
9
+ # and allows for content extraction from HTML pages
10
+ # so it can be neatly embedded in another page.
11
+ module Embedder
12
+
13
+ class EmbeddingError < RuntimeError; end
14
+ class InvalidSizeError < EmbeddingError; end
15
+ class InvalidTypeError < EmbeddingError; end
16
+
17
+ # Resources whose content-type is not specified in this
18
+ # list will be rejected
19
+ AllowedTypes = [/text\/plain/, /text\/html/, /application\/html/]
20
+
21
+ # Resources larger than 1 MByte will be rejected
22
+ MaximumLength = 1 * 1024 * 1024
23
+
24
+ # Resources served by any of the hosts specified in this list
25
+ # will be rejected
26
+ FilteredHosts = []
27
+
28
+ Timeout = 5
29
+
30
+ private
31
+
32
+ @@processors = []
33
+
34
+ public
35
+
36
+ class << self
37
+
38
+ # Performs a HEAD request to validate the resource, and if it
39
+ # passes the checks it will be downloaded and processed if
40
+ # any eligible Embedder::Processor is registered.
41
+ #
42
+ # Arguments:
43
+ # 1. raw_uri the full raw URI of the file to be embedded
44
+ # 2. source an optional identifier to specify the Processor
45
+ # that should be used to post-process the content
46
+ # 3. args options that can be meaningful to the Processor, if any
47
+ #
48
+ # Returns:
49
+ # A string containing the extracted data, or an empty one
50
+ def get_resource(raw_uri, source = "", args = "")
51
+ begin
52
+ uri = URI.parse(raw_uri)
53
+
54
+ # reject if the host is banned
55
+ return "" if FilteredHosts.include?(uri.host)
56
+
57
+ Net::HTTP.start(uri.host, uri.port) do |http|
58
+ http.open_timeout = Timeout
59
+ http.read_timeout = Timeout
60
+
61
+ # get the content type and length
62
+ ctype = ""
63
+ clength = 0
64
+ http.head(uri.path).each { |k,v|
65
+ # puts "#{k} => #{v}"
66
+ ctype = v if k == "content-type"
67
+ clength = v.to_i if k == "content-length"
68
+ }
69
+
70
+ raise InvalidTypeError.new ctype if !self.allowed?(ctype)
71
+ raise InvalidSizeError.new clength if clength > MaximumLength
72
+
73
+ open(raw_uri) { |f|
74
+ content = f.read
75
+
76
+ # invoke processors
77
+ keys = []
78
+ keys << source unless source.empty?
79
+ keys << raw_uri
80
+ @@processors.each { |p|
81
+ if p.applies_to?(keys) then
82
+ content = p.process(content, raw_uri, args)
83
+ break
84
+ end
85
+ }
86
+
87
+ return content
88
+ }
89
+ end
90
+ rescue EmbeddingError => e
91
+ # we want to escalate these errors
92
+ raise e
93
+ rescue Exception => e
94
+ # mask as a generic EmbeddingError
95
+ raise EmbeddingError.new e.message
96
+ end
97
+
98
+ ""
99
+ end
100
+
101
+ def allowed?(ctype)
102
+ AllowedTypes.each { |t| return true if t.match ctype }
103
+ false
104
+ end
105
+
106
+ def register_processor(proc)
107
+ @@processors ||= []
108
+ @@processors << proc
109
+ end
110
+
111
+ end # class << self
112
+
113
+ class Processor
114
+
115
+ # Processors apply to "keys" which can be written manually
116
+ # in Markdown by the user, or are found in the host portion
117
+ # of the resource URI
118
+ #
119
+ # IE, a Github Wiki processor would bind to the keys:
120
+ # "github-wiki", or/and <tt>/github.com.*\/wiki\//</tt>
121
+ #
122
+ # Manual keys are injected after the !include keyword:
123
+ # [!include github-wiki!](https://github.com/some-dude/wiki/Home)
124
+ #
125
+ def initialize(keys)
126
+ @keys = keys
127
+ super()
128
+ end
129
+
130
+ def process(content, uri, args = "")
131
+ raise NotImplementedError
132
+ end
133
+
134
+ def applies_to?(keys)
135
+ @keys.each { |h| keys.each { |k| return true if h.match k } }
136
+ false
137
+ end
138
+
139
+ # Node should be the root node that contains the embedded content,
140
+ # which will be stripped of all attributes and injected with new ones:
141
+ # 1. data-embed-uri containing the URI of the embedded resource
142
+ # 2. data-embed-src the name of the processor used for embedding
143
+ #
144
+ # All children nodes that have an @id attribute will have that attribute
145
+ # removed as well.
146
+ def stamp(node, uri, key)
147
+ node.xpath("//*[@id]").each { |node| node.remove_attribute "id" }
148
+ node.attributes.each_pair { |name,_| node.remove_attribute name }
149
+ node['data-embed-uri'] = uri
150
+ node['data-embed-src'] = key
151
+ end
152
+ end
153
+
154
+ # Extracts content from GitHub Wiki pages
155
+ #
156
+ # Bound keys:
157
+ # * "github-wiki"
158
+ # * URI("[...]github.com/[...]/wiki/[...]")
159
+ #
160
+ class GithubWikiProcessor < Processor
161
+ def initialize()
162
+ super(["github-wiki", /github.com.*\/wiki\//])
163
+ end
164
+
165
+ # Returns the content of the node <div class='markdown-body'></div>,
166
+ # it will also remove all id attributes of all content nodes.
167
+ #
168
+ # Supported options:
169
+ # 1. reduce-headings: all heading nodes (<h1> through <h5>) will be
170
+ # stepped one level, so h1 becomes h2, etc.
171
+ def process(content, uri, args = "")
172
+ html_doc = Nokogiri::HTML(content) do |config| config.noerror end
173
+
174
+ node = html_doc.xpath("//div[@class='markdown-body']").first
175
+
176
+ stamp(node, uri, 'github-wiki')
177
+
178
+ if args.include?("reduce-headings") then
179
+ 5.downto(1) { |level|
180
+ node.xpath("//h#{level}").each { |heading_node|
181
+ heading_node.name = "h#{level+1}"
182
+ }
183
+ }
184
+ end
185
+
186
+ node
187
+ end
188
+
189
+ end
190
+
191
+ # Extracts content from PageHub shared documents
192
+ #
193
+ # Bound keys:
194
+ # * "pagehub"
195
+ # * URI([...]pagehub.org/[...])
196
+ class PageHubProcessor < Processor
197
+ def initialize()
198
+ super(["pagehub", /pagehub.org/])
199
+ end
200
+
201
+ def process(content, uri, args = "")
202
+ html_doc = Nokogiri::HTML(content) do |config| config.noerror end
203
+ node = html_doc.xpath("//div[@id='content']").first
204
+ node.xpath('div[@id="breadcrumbs"]').remove
205
+ node.xpath('div[@id="bottom"]').remove
206
+ stamp(node, uri, 'pagehub')
207
+ node
208
+ end
209
+ end
210
+
211
+ register_processor(GithubWikiProcessor.new)
212
+ register_processor(PageHubProcessor.new)
213
+
214
+ end # Embedder module
215
+
216
+ add_processor :pre_render, lambda {|str|
217
+ # Embed remote references, if any
218
+ str.gsub!(/^\B\[\!include\s?(.*)\!\]\((.*)\)/) {
219
+ content = ""
220
+
221
+ uri = $2
222
+
223
+ # parse the content source and args, if any
224
+ source = ($1 || "").split.first || ""
225
+ args = ($1 || "").split || []
226
+ args = args[1..args.length].join(' ') unless args.empty?
227
+
228
+ begin
229
+ content = Embedder.get_resource(uri, source, args)
230
+ rescue Embedder::InvalidSizeError => e
231
+ content << "**Embedding error**: the file you tried to embed is too big - #{e.message.to_i} bytes."
232
+ content << " (**Source**: [#{$2}](#{$2}))\n\n"
233
+ rescue Embedder::InvalidTypeError => e
234
+ content << "**Embedding error**: the file type you tried to embed (`#{e.message}`) is not supported."
235
+ content << " (**Source**: [#{$2}](#{$2}))\n\n"
236
+ rescue Embedder::EmbeddingError => e
237
+ content << "**Embedding error**: #{e.message}."
238
+ content << " (**Source**: [#{$2}](#{$2}))\n\n"
239
+ end
240
+
241
+ # content = "<div data-embedded=true>#{content.to_s.to_markdown}</div>".to_markdown
242
+ # content = "#{content}"
243
+ content
244
+ }
245
+
246
+ str
247
+ }
248
+
249
+ end # Markdown module
250
+ end # PageHub module
@@ -0,0 +1,21 @@
1
+ module PageHub
2
+ module Markdown
3
+ add_processor :post_render, lambda { |str|
4
+ str.gsub!(/\[\!options(.*)\!\]/) {
5
+ opts = $1
6
+ out = ""
7
+
8
+ unless opts.empty?
9
+ opts = opts.split(' ').each { |opt|
10
+ case opt
11
+ when "no-title"
12
+ out += "<style>header h1 { display: none }</style>"
13
+ end
14
+ }
15
+ end
16
+
17
+ out
18
+ }
19
+ }
20
+ end # Markdown module
21
+ end # PageHub module
@@ -0,0 +1,98 @@
1
+ module PageHub
2
+ module Markdown
3
+ module ToC
4
+
5
+ # Builds a tree of headings from a given block of Markdown
6
+ # text, the returned list can be turned into HTML using
7
+ # ToC::to_html()
8
+ def self.from_markdown(markdown, threshold = 6)
9
+ self.from_content(/(#+)\s([^\n]+)/, lambda { |l, t| return l.length, t }, markdown, threshold)
10
+ end
11
+
12
+ # renders a table of content using nested <ol> list nodes
13
+ # from a given list of Heading objects produced by ToC::from_markdown()
14
+ def self.to_html(toc)
15
+ html = "<ol>"
16
+ toc.each { |heading| html << heading.to_html }
17
+ html << "</ol>"
18
+ html
19
+ end
20
+
21
+ private
22
+
23
+ def self.from_content(pattern, formatter, content, threshold)
24
+ headings = []
25
+ current = []
26
+ toc_index = 0
27
+ content.scan(pattern).each { |l, t|
28
+ level,title = formatter.call(l, t)
29
+
30
+ if level <= threshold
31
+ h = Heading.new(title, level, toc_index)
32
+ headings << h
33
+ current[level] = h
34
+ toc_index += 1 # toc_index is used for hyperlinking
35
+
36
+ # if there's a parent, attach this heading as a child to it
37
+ if current[level-1] then
38
+ current[level-1] << h
39
+ end
40
+ end
41
+ }
42
+
43
+ toc = []
44
+ headings.each { |h|
45
+ next if h.parent
46
+ toc << h
47
+ }
48
+
49
+ toc
50
+ end
51
+
52
+ class Heading
53
+ attr_accessor :level, :title, :children, :parent, :index
54
+
55
+ def initialize(title, level, index)
56
+ @title = title
57
+ @level = level
58
+ @index = index
59
+ @parent = nil
60
+ @children = []
61
+ super()
62
+ end
63
+
64
+ def <<(h)
65
+ @children.each { |child|
66
+ return if child.title == h.title
67
+ }
68
+
69
+ h.parent = self
70
+ @children << h
71
+ end
72
+
73
+ def to_html()
74
+ html = ""
75
+ html << "<li>"
76
+ html << "<a href=\"\#toc_#{index}\">" << title << "</a>"
77
+
78
+ if children.any? then
79
+ html << "<ol>"
80
+ children.each { |child| html << child.to_html }
81
+ html << "</ol>"
82
+ end
83
+
84
+ html << "</li>"
85
+ end
86
+ end
87
+ end
88
+
89
+ # register the processor
90
+ add_processor :pre_render, lambda { |str|
91
+ str.gsub!(/^\B\[\!toc(.*)\!\]/) {
92
+ ToC.to_html ToC.from_markdown(str, $1.empty? ? 6 : $1.strip.to_i)
93
+ }
94
+ str
95
+ }
96
+
97
+ end # Markdown module
98
+ end # PageHub module
metadata ADDED
@@ -0,0 +1,115 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pagehub-markdown
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Ahmad Amireh
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-10-03 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: redcarpet
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 2.1.1
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 2.1.1
30
+ - !ruby/object:Gem::Dependency
31
+ name: albino
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: 1.3.3
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: 1.3.3
46
+ - !ruby/object:Gem::Dependency
47
+ name: json
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: 1.7.0
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: 1.7.0
62
+ - !ruby/object:Gem::Dependency
63
+ name: nokogiri
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: 1.5.5
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: 1.5.5
78
+ description: A bunch of neat features added to the Markdown renderer via pure Markdown
79
+ syntax.
80
+ email: ahmad@amireh.net
81
+ executables: []
82
+ extensions: []
83
+ extra_rdoc_files: []
84
+ files:
85
+ - lib/pagehub-markdown.rb
86
+ - lib/pagehub-markdown/processors/toc_generator.rb
87
+ - lib/pagehub-markdown/processors/pagehub_options.rb
88
+ - lib/pagehub-markdown/processors/embedder.rb
89
+ - lib/pagehub-markdown/markdown.rb
90
+ - lib/pagehub-markdown/mutators/date_injector.rb
91
+ homepage: http://github.com/amireh/pagehub-markdown
92
+ licenses: []
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ none: false
99
+ requirements:
100
+ - - ! '>='
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ required_rubygems_version: !ruby/object:Gem::Requirement
104
+ none: false
105
+ requirements:
106
+ - - ! '>='
107
+ - !ruby/object:Gem::Version
108
+ version: '0'
109
+ requirements: []
110
+ rubyforge_project:
111
+ rubygems_version: 1.8.23
112
+ signing_key:
113
+ specification_version: 3
114
+ summary: PageHub's extensions of GitHub's Redcarpet Markdown renderer.
115
+ test_files: []