pagehub-markdown 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ require 'redcarpet'
2
+ require 'albino'
3
+
4
+ require 'pagehub-markdown/markdown'
5
+ require 'pagehub-markdown/processor'
6
+ require 'pagehub-markdown/mutator'
7
+ require 'pagehub-markdown/processors/embedder'
8
+ require 'pagehub-markdown/processors/pagehub_options'
9
+ require 'pagehub-markdown/processors/toc_generator'
10
+ require 'pagehub-markdown/mutators/date_injector'
11
+
12
+ module PageHub
13
+ end
@@ -0,0 +1,128 @@
1
+ module PageHub
2
+ module Markdown
3
+
4
+ class << self
5
+ def add_processor(stage, p) # :nodoc:
6
+ Stages.each { |s| @@hooks[s] ||= [] }
7
+
8
+ unless Stages.include?(stage.to_sym)
9
+ raise "Invalid stage #{stage}. Allowed stages are #{Stages.join(', ')}"
10
+ end
11
+
12
+ unless p.respond_to?(:call)
13
+ raise "Processor must be a callable object."
14
+ end
15
+
16
+ if stage.is_a? Array
17
+ stage.each { |s| @@hooks[s] << p }
18
+ else
19
+ @@hooks[stage.to_sym] << p
20
+ end
21
+
22
+ end
23
+
24
+ def add_mutator(m) # :nodoc:
25
+ unless m.respond_to?(:call)
26
+ raise "Mutator must be a callable object."
27
+ end
28
+
29
+ @@mutators << m
30
+ end
31
+
32
+ # (re)constructs the renderer with the given options, see
33
+ # PageHubOptions, RendererOptions, and RendererExtensions
34
+ # for accepted values
35
+ def configure(ph_options = {}, options = {}, extensions = {})
36
+ @@options = PageHubOptions.merge(ph_options)
37
+
38
+ @@renderer = Redcarpet::Markdown.new(
39
+ HTMLWithAlbino.new(RendererOptions.merge(options)),
40
+ RendererExtensions.merge(extensions))
41
+ end
42
+
43
+ def render!(str)
44
+ configure unless @@renderer
45
+
46
+ @@hooks[:pre_render].each { |processor| processor.call(str) }
47
+
48
+ # escape any JavaScript snippets
49
+ if @@options[:escape_scripts]
50
+ str.gsub!(/\<script(.*)\>/i) {
51
+ mutated = true
52
+ "&lt;script#{$1}&gt;"
53
+ }
54
+ end
55
+
56
+ str = @@renderer.render(str)
57
+
58
+ @@hooks[:post_render].each { |processor| processor.call(str) }
59
+
60
+ str
61
+ end
62
+
63
+ def render(str)
64
+ o = str.dup; render!(o); o
65
+ end
66
+
67
+ def mutate!(str)
68
+ mutated = false
69
+ @@mutators.each { |m| mutated ||= m.call(str) }
70
+ mutated
71
+ end
72
+
73
+ end
74
+
75
+ protected
76
+
77
+ Stages = [ :pre_render, :post_render ]
78
+ @@hooks = { }
79
+ @@mutators = [ ]
80
+ @@options = { }
81
+
82
+ PageHubOptions = {
83
+ escape_scripts: true
84
+ }
85
+
86
+ RendererOptions = {
87
+ filter_html: false,
88
+ no_images: false,
89
+ no_links: false,
90
+ no_styles: false,
91
+ safe_links_only: false,
92
+ with_toc_data: true,
93
+ hard_wrap: false,
94
+ xhtml: false
95
+ }
96
+
97
+ RendererExtensions = {
98
+ no_intra_emphasis: true,
99
+ tables: false,
100
+ fenced_code_blocks: true,
101
+ autolink: true,
102
+ strikethrough: true,
103
+ lax_html_blocks: false,
104
+ space_after_headers: true,
105
+ superscript: true
106
+ }
107
+
108
+ private
109
+
110
+ # a renderer that uses Albino to highlight syntax
111
+ class HTMLWithAlbino < Redcarpet::Render::HTML
112
+ def block_code(code, language)
113
+ begin
114
+ # TODO: try to figure out whether @language is valid
115
+ out = Albino.colorize(code, language)
116
+ rescue Exception => e
117
+ out = ""
118
+ # return "-- INVALID CODE BLOCK, MAKE SURE YOU'VE SURROUNDED CODE WITH ```"
119
+ end
120
+
121
+ # just render the code as plain text if the language is invalid
122
+ out.empty? ? block_code(code, "text") : out
123
+ end
124
+ end
125
+
126
+ @@renderer = nil
127
+ end
128
+ end
@@ -0,0 +1,18 @@
1
+ module PageHub
2
+ module Markdown
3
+
4
+ add_mutator lambda { |str|
5
+ mutated = false
6
+ str.gsub!(/\[\!date(.*)\!\]/) {
7
+ mutated = true
8
+
9
+ format = $1.empty? ? "%D" : $1.strip
10
+
11
+ DateTime.now.strftime(format)
12
+ }
13
+
14
+ mutated
15
+ }
16
+
17
+ end # Markdown module
18
+ end # PageHub module
@@ -0,0 +1,250 @@
1
+ require 'open-uri'
2
+ require 'net/http'
3
+ require 'nokogiri'
4
+
5
+ module PageHub
6
+ module Markdown
7
+
8
+ # Downloads remote textual resources from websites
9
+ # and allows for content extraction from HTML pages
10
+ # so it can be neatly embedded in another page.
11
+ module Embedder
12
+
13
+ class EmbeddingError < RuntimeError; end
14
+ class InvalidSizeError < EmbeddingError; end
15
+ class InvalidTypeError < EmbeddingError; end
16
+
17
+ # Resources whose content-type is not specified in this
18
+ # list will be rejected
19
+ AllowedTypes = [/text\/plain/, /text\/html/, /application\/html/]
20
+
21
+ # Resources larger than 1 MByte will be rejected
22
+ MaximumLength = 1 * 1024 * 1024
23
+
24
+ # Resources served by any of the hosts specified in this list
25
+ # will be rejected
26
+ FilteredHosts = []
27
+
28
+ Timeout = 5
29
+
30
+ private
31
+
32
+ @@processors = []
33
+
34
+ public
35
+
36
+ class << self
37
+
38
+ # Performs a HEAD request to validate the resource, and if it
39
+ # passes the checks it will be downloaded and processed if
40
+ # any eligible Embedder::Processor is registered.
41
+ #
42
+ # Arguments:
43
+ # 1. raw_uri the full raw URI of the file to be embedded
44
+ # 2. source an optional identifier to specify the Processor
45
+ # that should be used to post-process the content
46
+ # 3. args options that can be meaningful to the Processor, if any
47
+ #
48
+ # Returns:
49
+ # A string containing the extracted data, or an empty one
50
+ def get_resource(raw_uri, source = "", args = "")
51
+ begin
52
+ uri = URI.parse(raw_uri)
53
+
54
+ # reject if the host is banned
55
+ return "" if FilteredHosts.include?(uri.host)
56
+
57
+ Net::HTTP.start(uri.host, uri.port) do |http|
58
+ http.open_timeout = Timeout
59
+ http.read_timeout = Timeout
60
+
61
+ # get the content type and length
62
+ ctype = ""
63
+ clength = 0
64
+ http.head(uri.path).each { |k,v|
65
+ # puts "#{k} => #{v}"
66
+ ctype = v if k == "content-type"
67
+ clength = v.to_i if k == "content-length"
68
+ }
69
+
70
+ raise InvalidTypeError.new ctype if !self.allowed?(ctype)
71
+ raise InvalidSizeError.new clength if clength > MaximumLength
72
+
73
+ open(raw_uri) { |f|
74
+ content = f.read
75
+
76
+ # invoke processors
77
+ keys = []
78
+ keys << source unless source.empty?
79
+ keys << raw_uri
80
+ @@processors.each { |p|
81
+ if p.applies_to?(keys) then
82
+ content = p.process(content, raw_uri, args)
83
+ break
84
+ end
85
+ }
86
+
87
+ return content
88
+ }
89
+ end
90
+ rescue EmbeddingError => e
91
+ # we want to escalate these errors
92
+ raise e
93
+ rescue Exception => e
94
+ # mask as a generic EmbeddingError
95
+ raise EmbeddingError.new e.message
96
+ end
97
+
98
+ ""
99
+ end
100
+
101
+ def allowed?(ctype)
102
+ AllowedTypes.each { |t| return true if t.match ctype }
103
+ false
104
+ end
105
+
106
+ def register_processor(proc)
107
+ @@processors ||= []
108
+ @@processors << proc
109
+ end
110
+
111
+ end # class << self
112
+
113
+ class Processor
114
+
115
+ # Processors apply to "keys" which can be written manually
116
+ # in Markdown by the user, or are found in the host portion
117
+ # of the resource URI
118
+ #
119
+ # IE, a Github Wiki processor would bind to the keys:
120
+ # "github-wiki", or/and <tt>/github.com.*\/wiki\//</tt>
121
+ #
122
+ # Manual keys are injected after the !include keyword:
123
+ # [!include github-wiki!](https://github.com/some-dude/wiki/Home)
124
+ #
125
+ def initialize(keys)
126
+ @keys = keys
127
+ super()
128
+ end
129
+
130
+ def process(content, uri, args = "")
131
+ raise NotImplementedError
132
+ end
133
+
134
+ def applies_to?(keys)
135
+ @keys.each { |h| keys.each { |k| return true if h.match k } }
136
+ false
137
+ end
138
+
139
+ # Node should be the root node that contains the embedded content,
140
+ # which will be stripped of all attributes and injected with new ones:
141
+ # 1. data-embed-uri containing the URI of the embedded resource
142
+ # 2. data-embed-src the name of the processor used for embedding
143
+ #
144
+ # All children nodes that have an @id attribute will have that attribute
145
+ # removed as well.
146
+ def stamp(node, uri, key)
147
+ node.xpath("//*[@id]").each { |node| node.remove_attribute "id" }
148
+ node.attributes.each_pair { |name,_| node.remove_attribute name }
149
+ node['data-embed-uri'] = uri
150
+ node['data-embed-src'] = key
151
+ end
152
+ end
153
+
154
+ # Extracts content from GitHub Wiki pages
155
+ #
156
+ # Bound keys:
157
+ # * "github-wiki"
158
+ # * URI("[...]github.com/[...]/wiki/[...]")
159
+ #
160
+ class GithubWikiProcessor < Processor
161
+ def initialize()
162
+ super(["github-wiki", /github.com.*\/wiki\//])
163
+ end
164
+
165
+ # Returns the content of the node <div class='markdown-body'></div>,
166
+ # it will also remove all id attributes of all content nodes.
167
+ #
168
+ # Supported options:
169
+ # 1. reduce-headings: all heading nodes (<h1> through <h5>) will be
170
+ # stepped one level, so h1 becomes h2, etc.
171
+ def process(content, uri, args = "")
172
+ html_doc = Nokogiri::HTML(content) do |config| config.noerror end
173
+
174
+ node = html_doc.xpath("//div[@class='markdown-body']").first
175
+
176
+ stamp(node, uri, 'github-wiki')
177
+
178
+ if args.include?("reduce-headings") then
179
+ 5.downto(1) { |level|
180
+ node.xpath("//h#{level}").each { |heading_node|
181
+ heading_node.name = "h#{level+1}"
182
+ }
183
+ }
184
+ end
185
+
186
+ node
187
+ end
188
+
189
+ end
190
+
191
+ # Extracts content from PageHub shared documents
192
+ #
193
+ # Bound keys:
194
+ # * "pagehub"
195
+ # * URI([...]pagehub.org/[...])
196
+ class PageHubProcessor < Processor
197
+ def initialize()
198
+ super(["pagehub", /pagehub.org/])
199
+ end
200
+
201
+ def process(content, uri, args = "")
202
+ html_doc = Nokogiri::HTML(content) do |config| config.noerror end
203
+ node = html_doc.xpath("//div[@id='content']").first
204
+ node.xpath('div[@id="breadcrumbs"]').remove
205
+ node.xpath('div[@id="bottom"]').remove
206
+ stamp(node, uri, 'pagehub')
207
+ node
208
+ end
209
+ end
210
+
211
+ register_processor(GithubWikiProcessor.new)
212
+ register_processor(PageHubProcessor.new)
213
+
214
+ end # Embedder module
215
+
216
+ add_processor :pre_render, lambda {|str|
217
+ # Embed remote references, if any
218
+ str.gsub!(/^\B\[\!include\s?(.*)\!\]\((.*)\)/) {
219
+ content = ""
220
+
221
+ uri = $2
222
+
223
+ # parse the content source and args, if any
224
+ source = ($1 || "").split.first || ""
225
+ args = ($1 || "").split || []
226
+ args = args[1..args.length].join(' ') unless args.empty?
227
+
228
+ begin
229
+ content = Embedder.get_resource(uri, source, args)
230
+ rescue Embedder::InvalidSizeError => e
231
+ content << "**Embedding error**: the file you tried to embed is too big - #{e.message.to_i} bytes."
232
+ content << " (**Source**: [#{$2}](#{$2}))\n\n"
233
+ rescue Embedder::InvalidTypeError => e
234
+ content << "**Embedding error**: the file type you tried to embed (`#{e.message}`) is not supported."
235
+ content << " (**Source**: [#{$2}](#{$2}))\n\n"
236
+ rescue Embedder::EmbeddingError => e
237
+ content << "**Embedding error**: #{e.message}."
238
+ content << " (**Source**: [#{$2}](#{$2}))\n\n"
239
+ end
240
+
241
+ # content = "<div data-embedded=true>#{content.to_s.to_markdown}</div>".to_markdown
242
+ # content = "#{content}"
243
+ content
244
+ }
245
+
246
+ str
247
+ }
248
+
249
+ end # Markdown module
250
+ end # PageHub module
@@ -0,0 +1,21 @@
1
+ module PageHub
2
+ module Markdown
3
+ add_processor :post_render, lambda { |str|
4
+ str.gsub!(/\[\!options(.*)\!\]/) {
5
+ opts = $1
6
+ out = ""
7
+
8
+ unless opts.empty?
9
+ opts = opts.split(' ').each { |opt|
10
+ case opt
11
+ when "no-title"
12
+ out += "<style>header h1 { display: none }</style>"
13
+ end
14
+ }
15
+ end
16
+
17
+ out
18
+ }
19
+ }
20
+ end # Markdown module
21
+ end # PageHub module
@@ -0,0 +1,98 @@
1
+ module PageHub
2
+ module Markdown
3
+ module ToC
4
+
5
+ # Builds a tree of headings from a given block of Markdown
6
+ # text, the returned list can be turned into HTML using
7
+ # ToC::to_html()
8
+ def self.from_markdown(markdown, threshold = 6)
9
+ self.from_content(/(#+)\s([^\n]+)/, lambda { |l, t| return l.length, t }, markdown, threshold)
10
+ end
11
+
12
+ # renders a table of content using nested <ol> list nodes
13
+ # from a given list of Heading objects produced by ToC::from_markdown()
14
+ def self.to_html(toc)
15
+ html = "<ol>"
16
+ toc.each { |heading| html << heading.to_html }
17
+ html << "</ol>"
18
+ html
19
+ end
20
+
21
+ private
22
+
23
+ def self.from_content(pattern, formatter, content, threshold)
24
+ headings = []
25
+ current = []
26
+ toc_index = 0
27
+ content.scan(pattern).each { |l, t|
28
+ level,title = formatter.call(l, t)
29
+
30
+ if level <= threshold
31
+ h = Heading.new(title, level, toc_index)
32
+ headings << h
33
+ current[level] = h
34
+ toc_index += 1 # toc_index is used for hyperlinking
35
+
36
+ # if there's a parent, attach this heading as a child to it
37
+ if current[level-1] then
38
+ current[level-1] << h
39
+ end
40
+ end
41
+ }
42
+
43
+ toc = []
44
+ headings.each { |h|
45
+ next if h.parent
46
+ toc << h
47
+ }
48
+
49
+ toc
50
+ end
51
+
52
+ class Heading
53
+ attr_accessor :level, :title, :children, :parent, :index
54
+
55
+ def initialize(title, level, index)
56
+ @title = title
57
+ @level = level
58
+ @index = index
59
+ @parent = nil
60
+ @children = []
61
+ super()
62
+ end
63
+
64
+ def <<(h)
65
+ @children.each { |child|
66
+ return if child.title == h.title
67
+ }
68
+
69
+ h.parent = self
70
+ @children << h
71
+ end
72
+
73
+ def to_html()
74
+ html = ""
75
+ html << "<li>"
76
+ html << "<a href=\"\#toc_#{index}\">" << title << "</a>"
77
+
78
+ if children.any? then
79
+ html << "<ol>"
80
+ children.each { |child| html << child.to_html }
81
+ html << "</ol>"
82
+ end
83
+
84
+ html << "</li>"
85
+ end
86
+ end
87
+ end
88
+
89
+ # register the processor
90
+ add_processor :pre_render, lambda { |str|
91
+ str.gsub!(/^\B\[\!toc(.*)\!\]/) {
92
+ ToC.to_html ToC.from_markdown(str, $1.empty? ? 6 : $1.strip.to_i)
93
+ }
94
+ str
95
+ }
96
+
97
+ end # Markdown module
98
+ end # PageHub module
metadata ADDED
@@ -0,0 +1,115 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pagehub-markdown
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Ahmad Amireh
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-10-03 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: redcarpet
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 2.1.1
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 2.1.1
30
+ - !ruby/object:Gem::Dependency
31
+ name: albino
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: 1.3.3
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: 1.3.3
46
+ - !ruby/object:Gem::Dependency
47
+ name: json
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: 1.7.0
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: 1.7.0
62
+ - !ruby/object:Gem::Dependency
63
+ name: nokogiri
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: 1.5.5
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: 1.5.5
78
+ description: A bunch of neat features added to the Markdown renderer via pure Markdown
79
+ syntax.
80
+ email: ahmad@amireh.net
81
+ executables: []
82
+ extensions: []
83
+ extra_rdoc_files: []
84
+ files:
85
+ - lib/pagehub-markdown.rb
86
+ - lib/pagehub-markdown/processors/toc_generator.rb
87
+ - lib/pagehub-markdown/processors/pagehub_options.rb
88
+ - lib/pagehub-markdown/processors/embedder.rb
89
+ - lib/pagehub-markdown/markdown.rb
90
+ - lib/pagehub-markdown/mutators/date_injector.rb
91
+ homepage: http://github.com/amireh/pagehub-markdown
92
+ licenses: []
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ none: false
99
+ requirements:
100
+ - - ! '>='
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ required_rubygems_version: !ruby/object:Gem::Requirement
104
+ none: false
105
+ requirements:
106
+ - - ! '>='
107
+ - !ruby/object:Gem::Version
108
+ version: '0'
109
+ requirements: []
110
+ rubyforge_project:
111
+ rubygems_version: 1.8.23
112
+ signing_key:
113
+ specification_version: 3
114
+ summary: PageHub's extensions of GitHub's Redcarpet Markdown renderer.
115
+ test_files: []