devcenter-parser 1.3.9 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: c86b19b7db5df1fbf3ff75efcf6e8c5e820d4144
4
- data.tar.gz: 005abc3e6957539c3c459748fae88dcc839338c0
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: 731e954adb885153b1d9f5cecb351e93af1c249d
4
+ data.tar.gz: bc8bca7b5de1f10ff4400b43da93c93b299bf352
5
5
  SHA512:
6
- metadata.gz: be7ebb71b4c901ed408ca12a52772abfd4ea2dbd12cf4ba24d8f57c0e4198e04e2685c9e2f484dc06c44ba12a4047d089cf1b7ef9f418cfd27d5cecfa9c3ccea
7
- data.tar.gz: e3440ec9fcd6918dec57d4e16942a4d1ed98a871699ecc1afbe2f027b26f3593f38b30357ca31b496bf399556468639831e1d7ae7f246a92ac4538960adb15df
6
+ metadata.gz: dbfa585bb7106530d36e94166f845acd49c96c952fab64a1a96e89eb0fc3d76772ff6537357047388d40596e264132d96eaa7d2c1a7f479c6d8f70676317b4da
7
+ data.tar.gz: 443d781a6d537f9612f8b8dcb4809a8d766f31c362f9826ecccb4c56892f11654a89a54c7461027405c5598afbe4ad25b012f3eeea41f1a67b40e2f901310627
data/README.md CHANGED
@@ -20,8 +20,14 @@ Usage:
20
20
  end
21
21
  ```
22
22
 
23
+ Test:
24
+
25
+ ```bash
26
+ $ rake
27
+ ```
28
+
23
29
  ## License
24
30
  See the LICENSE file included in the distribution.
25
31
 
26
32
  ## Copyright
27
- Copyright (C) 2013 Heroku <raul@heroku.com>.
33
+ Copyright (C) 2013 Heroku <raul@heroku.com>.
@@ -3,45 +3,29 @@ require 'redcarpet'
3
3
  require 'nokogiri'
4
4
  require 'uri'
5
5
  require 'sanitize'
6
+ require 'ostruct'
7
+ require_relative './devcenter-parser/header_id_generator'
8
+ require_relative './devcenter-parser/github_parser'
9
+ require_relative './devcenter-parser/maruku_parser'
6
10
 
7
11
  module DevcenterParser
8
-
9
12
  AVAILABLE_FLAVOURS = [:github, :maruku]
10
13
 
11
14
  class InvalidMarkdownError < Exception; end
12
15
  class InvalidRawHTMLError < Exception; end
13
16
  class UnknownFlavourError < Exception; end
14
17
 
15
- class HTMLWithPantsRenderer < Redcarpet::Render::HTML
16
- include Redcarpet::Render::SmartyPants
17
- end
18
-
19
18
  def self.to_html(markdown, flavour)
20
19
  html = to_unsanitized_html(markdown, flavour.to_sym)
21
20
  sanitize(html)
22
21
  end
23
22
 
24
23
  def self.to_unsanitized_html(markdown, flavour)
25
- markdown = normalize_newlines(markdown.to_s)
26
- markdown = separate_consecutive_blockquote_blocks(markdown)
27
- doc = case flavour.to_sym
28
- when :maruku
29
- html = Maruku.new(markdown, :on_error => :raise).to_html
30
- doc = Nokogiri::HTML::DocumentFragment.parse(html)
31
- maruku_code_blocks(doc)
32
- maruku_underscores_to_dashes_in_subheader_anchors(doc)
33
- when :github
34
- html = github_parser.render(markdown.to_s)
35
- doc = Nokogiri::HTML::DocumentFragment.parse(html)
36
- github_parse_special_blocks(doc)
37
- github_underscores_to_dashes_in_subheader_anchors(doc)
38
- else
39
- raise UnknownFlavourError, "Markdown flavour '#{flavour}' not supported"
40
- end
41
- convert_to_article_links_all_relative_links_with_missing_initial_slashes(doc)
42
- html = doc.to_html(:encoding => 'utf-8')
43
- verify_raw_html(html)
44
- html
24
+ raise(UnknownFlavourError, "Markdown flavour '#{flavour}' not supported") unless %w{ maruku github }.include?(flavour.to_s)
25
+ markdown = normalize_markdown(markdown)
26
+ markdown_parser = flavour.to_s == 'maruku' ? MarukuParser : GitHubParser
27
+ doc = markdown_parser.parse(markdown)
28
+ doc_to_html(doc)
45
29
  rescue InvalidRawHTMLError => e
46
30
  raise InvalidMarkdownError, e.message
47
31
  rescue => e
@@ -54,9 +38,22 @@ module DevcenterParser
54
38
 
55
39
  private
56
40
 
41
+ def self.doc_to_html(doc)
42
+ HeaderIdGenerator.apply!(doc)
43
+ convert_to_article_links_all_relative_links_with_missing_initial_slashes(doc)
44
+ html = doc.to_html(:encoding => 'utf-8')
45
+ verify_raw_html(html)
46
+ html
47
+ end
48
+
49
+ def self.normalize_markdown(markdown)
50
+ markdown = normalize_newlines(markdown.to_s)
51
+ separate_consecutive_blockquote_blocks(markdown)
52
+ end
53
+
57
54
  # The current parsers consider something like:
58
55
  # > foo
59
- #
56
+ #
60
57
  # > bar
61
58
  # as a single blockquote, while we want it to be two different ones.
62
59
  # This method adds an empty paragraph between consecutive blocks so parsers process them separately
@@ -69,10 +66,6 @@ module DevcenterParser
69
66
  markdown.lines.map{ |l| l.rstrip }.join("\n")
70
67
  end
71
68
 
72
- def self.github_parser
73
- @@github_parser ||= Redcarpet::Markdown.new(HTMLWithPantsRenderer, fenced_code_blocks: true, tables: true)
74
- end
75
-
76
69
  def self.sanitize_config
77
70
  return @@sanitize_config if defined?(@@sanitize_config)
78
71
  config = Sanitize::Config::RELAXED
@@ -92,50 +85,6 @@ module DevcenterParser
92
85
  @@sanitize_config = config.merge({remove_contents: true, allow_comments: true})
93
86
  end
94
87
 
95
- def self.maruku_code_blocks(doc)
96
- doc.css('pre>code').each do |node|
97
- if match = node.content.match(/\A\s*:::\s*(\w+)/)
98
- lang = match[1]
99
- node.content = node.content.gsub(/\A\s*:::\s*\w+\n/, '')
100
- node['class'] = lang
101
- end
102
- end
103
- doc
104
- end
105
-
106
- def self.maruku_underscores_to_dashes_in_subheader_anchors(doc)
107
- doc.css("h2,h3,h4,h5,h6").each do |node|
108
- node['id'] = subheader_id(node.content)
109
- end
110
- doc
111
- end
112
-
113
- def self.github_underscores_to_dashes_in_subheader_anchors(doc)
114
- doc.css("h2,h3,h4,h5,h6").each do |node|
115
- node['id'] = subheader_id(node.content)
116
- end
117
- doc
118
- end
119
-
120
- def self.subheader_id(content)
121
- content.to_s.downcase.gsub(/\W+/, '-').gsub(/\A-+|-+\Z/, '')
122
- end
123
-
124
- def self.github_parse_special_blocks(doc)
125
- doc.css('blockquote>p:first').each do |node|
126
- if match = node.inner_html.match(/\A\W*(callout|warning|note)\W/)
127
- node.parent.name = 'div'
128
- node.parent['class'] = match[1]
129
-
130
- new_html = node.inner_html.gsub(/\A\W*(callout|warning|note)\W/, '')
131
-
132
- # Assigning inner_html directly causes encoding issues in old libxml versions,
133
- # workaround from https://github.com/sparklemotion/nokogiri/issues/458#issuecomment-3136620
134
- node.children = Nokogiri::HTML.fragment(new_html, 'utf-8')
135
- end
136
- end
137
- end
138
-
139
88
  def self.convert_to_article_links_all_relative_links_with_missing_initial_slashes(doc)
140
89
  doc.css('a').each do |node|
141
90
  unless node['href'].nil? || node['href'] =~ /\Ahttp|\A\/|\Amailto\:|\A#/
@@ -164,4 +113,4 @@ module DevcenterParser
164
113
  broken_html = html.match(/REXML could not parse this XML\/HTML\:(.+)<\/pre>/m)[1].strip rescue nil
165
114
  broken_html.nil? ? "Contains broken raw HTML." : "This raw HTML is invalid: #{CGI.unescapeHTML(broken_html)}"
166
115
  end
167
- end
116
+ end
@@ -0,0 +1,35 @@
1
+ module GitHubParser
2
+ extend self
3
+
4
+ class HTMLWithPantsRenderer < Redcarpet::Render::HTML
5
+ include Redcarpet::Render::SmartyPants
6
+ end
7
+
8
+ def self.parse(markdown)
9
+ html = github_parser.render(markdown.to_s)
10
+ doc = Nokogiri::HTML::DocumentFragment.parse(html)
11
+ special_blocks(doc)
12
+ doc
13
+ end
14
+
15
+ private
16
+
17
+ def self.github_parser
18
+ @@github_parser ||= Redcarpet::Markdown.new(HTMLWithPantsRenderer, fenced_code_blocks: true, tables: true)
19
+ end
20
+
21
+ def self.special_blocks(doc)
22
+ doc.css('blockquote>p:first').each do |node|
23
+ if match = node.inner_html.match(/\A\W*(callout|warning|note)\W/)
24
+ node.parent.name = 'div'
25
+ node.parent['class'] = match[1]
26
+
27
+ new_html = node.inner_html.gsub(/\A\W*(callout|warning|note)\W/, '')
28
+
29
+ # Assigning inner_html directly causes encoding issues in old libxml versions,
30
+ # workaround from https://github.com/sparklemotion/nokogiri/issues/458#issuecomment-3136620
31
+ node.children = Nokogiri::HTML.fragment(new_html, 'utf-8')
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,69 @@
1
+ # Generates header -> ids in the given doc, calculating the ids from the heading
2
+ # text and ensuring that there are no duplicated ids
3
+ class HeaderIdGenerator
4
+
5
+ def self.apply!(doc)
6
+ self.new(doc)
7
+ end
8
+
9
+ def initialize(doc)
10
+ @doc = doc
11
+ @header_nodes = @doc.css("h2,h3,h4,h5,h6").to_a
12
+
13
+ # { node -> id } hash
14
+ @nodes_ids = @header_nodes.inject({}){ |hash, node| hash[node] = nil; hash }
15
+
16
+ add_default_ids
17
+ prepend_parents_on_conflicts
18
+ append_numbers_on_conflicts
19
+
20
+ @nodes_ids.each{ |node, id| node['id'] = id }
21
+ end
22
+
23
+ private
24
+
25
+ # Parent != DOM nesting, but in the context of the content <h2></h2> ... <h3></h3>
26
+
27
+ def prepend_parents_on_conflicts
28
+ conflicts(@nodes_ids).each do |node, id|
29
+ parent_contents = parent_header_nodes(node).map{ |parent_node| parent_node.content }
30
+ content = (parent_contents + [node.content]).join(' ')
31
+ @nodes_ids[node] = subheader_id(content.to_s)
32
+ end
33
+ end
34
+
35
+ def parent_header_nodes(node)
36
+ parent_tags(node.name).map do |parent_tag|
37
+ @header_nodes[0..@header_nodes.index(node)-1].select{ |sibling| sibling.name == parent_tag }.last
38
+ end
39
+ end
40
+
41
+ # "h4" -> ["h2", "h3"]
42
+ def parent_tags(tag)
43
+ level = tag.gsub('h','').to_i
44
+ (2..level-1).map{ |n| "h#{n}" }
45
+ end
46
+
47
+
48
+ def append_numbers_on_conflicts
49
+ conflicts(@nodes_ids).group_by{ |node, id| id }.each do |id, id_conflicts|
50
+ id_conflicts.each_with_index do |conflict, n|
51
+ node = conflict[0]
52
+ new_id = "#{id}-#{n+1}"
53
+ @nodes_ids[node] = new_id
54
+ end
55
+ end
56
+ end
57
+
58
+ def conflicts(hash)
59
+ hash.select{ |node1, id1| hash.select{ |node1, id2| id1 == id2 }.size > 1 }
60
+ end
61
+
62
+ def add_default_ids
63
+ @nodes_ids.each{ |node, id| @nodes_ids[node] = subheader_id(node.content) }
64
+ end
65
+
66
+ def subheader_id(content)
67
+ content.to_s.downcase.gsub(/\W+/, '-').gsub(/\A-+|-+\Z/, '')
68
+ end
69
+ end
@@ -0,0 +1,23 @@
1
+ module MarukuParser
2
+ extend self
3
+
4
+ def self.parse(markdown)
5
+ html = Maruku.new(markdown, :on_error => :raise).to_html
6
+ doc = Nokogiri::HTML::DocumentFragment.parse(html)
7
+ code_blocks(doc)
8
+ doc
9
+ end
10
+
11
+ private
12
+
13
+ def self.code_blocks(doc)
14
+ doc.css('pre>code').each do |node|
15
+ if match = node.content.match(/\A\s*:::\s*(\w+)/)
16
+ lang = match[1]
17
+ node.content = node.content.gsub(/\A\s*:::\s*\w+\n/, '')
18
+ node['class'] = lang
19
+ end
20
+ end
21
+ doc
22
+ end
23
+ end
@@ -1,3 +1,3 @@
1
1
  module DevcenterParser
2
- VERSION = '1.3.9'
3
- end
2
+ VERSION = '1.4.0'
3
+ end
@@ -5,7 +5,7 @@ require_relative '../lib/devcenter-parser'
5
5
  describe 'DevcenterParser' do
6
6
 
7
7
  describe '.to_unsanitized_html' do
8
-
8
+
9
9
  it 'returns empty string for nil input' do
10
10
  assert_parsing_unsanitized_result nil, :maruku, ''
11
11
  assert_parsing_unsanitized_result nil, :maruku, ''
@@ -13,7 +13,7 @@ describe 'DevcenterParser' do
13
13
 
14
14
  it 'maintains script tags' do
15
15
  md = '<script>alert("hi")</script>'
16
- assert_parsing_unsanitized_result md, :maruku, '<script><![CDATA[alert("hi")]]></script>'
16
+ assert_parsing_unsanitized_result md, :maruku, '<script>alert("hi")</script>'
17
17
  assert_parsing_unsanitized_result md, :github, '<script>alert("hi")</script>'
18
18
  end
19
19
 
@@ -33,13 +33,6 @@ describe 'DevcenterParser' do
33
33
  end
34
34
  end
35
35
 
36
- it 'respects existing ids' do
37
- md = '<strong id="foo">clean</strong>'
38
- html = '<p><strong id="foo">clean</strong></p>'
39
- assert_maruku_result md, html
40
- assert_github_result md, html
41
- end
42
-
43
36
  it 'removes script tags and their content' do
44
37
  md = '<strong>clean<script>alert("hack!")</script></strong>'
45
38
  html = '<p><strong>clean</strong></p>'
@@ -47,28 +40,6 @@ describe 'DevcenterParser' do
47
40
  assert_github_result md, html
48
41
  end
49
42
 
50
- it 'includes ids in subheaders' do
51
- md = <<-MARKDOWN
52
- ## Foo Bar Header 123
53
-
54
- Foo bar content
55
- MARKDOWN
56
- assert DevcenterParser.to_html(md, :github).include?('<h2 id="foo-bar-header-123">Foo Bar Header 123</h2>')
57
- assert DevcenterParser.to_html(md, :maruku).include?('<h2 id="foo-bar-header-123">Foo Bar Header 123</h2>')
58
- end
59
-
60
- it 'generates ids replacing inner non-alphanum chars with dashes' do
61
- ['Foo Bar', 'Foo-Bar', 'Foo#bar', 'Foo##Bar', 'Foo##Bar', '-$Foo##Bar$-'].each do |title|
62
- md = <<-MARKDOWN
63
- ## #{title}
64
-
65
- Foo bar content
66
- MARKDOWN
67
- assert DevcenterParser.to_html(md, :github).include?("<h2 id=\"foo-bar\">#{title}</h2>"), "GitHub with title #{title}: " + DevcenterParser.to_html(md, :github)
68
- assert DevcenterParser.to_html(md, :maruku).include?("<h2 id=\"foo-bar\">#{title}</h2>"), "Maruku: " + DevcenterParser.to_html(md, :maruku)
69
- end
70
- end
71
-
72
43
  describe 'github markdown' do
73
44
 
74
45
  it 'generates apostrophes from single quotes in plain text' do
@@ -241,7 +212,7 @@ Testing
241
212
 
242
213
  > callout
243
214
  > **strong**
244
- > more callout
215
+ > more callout
245
216
 
246
217
  > normal
247
218
 
@@ -253,7 +224,7 @@ Testing
253
224
 
254
225
  >callout
255
226
  >**strong**
256
- >more callout
227
+ >more callout
257
228
 
258
229
  >normal
259
230
 
@@ -312,7 +283,7 @@ more callout</p>
312
283
  | A | B |
313
284
  | --- | --- |
314
285
  | 1 | 2 |
315
- | 3 | 4 |
286
+ | 3 | 4 |
316
287
  MARKDOWN
317
288
 
318
289
  html = <<-HTML
@@ -430,4 +401,9 @@ more callout</p>
430
401
  assert_equal expected.strip, result.strip, "Failed when parsing on unsanitized mode\n#{md}\nwith the #{flavour} flavour.\n\nExpected:\n#{expected}\n\nActual result:\n#{result}\n\n"
431
402
  end
432
403
 
433
- end
404
+ def assert_header_id(md, header, id)
405
+ assert DevcenterParser.to_html(md, :github).include?("<#{header} id=\"#{id}\">"), "GitHub does not generate a #{header} with id #{id}"
406
+ assert DevcenterParser.to_html(md, :maruku).include?("<#{header} id=\"#{id}\">"), "Maruku does not generate a #{header} with id #{id}"
407
+ end
408
+
409
+ end
@@ -0,0 +1,63 @@
1
+ require 'minitest/autorun'
2
+ require 'nokogiri'
3
+ require_relative '../lib/devcenter-parser/header_id_generator'
4
+
5
+ describe 'HeaderIdGeneratorTest' do
6
+ it 'respects existing ids in non-header elements' do
7
+ html = '<strong id="foo">clean</strong>'
8
+ assert_equal html, result(html)
9
+ end
10
+
11
+ it 'inserts ids in subheaders' do
12
+ html = '<h2>Foo Bar Header 123</h2>'
13
+ assert_id result(html), 'h2', 'foo-bar-header-123'
14
+ end
15
+
16
+ it 'generates ids replacing inner non-alphanum chars with dashes' do
17
+ ['Foo Bar', 'Foo-Bar', 'Foo#bar', 'Foo##Bar', 'Foo##Bar', '-$Foo##Bar$-'].each do |title|
18
+ html = "<h2>#{title}</h2>"
19
+ assert_id result(html), 'h2', 'foo-bar'
20
+ end
21
+ end
22
+
23
+
24
+ describe 'ensures that there are not collisions between ids in subheaders' do
25
+ it 'by prepending the id of the previous H2 if possible' do
26
+ html = <<-HTML
27
+ <h2>A</h2>
28
+ <h3>B</h3>
29
+ <h4>Z</h4>
30
+ <h2>C</h2>
31
+ <h3>B</h3>
32
+ <h4>Z</h4>
33
+ HTML
34
+ result = result(html)
35
+ %w{ a c }.each{ |id| assert_id(result, 'h2', id) }
36
+ %w{ a-b c-b }.each{ |id| assert_id(result, 'h3', id) }
37
+ %w{ a-b-z c-b-z }.each{ |id| assert_id(result, 'h4', id) }
38
+ end
39
+
40
+ it 'by appending numbers for those subheaders with same nesting level and parent header name' do
41
+ html = <<-HTML
42
+ <h2>A</h2>
43
+ <h3>B</h3>
44
+ <h3>B</h3>
45
+ <h2>C</h2>
46
+ <h2>C</h2>
47
+ HTML
48
+ result = result(html)
49
+ %w{ a c-1 c-2 }.each{ |id| assert_id(result, 'h2', id) }
50
+ %w{ a-b-1 a-b-2 }.each{ |id| assert_id(result, 'h3', id) }
51
+ end
52
+ end
53
+
54
+ def assert_id(html, tag, id)
55
+ assert html.include?("<#{tag} id=\"#{id}\">"), "<#{tag} id=\"#{id}\"> not found"
56
+ end
57
+
58
+ def result(html)
59
+ doc = Nokogiri::HTML::DocumentFragment.parse(html)
60
+ HeaderIdGenerator.new(doc)
61
+ doc.to_html
62
+ end
63
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: devcenter-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.9
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Heroku
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-11-07 00:00:00.000000000 Z
11
+ date: 2014-03-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: maruku
@@ -28,14 +28,14 @@ dependencies:
28
28
  name: nokogiri
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '>='
31
+ - - ! '>='
32
32
  - !ruby/object:Gem::Version
33
33
  version: 1.4.4
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '>='
38
+ - - ! '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: 1.4.4
41
41
  - !ruby/object:Gem::Dependency
@@ -70,14 +70,14 @@ dependencies:
70
70
  name: minitest
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - '>'
73
+ - - ! '>'
74
74
  - !ruby/object:Gem::Version
75
75
  version: '2.0'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - '>'
80
+ - - ! '>'
81
81
  - !ruby/object:Gem::Version
82
82
  version: '2.0'
83
83
  description: Parser for Heroku Dev Center's content
@@ -92,8 +92,12 @@ files:
92
92
  - README.md
93
93
  - devcenter-parser.gemspec
94
94
  - lib/devcenter-parser.rb
95
+ - lib/devcenter-parser/github_parser.rb
96
+ - lib/devcenter-parser/header_id_generator.rb
97
+ - lib/devcenter-parser/maruku_parser.rb
95
98
  - lib/devcenter-parser/version.rb
96
99
  - test/devcenter-parser_test.rb
100
+ - test/header_id_generator_test.rb
97
101
  homepage: https://devcenter.heroku.com
98
102
  licenses: []
99
103
  metadata: {}
@@ -103,19 +107,20 @@ require_paths:
103
107
  - lib
104
108
  required_ruby_version: !ruby/object:Gem::Requirement
105
109
  requirements:
106
- - - '>='
110
+ - - ! '>='
107
111
  - !ruby/object:Gem::Version
108
112
  version: '0'
109
113
  required_rubygems_version: !ruby/object:Gem::Requirement
110
114
  requirements:
111
- - - '>='
115
+ - - ! '>='
112
116
  - !ruby/object:Gem::Version
113
117
  version: '0'
114
118
  requirements: []
115
119
  rubyforge_project:
116
- rubygems_version: 2.0.3
120
+ rubygems_version: 2.2.2
117
121
  signing_key:
118
122
  specification_version: 4
119
123
  summary: Parser for Heroku Dev Center's content
120
124
  test_files:
121
125
  - test/devcenter-parser_test.rb
126
+ - test/header_id_generator_test.rb