devcenter-parser 1.3.9 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: c86b19b7db5df1fbf3ff75efcf6e8c5e820d4144
4
- data.tar.gz: 005abc3e6957539c3c459748fae88dcc839338c0
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: 731e954adb885153b1d9f5cecb351e93af1c249d
4
+ data.tar.gz: bc8bca7b5de1f10ff4400b43da93c93b299bf352
5
5
  SHA512:
6
- metadata.gz: be7ebb71b4c901ed408ca12a52772abfd4ea2dbd12cf4ba24d8f57c0e4198e04e2685c9e2f484dc06c44ba12a4047d089cf1b7ef9f418cfd27d5cecfa9c3ccea
7
- data.tar.gz: e3440ec9fcd6918dec57d4e16942a4d1ed98a871699ecc1afbe2f027b26f3593f38b30357ca31b496bf399556468639831e1d7ae7f246a92ac4538960adb15df
6
+ metadata.gz: dbfa585bb7106530d36e94166f845acd49c96c952fab64a1a96e89eb0fc3d76772ff6537357047388d40596e264132d96eaa7d2c1a7f479c6d8f70676317b4da
7
+ data.tar.gz: 443d781a6d537f9612f8b8dcb4809a8d766f31c362f9826ecccb4c56892f11654a89a54c7461027405c5598afbe4ad25b012f3eeea41f1a67b40e2f901310627
data/README.md CHANGED
@@ -20,8 +20,14 @@ Usage:
20
20
  end
21
21
  ```
22
22
 
23
+ Test:
24
+
25
+ ```bash
26
+ $ rake
27
+ ```
28
+
23
29
  ## License
24
30
  See the LICENSE file included in the distribution.
25
31
 
26
32
  ## Copyright
27
- Copyright (C) 2013 Heroku <raul@heroku.com>.
33
+ Copyright (C) 2013 Heroku <raul@heroku.com>.
@@ -3,45 +3,29 @@ require 'redcarpet'
3
3
  require 'nokogiri'
4
4
  require 'uri'
5
5
  require 'sanitize'
6
+ require 'ostruct'
7
+ require_relative './devcenter-parser/header_id_generator'
8
+ require_relative './devcenter-parser/github_parser'
9
+ require_relative './devcenter-parser/maruku_parser'
6
10
 
7
11
  module DevcenterParser
8
-
9
12
  AVAILABLE_FLAVOURS = [:github, :maruku]
10
13
 
11
14
  class InvalidMarkdownError < Exception; end
12
15
  class InvalidRawHTMLError < Exception; end
13
16
  class UnknownFlavourError < Exception; end
14
17
 
15
- class HTMLWithPantsRenderer < Redcarpet::Render::HTML
16
- include Redcarpet::Render::SmartyPants
17
- end
18
-
19
18
  def self.to_html(markdown, flavour)
20
19
  html = to_unsanitized_html(markdown, flavour.to_sym)
21
20
  sanitize(html)
22
21
  end
23
22
 
24
23
  def self.to_unsanitized_html(markdown, flavour)
25
- markdown = normalize_newlines(markdown.to_s)
26
- markdown = separate_consecutive_blockquote_blocks(markdown)
27
- doc = case flavour.to_sym
28
- when :maruku
29
- html = Maruku.new(markdown, :on_error => :raise).to_html
30
- doc = Nokogiri::HTML::DocumentFragment.parse(html)
31
- maruku_code_blocks(doc)
32
- maruku_underscores_to_dashes_in_subheader_anchors(doc)
33
- when :github
34
- html = github_parser.render(markdown.to_s)
35
- doc = Nokogiri::HTML::DocumentFragment.parse(html)
36
- github_parse_special_blocks(doc)
37
- github_underscores_to_dashes_in_subheader_anchors(doc)
38
- else
39
- raise UnknownFlavourError, "Markdown flavour '#{flavour}' not supported"
40
- end
41
- convert_to_article_links_all_relative_links_with_missing_initial_slashes(doc)
42
- html = doc.to_html(:encoding => 'utf-8')
43
- verify_raw_html(html)
44
- html
24
+ raise(UnknownFlavourError, "Markdown flavour '#{flavour}' not supported") unless %w{ maruku github }.include?(flavour.to_s)
25
+ markdown = normalize_markdown(markdown)
26
+ markdown_parser = flavour.to_s == 'maruku' ? MarukuParser : GitHubParser
27
+ doc = markdown_parser.parse(markdown)
28
+ doc_to_html(doc)
45
29
  rescue InvalidRawHTMLError => e
46
30
  raise InvalidMarkdownError, e.message
47
31
  rescue => e
@@ -54,9 +38,22 @@ module DevcenterParser
54
38
 
55
39
  private
56
40
 
41
+ def self.doc_to_html(doc)
42
+ HeaderIdGenerator.apply!(doc)
43
+ convert_to_article_links_all_relative_links_with_missing_initial_slashes(doc)
44
+ html = doc.to_html(:encoding => 'utf-8')
45
+ verify_raw_html(html)
46
+ html
47
+ end
48
+
49
+ def self.normalize_markdown(markdown)
50
+ markdown = normalize_newlines(markdown.to_s)
51
+ separate_consecutive_blockquote_blocks(markdown)
52
+ end
53
+
57
54
  # The current parsers consider something like:
58
55
  # > foo
59
- #
56
+ #
60
57
  # > bar
61
58
  # as a single blockquote, while we want it to be two different ones.
62
59
  # This method adds an empty paragraph between consecutive blocks so parsers process them separately
@@ -69,10 +66,6 @@ module DevcenterParser
69
66
  markdown.lines.map{ |l| l.rstrip }.join("\n")
70
67
  end
71
68
 
72
- def self.github_parser
73
- @@github_parser ||= Redcarpet::Markdown.new(HTMLWithPantsRenderer, fenced_code_blocks: true, tables: true)
74
- end
75
-
76
69
  def self.sanitize_config
77
70
  return @@sanitize_config if defined?(@@sanitize_config)
78
71
  config = Sanitize::Config::RELAXED
@@ -92,50 +85,6 @@ module DevcenterParser
92
85
  @@sanitize_config = config.merge({remove_contents: true, allow_comments: true})
93
86
  end
94
87
 
95
- def self.maruku_code_blocks(doc)
96
- doc.css('pre>code').each do |node|
97
- if match = node.content.match(/\A\s*:::\s*(\w+)/)
98
- lang = match[1]
99
- node.content = node.content.gsub(/\A\s*:::\s*\w+\n/, '')
100
- node['class'] = lang
101
- end
102
- end
103
- doc
104
- end
105
-
106
- def self.maruku_underscores_to_dashes_in_subheader_anchors(doc)
107
- doc.css("h2,h3,h4,h5,h6").each do |node|
108
- node['id'] = subheader_id(node.content)
109
- end
110
- doc
111
- end
112
-
113
- def self.github_underscores_to_dashes_in_subheader_anchors(doc)
114
- doc.css("h2,h3,h4,h5,h6").each do |node|
115
- node['id'] = subheader_id(node.content)
116
- end
117
- doc
118
- end
119
-
120
- def self.subheader_id(content)
121
- content.to_s.downcase.gsub(/\W+/, '-').gsub(/\A-+|-+\Z/, '')
122
- end
123
-
124
- def self.github_parse_special_blocks(doc)
125
- doc.css('blockquote>p:first').each do |node|
126
- if match = node.inner_html.match(/\A\W*(callout|warning|note)\W/)
127
- node.parent.name = 'div'
128
- node.parent['class'] = match[1]
129
-
130
- new_html = node.inner_html.gsub(/\A\W*(callout|warning|note)\W/, '')
131
-
132
- # Assigning inner_html directly causes encoding issues in old libxml versions,
133
- # workaround from https://github.com/sparklemotion/nokogiri/issues/458#issuecomment-3136620
134
- node.children = Nokogiri::HTML.fragment(new_html, 'utf-8')
135
- end
136
- end
137
- end
138
-
139
88
  def self.convert_to_article_links_all_relative_links_with_missing_initial_slashes(doc)
140
89
  doc.css('a').each do |node|
141
90
  unless node['href'].nil? || node['href'] =~ /\Ahttp|\A\/|\Amailto\:|\A#/
@@ -164,4 +113,4 @@ module DevcenterParser
164
113
  broken_html = html.match(/REXML could not parse this XML\/HTML\:(.+)<\/pre>/m)[1].strip rescue nil
165
114
  broken_html.nil? ? "Contains broken raw HTML." : "This raw HTML is invalid: #{CGI.unescapeHTML(broken_html)}"
166
115
  end
167
- end
116
+ end
@@ -0,0 +1,35 @@
1
+ module GitHubParser
2
+ extend self
3
+
4
+ class HTMLWithPantsRenderer < Redcarpet::Render::HTML
5
+ include Redcarpet::Render::SmartyPants
6
+ end
7
+
8
+ def self.parse(markdown)
9
+ html = github_parser.render(markdown.to_s)
10
+ doc = Nokogiri::HTML::DocumentFragment.parse(html)
11
+ special_blocks(doc)
12
+ doc
13
+ end
14
+
15
+ private
16
+
17
+ def self.github_parser
18
+ @@github_parser ||= Redcarpet::Markdown.new(HTMLWithPantsRenderer, fenced_code_blocks: true, tables: true)
19
+ end
20
+
21
+ def self.special_blocks(doc)
22
+ doc.css('blockquote>p:first').each do |node|
23
+ if match = node.inner_html.match(/\A\W*(callout|warning|note)\W/)
24
+ node.parent.name = 'div'
25
+ node.parent['class'] = match[1]
26
+
27
+ new_html = node.inner_html.gsub(/\A\W*(callout|warning|note)\W/, '')
28
+
29
+ # Assigning inner_html directly causes encoding issues in old libxml versions,
30
+ # workaround from https://github.com/sparklemotion/nokogiri/issues/458#issuecomment-3136620
31
+ node.children = Nokogiri::HTML.fragment(new_html, 'utf-8')
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,69 @@
1
+ # Generates header -> ids in the given doc, calculating the ids from the heading
2
+ # text and ensuring that there are no duplicated ids
3
+ class HeaderIdGenerator
4
+
5
+ def self.apply!(doc)
6
+ self.new(doc)
7
+ end
8
+
9
+ def initialize(doc)
10
+ @doc = doc
11
+ @header_nodes = @doc.css("h2,h3,h4,h5,h6").to_a
12
+
13
+ # { node -> id } hash
14
+ @nodes_ids = @header_nodes.inject({}){ |hash, node| hash[node] = nil; hash }
15
+
16
+ add_default_ids
17
+ prepend_parents_on_conflicts
18
+ append_numbers_on_conflicts
19
+
20
+ @nodes_ids.each{ |node, id| node['id'] = id }
21
+ end
22
+
23
+ private
24
+
25
+ # Parent != DOM nesting, but in the context of the content <h2></h2> ... <h3></h3>
26
+
27
+ def prepend_parents_on_conflicts
28
+ conflicts(@nodes_ids).each do |node, id|
29
+ parent_contents = parent_header_nodes(node).map{ |parent_node| parent_node.content }
30
+ content = (parent_contents + [node.content]).join(' ')
31
+ @nodes_ids[node] = subheader_id(content.to_s)
32
+ end
33
+ end
34
+
35
+ def parent_header_nodes(node)
36
+ parent_tags(node.name).map do |parent_tag|
37
+ @header_nodes[0..@header_nodes.index(node)-1].select{ |sibling| sibling.name == parent_tag }.last
38
+ end
39
+ end
40
+
41
+ # "h4" -> ["h2", "h3"]
42
+ def parent_tags(tag)
43
+ level = tag.gsub('h','').to_i
44
+ (2..level-1).map{ |n| "h#{n}" }
45
+ end
46
+
47
+
48
+ def append_numbers_on_conflicts
49
+ conflicts(@nodes_ids).group_by{ |node, id| id }.each do |id, id_conflicts|
50
+ id_conflicts.each_with_index do |conflict, n|
51
+ node = conflict[0]
52
+ new_id = "#{id}-#{n+1}"
53
+ @nodes_ids[node] = new_id
54
+ end
55
+ end
56
+ end
57
+
58
+ def conflicts(hash)
59
+ hash.select{ |node1, id1| hash.select{ |node1, id2| id1 == id2 }.size > 1 }
60
+ end
61
+
62
+ def add_default_ids
63
+ @nodes_ids.each{ |node, id| @nodes_ids[node] = subheader_id(node.content) }
64
+ end
65
+
66
+ def subheader_id(content)
67
+ content.to_s.downcase.gsub(/\W+/, '-').gsub(/\A-+|-+\Z/, '')
68
+ end
69
+ end
@@ -0,0 +1,23 @@
1
+ module MarukuParser
2
+ extend self
3
+
4
+ def self.parse(markdown)
5
+ html = Maruku.new(markdown, :on_error => :raise).to_html
6
+ doc = Nokogiri::HTML::DocumentFragment.parse(html)
7
+ code_blocks(doc)
8
+ doc
9
+ end
10
+
11
+ private
12
+
13
+ def self.code_blocks(doc)
14
+ doc.css('pre>code').each do |node|
15
+ if match = node.content.match(/\A\s*:::\s*(\w+)/)
16
+ lang = match[1]
17
+ node.content = node.content.gsub(/\A\s*:::\s*\w+\n/, '')
18
+ node['class'] = lang
19
+ end
20
+ end
21
+ doc
22
+ end
23
+ end
@@ -1,3 +1,3 @@
1
1
  module DevcenterParser
2
- VERSION = '1.3.9'
3
- end
2
+ VERSION = '1.4.0'
3
+ end
@@ -5,7 +5,7 @@ require_relative '../lib/devcenter-parser'
5
5
  describe 'DevcenterParser' do
6
6
 
7
7
  describe '.to_unsanitized_html' do
8
-
8
+
9
9
  it 'returns empty string for nil input' do
10
10
  assert_parsing_unsanitized_result nil, :maruku, ''
11
11
  assert_parsing_unsanitized_result nil, :maruku, ''
@@ -13,7 +13,7 @@ describe 'DevcenterParser' do
13
13
 
14
14
  it 'maintains script tags' do
15
15
  md = '<script>alert("hi")</script>'
16
- assert_parsing_unsanitized_result md, :maruku, '<script><![CDATA[alert("hi")]]></script>'
16
+ assert_parsing_unsanitized_result md, :maruku, '<script>alert("hi")</script>'
17
17
  assert_parsing_unsanitized_result md, :github, '<script>alert("hi")</script>'
18
18
  end
19
19
 
@@ -33,13 +33,6 @@ describe 'DevcenterParser' do
33
33
  end
34
34
  end
35
35
 
36
- it 'respects existing ids' do
37
- md = '<strong id="foo">clean</strong>'
38
- html = '<p><strong id="foo">clean</strong></p>'
39
- assert_maruku_result md, html
40
- assert_github_result md, html
41
- end
42
-
43
36
  it 'removes script tags and their content' do
44
37
  md = '<strong>clean<script>alert("hack!")</script></strong>'
45
38
  html = '<p><strong>clean</strong></p>'
@@ -47,28 +40,6 @@ describe 'DevcenterParser' do
47
40
  assert_github_result md, html
48
41
  end
49
42
 
50
- it 'includes ids in subheaders' do
51
- md = <<-MARKDOWN
52
- ## Foo Bar Header 123
53
-
54
- Foo bar content
55
- MARKDOWN
56
- assert DevcenterParser.to_html(md, :github).include?('<h2 id="foo-bar-header-123">Foo Bar Header 123</h2>')
57
- assert DevcenterParser.to_html(md, :maruku).include?('<h2 id="foo-bar-header-123">Foo Bar Header 123</h2>')
58
- end
59
-
60
- it 'generates ids replacing inner non-alphanum chars with dashes' do
61
- ['Foo Bar', 'Foo-Bar', 'Foo#bar', 'Foo##Bar', 'Foo##Bar', '-$Foo##Bar$-'].each do |title|
62
- md = <<-MARKDOWN
63
- ## #{title}
64
-
65
- Foo bar content
66
- MARKDOWN
67
- assert DevcenterParser.to_html(md, :github).include?("<h2 id=\"foo-bar\">#{title}</h2>"), "GitHub with title #{title}: " + DevcenterParser.to_html(md, :github)
68
- assert DevcenterParser.to_html(md, :maruku).include?("<h2 id=\"foo-bar\">#{title}</h2>"), "Maruku: " + DevcenterParser.to_html(md, :maruku)
69
- end
70
- end
71
-
72
43
  describe 'github markdown' do
73
44
 
74
45
  it 'generates apostrophes from single quotes in plain text' do
@@ -241,7 +212,7 @@ Testing
241
212
 
242
213
  > callout
243
214
  > **strong**
244
- > more callout
215
+ > more callout
245
216
 
246
217
  > normal
247
218
 
@@ -253,7 +224,7 @@ Testing
253
224
 
254
225
  >callout
255
226
  >**strong**
256
- >more callout
227
+ >more callout
257
228
 
258
229
  >normal
259
230
 
@@ -312,7 +283,7 @@ more callout</p>
312
283
  | A | B |
313
284
  | --- | --- |
314
285
  | 1 | 2 |
315
- | 3 | 4 |
286
+ | 3 | 4 |
316
287
  MARKDOWN
317
288
 
318
289
  html = <<-HTML
@@ -430,4 +401,9 @@ more callout</p>
430
401
  assert_equal expected.strip, result.strip, "Failed when parsing on unsanitized mode\n#{md}\nwith the #{flavour} flavour.\n\nExpected:\n#{expected}\n\nActual result:\n#{result}\n\n"
431
402
  end
432
403
 
433
- end
404
+ def assert_header_id(md, header, id)
405
+ assert DevcenterParser.to_html(md, :github).include?("<#{header} id=\"#{id}\">"), "GitHub does not generate a #{header} with id #{id}"
406
+ assert DevcenterParser.to_html(md, :maruku).include?("<#{header} id=\"#{id}\">"), "Maruku does not generate a #{header} with id #{id}"
407
+ end
408
+
409
+ end
@@ -0,0 +1,63 @@
1
+ require 'minitest/autorun'
2
+ require 'nokogiri'
3
+ require_relative '../lib/devcenter-parser/header_id_generator'
4
+
5
+ describe 'HeaderIdGeneratorTest' do
6
+ it 'respects existing ids in non-header elements' do
7
+ html = '<strong id="foo">clean</strong>'
8
+ assert_equal html, result(html)
9
+ end
10
+
11
+ it 'inserts ids in subheaders' do
12
+ html = '<h2>Foo Bar Header 123</h2>'
13
+ assert_id result(html), 'h2', 'foo-bar-header-123'
14
+ end
15
+
16
+ it 'generates ids replacing inner non-alphanum chars with dashes' do
17
+ ['Foo Bar', 'Foo-Bar', 'Foo#bar', 'Foo##Bar', 'Foo##Bar', '-$Foo##Bar$-'].each do |title|
18
+ html = "<h2>#{title}</h2>"
19
+ assert_id result(html), 'h2', 'foo-bar'
20
+ end
21
+ end
22
+
23
+
24
+ describe 'ensures that there are not collisions between ids in subheaders' do
25
+ it 'by prepending the id of the previous H2 if possible' do
26
+ html = <<-HTML
27
+ <h2>A</h2>
28
+ <h3>B</h3>
29
+ <h4>Z</h4>
30
+ <h2>C</h2>
31
+ <h3>B</h3>
32
+ <h4>Z</h4>
33
+ HTML
34
+ result = result(html)
35
+ %w{ a c }.each{ |id| assert_id(result, 'h2', id) }
36
+ %w{ a-b c-b }.each{ |id| assert_id(result, 'h3', id) }
37
+ %w{ a-b-z c-b-z }.each{ |id| assert_id(result, 'h4', id) }
38
+ end
39
+
40
+ it 'by appending numbers for those subheaders with same nesting level and parent header name' do
41
+ html = <<-HTML
42
+ <h2>A</h2>
43
+ <h3>B</h3>
44
+ <h3>B</h3>
45
+ <h2>C</h2>
46
+ <h2>C</h2>
47
+ HTML
48
+ result = result(html)
49
+ %w{ a c-1 c-2 }.each{ |id| assert_id(result, 'h2', id) }
50
+ %w{ a-b-1 a-b-2 }.each{ |id| assert_id(result, 'h3', id) }
51
+ end
52
+ end
53
+
54
+ def assert_id(html, tag, id)
55
+ assert html.include?("<#{tag} id=\"#{id}\">"), "<#{tag} id=\"#{id}\"> not found"
56
+ end
57
+
58
+ def result(html)
59
+ doc = Nokogiri::HTML::DocumentFragment.parse(html)
60
+ HeaderIdGenerator.new(doc)
61
+ doc.to_html
62
+ end
63
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: devcenter-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.9
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Heroku
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-11-07 00:00:00.000000000 Z
11
+ date: 2014-03-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: maruku
@@ -28,14 +28,14 @@ dependencies:
28
28
  name: nokogiri
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '>='
31
+ - - ! '>='
32
32
  - !ruby/object:Gem::Version
33
33
  version: 1.4.4
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '>='
38
+ - - ! '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: 1.4.4
41
41
  - !ruby/object:Gem::Dependency
@@ -70,14 +70,14 @@ dependencies:
70
70
  name: minitest
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - '>'
73
+ - - ! '>'
74
74
  - !ruby/object:Gem::Version
75
75
  version: '2.0'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - '>'
80
+ - - ! '>'
81
81
  - !ruby/object:Gem::Version
82
82
  version: '2.0'
83
83
  description: Parser for Heroku Dev Center's content
@@ -92,8 +92,12 @@ files:
92
92
  - README.md
93
93
  - devcenter-parser.gemspec
94
94
  - lib/devcenter-parser.rb
95
+ - lib/devcenter-parser/github_parser.rb
96
+ - lib/devcenter-parser/header_id_generator.rb
97
+ - lib/devcenter-parser/maruku_parser.rb
95
98
  - lib/devcenter-parser/version.rb
96
99
  - test/devcenter-parser_test.rb
100
+ - test/header_id_generator_test.rb
97
101
  homepage: https://devcenter.heroku.com
98
102
  licenses: []
99
103
  metadata: {}
@@ -103,19 +107,20 @@ require_paths:
103
107
  - lib
104
108
  required_ruby_version: !ruby/object:Gem::Requirement
105
109
  requirements:
106
- - - '>='
110
+ - - ! '>='
107
111
  - !ruby/object:Gem::Version
108
112
  version: '0'
109
113
  required_rubygems_version: !ruby/object:Gem::Requirement
110
114
  requirements:
111
- - - '>='
115
+ - - ! '>='
112
116
  - !ruby/object:Gem::Version
113
117
  version: '0'
114
118
  requirements: []
115
119
  rubyforge_project:
116
- rubygems_version: 2.0.3
120
+ rubygems_version: 2.2.2
117
121
  signing_key:
118
122
  specification_version: 4
119
123
  summary: Parser for Heroku Dev Center's content
120
124
  test_files:
121
125
  - test/devcenter-parser_test.rb
126
+ - test/header_id_generator_test.rb