upmark 0.2.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 05a88fea05d73c8e0e6b6283687fde275582f030
4
- data.tar.gz: ca3431e72f14140a5e558dc74d275390c010a423
3
+ metadata.gz: ebfa7a03ebe86bf8d70709fe99002518ee9f7237
4
+ data.tar.gz: be21325fe8ab8cd94d9ba4a4df1fcb0025bfb64b
5
5
  SHA512:
6
- metadata.gz: 190e387ab1335b918dc81d002b14acd97ffe1c42b8181066018908cefc4fe9729fc29081e64908d2f5f94ead4836d22abb316078c3b74c8929ce0f1dab6dcaa4
7
- data.tar.gz: a200693de5b909b23d4e385fa769dfd0588531621587e071b2bde4c4fe126033e66293cd02b93b8c5e05a61df6294b779f2403d83c45212c8a37a002a40648ee
6
+ metadata.gz: d4eab42d3640a2f9011daf71244c97036f9e92bce80cf351fe9eeda00b1535f0ef5a0471440ec1c4dd9d5c0ac8c76d9c0263661f9c2c6f9d1425a3845296c901
7
+ data.tar.gz: 3ed9e855887dbab065c586517669657e7c4cb56311b1e5237161111e64853043c2e44097f2bff95b5841ecd5fe3424cd29811940fc07fe8e860472a4ad77b92c
data/README.md CHANGED
@@ -31,7 +31,7 @@ Upmark will convert the following (arbitrarily nested) HTML elements to Markdown
31
31
  * `em`
32
32
  * `p`
33
33
  * `a`
34
- * `h1`, `h2`, `h3`
34
+ * `h1`, `h2`, `h3`, `h4`, `h5`, `h6`
35
35
  * `ul`
36
36
  * `ol`
37
37
  * `br`
data/lib/upmark.rb CHANGED
@@ -24,11 +24,20 @@ module Upmark
24
24
  # The result is either a String or an Array.
25
25
  ast = ast.join if ast.is_a?(Array)
26
26
 
27
+ # Remove trailing whitespace
28
+ ast.gsub!(/ +$/,'')
29
+
30
+ # Compress bullet point lists
31
+ ast.gsub!(/^•\s*([^•\n]*)\n+(?=•)/,"* #{'\1'}\n")
32
+
27
33
  # Any more than two consecutive newline characters is superflous.
28
- ast = ast.gsub(/\n(\s*\n)+/, "\n\n")
34
+ ast.gsub!(/\n(\s*\n)+/, "\n\n")
35
+
36
+ # Remove other bullet points
37
+ ast.gsub!(/^•\s*/,"* ")
29
38
 
30
39
  ast.strip
31
- rescue Parslet::ParseFailed
32
- raise Upmark::ParseFailed
40
+ rescue Parslet::ParseFailed => e
41
+ raise Upmark::ParseFailed.new('Parse failed', e)
33
42
  end
34
43
  end
data/lib/upmark/errors.rb CHANGED
@@ -1,3 +1,14 @@
1
1
  module Upmark
2
- ParseFailed = Class.new(StandardError)
2
+ class ParseFailed < StandardError
3
+
4
+ def initialize(message, cause)
5
+ @cause = cause
6
+ super(message)
7
+ end
8
+
9
+ def cause
10
+ @cause
11
+ end
12
+
13
+ end
3
14
  end
@@ -10,68 +10,81 @@ module Upmark
10
10
  class XML < Parslet::Parser
11
11
  root(:node)
12
12
 
13
- rule(:node) {
13
+ rule(:node) do
14
14
  (
15
+ empty_element.as(:empty) |
15
16
  element.as(:element) |
16
17
  text.as(:text)
17
18
  ).repeat(0)
18
- }
19
+ end
19
20
 
20
- rule(:element) {
21
+ rule(:empty_element) do
22
+ start_tag.as(:start_tag) >>
23
+ match(/\s+/) >>
24
+ end_tag.as(:end_tag)
25
+ end
26
+
27
+ rule(:element) do
28
+ empty_br.as(:empty_tag) |
21
29
  (
22
30
  start_tag.as(:start_tag) >>
23
31
  node.as(:children) >>
24
32
  end_tag.as(:end_tag)
25
33
  ) |
26
34
  empty_tag.as(:empty_tag)
27
- }
35
+ end
28
36
 
29
- rule(:text) {
37
+ rule(:text) do
38
+ match(/\A[\s\n\t ]+\Z/m).absent? >> # ignore entirely empty strings
30
39
  match(/[^<>]/).repeat(1)
31
- }
40
+ end
32
41
 
33
- rule(:start_tag) {
42
+ rule(:start_tag) do
34
43
  str('<') >>
35
44
  name.as(:name) >>
36
45
  (space >> attribute).repeat.as(:attributes) >>
37
46
  space? >>
38
47
  str('>')
39
- }
48
+ end
40
49
 
41
- rule(:end_tag) {
50
+ rule(:end_tag) do
42
51
  str('</') >>
43
52
  name.as(:name) >>
44
53
  space? >>
45
54
  str('>')
46
- }
55
+ end
56
+
57
+ rule(:empty_br) do
58
+ str('<') >> space? >> str('br').as(:name) >> space? >> str('>')
59
+ end
47
60
 
48
- rule(:empty_tag) {
61
+ rule(:empty_tag) do
49
62
  str('<') >>
50
63
  name.as(:name) >>
51
64
  (space >> attribute).repeat.as(:attributes) >>
52
65
  space? >>
53
66
  str('/>')
54
- }
67
+ end
55
68
 
56
- rule(:name) {
69
+ rule(:name) do
57
70
  match(/[a-zA-Z_:]/) >> match(/[\w:\.-]/).repeat
58
- }
71
+ end
59
72
 
60
- rule(:attribute) {
73
+ rule(:attribute) do
61
74
  name.as(:name) >>
62
75
  str('=') >> (
63
76
  (str('"') >> double_quoted_attribute_value.as(:value) >> str('"')) | # double quotes
64
77
  (str("'") >> single_quoted_attribute_value.as(:value) >> str("'")) # single quotes
65
78
  )
66
- }
79
+ end
67
80
 
68
- rule(:double_quoted_attribute_value) {
81
+ rule(:double_quoted_attribute_value) do
69
82
  (str('"').absent? >> (match(/[^<&]/) | entity_ref)).repeat
70
- }
83
+ end
71
84
 
72
- rule(:single_quoted_attribute_value) {
85
+ rule(:single_quoted_attribute_value) do
73
86
  (str("'").absent? >> (match(/[^<&]/) | entity_ref)).repeat
74
- }
87
+ end
75
88
 
76
89
  rule(:entity_ref) { match("&") >> name >> match(";") }
77
90
 
@@ -7,19 +7,49 @@ module Upmark
7
7
 
8
8
  rule(text: simple(:value)) { value.to_s }
9
9
 
10
+ # Pass all unmatched elements through.
11
+ rule(
12
+ element: {
13
+ name: simple(:name),
14
+ attributes: subtree(:attributes),
15
+ children: sequence(:children),
16
+ ignore: simple(:ignore)
17
+ }
18
+ ) do |element|
19
+ attributes = map_attributes_subtree(element[:attributes])
20
+ children = element[:children].join
21
+ name = element[:name]
22
+
23
+ attributes_list =
24
+ if attributes.any?
25
+ " " + attributes.map {|name, value| %Q{#{name}="#{value}"} }.join(" ")
26
+ else
27
+ ""
28
+ end
29
+
30
+ if children.empty?
31
+ %Q{<#{name}#{attributes_list} />}
32
+ else
33
+ %Q{<#{name}#{attributes_list}>#{children}</#{name}>}
34
+ end
35
+ end
36
+
10
37
  def self.text(element)
11
- element[:children].join.gsub(/(\n)+/, '\1')
38
+ element[:children].join.gsub(/(\n)[\n ]+/, '\1')
12
39
  end
13
40
 
14
41
  element(:p) {|element| "#{text(element)}\n\n" }
15
42
  element(:h1) {|element| "# #{text(element)}" }
16
43
  element(:h2) {|element| "## #{text(element)}" }
17
44
  element(:h3) {|element| "### #{text(element)}" }
45
+ element(:h4) {|element| "#### #{text(element)}" }
46
+ element(:h5) {|element| "##### #{text(element)}" }
47
+ element(:h6) {|element| "###### #{text(element)}" }
18
48
  element(:li) {|element| "#{text(element)}" }
19
49
 
20
50
  element(:ul) do |element|
21
51
  children = element[:children].map {|value| value.strip != "" ? value : nil }.compact
22
- children.map {|value| "* #{value}\n" }
52
+ children.map {|value| "* #{value.gsub(/^\s*•\s*/,'')}\n" }
23
53
  end
24
54
 
25
55
  element(:ol) do |element|
@@ -32,7 +62,11 @@ module Upmark
32
62
  href = attributes[:href]
33
63
  title = attributes[:title]
34
64
 
35
- %Q{[#{text(element)}](#{href} "#{title}")}
65
+ if /^(?:http|mailto)/ =~ href
66
+ %Q{[#{text(element)}](#{href} "#{title}")}
67
+ else
68
+ text(element)
69
+ end
36
70
  end
37
71
 
38
72
  element(:img) do |element|
@@ -41,40 +75,19 @@ module Upmark
41
75
  title = attributes[:title]
42
76
  alt_text = attributes[:alt]
43
77
 
44
- %Q{![#{alt_text}](#{href} "#{title}")}
78
+ if /^http/ =~ href
79
+ %Q{![#{alt_text}](#{href} "#{title}")}
80
+ else
81
+ "#{alt_text || title}"
82
+ end
45
83
  end
46
84
 
47
85
  element(:b, :strong) {|element| "**#{text(element)}**" }
48
86
  element(:i, :em) {|element| "*#{text(element)}*" }
49
87
 
50
88
  element(:br) { "\n" }
89
+ rule(element: { name: "br"}) { "\n" }
51
90
 
52
- # Pass all unmatched elements through.
53
- rule(
54
- element: {
55
- name: simple(:name),
56
- attributes: subtree(:attributes),
57
- children: sequence(:children),
58
- ignore: simple(:ignore)
59
- }
60
- ) do |element|
61
- attributes = map_attributes_subtree(element[:attributes])
62
- children = element[:children].join
63
- name = element[:name]
64
-
65
- attributes_list =
66
- if attributes.any?
67
- " " + attributes.map {|name, value| %Q{#{name}="#{value}"} }.join(" ")
68
- else
69
- ""
70
- end
71
-
72
- if children.empty?
73
- %Q{<#{name}#{attributes_list} />}
74
- else
75
- %Q{<#{name}#{attributes_list}>#{children}</#{name}>}
76
- end
77
- end
78
91
  end
79
92
  end
80
93
  end
@@ -3,13 +3,26 @@ module Upmark
3
3
  # A transform class withich normalises start/end/empty tags into the
4
4
  # same structure.
5
5
  class Normalise < Parslet::Transform
6
+
7
+ rule(element: subtree(:invalid)) do
8
+ raise Upmark::ParseFailed.new('Invalid parse result', nil)
9
+ end
10
+
11
+ # Strip empty tags
12
+ rule(empty: subtree(:invalid)) do
13
+ ' '
14
+ end
15
+
6
16
  rule(
7
17
  element: {
8
18
  start_tag: {name: simple(:name), attributes: subtree(:attributes)},
9
- end_tag: {name: simple(:name)},
19
+ end_tag: {name: simple(:end_tag_name)},
10
20
  children: subtree(:children)
11
21
  }
12
22
  ) do
23
+ unless name == end_tag_name
24
+ raise Upmark::ParseFailed.new('Mismatched tags', nil)
25
+ end
13
26
  {
14
27
  element: {
15
28
  name: name,
@@ -20,6 +33,21 @@ module Upmark
20
33
  }
21
34
  end
22
35
 
36
+ rule(
37
+ element: {
38
+ empty_tag: { name: simple(:name) }
39
+ }
40
+ ) do
41
+ {
42
+ element: {
43
+ name: name,
44
+ attributes: [],
45
+ children: [],
46
+ ignore: false
47
+ }
48
+ }
49
+ end
50
+
23
51
  rule(
24
52
  element: {
25
53
  empty_tag: {name: simple(:name), attributes: subtree(:attributes)}
@@ -34,6 +62,7 @@ module Upmark
34
62
  }
35
63
  }
36
64
  end
65
+
37
66
  end
38
67
  end
39
68
  end
@@ -7,7 +7,7 @@ module Upmark
7
7
  class Preprocess < Parslet::Transform
8
8
  include TransformHelpers
9
9
 
10
- element(:div, :table, :pre) do |element|
10
+ element(:div, :pre) do |element|
11
11
  {
12
12
  element: {
13
13
  name: element[:name],
@@ -17,6 +17,37 @@ module Upmark
17
17
  }
18
18
  }
19
19
  end
20
+
21
+ element(:span) do |element|
22
+ element[:children]
23
+ end
24
+
25
+ # table content elements are stripped ignoring their spacing
26
+ element(:table, :thead, :tbody, :tfoot) do |element|
27
+ element[:children].reject! do |c|
28
+ Hash === c && c[:text].to_s =~ /\A[\n ]*\Z/m
29
+ end
30
+ element[:children]
31
+ end
32
+
33
+ # table content elements are stripped
34
+ element(:td, :th) do |element|
35
+ element[:children]
36
+ end
37
+
38
+ # table rows are treated as 'paragraph' blocks
39
+ element(:tr) do |element|
40
+ element[:children]
41
+ .select { |c| Array === c }
42
+ .map do |children|
43
+ children.map do |child|
44
+ if child[:text]
45
+ child[:text].to_s.gsub!(/^\n */,'')
46
+ end
47
+ child
48
+ end + ["\n"]
49
+ end + ["\n"]
50
+ end
20
51
  end
21
52
  end
22
53
  end
@@ -1,57 +1,102 @@
1
- require "spec_helper"
1
+ RSpec.describe Upmark, ".convert" do
2
+ RSpec::Matchers.define :convert_to do |expected|
3
+ match do
4
+ actual == expected
5
+ end
2
6
 
3
- describe Upmark, ".convert" do
4
- subject { Upmark.convert(html) }
7
+ def actual
8
+ @converted_actual ||= Upmark.convert(@actual)
9
+ end
5
10
 
6
- context "<a>" do
7
- let(:html) { <<-HTML.strip }
8
- <p><a href="http://helvetica.com/" title="art party organic">messenger <strong>bag</strong> skateboard</a></p>
9
- HTML
11
+ diffable
12
+ end
10
13
 
11
- it { should == <<-MD.strip }
12
- [messenger **bag** skateboard](http://helvetica.com/ "art party organic")
13
- MD
14
+ context "<a>" do
15
+ specify 'converts to []()' do
16
+ expect(<<-HTML.strip
17
+ <p><a href="http://helvetica.com/" title="art party organic">messenger <strong>bag</strong> skateboard</a></p>
18
+ HTML
19
+ ).to convert_to <<-MD.strip
20
+ [messenger **bag** skateboard](http://helvetica.com/ "art party organic")
21
+ MD
22
+ end
14
23
  end
15
24
 
16
25
  context "<a> hard" do
17
- let(:html) { <<-HTML.strip }
18
- <p><a href="http://jobs.latrobe.edu.au/jobDetails.asp?sJobIDs=545808&amp;sKeywords=business">Manager, Business Solutions</a></p>
19
- HTML
20
-
21
- it { should == <<-MD.strip }
26
+ specify 'converts as []()' do
27
+ expect(<<-HTML.strip
28
+ <p><a href="http://jobs.latrobe.edu.au/jobDetails.asp?sJobIDs=545808&amp;sKeywords=business">Manager, Business Solutions</a></p>
29
+ HTML
30
+ ).to convert_to <<-MD.strip
22
31
  [Manager, Business Solutions](http://jobs.latrobe.edu.au/jobDetails.asp?sJobIDs=545808&amp;sKeywords=business "")
23
- MD
32
+ MD
33
+ end
24
34
  end
25
35
 
26
36
  context "<img>" do
27
- let(:html) { <<-HTML.strip }
28
- <img src="http://helvetica.com/image.gif" title="art party organic" alt="messenger bag skateboard" />
29
-
30
- HTML
31
-
32
- it { should == <<-MD.strip }
37
+ specify 'converts as ![]()' do
38
+ expect(<<-HTML.strip
39
+ <img src="http://helvetica.com/image.gif" title="art party organic" alt="messenger bag skateboard" />
40
+ HTML
41
+ ).to convert_to <<-MD.strip
33
42
  ![messenger bag skateboard](http://helvetica.com/image.gif "art party organic")
34
- MD
43
+ MD
44
+ end
35
45
  end
36
46
 
37
47
  context "<p>" do
38
- let(:html) { <<-HTML.strip }
48
+ specify 'converts as plaintext' do
49
+ expect(<<-HTML.strip
50
+ <p>• Bullet 1</p>
51
+ <p>• Bullet 2</p>
39
52
  <p>messenger <strong>bag</strong> skateboard</p>
40
53
 
41
54
  <p>art party<br />
42
55
  organic</p>
43
- HTML
44
56
 
45
- it { should == <<-MD.strip }
57
+ <p>art party<br>
58
+ organic</p>
59
+
60
+ <p> </p>
61
+ <p><strong> </strong></p>
62
+
63
+ <p>• Bullet 3</p>
64
+ <p>• Bullet 4</p>
65
+ <p>• Bullet 5</p>
66
+ <p>• Bullet 6</p>
67
+ <p>• Bullet 7</p>
68
+ <p>Something else</p>
69
+ HTML
70
+ ).to convert_to <<-MD.strip
71
+ * Bullet 1
72
+ * Bullet 2
73
+
46
74
  messenger **bag** skateboard
47
75
 
48
76
  art party
49
77
  organic
50
- MD
78
+
79
+ art party
80
+ organic
81
+
82
+ * Bullet 3
83
+ * Bullet 4
84
+ * Bullet 5
85
+ * Bullet 6
86
+ * Bullet 7
87
+
88
+ Something else
89
+ MD
90
+ end
91
+
92
+ it 'converts paragraph utf-8 bullet points to a markdown list' do
93
+ expect("<p>• Bullet 1</p><p>• Bullet 2</p>").to convert_to "* Bullet 1\n* Bullet 2"
94
+ end
51
95
  end
52
96
 
53
97
  context "<ul>" do
54
- let(:html) { <<-HTML.strip }
98
+ specify 'converts as list' do
99
+ expect(<<-HTML.strip
55
100
  <ul>
56
101
  <li>messenger</li>
57
102
  <li><strong>bag</strong></li>
@@ -63,9 +108,13 @@ organic
63
108
  <li><p><strong>bag</strong></p></li>
64
109
  <li><p>skateboard</p></li>
65
110
  </ul>
66
- HTML
67
111
 
68
- it { should == <<-MD.strip }
112
+ <ul>
113
+ <li>• Bullet 1</li>
114
+ <li>• Bullet 2</li>
115
+ </ul>
116
+ HTML
117
+ ).to convert_to <<-MD.strip
69
118
  * messenger
70
119
  * **bag**
71
120
  * skateboard
@@ -75,11 +124,16 @@ organic
75
124
  * **bag**
76
125
 
77
126
  * skateboard
78
- MD
127
+
128
+ * Bullet 1
129
+ * Bullet 2
130
+ MD
131
+ end
79
132
  end
80
133
 
81
134
  context "<ol>" do
82
- let(:html) { <<-HTML.strip }
135
+ specify 'converts as numbered list' do
136
+ expect(<<-HTML.strip
83
137
  <ol>
84
138
  <li>messenger</li>
85
139
  <li><strong>bag</strong></li>
@@ -91,9 +145,8 @@ organic
91
145
  <li><p><strong>bag</strong></p></li>
92
146
  <li><p>skateboard</p></li>
93
147
  </ol>
94
- HTML
95
-
96
- it { should == <<-MD.strip }
148
+ HTML
149
+ ).to convert_to <<-MD.strip
97
150
  1. messenger
98
151
  2. **bag**
99
152
  3. skateboard
@@ -103,21 +156,29 @@ organic
103
156
  2. **bag**
104
157
 
105
158
  3. skateboard
106
- MD
159
+ MD
160
+ end
107
161
  end
108
162
 
109
- context "<h1>" do
110
- let(:html) { <<-HTML.strip }
163
+ context "<h1>, <h2>, <h3>, <h4>, <h5>, <h6>" do
164
+ specify 'converts as #' do
165
+ expect(<<-HTML.strip
111
166
  <h1>messenger bag skateboard</h1>
112
167
  <h2>messenger bag skateboard</h2>
113
168
  <h3>messenger bag skateboard</h3>
114
- HTML
115
-
116
- it { should == <<-MD.strip }
169
+ <h4>messenger bag skateboard</h4>
170
+ <h5>messenger bag skateboard</h5>
171
+ <h6>messenger bag skateboard</h6>
172
+ HTML
173
+ ).to convert_to <<-MD.strip
117
174
  # messenger bag skateboard
118
175
  ## messenger bag skateboard
119
176
  ### messenger bag skateboard
120
- MD
177
+ #### messenger bag skateboard
178
+ ##### messenger bag skateboard
179
+ ###### messenger bag skateboard
180
+ MD
181
+ end
121
182
  end
122
183
 
123
184
  context "block-level elements" do
@@ -127,25 +188,52 @@ organic
127
188
  <div id="tofu" class="art party">messenger <strong>bag</strong> skateboard</div>
128
189
  HTML
129
190
 
130
- it { should == html }
191
+ specify 'are left alone' do
192
+ expect(html).to convert_to html
193
+ end
131
194
  end
132
195
 
133
196
  context "<table>" do
134
197
  let(:html) { <<-HTML.strip }
135
198
  <table>
136
199
  <tr>
137
- <td>messenger</td>
200
+ <td><p><strong>messenger</strong></p></td>
201
+ <td><p>bag</p></td>
202
+ </tr>
203
+ <tr>
204
+ <td><p>messenger</p></td>
205
+ <td><p><strong>bag</strong></p></td>
138
206
  </tr>
139
207
  <tr>
140
- <td><strong>bag</strong></td>
208
+ <td>skateboarding</td>
209
+ <td><p>is cool with all the kids<br/>
210
+ or something</p></td>
141
211
  </tr>
142
212
  <tr>
143
- <td>skateboard</td>
213
+ <td><strong>Messenger bags</strong></td>
214
+ <td>are in with the hipsters though.</td>
144
215
  </tr>
145
216
  </table>
146
217
  HTML
147
218
 
148
- it { should == html }
219
+ specify 'is converted to paragraphs' do
220
+ expect(html).to convert_to <<-MD.strip
221
+ **messenger**
222
+
223
+ bag
224
+
225
+ messenger
226
+
227
+ **bag**
228
+
229
+ skateboarding
230
+ is cool with all the kids
231
+ or something
232
+
233
+ **Messenger bags**
234
+ are in with the hipsters though.
235
+ MD
236
+ end
149
237
  end
150
238
 
151
239
  context "<pre>" do
@@ -157,22 +245,31 @@ organic
157
245
  </pre>
158
246
  HTML
159
247
 
160
- it { should == html }
248
+ specify 'are left alone' do
249
+ expect(html).to convert_to html
250
+ end
161
251
  end
162
252
  end
163
253
 
164
- context "span-level elements" do
165
- context "<span>" do
166
- let(:html) { <<-HTML.strip }
254
+ context "<span> elements" do
255
+ specify 'are stripped' do
256
+ expect(<<-HTML.strip
167
257
  <span>messenger <strong>bag</strong> skateboard</span>
168
258
  HTML
169
-
170
- it { should == <<-MD.strip }
171
- <span>messenger **bag** skateboard</span>
259
+ ).to convert_to <<-MD.strip
260
+ messenger **bag** skateboard
172
261
  MD
173
262
  end
174
263
  end
175
264
 
265
+ context "plain text" do
266
+ it 'containing plain bullet points converts to markdown' do
267
+ expect(
268
+ "• Bullet 1\n• Bullet 2\n"
269
+ ).to convert_to "* Bullet 1\n* Bullet 2"
270
+ end
271
+ end
272
+
176
273
  context "unbalanced elements" do
177
274
  let(:html) { "<span><span>foo</span>" }
178
275
 
@@ -182,4 +279,14 @@ organic
182
279
  }.to raise_error(Upmark::ParseFailed)
183
280
  end
184
281
  end
282
+
283
+ context "unbalanced elements" do
284
+ let(:html) { "<p>foo</b>" }
285
+
286
+ it "should raise an exception" do
287
+ expect {
288
+ Upmark.convert(html)
289
+ }.to raise_error(Upmark::ParseFailed)
290
+ end
291
+ end
185
292
  end