html2md 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/features/assets/test.html +0 -1
- data/features/assets/test.md +1 -0
- data/features/markdown.feature +31 -11
- data/features/step_definitions/markdown_steps.rb +2 -1
- data/lib/html2md.rb +2 -1
- data/lib/html2md/document.rb +170 -26
- metadata +2 -2
data/features/assets/test.html
CHANGED
data/features/assets/test.md
CHANGED
data/features/markdown.feature
CHANGED
@@ -4,12 +4,12 @@ Feature: Markdown
|
|
4
4
|
Scenario: Create a H Rule (HR) element
|
5
5
|
* HTML <hr/>
|
6
6
|
* I say parse
|
7
|
-
* The markdown should be (********\n)
|
7
|
+
* The markdown should be (\n********\n\n)
|
8
8
|
|
9
9
|
Scenario: Create a hard break (BR) element
|
10
10
|
* HTML <br/>
|
11
11
|
* I say parse
|
12
|
-
* The markdown should be (
|
12
|
+
* The markdown should be (\n \n)
|
13
13
|
|
14
14
|
Scenario: Paragraph (P) elements should be a single hard return
|
15
15
|
* HTML <p>
|
@@ -24,17 +24,17 @@ Feature: Markdown
|
|
24
24
|
Scenario: Other ancors should be ignored
|
25
25
|
* HTML <a name="link"> Link </a>
|
26
26
|
* I say parse
|
27
|
-
* The markdown should be ( Link
|
27
|
+
* The markdown should be ( Link)
|
28
28
|
|
29
29
|
Scenario: Ancors should reset after being used once
|
30
30
|
* HTML <a href="/some/link.html"> Link </a> <a name="link"> Link </a>
|
31
31
|
* I say parse
|
32
|
-
* The markdown should be ([ Link ](/some/link.html)
|
32
|
+
* The markdown should be ([ Link ](/some/link.html) Link)
|
33
33
|
|
34
34
|
Scenario: Other (a) elements should be ignored
|
35
|
-
* HTML <a> Text </a>
|
35
|
+
* HTML <a> Text Text </a>
|
36
36
|
* I say parse
|
37
|
-
* The markdown should be ( Text )
|
37
|
+
* The markdown should be ( Text Text)
|
38
38
|
|
39
39
|
Scenario: An order list
|
40
40
|
* HTML <ol><li>First</li><li>Second</li><ol>
|
@@ -47,9 +47,9 @@ Feature: Markdown
|
|
47
47
|
* The markdown should be (\n - First\n - Second\n\n)
|
48
48
|
|
49
49
|
Scenario: Complex List
|
50
|
-
* HTML <ul
|
50
|
+
* HTML <ul> <li>First</li> <li> <ol> <li> Some Text <ul> <li>First</li> <li>Second</li> </ul> </li> <li>Second</li> </ol> </li> <li>Second</li> <ul>
|
51
51
|
* I say parse
|
52
|
-
* The markdown should be (\n - First\n
|
52
|
+
* The markdown should be (\n - First\n 1. Some Text\n - First\n - Second\n 2. Second\n - Second\n\n)
|
53
53
|
|
54
54
|
Scenario: Emphasis (em) element
|
55
55
|
* HTML <em>Emphasis</em>
|
@@ -77,9 +77,19 @@ Feature: Markdown
|
|
77
77
|
* The markdown should be (This is in a span)
|
78
78
|
|
79
79
|
Scenario: Character data should not have new lines
|
80
|
-
* HTML <p>This is character data
|
80
|
+
* HTML <p>This is character data \n\n\n\n</p>
|
81
81
|
* I say parse
|
82
|
-
* The markdown should be (This is character data
|
82
|
+
* The markdown should be (This is character data\n\n)
|
83
|
+
|
84
|
+
Scenario: Character data should not have new lines
|
85
|
+
* HTML <em><p> This is emphasized </p><br/></em>
|
86
|
+
* I say parse
|
87
|
+
* The markdown should be (_This is emphasized_)
|
88
|
+
|
89
|
+
Scenario: HR Followed by em should not fold
|
90
|
+
* HTML <em><p> This is emphasized </p><br/></em><hr/>
|
91
|
+
* I say parse
|
92
|
+
* The markdown should be (_This is emphasized_\n********\n\n)
|
83
93
|
|
84
94
|
Scenario: First level headers
|
85
95
|
* HTML <h1>This is a H1 Element</h1>
|
@@ -91,6 +101,11 @@ Feature: Markdown
|
|
91
101
|
* I say parse
|
92
102
|
* The markdown should be (\nThis is a H2 Element\n--------------------\n\n)
|
93
103
|
|
104
|
+
Scenario: New lines should be treated as space
|
105
|
+
* HTML <body>Word 1\nWord 2</body>
|
106
|
+
* I say parse
|
107
|
+
* The markdown should be (Word 1 Word 2)
|
108
|
+
|
94
109
|
Scenario: Third level headers
|
95
110
|
* HTML <h3>This is a H3 Element</h3>
|
96
111
|
* I say parse
|
@@ -99,4 +114,9 @@ Feature: Markdown
|
|
99
114
|
Scenario: Full File Conversion
|
100
115
|
* File (./features/assets/test.html)
|
101
116
|
* I say parse
|
102
|
-
* The mardown should be equal to (./features/assets/test.md)
|
117
|
+
* The mardown should be equal to (./features/assets/test.md)
|
118
|
+
|
119
|
+
Scenario: Strike Through
|
120
|
+
* HTML <strike><p> This is striken </p><br/></strike>
|
121
|
+
* I say parse
|
122
|
+
* The markdown should be (~~This is striken~~)
|
data/lib/html2md.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
require 'html2md/document'
|
3
|
+
require 'cgi'
|
3
4
|
|
4
5
|
class Html2Md
|
5
6
|
attr_accessor :options, :source
|
@@ -13,7 +14,7 @@ class Html2Md
|
|
13
14
|
doc = Html2Md::Document.new()
|
14
15
|
doc.relative_url = options[:relative_url]
|
15
16
|
parser = Nokogiri::HTML::SAX::Parser.new(doc)
|
16
|
-
parser.parse(source)
|
17
|
+
parser.parse( CGI.unescapeHTML(source).gsub(/\r/," ") )
|
17
18
|
parser.document.markdown
|
18
19
|
end
|
19
20
|
end
|
data/lib/html2md/document.rb
CHANGED
@@ -7,12 +7,33 @@ class Html2Md
|
|
7
7
|
attr_reader :markdown
|
8
8
|
attr_accessor :relative_url
|
9
9
|
|
10
|
+
def is_newline?(line)
|
11
|
+
if line.is_a? String
|
12
|
+
if /^\s+$/ =~ line
|
13
|
+
true
|
14
|
+
elsif /^\[\[::HARD_BREAK::\]\]$/ =~ line
|
15
|
+
true
|
16
|
+
#elsif line.empty?
|
17
|
+
# true
|
18
|
+
else
|
19
|
+
false
|
20
|
+
end
|
21
|
+
else
|
22
|
+
false
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def new_line
|
27
|
+
@markdown << "\n" unless is_newline?( @markdown[-1] ) and is_newline?( @markdown[-2] )
|
28
|
+
end
|
29
|
+
|
10
30
|
def start_document
|
11
|
-
@markdown =
|
31
|
+
@markdown = []
|
12
32
|
@last_href = nil
|
13
33
|
@allowed_tags = ['tr','td','th','table']
|
14
34
|
@list_tree = []
|
15
35
|
@last_cdata_length = 0
|
36
|
+
@pre_block = false
|
16
37
|
|
17
38
|
end
|
18
39
|
|
@@ -47,7 +68,6 @@ class Html2Md
|
|
47
68
|
end
|
48
69
|
|
49
70
|
def end_element name, attributes = []
|
50
|
-
#@markdown << name
|
51
71
|
end_name = "end_#{name}".to_sym
|
52
72
|
both_name = "start_and_end_#{name}".to_sym
|
53
73
|
if self.respond_to?(both_name)
|
@@ -59,16 +79,83 @@ class Html2Md
|
|
59
79
|
end
|
60
80
|
end
|
61
81
|
|
82
|
+
def start_strike(attributes)
|
83
|
+
@markdown << "~~"
|
84
|
+
end
|
85
|
+
|
86
|
+
def end_strike(attributes)
|
87
|
+
|
88
|
+
#Collapse Breaks
|
89
|
+
while is_newline?( @markdown[-1] )
|
90
|
+
@markdown.delete_at(-1)
|
91
|
+
end
|
92
|
+
|
93
|
+
#Collapse Space Before the emphasis
|
94
|
+
@markdown.reverse!
|
95
|
+
|
96
|
+
@markdown.each_index do |index|
|
97
|
+
if @markdown[index].eql? '~~'
|
98
|
+
|
99
|
+
count = 1
|
100
|
+
while is_newline?(@markdown[index-count])
|
101
|
+
@markdown.delete_at(index-count)
|
102
|
+
end
|
103
|
+
|
104
|
+
@markdown[index-1].gsub!(/^\s+/,'')
|
105
|
+
end
|
106
|
+
end
|
107
|
+
@markdown.reverse!
|
108
|
+
|
109
|
+
@markdown[-1].gsub!(/\s+$/,'')
|
110
|
+
|
111
|
+
@markdown << '~~'
|
112
|
+
|
113
|
+
|
114
|
+
end
|
62
115
|
def start_hr(attributes)
|
63
|
-
|
116
|
+
new_line
|
117
|
+
@markdown << "********"
|
118
|
+
new_line
|
119
|
+
new_line
|
64
120
|
end
|
65
121
|
|
66
122
|
def end_hr(attributes)
|
67
123
|
|
68
124
|
end
|
69
125
|
|
70
|
-
def
|
126
|
+
def start_em(attributes)
|
127
|
+
@markdown << "_"
|
128
|
+
end
|
129
|
+
|
130
|
+
def end_em(attributes)
|
131
|
+
|
132
|
+
#Collapse Breaks
|
133
|
+
while is_newline?( @markdown[-1] )
|
134
|
+
@markdown.delete_at(-1)
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
#Collapse Space Before the emphasis
|
139
|
+
@markdown.reverse!
|
140
|
+
|
141
|
+
@markdown.each_index do |index|
|
142
|
+
|
143
|
+
if @markdown[index].eql? '_' and not @markdown[index+1] =~ /\\$/
|
144
|
+
|
145
|
+
count = 1
|
146
|
+
while is_newline?(@markdown[index-count])
|
147
|
+
@markdown.delete_at(index-count)
|
148
|
+
end
|
149
|
+
|
150
|
+
@markdown[index-1].gsub!(/^\s+/,'')
|
151
|
+
end
|
152
|
+
end
|
153
|
+
@markdown.reverse!
|
154
|
+
|
155
|
+
@markdown[-1].gsub!(/\s+$/,'')
|
71
156
|
@markdown << '_'
|
157
|
+
|
158
|
+
###@markdown.gsub!(/((\[\[::HARD_BREAK::\]\])?(\s+)?)*_$/,'_')
|
72
159
|
end
|
73
160
|
|
74
161
|
def start_and_end_strong(attributes)
|
@@ -76,7 +163,8 @@ class Html2Md
|
|
76
163
|
end
|
77
164
|
|
78
165
|
def start_br(attributes)
|
79
|
-
|
166
|
+
new_line
|
167
|
+
@markdown << "[[::HARD_BREAK::]]"
|
80
168
|
end
|
81
169
|
|
82
170
|
def end_br(attributes)
|
@@ -88,39 +176,44 @@ class Html2Md
|
|
88
176
|
end
|
89
177
|
|
90
178
|
def end_p(attributes)
|
91
|
-
|
179
|
+
new_line unless @list_tree[-1]
|
180
|
+
new_line unless @list_tree[-1]
|
92
181
|
end
|
93
182
|
|
94
183
|
def start_h1(attributes)
|
95
|
-
|
184
|
+
new_line
|
96
185
|
end
|
97
186
|
|
98
187
|
def end_h1(attributes)
|
99
|
-
|
188
|
+
new_line
|
100
189
|
@last_cdata_length.times do
|
101
190
|
@markdown << "="
|
102
191
|
end
|
103
|
-
|
192
|
+
new_line
|
193
|
+
new_line
|
104
194
|
end
|
105
195
|
|
106
196
|
def start_h2(attributes)
|
107
|
-
|
197
|
+
new_line
|
108
198
|
end
|
109
199
|
|
110
200
|
def end_h2(attributes)
|
111
|
-
|
201
|
+
new_line
|
112
202
|
@last_cdata_length.times do
|
113
203
|
@markdown << "-"
|
114
204
|
end
|
115
|
-
|
205
|
+
new_line
|
206
|
+
new_line
|
116
207
|
end
|
117
208
|
|
118
209
|
def start_h3(attributes)
|
119
|
-
|
210
|
+
new_line
|
211
|
+
@markdown << "### "
|
120
212
|
end
|
121
213
|
|
122
214
|
def end_h3(attributes)
|
123
|
-
|
215
|
+
new_line
|
216
|
+
new_line
|
124
217
|
end
|
125
218
|
|
126
219
|
def start_a(attributes)
|
@@ -133,15 +226,22 @@ class Html2Md
|
|
133
226
|
end
|
134
227
|
|
135
228
|
def start_pre(attributes)
|
136
|
-
@
|
229
|
+
@pre_block = true;
|
230
|
+
new_line
|
231
|
+
@markdown << "```"
|
232
|
+
new_line
|
137
233
|
end
|
138
234
|
|
139
235
|
def end_pre(attributes)
|
140
|
-
@
|
236
|
+
@pre_block = false;
|
237
|
+
new_line
|
238
|
+
@markdown << "```"
|
239
|
+
new_line
|
141
240
|
end
|
142
241
|
|
143
242
|
def end_a(attributes)
|
144
|
-
|
243
|
+
begin
|
244
|
+
if @last_href and not (['http','https'].include? URI(URI.escape(@last_href)).scheme)
|
145
245
|
begin
|
146
246
|
rp = URI(relative_url)
|
147
247
|
rp.path = @last_href
|
@@ -152,21 +252,24 @@ class Html2Md
|
|
152
252
|
|
153
253
|
@markdown << "](#{@last_href})" if @last_href
|
154
254
|
@last_href = nil if @last_href
|
255
|
+
rescue
|
256
|
+
|
257
|
+
end
|
155
258
|
|
156
259
|
end
|
157
260
|
|
158
261
|
def start_ul(attributes)
|
159
|
-
|
262
|
+
new_line
|
160
263
|
@list_tree.push( { :type => :ul, :current_element => 0 } )
|
161
264
|
end
|
162
265
|
|
163
266
|
def end_ul(attributes)
|
164
267
|
@list_tree.pop
|
165
|
-
|
268
|
+
new_line unless @list_tree[-1]
|
166
269
|
end
|
167
270
|
|
168
271
|
def start_ol(attributes)
|
169
|
-
|
272
|
+
new_line
|
170
273
|
@list_tree.push( { :type => :ol, :current_element => 0 } )
|
171
274
|
end
|
172
275
|
|
@@ -177,12 +280,21 @@ class Html2Md
|
|
177
280
|
|
178
281
|
def start_li(attributes)
|
179
282
|
|
283
|
+
if /^(-|\d+.)\s+$/ =~ @markdown[-2]
|
284
|
+
@markdown.delete_at(-2)
|
285
|
+
@markdown.delete_at(-3)
|
286
|
+
end
|
287
|
+
|
288
|
+
@markdown[-2].gsub! /^\s+(-|\d+.)\s+$/,''
|
289
|
+
#Add Whitespace before the list item
|
180
290
|
@list_tree.length.times do
|
181
291
|
@markdown << " "
|
182
292
|
end
|
183
293
|
|
294
|
+
#Increment the Current Element to start at one
|
184
295
|
@list_tree[-1][:current_element] += 1
|
185
296
|
|
297
|
+
|
186
298
|
case @list_tree[-1][:type]
|
187
299
|
when :ol
|
188
300
|
@markdown << "#{ @list_tree[-1][:current_element] }. "
|
@@ -193,19 +305,51 @@ class Html2Md
|
|
193
305
|
end
|
194
306
|
|
195
307
|
def end_li(attributes)
|
196
|
-
|
308
|
+
new_line if @markdown[-1] != "\n" and @markdown[-1] != 10
|
197
309
|
end
|
198
310
|
|
199
311
|
def characters c
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
312
|
+
#Escape character data with _
|
313
|
+
c.gsub!('_','\_') unless @pre_block
|
314
|
+
|
315
|
+
#Collapse all whitespace into spaces
|
316
|
+
c.gsub!(/(\s+|\n|\r\n|\t)/, " ")
|
317
|
+
|
318
|
+
|
319
|
+
if c.rstrip.lstrip.chomp != ""
|
320
|
+
if @list_tree[-1]
|
321
|
+
|
322
|
+
#Strip whitespace at the start of the character data
|
323
|
+
c.gsub!(/\A(\r|\n|\s|\t)/,'')
|
324
|
+
|
325
|
+
c.chomp!
|
326
|
+
|
327
|
+
@last_cdata_length = c.chomp.length
|
328
|
+
|
329
|
+
@markdown << c
|
330
|
+
else
|
331
|
+
@last_cdata_length = c.chomp.length
|
332
|
+
@markdown << c
|
333
|
+
end
|
205
334
|
end
|
206
335
|
end
|
207
336
|
|
208
337
|
def end_document
|
338
|
+
|
339
|
+
@markdown = @markdown.join('')
|
340
|
+
#Replace All Ancor Links
|
341
|
+
@markdown.gsub!(/\[.*\]\(#.*\)/,'')
|
342
|
+
|
343
|
+
#Remove all extra space at the end of a line
|
344
|
+
@markdown.gsub!(/ +$/,'')
|
345
|
+
|
346
|
+
#Add Hard Breaks
|
347
|
+
@markdown.gsub!(/\[\[::HARD_BREAK::\]\]/," \n")
|
348
|
+
|
349
|
+
#Collapse Superfulious Hard Line Breaks
|
350
|
+
#@markdown.gsub!(/( \n+){1,}/," \n")
|
351
|
+
|
352
|
+
#Collapse Superfulious Line Breaks
|
209
353
|
@markdown.gsub!(/\n{2,}/,"\n\n")
|
210
354
|
end
|
211
355
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2md
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-03-
|
12
|
+
date: 2012-03-28 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: ! ' Converts Basic HTML to markdown
|
15
15
|
|