html2md 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/features/assets/test.html +0 -1
- data/features/assets/test.md +1 -0
- data/features/markdown.feature +31 -11
- data/features/step_definitions/markdown_steps.rb +2 -1
- data/lib/html2md.rb +2 -1
- data/lib/html2md/document.rb +170 -26
- metadata +2 -2
data/features/assets/test.html
CHANGED
data/features/assets/test.md
CHANGED
data/features/markdown.feature
CHANGED
@@ -4,12 +4,12 @@ Feature: Markdown
|
|
4
4
|
Scenario: Create a H Rule (HR) element
|
5
5
|
* HTML <hr/>
|
6
6
|
* I say parse
|
7
|
-
* The markdown should be (********\n)
|
7
|
+
* The markdown should be (\n********\n\n)
|
8
8
|
|
9
9
|
Scenario: Create a hard break (BR) element
|
10
10
|
* HTML <br/>
|
11
11
|
* I say parse
|
12
|
-
* The markdown should be (
|
12
|
+
* The markdown should be (\n \n)
|
13
13
|
|
14
14
|
Scenario: Paragraph (P) elements should be a single hard return
|
15
15
|
* HTML <p>
|
@@ -24,17 +24,17 @@ Feature: Markdown
|
|
24
24
|
Scenario: Other ancors should be ignored
|
25
25
|
* HTML <a name="link"> Link </a>
|
26
26
|
* I say parse
|
27
|
-
* The markdown should be ( Link
|
27
|
+
* The markdown should be ( Link)
|
28
28
|
|
29
29
|
Scenario: Ancors should reset after being used once
|
30
30
|
* HTML <a href="/some/link.html"> Link </a> <a name="link"> Link </a>
|
31
31
|
* I say parse
|
32
|
-
* The markdown should be ([ Link ](/some/link.html)
|
32
|
+
* The markdown should be ([ Link ](/some/link.html) Link)
|
33
33
|
|
34
34
|
Scenario: Other (a) elements should be ignored
|
35
|
-
* HTML <a> Text </a>
|
35
|
+
* HTML <a> Text Text </a>
|
36
36
|
* I say parse
|
37
|
-
* The markdown should be ( Text )
|
37
|
+
* The markdown should be ( Text Text)
|
38
38
|
|
39
39
|
Scenario: An order list
|
40
40
|
* HTML <ol><li>First</li><li>Second</li><ol>
|
@@ -47,9 +47,9 @@ Feature: Markdown
|
|
47
47
|
* The markdown should be (\n - First\n - Second\n\n)
|
48
48
|
|
49
49
|
Scenario: Complex List
|
50
|
-
* HTML <ul
|
50
|
+
* HTML <ul> <li>First</li> <li> <ol> <li> Some Text <ul> <li>First</li> <li>Second</li> </ul> </li> <li>Second</li> </ol> </li> <li>Second</li> <ul>
|
51
51
|
* I say parse
|
52
|
-
* The markdown should be (\n - First\n
|
52
|
+
* The markdown should be (\n - First\n 1. Some Text\n - First\n - Second\n 2. Second\n - Second\n\n)
|
53
53
|
|
54
54
|
Scenario: Emphasis (em) element
|
55
55
|
* HTML <em>Emphasis</em>
|
@@ -77,9 +77,19 @@ Feature: Markdown
|
|
77
77
|
* The markdown should be (This is in a span)
|
78
78
|
|
79
79
|
Scenario: Character data should not have new lines
|
80
|
-
* HTML <p>This is character data
|
80
|
+
* HTML <p>This is character data \n\n\n\n</p>
|
81
81
|
* I say parse
|
82
|
-
* The markdown should be (This is character data
|
82
|
+
* The markdown should be (This is character data\n\n)
|
83
|
+
|
84
|
+
Scenario: Character data should not have new lines
|
85
|
+
* HTML <em><p> This is emphasized </p><br/></em>
|
86
|
+
* I say parse
|
87
|
+
* The markdown should be (_This is emphasized_)
|
88
|
+
|
89
|
+
Scenario: HR Followed by em should not fold
|
90
|
+
* HTML <em><p> This is emphasized </p><br/></em><hr/>
|
91
|
+
* I say parse
|
92
|
+
* The markdown should be (_This is emphasized_\n********\n\n)
|
83
93
|
|
84
94
|
Scenario: First level headers
|
85
95
|
* HTML <h1>This is a H1 Element</h1>
|
@@ -91,6 +101,11 @@ Feature: Markdown
|
|
91
101
|
* I say parse
|
92
102
|
* The markdown should be (\nThis is a H2 Element\n--------------------\n\n)
|
93
103
|
|
104
|
+
Scenario: New lines should be treated as space
|
105
|
+
* HTML <body>Word 1\nWord 2</body>
|
106
|
+
* I say parse
|
107
|
+
* The markdown should be (Word 1 Word 2)
|
108
|
+
|
94
109
|
Scenario: Third level headers
|
95
110
|
* HTML <h3>This is a H3 Element</h3>
|
96
111
|
* I say parse
|
@@ -99,4 +114,9 @@ Feature: Markdown
|
|
99
114
|
Scenario: Full File Conversion
|
100
115
|
* File (./features/assets/test.html)
|
101
116
|
* I say parse
|
102
|
-
* The mardown should be equal to (./features/assets/test.md)
|
117
|
+
* The mardown should be equal to (./features/assets/test.md)
|
118
|
+
|
119
|
+
Scenario: Strike Through
|
120
|
+
* HTML <strike><p> This is striken </p><br/></strike>
|
121
|
+
* I say parse
|
122
|
+
* The markdown should be (~~This is striken~~)
|
data/lib/html2md.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
require 'html2md/document'
|
3
|
+
require 'cgi'
|
3
4
|
|
4
5
|
class Html2Md
|
5
6
|
attr_accessor :options, :source
|
@@ -13,7 +14,7 @@ class Html2Md
|
|
13
14
|
doc = Html2Md::Document.new()
|
14
15
|
doc.relative_url = options[:relative_url]
|
15
16
|
parser = Nokogiri::HTML::SAX::Parser.new(doc)
|
16
|
-
parser.parse(source)
|
17
|
+
parser.parse( CGI.unescapeHTML(source).gsub(/\r/," ") )
|
17
18
|
parser.document.markdown
|
18
19
|
end
|
19
20
|
end
|
data/lib/html2md/document.rb
CHANGED
@@ -7,12 +7,33 @@ class Html2Md
|
|
7
7
|
attr_reader :markdown
|
8
8
|
attr_accessor :relative_url
|
9
9
|
|
10
|
+
def is_newline?(line)
|
11
|
+
if line.is_a? String
|
12
|
+
if /^\s+$/ =~ line
|
13
|
+
true
|
14
|
+
elsif /^\[\[::HARD_BREAK::\]\]$/ =~ line
|
15
|
+
true
|
16
|
+
#elsif line.empty?
|
17
|
+
# true
|
18
|
+
else
|
19
|
+
false
|
20
|
+
end
|
21
|
+
else
|
22
|
+
false
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def new_line
|
27
|
+
@markdown << "\n" unless is_newline?( @markdown[-1] ) and is_newline?( @markdown[-2] )
|
28
|
+
end
|
29
|
+
|
10
30
|
def start_document
|
11
|
-
@markdown =
|
31
|
+
@markdown = []
|
12
32
|
@last_href = nil
|
13
33
|
@allowed_tags = ['tr','td','th','table']
|
14
34
|
@list_tree = []
|
15
35
|
@last_cdata_length = 0
|
36
|
+
@pre_block = false
|
16
37
|
|
17
38
|
end
|
18
39
|
|
@@ -47,7 +68,6 @@ class Html2Md
|
|
47
68
|
end
|
48
69
|
|
49
70
|
def end_element name, attributes = []
|
50
|
-
#@markdown << name
|
51
71
|
end_name = "end_#{name}".to_sym
|
52
72
|
both_name = "start_and_end_#{name}".to_sym
|
53
73
|
if self.respond_to?(both_name)
|
@@ -59,16 +79,83 @@ class Html2Md
|
|
59
79
|
end
|
60
80
|
end
|
61
81
|
|
82
|
+
def start_strike(attributes)
|
83
|
+
@markdown << "~~"
|
84
|
+
end
|
85
|
+
|
86
|
+
def end_strike(attributes)
|
87
|
+
|
88
|
+
#Collapse Breaks
|
89
|
+
while is_newline?( @markdown[-1] )
|
90
|
+
@markdown.delete_at(-1)
|
91
|
+
end
|
92
|
+
|
93
|
+
#Collapse Space Before the emphasis
|
94
|
+
@markdown.reverse!
|
95
|
+
|
96
|
+
@markdown.each_index do |index|
|
97
|
+
if @markdown[index].eql? '~~'
|
98
|
+
|
99
|
+
count = 1
|
100
|
+
while is_newline?(@markdown[index-count])
|
101
|
+
@markdown.delete_at(index-count)
|
102
|
+
end
|
103
|
+
|
104
|
+
@markdown[index-1].gsub!(/^\s+/,'')
|
105
|
+
end
|
106
|
+
end
|
107
|
+
@markdown.reverse!
|
108
|
+
|
109
|
+
@markdown[-1].gsub!(/\s+$/,'')
|
110
|
+
|
111
|
+
@markdown << '~~'
|
112
|
+
|
113
|
+
|
114
|
+
end
|
62
115
|
def start_hr(attributes)
|
63
|
-
|
116
|
+
new_line
|
117
|
+
@markdown << "********"
|
118
|
+
new_line
|
119
|
+
new_line
|
64
120
|
end
|
65
121
|
|
66
122
|
def end_hr(attributes)
|
67
123
|
|
68
124
|
end
|
69
125
|
|
70
|
-
def
|
126
|
+
def start_em(attributes)
|
127
|
+
@markdown << "_"
|
128
|
+
end
|
129
|
+
|
130
|
+
def end_em(attributes)
|
131
|
+
|
132
|
+
#Collapse Breaks
|
133
|
+
while is_newline?( @markdown[-1] )
|
134
|
+
@markdown.delete_at(-1)
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
#Collapse Space Before the emphasis
|
139
|
+
@markdown.reverse!
|
140
|
+
|
141
|
+
@markdown.each_index do |index|
|
142
|
+
|
143
|
+
if @markdown[index].eql? '_' and not @markdown[index+1] =~ /\\$/
|
144
|
+
|
145
|
+
count = 1
|
146
|
+
while is_newline?(@markdown[index-count])
|
147
|
+
@markdown.delete_at(index-count)
|
148
|
+
end
|
149
|
+
|
150
|
+
@markdown[index-1].gsub!(/^\s+/,'')
|
151
|
+
end
|
152
|
+
end
|
153
|
+
@markdown.reverse!
|
154
|
+
|
155
|
+
@markdown[-1].gsub!(/\s+$/,'')
|
71
156
|
@markdown << '_'
|
157
|
+
|
158
|
+
###@markdown.gsub!(/((\[\[::HARD_BREAK::\]\])?(\s+)?)*_$/,'_')
|
72
159
|
end
|
73
160
|
|
74
161
|
def start_and_end_strong(attributes)
|
@@ -76,7 +163,8 @@ class Html2Md
|
|
76
163
|
end
|
77
164
|
|
78
165
|
def start_br(attributes)
|
79
|
-
|
166
|
+
new_line
|
167
|
+
@markdown << "[[::HARD_BREAK::]]"
|
80
168
|
end
|
81
169
|
|
82
170
|
def end_br(attributes)
|
@@ -88,39 +176,44 @@ class Html2Md
|
|
88
176
|
end
|
89
177
|
|
90
178
|
def end_p(attributes)
|
91
|
-
|
179
|
+
new_line unless @list_tree[-1]
|
180
|
+
new_line unless @list_tree[-1]
|
92
181
|
end
|
93
182
|
|
94
183
|
def start_h1(attributes)
|
95
|
-
|
184
|
+
new_line
|
96
185
|
end
|
97
186
|
|
98
187
|
def end_h1(attributes)
|
99
|
-
|
188
|
+
new_line
|
100
189
|
@last_cdata_length.times do
|
101
190
|
@markdown << "="
|
102
191
|
end
|
103
|
-
|
192
|
+
new_line
|
193
|
+
new_line
|
104
194
|
end
|
105
195
|
|
106
196
|
def start_h2(attributes)
|
107
|
-
|
197
|
+
new_line
|
108
198
|
end
|
109
199
|
|
110
200
|
def end_h2(attributes)
|
111
|
-
|
201
|
+
new_line
|
112
202
|
@last_cdata_length.times do
|
113
203
|
@markdown << "-"
|
114
204
|
end
|
115
|
-
|
205
|
+
new_line
|
206
|
+
new_line
|
116
207
|
end
|
117
208
|
|
118
209
|
def start_h3(attributes)
|
119
|
-
|
210
|
+
new_line
|
211
|
+
@markdown << "### "
|
120
212
|
end
|
121
213
|
|
122
214
|
def end_h3(attributes)
|
123
|
-
|
215
|
+
new_line
|
216
|
+
new_line
|
124
217
|
end
|
125
218
|
|
126
219
|
def start_a(attributes)
|
@@ -133,15 +226,22 @@ class Html2Md
|
|
133
226
|
end
|
134
227
|
|
135
228
|
def start_pre(attributes)
|
136
|
-
@
|
229
|
+
@pre_block = true;
|
230
|
+
new_line
|
231
|
+
@markdown << "```"
|
232
|
+
new_line
|
137
233
|
end
|
138
234
|
|
139
235
|
def end_pre(attributes)
|
140
|
-
@
|
236
|
+
@pre_block = false;
|
237
|
+
new_line
|
238
|
+
@markdown << "```"
|
239
|
+
new_line
|
141
240
|
end
|
142
241
|
|
143
242
|
def end_a(attributes)
|
144
|
-
|
243
|
+
begin
|
244
|
+
if @last_href and not (['http','https'].include? URI(URI.escape(@last_href)).scheme)
|
145
245
|
begin
|
146
246
|
rp = URI(relative_url)
|
147
247
|
rp.path = @last_href
|
@@ -152,21 +252,24 @@ class Html2Md
|
|
152
252
|
|
153
253
|
@markdown << "](#{@last_href})" if @last_href
|
154
254
|
@last_href = nil if @last_href
|
255
|
+
rescue
|
256
|
+
|
257
|
+
end
|
155
258
|
|
156
259
|
end
|
157
260
|
|
158
261
|
def start_ul(attributes)
|
159
|
-
|
262
|
+
new_line
|
160
263
|
@list_tree.push( { :type => :ul, :current_element => 0 } )
|
161
264
|
end
|
162
265
|
|
163
266
|
def end_ul(attributes)
|
164
267
|
@list_tree.pop
|
165
|
-
|
268
|
+
new_line unless @list_tree[-1]
|
166
269
|
end
|
167
270
|
|
168
271
|
def start_ol(attributes)
|
169
|
-
|
272
|
+
new_line
|
170
273
|
@list_tree.push( { :type => :ol, :current_element => 0 } )
|
171
274
|
end
|
172
275
|
|
@@ -177,12 +280,21 @@ class Html2Md
|
|
177
280
|
|
178
281
|
def start_li(attributes)
|
179
282
|
|
283
|
+
if /^(-|\d+.)\s+$/ =~ @markdown[-2]
|
284
|
+
@markdown.delete_at(-2)
|
285
|
+
@markdown.delete_at(-3)
|
286
|
+
end
|
287
|
+
|
288
|
+
@markdown[-2].gsub! /^\s+(-|\d+.)\s+$/,''
|
289
|
+
#Add Whitespace before the list item
|
180
290
|
@list_tree.length.times do
|
181
291
|
@markdown << " "
|
182
292
|
end
|
183
293
|
|
294
|
+
#Increment the Current Element to start at one
|
184
295
|
@list_tree[-1][:current_element] += 1
|
185
296
|
|
297
|
+
|
186
298
|
case @list_tree[-1][:type]
|
187
299
|
when :ol
|
188
300
|
@markdown << "#{ @list_tree[-1][:current_element] }. "
|
@@ -193,19 +305,51 @@ class Html2Md
|
|
193
305
|
end
|
194
306
|
|
195
307
|
def end_li(attributes)
|
196
|
-
|
308
|
+
new_line if @markdown[-1] != "\n" and @markdown[-1] != 10
|
197
309
|
end
|
198
310
|
|
199
311
|
def characters c
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
312
|
+
#Escape character data with _
|
313
|
+
c.gsub!('_','\_') unless @pre_block
|
314
|
+
|
315
|
+
#Collapse all whitespace into spaces
|
316
|
+
c.gsub!(/(\s+|\n|\r\n|\t)/, " ")
|
317
|
+
|
318
|
+
|
319
|
+
if c.rstrip.lstrip.chomp != ""
|
320
|
+
if @list_tree[-1]
|
321
|
+
|
322
|
+
#Strip whitespace at the start of the character data
|
323
|
+
c.gsub!(/\A(\r|\n|\s|\t)/,'')
|
324
|
+
|
325
|
+
c.chomp!
|
326
|
+
|
327
|
+
@last_cdata_length = c.chomp.length
|
328
|
+
|
329
|
+
@markdown << c
|
330
|
+
else
|
331
|
+
@last_cdata_length = c.chomp.length
|
332
|
+
@markdown << c
|
333
|
+
end
|
205
334
|
end
|
206
335
|
end
|
207
336
|
|
208
337
|
def end_document
|
338
|
+
|
339
|
+
@markdown = @markdown.join('')
|
340
|
+
#Replace All Ancor Links
|
341
|
+
@markdown.gsub!(/\[.*\]\(#.*\)/,'')
|
342
|
+
|
343
|
+
#Remove all extra space at the end of a line
|
344
|
+
@markdown.gsub!(/ +$/,'')
|
345
|
+
|
346
|
+
#Add Hard Breaks
|
347
|
+
@markdown.gsub!(/\[\[::HARD_BREAK::\]\]/," \n")
|
348
|
+
|
349
|
+
#Collapse Superfulious Hard Line Breaks
|
350
|
+
#@markdown.gsub!(/( \n+){1,}/," \n")
|
351
|
+
|
352
|
+
#Collapse Superfulious Line Breaks
|
209
353
|
@markdown.gsub!(/\n{2,}/,"\n\n")
|
210
354
|
end
|
211
355
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2md
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-03-
|
12
|
+
date: 2012-03-28 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: ! ' Converts Basic HTML to markdown
|
15
15
|
|