html2md 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,7 +31,6 @@
31
31
  <ol>
32
32
  <li>Ordered Item 1</li>
33
33
  <li>Ordered Item 2
34
-
35
34
  <ul>
36
35
  <li>Un-Ordered Item 1</li>
37
36
  </ul>
@@ -1,3 +1,4 @@
1
+
1
2
  ********
2
3
 
3
4
  Header 1
@@ -4,12 +4,12 @@ Feature: Markdown
4
4
  Scenario: Create a H Rule (HR) element
5
5
  * HTML <hr/>
6
6
  * I say parse
7
- * The markdown should be (********\n)
7
+ * The markdown should be (\n********\n\n)
8
8
 
9
9
  Scenario: Create a hard break (BR) element
10
10
  * HTML <br/>
11
11
  * I say parse
12
- * The markdown should be ( \n)
12
+ * The markdown should be (\n \n)
13
13
 
14
14
  Scenario: Paragraph (P) elements should be a single hard return
15
15
  * HTML <p>
@@ -24,17 +24,17 @@ Feature: Markdown
24
24
  Scenario: Other ancors should be ignored
25
25
  * HTML <a name="link"> Link </a>
26
26
  * I say parse
27
- * The markdown should be ( Link )
27
+ * The markdown should be ( Link)
28
28
 
29
29
  Scenario: Ancors should reset after being used once
30
30
  * HTML <a href="/some/link.html"> Link </a> <a name="link"> Link </a>
31
31
  * I say parse
32
- * The markdown should be ([ Link ](/some/link.html) Link )
32
+ * The markdown should be ([ Link ](/some/link.html) Link)
33
33
 
34
34
  Scenario: Other (a) elements should be ignored
35
- * HTML <a> Text </a>
35
+ * HTML <a> Text Text </a>
36
36
  * I say parse
37
- * The markdown should be ( Text )
37
+ * The markdown should be ( Text Text)
38
38
 
39
39
  Scenario: An order list
40
40
  * HTML <ol><li>First</li><li>Second</li><ol>
@@ -47,9 +47,9 @@ Feature: Markdown
47
47
  * The markdown should be (\n - First\n - Second\n\n)
48
48
 
49
49
  Scenario: Complex List
50
- * HTML <ul><li>First</li><li> <ol><li>First<ul><li>First</li><li>Second</li></ul></li><li>Second</li> </ol>Second</li><ul>
50
+ * HTML <ul> <li>First</li> <li> <ol> <li> Some Text <ul> <li>First</li> <li>Second</li> </ul> </li> <li>Second</li> </ol> </li> <li>Second</li> <ul>
51
51
  * I say parse
52
- * The markdown should be (\n - First\n - \n 1. First\n - First\n - Second\n 2. Second\nSecond\n\n)
52
+ * The markdown should be (\n - First\n 1. Some Text\n - First\n - Second\n 2. Second\n - Second\n\n)
53
53
 
54
54
  Scenario: Emphasis (em) element
55
55
  * HTML <em>Emphasis</em>
@@ -77,9 +77,19 @@ Feature: Markdown
77
77
  * The markdown should be (This is in a span)
78
78
 
79
79
  Scenario: Character data should not have new lines
80
- * HTML <p>This is character data \n\n\n\n</p>
80
+ * HTML <p>This is character data \n\n\n\n</p>
81
81
  * I say parse
82
- * The markdown should be (This is character data \n\n)
82
+ * The markdown should be (This is character data\n\n)
83
+
84
+ Scenario: Character data should not have new lines
85
+ * HTML <em><p> This is emphasized </p><br/></em>
86
+ * I say parse
87
+ * The markdown should be (_This is emphasized_)
88
+
89
+ Scenario: HR Followed by em should not fold
90
+ * HTML <em><p> This is emphasized </p><br/></em><hr/>
91
+ * I say parse
92
+ * The markdown should be (_This is emphasized_\n********\n\n)
83
93
 
84
94
  Scenario: First level headers
85
95
  * HTML <h1>This is a H1 Element</h1>
@@ -91,6 +101,11 @@ Feature: Markdown
91
101
  * I say parse
92
102
  * The markdown should be (\nThis is a H2 Element\n--------------------\n\n)
93
103
 
104
+ Scenario: New lines should be treated as space
105
+ * HTML <body>Word 1\nWord 2</body>
106
+ * I say parse
107
+ * The markdown should be (Word 1 Word 2)
108
+
94
109
  Scenario: Third level headers
95
110
  * HTML <h3>This is a H3 Element</h3>
96
111
  * I say parse
@@ -99,4 +114,9 @@ Feature: Markdown
99
114
  Scenario: Full File Conversion
100
115
  * File (./features/assets/test.html)
101
116
  * I say parse
102
- * The mardown should be equal to (./features/assets/test.md)
117
+ * The mardown should be equal to (./features/assets/test.md)
118
+
119
+ Scenario: Strike Through
120
+ * HTML <strike><p> This is striken </p><br/></strike>
121
+ * I say parse
122
+ * The markdown should be (~~This is striken~~)
@@ -12,7 +12,8 @@ After do
12
12
  end
13
13
 
14
14
  Given /HTML (.*)/ do |n|
15
- @html2md.source = n.gsub("\\n", "\n")
15
+ @html2md.source = n.gsub('\n', "\n")
16
+ puts n.gsub '\n',"\n"
16
17
  end
17
18
 
18
19
  Given /File \((.*)\)/ do |n|
@@ -1,5 +1,6 @@
1
1
  require 'nokogiri'
2
2
  require 'html2md/document'
3
+ require 'cgi'
3
4
 
4
5
  class Html2Md
5
6
  attr_accessor :options, :source
@@ -13,7 +14,7 @@ class Html2Md
13
14
  doc = Html2Md::Document.new()
14
15
  doc.relative_url = options[:relative_url]
15
16
  parser = Nokogiri::HTML::SAX::Parser.new(doc)
16
- parser.parse(source)
17
+ parser.parse( CGI.unescapeHTML(source).gsub(/\r/," ") )
17
18
  parser.document.markdown
18
19
  end
19
20
  end
@@ -7,12 +7,33 @@ class Html2Md
7
7
  attr_reader :markdown
8
8
  attr_accessor :relative_url
9
9
 
10
+ def is_newline?(line)
11
+ if line.is_a? String
12
+ if /^\s+$/ =~ line
13
+ true
14
+ elsif /^\[\[::HARD_BREAK::\]\]$/ =~ line
15
+ true
16
+ #elsif line.empty?
17
+ # true
18
+ else
19
+ false
20
+ end
21
+ else
22
+ false
23
+ end
24
+ end
25
+
26
+ def new_line
27
+ @markdown << "\n" unless is_newline?( @markdown[-1] ) and is_newline?( @markdown[-2] )
28
+ end
29
+
10
30
  def start_document
11
- @markdown = ''
31
+ @markdown = []
12
32
  @last_href = nil
13
33
  @allowed_tags = ['tr','td','th','table']
14
34
  @list_tree = []
15
35
  @last_cdata_length = 0
36
+ @pre_block = false
16
37
 
17
38
  end
18
39
 
@@ -47,7 +68,6 @@ class Html2Md
47
68
  end
48
69
 
49
70
  def end_element name, attributes = []
50
- #@markdown << name
51
71
  end_name = "end_#{name}".to_sym
52
72
  both_name = "start_and_end_#{name}".to_sym
53
73
  if self.respond_to?(both_name)
@@ -59,16 +79,83 @@ class Html2Md
59
79
  end
60
80
  end
61
81
 
82
+ def start_strike(attributes)
83
+ @markdown << "~~"
84
+ end
85
+
86
+ def end_strike(attributes)
87
+
88
+ #Collapse Breaks
89
+ while is_newline?( @markdown[-1] )
90
+ @markdown.delete_at(-1)
91
+ end
92
+
93
+ #Collapse Space Before the emphasis
94
+ @markdown.reverse!
95
+
96
+ @markdown.each_index do |index|
97
+ if @markdown[index].eql? '~~'
98
+
99
+ count = 1
100
+ while is_newline?(@markdown[index-count])
101
+ @markdown.delete_at(index-count)
102
+ end
103
+
104
+ @markdown[index-1].gsub!(/^\s+/,'')
105
+ end
106
+ end
107
+ @markdown.reverse!
108
+
109
+ @markdown[-1].gsub!(/\s+$/,'')
110
+
111
+ @markdown << '~~'
112
+
113
+
114
+ end
62
115
  def start_hr(attributes)
63
- @markdown << "********\n"
116
+ new_line
117
+ @markdown << "********"
118
+ new_line
119
+ new_line
64
120
  end
65
121
 
66
122
  def end_hr(attributes)
67
123
 
68
124
  end
69
125
 
70
- def start_and_end_em(attributes)
126
+ def start_em(attributes)
127
+ @markdown << "_"
128
+ end
129
+
130
+ def end_em(attributes)
131
+
132
+ #Collapse Breaks
133
+ while is_newline?( @markdown[-1] )
134
+ @markdown.delete_at(-1)
135
+ end
136
+
137
+
138
+ #Collapse Space Before the emphasis
139
+ @markdown.reverse!
140
+
141
+ @markdown.each_index do |index|
142
+
143
+ if @markdown[index].eql? '_' and not @markdown[index+1] =~ /\\$/
144
+
145
+ count = 1
146
+ while is_newline?(@markdown[index-count])
147
+ @markdown.delete_at(index-count)
148
+ end
149
+
150
+ @markdown[index-1].gsub!(/^\s+/,'')
151
+ end
152
+ end
153
+ @markdown.reverse!
154
+
155
+ @markdown[-1].gsub!(/\s+$/,'')
71
156
  @markdown << '_'
157
+
158
+ ###@markdown.gsub!(/((\[\[::HARD_BREAK::\]\])?(\s+)?)*_$/,'_')
72
159
  end
73
160
 
74
161
  def start_and_end_strong(attributes)
@@ -76,7 +163,8 @@ class Html2Md
76
163
  end
77
164
 
78
165
  def start_br(attributes)
79
- @markdown << " \n"
166
+ new_line
167
+ @markdown << "[[::HARD_BREAK::]]"
80
168
  end
81
169
 
82
170
  def end_br(attributes)
@@ -88,39 +176,44 @@ class Html2Md
88
176
  end
89
177
 
90
178
  def end_p(attributes)
91
- @markdown << "\n\n" unless @list_tree[-1]
179
+ new_line unless @list_tree[-1]
180
+ new_line unless @list_tree[-1]
92
181
  end
93
182
 
94
183
  def start_h1(attributes)
95
- @markdown << "\n"
184
+ new_line
96
185
  end
97
186
 
98
187
  def end_h1(attributes)
99
- @markdown << "\n"
188
+ new_line
100
189
  @last_cdata_length.times do
101
190
  @markdown << "="
102
191
  end
103
- @markdown << "\n\n"
192
+ new_line
193
+ new_line
104
194
  end
105
195
 
106
196
  def start_h2(attributes)
107
- @markdown << "\n"
197
+ new_line
108
198
  end
109
199
 
110
200
  def end_h2(attributes)
111
- @markdown << "\n"
201
+ new_line
112
202
  @last_cdata_length.times do
113
203
  @markdown << "-"
114
204
  end
115
- @markdown << "\n\n"
205
+ new_line
206
+ new_line
116
207
  end
117
208
 
118
209
  def start_h3(attributes)
119
- @markdown << "\n### "
210
+ new_line
211
+ @markdown << "### "
120
212
  end
121
213
 
122
214
  def end_h3(attributes)
123
- @markdown << "\n\n"
215
+ new_line
216
+ new_line
124
217
  end
125
218
 
126
219
  def start_a(attributes)
@@ -133,15 +226,22 @@ class Html2Md
133
226
  end
134
227
 
135
228
  def start_pre(attributes)
136
- @markdown << "\n```\n"
229
+ @pre_block = true;
230
+ new_line
231
+ @markdown << "```"
232
+ new_line
137
233
  end
138
234
 
139
235
  def end_pre(attributes)
140
- @markdown << "\n```\n"
236
+ @pre_block = false;
237
+ new_line
238
+ @markdown << "```"
239
+ new_line
141
240
  end
142
241
 
143
242
  def end_a(attributes)
144
- if @last_href and not (['http','https'].include? URI(@last_href).scheme)
243
+ begin
244
+ if @last_href and not (['http','https'].include? URI(URI.escape(@last_href)).scheme)
145
245
  begin
146
246
  rp = URI(relative_url)
147
247
  rp.path = @last_href
@@ -152,21 +252,24 @@ class Html2Md
152
252
 
153
253
  @markdown << "](#{@last_href})" if @last_href
154
254
  @last_href = nil if @last_href
255
+ rescue
256
+
257
+ end
155
258
 
156
259
  end
157
260
 
158
261
  def start_ul(attributes)
159
- @markdown << "\n" #if @list_tree[-1]
262
+ new_line
160
263
  @list_tree.push( { :type => :ul, :current_element => 0 } )
161
264
  end
162
265
 
163
266
  def end_ul(attributes)
164
267
  @list_tree.pop
165
- @markdown << "\n" unless @list_tree[-1]
268
+ new_line unless @list_tree[-1]
166
269
  end
167
270
 
168
271
  def start_ol(attributes)
169
- @markdown << "\n"# if @list_tree[-1]
272
+ new_line
170
273
  @list_tree.push( { :type => :ol, :current_element => 0 } )
171
274
  end
172
275
 
@@ -177,12 +280,21 @@ class Html2Md
177
280
 
178
281
  def start_li(attributes)
179
282
 
283
+ if /^(-|\d+.)\s+$/ =~ @markdown[-2]
284
+ @markdown.delete_at(-2)
285
+ @markdown.delete_at(-3)
286
+ end
287
+
288
+ @markdown[-2].gsub! /^\s+(-|\d+.)\s+$/,''
289
+ #Add Whitespace before the list item
180
290
  @list_tree.length.times do
181
291
  @markdown << " "
182
292
  end
183
293
 
294
+ #Increment the Current Element to start at one
184
295
  @list_tree[-1][:current_element] += 1
185
296
 
297
+
186
298
  case @list_tree[-1][:type]
187
299
  when :ol
188
300
  @markdown << "#{ @list_tree[-1][:current_element] }. "
@@ -193,19 +305,51 @@ class Html2Md
193
305
  end
194
306
 
195
307
  def end_li(attributes)
196
- @markdown << "\n" if @markdown[-1] != "\n" and @markdown[-1] != 10
308
+ new_line if @markdown[-1] != "\n" and @markdown[-1] != 10
197
309
  end
198
310
 
199
311
  def characters c
200
- @last_cdata_length = c.chomp.length
201
- if @list_tree[-1]
202
- @markdown << c.gsub(/\n(\s*)?/,"").lstrip
203
- else
204
- @markdown << c.gsub(/\n(\s*)?/,"")
312
+ #Escape character data with _
313
+ c.gsub!('_','\_') unless @pre_block
314
+
315
+ #Collapse all whitespace into spaces
316
+ c.gsub!(/(\s+|\n|\r\n|\t)/, " ")
317
+
318
+
319
+ if c.rstrip.lstrip.chomp != ""
320
+ if @list_tree[-1]
321
+
322
+ #Strip whitespace at the start of the character data
323
+ c.gsub!(/\A(\r|\n|\s|\t)/,'')
324
+
325
+ c.chomp!
326
+
327
+ @last_cdata_length = c.chomp.length
328
+
329
+ @markdown << c
330
+ else
331
+ @last_cdata_length = c.chomp.length
332
+ @markdown << c
333
+ end
205
334
  end
206
335
  end
207
336
 
208
337
  def end_document
338
+
339
+ @markdown = @markdown.join('')
340
+ #Replace All Ancor Links
341
+ @markdown.gsub!(/\[.*\]\(#.*\)/,'')
342
+
343
+ #Remove all extra space at the end of a line
344
+ @markdown.gsub!(/ +$/,'')
345
+
346
+ #Add Hard Breaks
347
+ @markdown.gsub!(/\[\[::HARD_BREAK::\]\]/," \n")
348
+
349
+ #Collapse Superfulious Hard Line Breaks
350
+ #@markdown.gsub!(/( \n+){1,}/," \n")
351
+
352
+ #Collapse Superfulious Line Breaks
209
353
  @markdown.gsub!(/\n{2,}/,"\n\n")
210
354
  end
211
355
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2md
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-03-23 00:00:00.000000000 Z
12
+ date: 2012-03-28 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: ! ' Converts Basic HTML to markdown
15
15