cpjolicoeur-ClothBlue 0.2.2 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- data/clothblue.gemspec +4 -4
- data/lib/clothblue.rb +698 -100
- data/lib/parsehtml/parsehtml.rb +452 -0
- metadata +4 -4
- data/README +0 -3
- data/lib/README.rdoc +0 -46
data/clothblue.gemspec
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = "ClothBlue"
|
3
|
-
s.version = "0.
|
4
|
-
s.date = "
|
3
|
+
s.version = "0.5.1"
|
4
|
+
s.date = "2008-11-25"
|
5
5
|
s.summary = "HTML to Markdown converter"
|
6
6
|
s.email = "cpjolicoeur@gmail.com"
|
7
7
|
s.homepage = "http://github.com/cpjolicoeur/clothblue"
|
8
8
|
s.description = "ClothBlue is BlueCloth's evil twin. It converts existing HTML into Markdown format for use with BlueCloth."
|
9
9
|
s.has_rdoc = true
|
10
10
|
s.authors = ["Craig P Jolicoeur"]
|
11
|
-
s.files = ["README", "TODO", "clothblue.gemspec", "lib/clothblue.rb", "lib/
|
11
|
+
s.files = ["README", "TODO", "clothblue.gemspec", "lib/clothblue.rb", "lib/parsehtml/parsehtml.rb", "test/README", "test/test_entities.rb", "test/test_formatting.rb", "test/test_headings.rb", "test/test_lists.rb", "test/test_structure.rb", "test/test_tables.rb"]
|
12
12
|
s.test_files = ["test/test_entities.rb", "test/test_formatting.rb", "test/test_headings.rb", "test/test_lists.rb", "test/test_structure.rb", "test/test_tables.rb"]
|
13
13
|
s.rdoc_options = ["--main", "lib/README.rdoc"]
|
14
|
-
end
|
14
|
+
end
|
data/lib/clothblue.rb
CHANGED
@@ -1,124 +1,722 @@
|
|
1
|
-
|
2
|
-
Provides the methods to convert HTML into Markdown.
|
3
|
-
*Please* *note*: ClothBlue creates UTF-8 output. To do so, it sets $KCODE to UTF-8. This will be globally available!
|
4
|
-
#--
|
5
|
-
TODO: enhance docs, as more methods come availlable
|
6
|
-
#++
|
1
|
+
require 'parsehtml/parsehtml'
|
7
2
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
=
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
3
|
+
class ClothBlue
|
4
|
+
|
5
|
+
# Constants
|
6
|
+
LINKS_EACH_PARAGRAPH = false
|
7
|
+
BODYWIDTH = false
|
8
|
+
KEEPHTML = true
|
9
|
+
MIN_BODYWIDTH = 25
|
10
|
+
|
11
|
+
# tags which can be handled by markdown
|
12
|
+
IS_MARKDOWNABLE = {
|
13
|
+
'p' => [],
|
14
|
+
'ul' => [],
|
15
|
+
'ol' => [],
|
16
|
+
'li' => [],
|
17
|
+
'br' => [],
|
18
|
+
'blockquote' => [],
|
19
|
+
'code' => [],
|
20
|
+
'pre' => [],
|
21
|
+
'a' => [{'href' => 'required'}, {'title' => 'optional'}],
|
22
|
+
'strong' => [],
|
23
|
+
'b' => [],
|
24
|
+
'em' => [],
|
25
|
+
'i' => [],
|
26
|
+
'img' => [{'src' => 'required'}, {'alt' => 'optional'}, {'title' => 'optional'}],
|
27
|
+
'h1' => [],
|
28
|
+
'h2' => [],
|
29
|
+
'h3' => [],
|
30
|
+
'h4' => [],
|
31
|
+
'h5' => [],
|
32
|
+
'h6' => [],
|
33
|
+
'hr' => []
|
34
|
+
}
|
35
|
+
|
36
|
+
# html tags to be ignored (content will be parsed)
|
37
|
+
IGNORE = %w(html body)
|
38
|
+
|
39
|
+
# html tags to be dropped (content will not be parsed!)
|
40
|
+
DROP = %w(script head style form area object param iframe)
|
40
41
|
|
41
|
-
|
42
|
-
|
42
|
+
# Markdown indents which could be wrapped
|
43
|
+
WRAPPABLE_INDENTS = [
|
44
|
+
'\* ', # ul
|
45
|
+
'\d. ', # ol
|
46
|
+
'\d\d. ', # ol
|
47
|
+
'> ', # blockquote
|
48
|
+
'' # p
|
43
49
|
]
|
44
|
-
|
45
|
-
|
46
|
-
|
50
|
+
|
51
|
+
# list of chars which have to be escaped in normal text
|
52
|
+
# TODO: what's with block chars/ sequences at the beginning of a block?
|
53
|
+
ESCAPE_IN_TEXT = [
|
54
|
+
{'([-*_])([ ]{0,2}\1){2,}' => '\\\\$0|'}, # hr
|
55
|
+
{'\*\*([^*\s]+)\*\*' => '\*\*$1\*\*'}, # strong
|
56
|
+
{'\*([^*\s]+)\*' => '\*$1\*'}, # em
|
57
|
+
{'__(?! |_)(.+)(?!<_| )__' => '\_\_$1\_\_'}, # em
|
58
|
+
{'_(?! |_)(.+)(?!<_| )_' => '\_$1\_'}, # em
|
59
|
+
{'`(.+)`' => '\`$1\`'}, # code
|
60
|
+
{'\[(.+)\](\s*\()' => '\[$1\]$2'}, # links: [text] (url) => [text\] (url)
|
61
|
+
{'\[(.+)\](\s*)\[(.*)\]' => '\[$1\]$2\[$3\]'}, # links: [text][id] => [text\][id\]
|
47
62
|
]
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
63
|
+
|
64
|
+
# parseHTML parser
|
65
|
+
attr_accessor :parser
|
66
|
+
|
67
|
+
# markdown output
|
68
|
+
attr_reader :output
|
69
|
+
|
70
|
+
# stack with tags which were not converted to html
|
71
|
+
attr_reader :not_converted
|
72
|
+
|
73
|
+
# skip conversion to markdown
|
74
|
+
attr_reader :skip_conversion
|
75
|
+
|
76
|
+
# keep html tags which cannot be converted to markdown
|
77
|
+
attr_reader :keep_html
|
78
|
+
|
79
|
+
# wrap output, set to 0 to skip wrapping
|
80
|
+
attr_reader :body_width
|
81
|
+
|
82
|
+
# whether last processed node was a block tag or not
|
83
|
+
@last_was_block_tag = false
|
84
|
+
attr_reader :last_was_block_tag
|
85
|
+
|
86
|
+
# name of last closed tag
|
87
|
+
@last_closed_tag = ''
|
88
|
+
attr_reader :last_closed_tag
|
89
|
+
|
90
|
+
# list of chars which have to be escaped in normal text
|
91
|
+
@escape_in_text = {}
|
92
|
+
attr_reader :escape_in_text
|
93
|
+
|
94
|
+
# number of linebreaks before next inline output
|
95
|
+
@linebreaks = 0
|
96
|
+
attr_accessor :linebreaks
|
97
|
+
|
98
|
+
# stores current buffer
|
99
|
+
@buffer = []
|
100
|
+
attr_accessor :buffer
|
101
|
+
|
102
|
+
# current indentation
|
103
|
+
@indent = ''
|
104
|
+
attr_accessor :indent
|
105
|
+
|
106
|
+
# node stack, e.g. for <a> and <abbr> tags
|
107
|
+
@stack = {}
|
108
|
+
attr_accessor :stack
|
109
|
+
|
110
|
+
# Constructor
|
111
|
+
def initialize(text = '', links_after_each_paragraph = LINKS_EACH_PARAGRAPH, body_width = BODYWIDTH, keep_html = KEEPHTML)
|
112
|
+
@links_after_each_paragraph = links_after_each_paragraph
|
113
|
+
@keep_html = keep_html
|
114
|
+
@body_width = (body_width > MIN_BODYWIDTH) ? body_width.to_i : MIN_BODYWIDTH
|
115
|
+
|
116
|
+
@parser = HTML::Tokenizer.new(text)
|
117
|
+
|
118
|
+
@output = ''
|
119
|
+
@not_converted = []
|
120
|
+
@skip_conversion = false
|
121
|
+
|
122
|
+
@search, @replace = [], []
|
123
|
+
ESCAPE_IN_TEXT.each do |s,r|
|
124
|
+
@search << '/(?<!\\\)/' + s + '/U'
|
125
|
+
@replace << r
|
126
|
+
end
|
127
|
+
|
128
|
+
@escape_in_text = {'search' => @search, 'replace' => @replace}
|
52
129
|
end
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
structure(@workingcopy)
|
60
|
-
text_formatting(@workingcopy)
|
61
|
-
lists(@workingcopy)
|
62
|
-
entities(@workingcopy)
|
63
|
-
tables(@workingcopy)
|
64
|
-
@workingcopy = CGI::unescapeHTML(@workingcopy)
|
65
|
-
@workingcopy
|
130
|
+
|
131
|
+
# parse an HTML string
|
132
|
+
def parse_string
|
133
|
+
# @parser.html = html ## -> if we passed it in
|
134
|
+
parse
|
135
|
+
return @output
|
66
136
|
end
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
137
|
+
|
138
|
+
# iterate through the nodes and decide what to do with the current node
|
139
|
+
def parse
|
140
|
+
@output = ''
|
141
|
+
# drop tags that are in the DROP list
|
142
|
+
# TODO: implement dropping of @drop tags
|
143
|
+
|
144
|
+
while token = @parser.next_node
|
145
|
+
case @parser.node_type
|
146
|
+
when 'doctype', 'pi', 'comment'
|
147
|
+
if (@keep_html)
|
148
|
+
flush_linebreaks
|
149
|
+
out(@parser.node)
|
150
|
+
set_linebreaks(2)
|
151
|
+
end
|
152
|
+
when 'text'
|
153
|
+
handle_text
|
154
|
+
when 'tag'
|
155
|
+
next if IGNORE.include?(@parser.tag_name)
|
156
|
+
flush_linebreaks if (@parser.is_start_tag)
|
157
|
+
if (@skip_conversion)
|
158
|
+
is_markdownable # update notConverted
|
159
|
+
handle_tag_to_text
|
160
|
+
next
|
161
|
+
end
|
162
|
+
@parser.html = @parser.html.lstrip if (!@parser.keep_whitespace && @parser.is_block_element && @parser.is_start_tag)
|
163
|
+
if (is_markdownable)
|
164
|
+
if (@parser.is_block_element && @parser.is_start_tag && !@last_was_block_tag && !@output.empty?)
|
165
|
+
if (!@buffer.empty?)
|
166
|
+
str = @buffer[@buffer.size - 1]
|
167
|
+
else
|
168
|
+
str = @output
|
169
|
+
end
|
170
|
+
if (str.slice((@indent.size - 1) * -1) != "\n#{@indent}")
|
171
|
+
str << "\n" + @indent
|
172
|
+
end
|
173
|
+
func = "handle_tag_#{@parser.tag_name}"
|
174
|
+
self.send(func)
|
175
|
+
|
176
|
+
if (@links_after_each_paragraph && @parser.is_block_element && !@parser.is_start_tag)
|
177
|
+
flush_stacked
|
178
|
+
end
|
179
|
+
if(!@parser.is_start_tag)
|
180
|
+
@last_closed_tag = @parser.tag_name
|
181
|
+
end
|
182
|
+
end
|
183
|
+
else
|
184
|
+
handle_tag_to_text
|
185
|
+
@last_closed_tag = ''
|
186
|
+
end
|
187
|
+
else
|
188
|
+
# TODO: trigger error for invalid node type
|
189
|
+
end # end case
|
190
|
+
|
191
|
+
@last_was_block_tag = (@parser.node_type == 'tag' && @parser.is_start_tag && @parser.is_block_element)
|
192
|
+
end # end while
|
193
|
+
|
194
|
+
### cleanup
|
195
|
+
tmp = @output.gsub('>', '>')
|
196
|
+
tmp = tmp.gsub('&', '&')
|
197
|
+
@output = tmp.rstrip
|
198
|
+
# end parsing, flush stacked tags
|
199
|
+
flush_stacked
|
200
|
+
@stack = {}
|
201
|
+
<<<<<<< HEAD:lib/clothblue_rewrite.rb
|
202
|
+
=======
|
203
|
+
end
|
204
|
+
|
205
|
+
# check if current tag can be converted to Markdown
|
206
|
+
def is_markdownable
|
207
|
+
return false unless (IS_MARKDOWNABLE.include?(@parser.tag_name))
|
208
|
+
|
209
|
+
if (@parser.is_start_tag)
|
210
|
+
ret = true
|
211
|
+
if (@keep_html)
|
212
|
+
diff = @parser.tag_attributes.reject { |a| @parser.tag_name.include?(a) }
|
213
|
+
ret = false unless diff.empty? # non markdownable attributes given
|
214
|
+
end
|
215
|
+
if (ret)
|
216
|
+
IS_MARKDOWNABLE.each do |attr, type|
|
217
|
+
if ((type == 'required') && @parser.tag_attributes[attr].nil?)
|
218
|
+
# required Markdown attribute not given
|
219
|
+
ret = false
|
220
|
+
break
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
224
|
+
unless (ret)
|
225
|
+
@not_converted << (@parser.tag_name + '::' + @parser.open_tags.join('/'))
|
226
|
+
end
|
227
|
+
return ret
|
228
|
+
else
|
229
|
+
if (!@not_converted.empty? && (@not_converted.last == (@parser.tag_name + '::' + @parser.open_tags.join('/'))))
|
230
|
+
@not_converted.pop
|
231
|
+
return false
|
232
|
+
end
|
233
|
+
return true
|
75
234
|
end
|
76
|
-
|
235
|
+
end
|
236
|
+
|
237
|
+
# flush enqued linebreaks
|
238
|
+
def flush_linebreaks
|
239
|
+
if ((@linebreaks > 0) && !@output.empty?)
|
240
|
+
out("\n" * @linebreaks, true)
|
241
|
+
end
|
242
|
+
@linebreaks = 0
|
243
|
+
end
|
244
|
+
|
245
|
+
# output all stacked tags
|
246
|
+
def flush_stacked
|
247
|
+
# # links
|
248
|
+
# foreach ($this->stack as $tag => $a) {
|
249
|
+
# if (!empty($a)) {
|
250
|
+
# call_user_func(array(&$this, 'flushStacked_'.$tag));
|
251
|
+
# }
|
252
|
+
# }
|
77
253
|
end
|
78
254
|
|
79
|
-
|
80
|
-
def
|
81
|
-
|
82
|
-
|
255
|
+
# set number of line breaks before next start tag
|
256
|
+
def set_linebreaks(number)
|
257
|
+
@linebreaks = number if (@linebreaks < number)
|
258
|
+
end
|
259
|
+
|
260
|
+
# append string to the correct var, either directly to
|
261
|
+
# @output or to the current buffers
|
262
|
+
def out(put = '', nowrap = false)
|
263
|
+
return if put.empty?
|
264
|
+
|
265
|
+
if (!@buffer.empty?)
|
266
|
+
@buffer.last << put
|
267
|
+
else
|
268
|
+
if ((@body_width > 0) && !@parser.keep_whitespace) # wrap lines
|
269
|
+
# get last line
|
270
|
+
pos = @output.index("\n")
|
271
|
+
line = pos ? @output.slice(pos, @output.size - pos) : @output
|
272
|
+
end
|
273
|
+
|
274
|
+
if (nowrap)
|
275
|
+
if ((put[0,1] != "\n") && (line.size + put.size) > @body_width)
|
276
|
+
@output << "\n#{indent(put)}"
|
277
|
+
else
|
278
|
+
@output << put
|
279
|
+
end
|
280
|
+
return
|
281
|
+
else
|
282
|
+
put << "\n" # make sure we get all lines in the while below
|
283
|
+
line_len = line.size
|
284
|
+
while (pos = put.index("\n"))
|
285
|
+
put_line = put.slice(1, pos+1)
|
286
|
+
put_len = put_line.size
|
287
|
+
put = put.slice(pos+1, put.size - pos)
|
288
|
+
if (line_len + put_len < @body_width)
|
289
|
+
@output << put_line
|
290
|
+
line_len = put_len
|
291
|
+
else
|
292
|
+
# $split = preg_split('#^(.{0,'.($this->bodyWidth - $lineLen).'})\b#', $putLine, 2, PREG_SPLIT_OFFSET_CAPTURE | PREG_SPLIT_DELIM_CAPTURE);
|
293
|
+
# $this->output .= rtrim($split[1][0])."\n".$this->indent.$this->wordwrap(ltrim($split[2][0]), $this->bodyWidth, "\n".$this->indent, false);
|
294
|
+
end
|
295
|
+
end # end while
|
296
|
+
end
|
297
|
+
@output = @output(0, -1)
|
298
|
+
return
|
299
|
+
else
|
300
|
+
@output << put
|
83
301
|
end
|
84
|
-
|
302
|
+
>>>>>>> b6201584759afcc6f24a557ef9312597bd63f98f:lib/clothblue_rewrite.rb
|
85
303
|
end
|
86
304
|
|
305
|
+
<<<<<<< HEAD:lib/clothblue_rewrite.rb
|
306
|
+
# check if current tag can be converted to Markdown
|
307
|
+
def is_markdownable
|
308
|
+
return false unless (IS_MARKDOWNABLE.include?(@parser.tag_name))
|
309
|
+
=======
|
310
|
+
# indent next output (start tag) or unindent (end tag)
|
311
|
+
def indent(str, output = true)
|
312
|
+
if (@parser.is_start_tag)
|
313
|
+
@indent << str
|
314
|
+
out(str, true) if @output
|
315
|
+
else
|
316
|
+
@indent = @indent.slice(0, (str.size * -1))
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
# handle plain text
|
321
|
+
def handle_text
|
322
|
+
if (has_parent('pre') && @parser.node.index("\n"))
|
323
|
+
@parser.node.gsub!("\n", "\n#{@indent}")
|
324
|
+
end
|
325
|
+
if (!has_parent('code') && !has_parent('pre'))
|
326
|
+
# entity decode
|
327
|
+
decode(@parser.node)
|
328
|
+
if (!@skip_conversion)
|
329
|
+
# escape some chars in normal text
|
330
|
+
@parser.node.gsub!(@escape_in_text['search'], @escape_in_text['replace'])
|
331
|
+
end
|
332
|
+
else
|
333
|
+
@parser.node.gsub!(['"', '&apos'], ['"', '\''])
|
334
|
+
end
|
335
|
+
out(@parser.node)
|
336
|
+
@last_closed_tag = ''
|
337
|
+
end
|
87
338
|
|
88
|
-
|
89
|
-
|
90
|
-
|
339
|
+
# handle non-Markdownable tags
|
340
|
+
def handle_tag_to_text
|
341
|
+
if (!@keep_html)
|
342
|
+
set_linebreaks(2) if (!@parser.is_start_tag && @parser.is_block_element)
|
343
|
+
else
|
344
|
+
# dont convert to markdown inside this tag
|
345
|
+
# TODO: markdown extra
|
346
|
+
if (!@parser.is_empty_tag)
|
347
|
+
if (@parser.is_start_tag)
|
348
|
+
unless (@skip_conversion)
|
349
|
+
@skip_conversion = @parser.tag_name + '::' + @parser.open_tags.join('/')
|
350
|
+
end
|
351
|
+
else
|
352
|
+
if (@skip_conversion == (@parser.tag_name + '::' + @parser.open_tags.join('/'))
|
353
|
+
@skip_conversion = false
|
354
|
+
end
|
355
|
+
end
|
356
|
+
end # end !@parser.is_empty_tag
|
357
|
+
|
358
|
+
if (@parser.is_block_element)
|
359
|
+
if (@parser.is_start_tag)
|
360
|
+
if (%w(ins del).include?(parent))
|
361
|
+
# looks like ins or del are block elements now
|
362
|
+
out("\n", true)
|
363
|
+
indent(' ')
|
364
|
+
end
|
365
|
+
if (@parser.tag_name != 'pre')
|
366
|
+
out(@parser.node + "\n" + @indent)
|
367
|
+
@parser.is_empty_tag ? set_linebreaks(1) : indent(' ')
|
368
|
+
@parser.html = @parser.html.lstrip
|
369
|
+
else
|
370
|
+
# dont indent inside <pre> tags
|
371
|
+
out(@parser.node)
|
372
|
+
@static_indent = @indent
|
373
|
+
@indent = ''
|
374
|
+
end
|
375
|
+
else
|
376
|
+
@output = rstrip(@output) unless @parser.keep_whitespace
|
377
|
+
if (@parser.tag_name != 'pre')
|
378
|
+
indent(' ')
|
379
|
+
out("\n" + @indent + @parser.node)
|
380
|
+
else
|
381
|
+
# reset indentation
|
382
|
+
out(@parser.node)
|
383
|
+
@indent = @static_indent
|
384
|
+
end
|
385
|
+
|
386
|
+
if (%w(ins del).include?(parent))
|
387
|
+
# ins or del was block element
|
388
|
+
out("\n")
|
389
|
+
indent(' ')
|
390
|
+
end
|
391
|
+
|
392
|
+
@parser.tag_name == 'li' ? set_linebreaks(1) : set_linebreaks(2)
|
393
|
+
end
|
394
|
+
else
|
395
|
+
out(@parser.node)
|
396
|
+
end
|
397
|
+
|
398
|
+
if (%w(code pre).include?(@parser.tag_name))
|
399
|
+
if (@parser.is_start_tag)
|
400
|
+
buffer
|
401
|
+
else
|
402
|
+
# add stuff so cleanup just reverses this
|
403
|
+
tmp = unbugger.gsub('>', '&gt;')
|
404
|
+
out(tmp.gsub('<', '&lt;'))
|
405
|
+
end
|
406
|
+
end
|
91
407
|
end
|
92
|
-
text
|
93
408
|
end
|
94
|
-
|
95
|
-
|
96
|
-
def
|
97
|
-
|
98
|
-
|
409
|
+
|
410
|
+
# get tag name of direct parent tag
|
411
|
+
def parent
|
412
|
+
@parser.open_tags.last
|
413
|
+
end
|
414
|
+
|
415
|
+
# check if current not has a tag as parent (somewhere, not just the direct parent)
|
416
|
+
def has_parent(tag)
|
417
|
+
@parser.open_tags.include?(tag)
|
418
|
+
end
|
419
|
+
|
420
|
+
# add current node to the stack (this only stores the attributes)
|
421
|
+
def stack
|
422
|
+
@stack[@parser.tag_name] = [] if (@stack[@parser.tag_name].nil?)
|
423
|
+
@stack[@parser.tag_name] << @parser.tag_attributes
|
424
|
+
end
|
425
|
+
|
426
|
+
# remove current tag from stack
|
427
|
+
def unstack
|
428
|
+
if (@stack[@parser.tag_name].nil? || !@stack[@parser.tag_name].is_a?(Array))
|
429
|
+
# TODO: trigger and error
|
430
|
+
raise "somebody set us up the bomb"
|
99
431
|
end
|
100
|
-
|
432
|
+
@stack[@parser.tag_name].pop
|
101
433
|
end
|
102
|
-
|
103
|
-
|
104
|
-
def
|
105
|
-
|
106
|
-
|
434
|
+
|
435
|
+
# get last stacked element of type tag
|
436
|
+
def get_stacked(tag)
|
437
|
+
@stack[tag][@stack[tag].size-1]
|
438
|
+
end
|
439
|
+
|
440
|
+
# buffer next parser output until unbuffer is called
|
441
|
+
def buffer
|
442
|
+
@buffer << ''
|
443
|
+
end
|
444
|
+
|
445
|
+
# end current buffer and return buffered output
|
446
|
+
def unbuffer
|
447
|
+
@buffer.pop
|
448
|
+
end
|
449
|
+
|
450
|
+
# wordwrap for utf8 encoded strings
|
451
|
+
def wordwrap(str, width, brk, cut = false)
|
452
|
+
>>>>>>> b6201584759afcc6f24a557ef9312597bd63f98f:lib/clothblue_rewrite.rb
|
453
|
+
|
454
|
+
if (@parser.is_start_tag)
|
455
|
+
ret = true
|
456
|
+
if (@keep_html)
|
457
|
+
diff = @parser.tag_attributes.reject { |a| @parser.tag_name.include?(a) }
|
458
|
+
ret = false unless diff.empty? # non markdownable attributes given
|
459
|
+
end
|
460
|
+
if (ret)
|
461
|
+
IS_MARKDOWNABLE.each do |attr, type|
|
462
|
+
if ((type == 'required') && @parser.tag_attributes[attr].nil?)
|
463
|
+
# required Markdown attribute not given
|
464
|
+
ret = false
|
465
|
+
break
|
466
|
+
end
|
467
|
+
end
|
468
|
+
end
|
469
|
+
unless (ret)
|
470
|
+
@not_converted << (@parser.tag_name + '::' + @parser.open_tags.join('/'))
|
471
|
+
end
|
472
|
+
return ret
|
473
|
+
else
|
474
|
+
if (!@not_converted.empty? && (@not_converted.last == (@parser.tag_name + '::' + @parser.open_tags.join('/'))))
|
475
|
+
@not_converted.pop
|
476
|
+
return false
|
477
|
+
end
|
478
|
+
return true
|
107
479
|
end
|
108
|
-
text
|
109
480
|
end
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
481
|
+
|
482
|
+
<<<<<<< HEAD:lib/clothblue_rewrite.rb
|
483
|
+
# flush enqued linebreaks
|
484
|
+
def flush_linebreaks
|
485
|
+
if ((@linebreaks > 0) && !@output.empty?)
|
486
|
+
out("\n" * @linebreaks, true)
|
114
487
|
end
|
115
|
-
|
488
|
+
@linebreaks = 0
|
489
|
+
end
|
490
|
+
|
491
|
+
# output all stacked tags
|
492
|
+
def flush_stacked
|
493
|
+
# # links
|
494
|
+
# foreach ($this->stack as $tag => $a) {
|
495
|
+
# if (!empty($a)) {
|
496
|
+
# call_user_func(array(&$this, 'flushStacked_'.$tag));
|
497
|
+
# }
|
498
|
+
# }
|
116
499
|
end
|
117
500
|
|
501
|
+
# set number of line breaks before next start tag
|
502
|
+
def set_linebreaks(number)
|
503
|
+
@linebreaks = number if (@linebreaks < number)
|
504
|
+
end
|
505
|
+
|
506
|
+
# append string to the correct var, either directly to
|
507
|
+
# @output or to the current buffers
|
508
|
+
def out(put = '', nowrap = false)
|
509
|
+
return if put.empty?
|
510
|
+
|
511
|
+
if (!@buffer.empty?)
|
512
|
+
@buffer.last << put
|
513
|
+
else
|
514
|
+
if ((@body_width > 0) && !@parser.keep_whitespace) # wrap lines
|
515
|
+
# get last line
|
516
|
+
pos = @output.index("\n")
|
517
|
+
line = pos ? @output.slice(pos, @output.size - pos) : @output
|
518
|
+
end
|
519
|
+
|
520
|
+
if (nowrap)
|
521
|
+
if ((put[0,1] != "\n") && (line.size + put.size) > @body_width)
|
522
|
+
@output << "\n#{indent(put)}"
|
523
|
+
else
|
524
|
+
@output << put
|
525
|
+
end
|
526
|
+
return
|
527
|
+
else
|
528
|
+
put << "\n" # make sure we get all lines in the while below
|
529
|
+
line_len = line.size
|
530
|
+
while (pos = put.index("\n"))
|
531
|
+
put_line = put.slice(1, pos+1)
|
532
|
+
put_len = put_line.size
|
533
|
+
put = put.slice(pos+1, put.size - pos)
|
534
|
+
if (line_len + put_len < @body_width)
|
535
|
+
@output << put_line
|
536
|
+
line_len = put_len
|
537
|
+
else
|
538
|
+
# $split = preg_split('#^(.{0,'.($this->bodyWidth - $lineLen).'})\b#', $putLine, 2, PREG_SPLIT_OFFSET_CAPTURE | PREG_SPLIT_DELIM_CAPTURE);
|
539
|
+
# $this->output .= rtrim($split[1][0])."\n".$this->indent.$this->wordwrap(ltrim($split[2][0]), $this->bodyWidth, "\n".$this->indent, false);
|
540
|
+
end
|
541
|
+
end # end while
|
542
|
+
end
|
543
|
+
@output = @output(0, -1)
|
544
|
+
return
|
545
|
+
else
|
546
|
+
@output << put
|
547
|
+
end
|
548
|
+
end
|
549
|
+
|
550
|
+
# indent next output (start tag) or unindent (end tag)
|
551
|
+
def indent(str, output = true)
|
552
|
+
if (@parser.is_start_tag)
|
553
|
+
@indent << str
|
554
|
+
out(str, true) if @output
|
555
|
+
else
|
556
|
+
@indent = @indent.slice(0, (str.size * -1))
|
557
|
+
end
|
558
|
+
end
|
559
|
+
|
560
|
+
# handle plain text
|
561
|
+
def handle_text
|
562
|
+
if (has_parent('pre') && @parser.node.index("\n"))
|
563
|
+
@parser.node.gsub!("\n", "\n#{@indent}")
|
564
|
+
end
|
565
|
+
if (!has_parent('code') && !has_parent('pre'))
|
566
|
+
# entity decode
|
567
|
+
decode(@parser.node)
|
568
|
+
if (!@skip_conversion)
|
569
|
+
# escape some chars in normal text
|
570
|
+
@parser.node.gsub!(@escape_in_text['search'], @escape_in_text['replace'])
|
571
|
+
end
|
572
|
+
else
|
573
|
+
@parser.node.gsub!(['"', '&apos'], ['"', '\''])
|
574
|
+
end
|
575
|
+
out(@parser.node)
|
576
|
+
@last_closed_tag = ''
|
577
|
+
end
|
578
|
+
|
579
|
+
# handle non-Markdownable tags
|
580
|
+
def handle_tag_to_text
|
581
|
+
if (!@keep_html)
|
582
|
+
set_linebreaks(2) if (!@parser.is_start_tag && @parser.is_block_element)
|
583
|
+
else
|
584
|
+
# dont convert to markdown inside this tag
|
585
|
+
# TODO: markdown extra
|
586
|
+
if (!@parser.is_empty_tag)
|
587
|
+
if (@parser.is_start_tag)
|
588
|
+
unless (@skip_conversion)
|
589
|
+
@skip_conversion = @parser.tag_name + '::' + @parser.open_tags.join('/')
|
590
|
+
end
|
591
|
+
else
|
592
|
+
if (@skip_conversion == (@parser.tag_name + '::' + @parser.open_tags.join('/'))
|
593
|
+
@skip_conversion = false
|
594
|
+
end
|
595
|
+
end
|
596
|
+
end # end !@parser.is_empty_tag
|
597
|
+
|
598
|
+
if (@parser.is_block_element)
|
599
|
+
if (@parser.is_start_tag)
|
600
|
+
if (%w(ins del).include?(parent))
|
601
|
+
# looks like ins or del are block elements now
|
602
|
+
out("\n", true)
|
603
|
+
indent(' ')
|
604
|
+
end
|
605
|
+
if (@parser.tag_name != 'pre')
|
606
|
+
out(@parser.node + "\n" + @indent)
|
607
|
+
@parser.is_empty_tag ? set_linebreaks(1) : indent(' ')
|
608
|
+
@parser.html = @parser.html.lstrip
|
609
|
+
else
|
610
|
+
# dont indent inside <pre> tags
|
611
|
+
out(@parser.node)
|
612
|
+
@static_indent = @indent
|
613
|
+
@indent = ''
|
614
|
+
end
|
615
|
+
else
|
616
|
+
@output = rstrip(@output) unless @parser.keep_whitespace
|
617
|
+
if (@parser.tag_name != 'pre')
|
618
|
+
indent(' ')
|
619
|
+
out("\n" + @indent + @parser.node)
|
620
|
+
else
|
621
|
+
# reset indentation
|
622
|
+
out(@parser.node)
|
623
|
+
@indent = @static_indent
|
624
|
+
end
|
625
|
+
|
626
|
+
if (%w(ins del).include?(parent))
|
627
|
+
# ins or del was block element
|
628
|
+
out("\n")
|
629
|
+
indent(' ')
|
630
|
+
end
|
631
|
+
|
632
|
+
@parser.tag_name == 'li' ? set_linebreaks(1) : set_linebreaks(2)
|
633
|
+
end
|
634
|
+
else
|
635
|
+
out(@parser.node)
|
636
|
+
end
|
637
|
+
|
638
|
+
if (%w(code pre).include?(@parser.tag_name))
|
639
|
+
if (@parser.is_start_tag)
|
640
|
+
buffer
|
641
|
+
else
|
642
|
+
# add stuff so cleanup just reverses this
|
643
|
+
tmp = unbugger.gsub('>', '&gt;')
|
644
|
+
out(tmp.gsub('<', '&lt;'))
|
645
|
+
end
|
646
|
+
end
|
647
|
+
end
|
648
|
+
end
|
649
|
+
|
650
|
+
# get tag name of direct parent tag
|
651
|
+
def parent
|
652
|
+
@parser.open_tags.last
|
653
|
+
end
|
654
|
+
|
655
|
+
# check if current not has a tag as parent (somewhere, not just the direct parent)
|
656
|
+
def has_parent(tag)
|
657
|
+
@parser.open_tags.include?(tag)
|
658
|
+
end
|
659
|
+
|
660
|
+
# add current node to the stack (this only stores the attributes)
|
661
|
+
def stack
|
662
|
+
@stack[@parser.tag_name] = [] if (@stack[@parser.tag_name].nil?)
|
663
|
+
@stack[@parser.tag_name] << @parser.tag_attributes
|
664
|
+
end
|
665
|
+
|
666
|
+
# remove current tag from stack
|
667
|
+
def unstack
|
668
|
+
if (@stack[@parser.tag_name].nil? || !@stack[@parser.tag_name].is_a?(Array))
|
669
|
+
# TODO: trigger and error
|
670
|
+
raise "somebody set us up the bomb"
|
671
|
+
end
|
672
|
+
@stack[@parser.tag_name].pop
|
673
|
+
end
|
674
|
+
|
675
|
+
# get last stacked element of type tag
|
676
|
+
def get_stacked(tag)
|
677
|
+
@stack[tag][@stack[tag].size-1]
|
678
|
+
end
|
679
|
+
|
680
|
+
# buffer next parser output until unbuffer is called
|
681
|
+
def buffer
|
682
|
+
@buffer << ''
|
683
|
+
end
|
684
|
+
|
685
|
+
# end current buffer and return buffered output
|
686
|
+
def unbuffer
|
687
|
+
@buffer.pop
|
688
|
+
end
|
689
|
+
|
690
|
+
# wordwrap for utf8 encoded strings
|
691
|
+
def wordwrap(str, width, brk, cut = false)
|
692
|
+
# TODO: implement wordwrap for utf8 code
|
693
|
+
end
|
694
|
+
|
695
|
+
=======
|
696
|
+
>>>>>>> b6201584759afcc6f24a557ef9312597bd63f98f:lib/clothblue_rewrite.rb
|
697
|
+
# decode email address
|
698
|
+
def decode(text, quoted_style = '')
|
699
|
+
# TODO: implement decode method
|
118
700
|
|
119
|
-
|
120
|
-
#
|
121
|
-
|
701
|
+
# @author derernst@gmx.ch <http://www.php.net/manual/en/function.html-entity-decode.php#68536>
|
702
|
+
# @author Milian Wolff <http://milianw.de>
|
703
|
+
# if (version_compare(PHP_VERSION, '5', '>=')) {
|
704
|
+
# # UTF-8 is only supported in PHP 5.x.x and above
|
705
|
+
# $text = html_entity_decode($text, $quote_style, 'UTF-8');
|
706
|
+
# } else {
|
707
|
+
# if (function_exists('html_entity_decode')) {
|
708
|
+
# $text = html_entity_decode($text, $quote_style, 'ISO-8859-1');
|
709
|
+
# } else {
|
710
|
+
# static $trans_tbl;
|
711
|
+
# if (!isset($trans_tbl)) {
|
712
|
+
# $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, $quote_style));
|
713
|
+
# }
|
714
|
+
# $text = strtr($text, $trans_tbl);
|
715
|
+
# }
|
716
|
+
# $text = preg_replace_callback('~&#x([0-9a-f]+);~i', array(&$this, '_decode_hex'), $text);
|
717
|
+
# $text = preg_replace_callback('~&#(\d{2,5});~', array(&$this, '_decode_numeric'), $text);
|
718
|
+
# }
|
719
|
+
# return $text;
|
122
720
|
end
|
123
|
-
|
124
|
-
end
|
721
|
+
|
722
|
+
end
|