cpjolicoeur-ClothBlue 0.2.2 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/clothblue.gemspec +4 -4
- data/lib/clothblue.rb +698 -100
- data/lib/parsehtml/parsehtml.rb +452 -0
- metadata +4 -4
- data/README +0 -3
- data/lib/README.rdoc +0 -46
data/clothblue.gemspec
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
Gem::Specification.new do |s|
|
|
2
2
|
s.name = "ClothBlue"
|
|
3
|
-
s.version = "0.
|
|
4
|
-
s.date = "
|
|
3
|
+
s.version = "0.5.1"
|
|
4
|
+
s.date = "2008-11-25"
|
|
5
5
|
s.summary = "HTML to Markdown converter"
|
|
6
6
|
s.email = "cpjolicoeur@gmail.com"
|
|
7
7
|
s.homepage = "http://github.com/cpjolicoeur/clothblue"
|
|
8
8
|
s.description = "ClothBlue is BlueCloth's evil twin. It converts existing HTML into Markdown format for use with BlueCloth."
|
|
9
9
|
s.has_rdoc = true
|
|
10
10
|
s.authors = ["Craig P Jolicoeur"]
|
|
11
|
-
s.files = ["README", "TODO", "clothblue.gemspec", "lib/clothblue.rb", "lib/
|
|
11
|
+
s.files = ["README", "TODO", "clothblue.gemspec", "lib/clothblue.rb", "lib/parsehtml/parsehtml.rb", "test/README", "test/test_entities.rb", "test/test_formatting.rb", "test/test_headings.rb", "test/test_lists.rb", "test/test_structure.rb", "test/test_tables.rb"]
|
|
12
12
|
s.test_files = ["test/test_entities.rb", "test/test_formatting.rb", "test/test_headings.rb", "test/test_lists.rb", "test/test_structure.rb", "test/test_tables.rb"]
|
|
13
13
|
s.rdoc_options = ["--main", "lib/README.rdoc"]
|
|
14
|
-
end
|
|
14
|
+
end
|
data/lib/clothblue.rb
CHANGED
|
@@ -1,124 +1,722 @@
|
|
|
1
|
-
|
|
2
|
-
Provides the methods to convert HTML into Markdown.
|
|
3
|
-
*Please* *note*: ClothBlue creates UTF-8 output. To do so, it sets $KCODE to UTF-8. This will be globally available!
|
|
4
|
-
#--
|
|
5
|
-
TODO: enhance docs, as more methods come availlable
|
|
6
|
-
#++
|
|
1
|
+
require 'parsehtml/parsehtml'
|
|
7
2
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
=
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
3
|
+
class ClothBlue
|
|
4
|
+
|
|
5
|
+
# Constants
|
|
6
|
+
LINKS_EACH_PARAGRAPH = false
|
|
7
|
+
BODYWIDTH = false
|
|
8
|
+
KEEPHTML = true
|
|
9
|
+
MIN_BODYWIDTH = 25
|
|
10
|
+
|
|
11
|
+
# tags which can be handled by markdown
|
|
12
|
+
IS_MARKDOWNABLE = {
|
|
13
|
+
'p' => [],
|
|
14
|
+
'ul' => [],
|
|
15
|
+
'ol' => [],
|
|
16
|
+
'li' => [],
|
|
17
|
+
'br' => [],
|
|
18
|
+
'blockquote' => [],
|
|
19
|
+
'code' => [],
|
|
20
|
+
'pre' => [],
|
|
21
|
+
'a' => [{'href' => 'required'}, {'title' => 'optional'}],
|
|
22
|
+
'strong' => [],
|
|
23
|
+
'b' => [],
|
|
24
|
+
'em' => [],
|
|
25
|
+
'i' => [],
|
|
26
|
+
'img' => [{'src' => 'required'}, {'alt' => 'optional'}, {'title' => 'optional'}],
|
|
27
|
+
'h1' => [],
|
|
28
|
+
'h2' => [],
|
|
29
|
+
'h3' => [],
|
|
30
|
+
'h4' => [],
|
|
31
|
+
'h5' => [],
|
|
32
|
+
'h6' => [],
|
|
33
|
+
'hr' => []
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
# html tags to be ignored (content will be parsed)
|
|
37
|
+
IGNORE = %w(html body)
|
|
38
|
+
|
|
39
|
+
# html tags to be dropped (content will not be parsed!)
|
|
40
|
+
DROP = %w(script head style form area object param iframe)
|
|
40
41
|
|
|
41
|
-
|
|
42
|
-
|
|
42
|
+
# Markdown indents which could be wrapped
|
|
43
|
+
WRAPPABLE_INDENTS = [
|
|
44
|
+
'\* ', # ul
|
|
45
|
+
'\d. ', # ol
|
|
46
|
+
'\d\d. ', # ol
|
|
47
|
+
'> ', # blockquote
|
|
48
|
+
'' # p
|
|
43
49
|
]
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
50
|
+
|
|
51
|
+
# list of chars which have to be escaped in normal text
|
|
52
|
+
# TODO: what's with block chars/ sequences at the beginning of a block?
|
|
53
|
+
ESCAPE_IN_TEXT = [
|
|
54
|
+
{'([-*_])([ ]{0,2}\1){2,}' => '\\\\$0|'}, # hr
|
|
55
|
+
{'\*\*([^*\s]+)\*\*' => '\*\*$1\*\*'}, # strong
|
|
56
|
+
{'\*([^*\s]+)\*' => '\*$1\*'}, # em
|
|
57
|
+
{'__(?! |_)(.+)(?!<_| )__' => '\_\_$1\_\_'}, # em
|
|
58
|
+
{'_(?! |_)(.+)(?!<_| )_' => '\_$1\_'}, # em
|
|
59
|
+
{'`(.+)`' => '\`$1\`'}, # code
|
|
60
|
+
{'\[(.+)\](\s*\()' => '\[$1\]$2'}, # links: [text] (url) => [text\] (url)
|
|
61
|
+
{'\[(.+)\](\s*)\[(.*)\]' => '\[$1\]$2\[$3\]'}, # links: [text][id] => [text\][id\]
|
|
47
62
|
]
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
63
|
+
|
|
64
|
+
# parseHTML parser
|
|
65
|
+
attr_accessor :parser
|
|
66
|
+
|
|
67
|
+
# markdown output
|
|
68
|
+
attr_reader :output
|
|
69
|
+
|
|
70
|
+
# stack with tags which were not converted to html
|
|
71
|
+
attr_reader :not_converted
|
|
72
|
+
|
|
73
|
+
# skip conversion to markdown
|
|
74
|
+
attr_reader :skip_conversion
|
|
75
|
+
|
|
76
|
+
# keep html tags which cannot be converted to markdown
|
|
77
|
+
attr_reader :keep_html
|
|
78
|
+
|
|
79
|
+
# wrap output, set to 0 to skip wrapping
|
|
80
|
+
attr_reader :body_width
|
|
81
|
+
|
|
82
|
+
# whether last processed node was a block tag or not
|
|
83
|
+
@last_was_block_tag = false
|
|
84
|
+
attr_reader :last_was_block_tag
|
|
85
|
+
|
|
86
|
+
# name of last closed tag
|
|
87
|
+
@last_closed_tag = ''
|
|
88
|
+
attr_reader :last_closed_tag
|
|
89
|
+
|
|
90
|
+
# list of chars which have to be escaped in normal text
|
|
91
|
+
@escape_in_text = {}
|
|
92
|
+
attr_reader :escape_in_text
|
|
93
|
+
|
|
94
|
+
# number of linebreaks before next inline output
|
|
95
|
+
@linebreaks = 0
|
|
96
|
+
attr_accessor :linebreaks
|
|
97
|
+
|
|
98
|
+
# stores current buffer
|
|
99
|
+
@buffer = []
|
|
100
|
+
attr_accessor :buffer
|
|
101
|
+
|
|
102
|
+
# current indentation
|
|
103
|
+
@indent = ''
|
|
104
|
+
attr_accessor :indent
|
|
105
|
+
|
|
106
|
+
# node stack, e.g. for <a> and <abbr> tags
|
|
107
|
+
@stack = {}
|
|
108
|
+
attr_accessor :stack
|
|
109
|
+
|
|
110
|
+
# Constructor
|
|
111
|
+
def initialize(text = '', links_after_each_paragraph = LINKS_EACH_PARAGRAPH, body_width = BODYWIDTH, keep_html = KEEPHTML)
|
|
112
|
+
@links_after_each_paragraph = links_after_each_paragraph
|
|
113
|
+
@keep_html = keep_html
|
|
114
|
+
@body_width = (body_width > MIN_BODYWIDTH) ? body_width.to_i : MIN_BODYWIDTH
|
|
115
|
+
|
|
116
|
+
@parser = HTML::Tokenizer.new(text)
|
|
117
|
+
|
|
118
|
+
@output = ''
|
|
119
|
+
@not_converted = []
|
|
120
|
+
@skip_conversion = false
|
|
121
|
+
|
|
122
|
+
@search, @replace = [], []
|
|
123
|
+
ESCAPE_IN_TEXT.each do |s,r|
|
|
124
|
+
@search << '/(?<!\\\)/' + s + '/U'
|
|
125
|
+
@replace << r
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
@escape_in_text = {'search' => @search, 'replace' => @replace}
|
|
52
129
|
end
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
structure(@workingcopy)
|
|
60
|
-
text_formatting(@workingcopy)
|
|
61
|
-
lists(@workingcopy)
|
|
62
|
-
entities(@workingcopy)
|
|
63
|
-
tables(@workingcopy)
|
|
64
|
-
@workingcopy = CGI::unescapeHTML(@workingcopy)
|
|
65
|
-
@workingcopy
|
|
130
|
+
|
|
131
|
+
# parse an HTML string
|
|
132
|
+
def parse_string
|
|
133
|
+
# @parser.html = html ## -> if we passed it in
|
|
134
|
+
parse
|
|
135
|
+
return @output
|
|
66
136
|
end
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
137
|
+
|
|
138
|
+
# iterate through the nodes and decide what to do with the current node
|
|
139
|
+
def parse
|
|
140
|
+
@output = ''
|
|
141
|
+
# drop tags that are in the DROP list
|
|
142
|
+
# TODO: implement dropping of @drop tags
|
|
143
|
+
|
|
144
|
+
while token = @parser.next_node
|
|
145
|
+
case @parser.node_type
|
|
146
|
+
when 'doctype', 'pi', 'comment'
|
|
147
|
+
if (@keep_html)
|
|
148
|
+
flush_linebreaks
|
|
149
|
+
out(@parser.node)
|
|
150
|
+
set_linebreaks(2)
|
|
151
|
+
end
|
|
152
|
+
when 'text'
|
|
153
|
+
handle_text
|
|
154
|
+
when 'tag'
|
|
155
|
+
next if IGNORE.include?(@parser.tag_name)
|
|
156
|
+
flush_linebreaks if (@parser.is_start_tag)
|
|
157
|
+
if (@skip_conversion)
|
|
158
|
+
is_markdownable # update notConverted
|
|
159
|
+
handle_tag_to_text
|
|
160
|
+
next
|
|
161
|
+
end
|
|
162
|
+
@parser.html = @parser.html.lstrip if (!@parser.keep_whitespace && @parser.is_block_element && @parser.is_start_tag)
|
|
163
|
+
if (is_markdownable)
|
|
164
|
+
if (@parser.is_block_element && @parser.is_start_tag && !@last_was_block_tag && !@output.empty?)
|
|
165
|
+
if (!@buffer.empty?)
|
|
166
|
+
str = @buffer[@buffer.size - 1]
|
|
167
|
+
else
|
|
168
|
+
str = @output
|
|
169
|
+
end
|
|
170
|
+
if (str.slice((@indent.size - 1) * -1) != "\n#{@indent}")
|
|
171
|
+
str << "\n" + @indent
|
|
172
|
+
end
|
|
173
|
+
func = "handle_tag_#{@parser.tag_name}"
|
|
174
|
+
self.send(func)
|
|
175
|
+
|
|
176
|
+
if (@links_after_each_paragraph && @parser.is_block_element && !@parser.is_start_tag)
|
|
177
|
+
flush_stacked
|
|
178
|
+
end
|
|
179
|
+
if(!@parser.is_start_tag)
|
|
180
|
+
@last_closed_tag = @parser.tag_name
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
else
|
|
184
|
+
handle_tag_to_text
|
|
185
|
+
@last_closed_tag = ''
|
|
186
|
+
end
|
|
187
|
+
else
|
|
188
|
+
# TODO: trigger error for invalid node type
|
|
189
|
+
end # end case
|
|
190
|
+
|
|
191
|
+
@last_was_block_tag = (@parser.node_type == 'tag' && @parser.is_start_tag && @parser.is_block_element)
|
|
192
|
+
end # end while
|
|
193
|
+
|
|
194
|
+
### cleanup
|
|
195
|
+
tmp = @output.gsub('>', '>')
|
|
196
|
+
tmp = tmp.gsub('&', '&')
|
|
197
|
+
@output = tmp.rstrip
|
|
198
|
+
# end parsing, flush stacked tags
|
|
199
|
+
flush_stacked
|
|
200
|
+
@stack = {}
|
|
201
|
+
<<<<<<< HEAD:lib/clothblue_rewrite.rb
|
|
202
|
+
=======
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
# check if current tag can be converted to Markdown
|
|
206
|
+
def is_markdownable
|
|
207
|
+
return false unless (IS_MARKDOWNABLE.include?(@parser.tag_name))
|
|
208
|
+
|
|
209
|
+
if (@parser.is_start_tag)
|
|
210
|
+
ret = true
|
|
211
|
+
if (@keep_html)
|
|
212
|
+
diff = @parser.tag_attributes.reject { |a| @parser.tag_name.include?(a) }
|
|
213
|
+
ret = false unless diff.empty? # non markdownable attributes given
|
|
214
|
+
end
|
|
215
|
+
if (ret)
|
|
216
|
+
IS_MARKDOWNABLE.each do |attr, type|
|
|
217
|
+
if ((type == 'required') && @parser.tag_attributes[attr].nil?)
|
|
218
|
+
# required Markdown attribute not given
|
|
219
|
+
ret = false
|
|
220
|
+
break
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
unless (ret)
|
|
225
|
+
@not_converted << (@parser.tag_name + '::' + @parser.open_tags.join('/'))
|
|
226
|
+
end
|
|
227
|
+
return ret
|
|
228
|
+
else
|
|
229
|
+
if (!@not_converted.empty? && (@not_converted.last == (@parser.tag_name + '::' + @parser.open_tags.join('/'))))
|
|
230
|
+
@not_converted.pop
|
|
231
|
+
return false
|
|
232
|
+
end
|
|
233
|
+
return true
|
|
75
234
|
end
|
|
76
|
-
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
# flush enqued linebreaks
|
|
238
|
+
def flush_linebreaks
|
|
239
|
+
if ((@linebreaks > 0) && !@output.empty?)
|
|
240
|
+
out("\n" * @linebreaks, true)
|
|
241
|
+
end
|
|
242
|
+
@linebreaks = 0
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
# output all stacked tags
|
|
246
|
+
def flush_stacked
|
|
247
|
+
# # links
|
|
248
|
+
# foreach ($this->stack as $tag => $a) {
|
|
249
|
+
# if (!empty($a)) {
|
|
250
|
+
# call_user_func(array(&$this, 'flushStacked_'.$tag));
|
|
251
|
+
# }
|
|
252
|
+
# }
|
|
77
253
|
end
|
|
78
254
|
|
|
79
|
-
|
|
80
|
-
def
|
|
81
|
-
|
|
82
|
-
|
|
255
|
+
# set number of line breaks before next start tag
|
|
256
|
+
def set_linebreaks(number)
|
|
257
|
+
@linebreaks = number if (@linebreaks < number)
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
# append string to the correct var, either directly to
|
|
261
|
+
# @output or to the current buffers
|
|
262
|
+
def out(put = '', nowrap = false)
|
|
263
|
+
return if put.empty?
|
|
264
|
+
|
|
265
|
+
if (!@buffer.empty?)
|
|
266
|
+
@buffer.last << put
|
|
267
|
+
else
|
|
268
|
+
if ((@body_width > 0) && !@parser.keep_whitespace) # wrap lines
|
|
269
|
+
# get last line
|
|
270
|
+
pos = @output.index("\n")
|
|
271
|
+
line = pos ? @output.slice(pos, @output.size - pos) : @output
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
if (nowrap)
|
|
275
|
+
if ((put[0,1] != "\n") && (line.size + put.size) > @body_width)
|
|
276
|
+
@output << "\n#{indent(put)}"
|
|
277
|
+
else
|
|
278
|
+
@output << put
|
|
279
|
+
end
|
|
280
|
+
return
|
|
281
|
+
else
|
|
282
|
+
put << "\n" # make sure we get all lines in the while below
|
|
283
|
+
line_len = line.size
|
|
284
|
+
while (pos = put.index("\n"))
|
|
285
|
+
put_line = put.slice(1, pos+1)
|
|
286
|
+
put_len = put_line.size
|
|
287
|
+
put = put.slice(pos+1, put.size - pos)
|
|
288
|
+
if (line_len + put_len < @body_width)
|
|
289
|
+
@output << put_line
|
|
290
|
+
line_len = put_len
|
|
291
|
+
else
|
|
292
|
+
# $split = preg_split('#^(.{0,'.($this->bodyWidth - $lineLen).'})\b#', $putLine, 2, PREG_SPLIT_OFFSET_CAPTURE | PREG_SPLIT_DELIM_CAPTURE);
|
|
293
|
+
# $this->output .= rtrim($split[1][0])."\n".$this->indent.$this->wordwrap(ltrim($split[2][0]), $this->bodyWidth, "\n".$this->indent, false);
|
|
294
|
+
end
|
|
295
|
+
end # end while
|
|
296
|
+
end
|
|
297
|
+
@output = @output(0, -1)
|
|
298
|
+
return
|
|
299
|
+
else
|
|
300
|
+
@output << put
|
|
83
301
|
end
|
|
84
|
-
|
|
302
|
+
>>>>>>> b6201584759afcc6f24a557ef9312597bd63f98f:lib/clothblue_rewrite.rb
|
|
85
303
|
end
|
|
86
304
|
|
|
305
|
+
<<<<<<< HEAD:lib/clothblue_rewrite.rb
|
|
306
|
+
# check if current tag can be converted to Markdown
|
|
307
|
+
def is_markdownable
|
|
308
|
+
return false unless (IS_MARKDOWNABLE.include?(@parser.tag_name))
|
|
309
|
+
=======
|
|
310
|
+
# indent next output (start tag) or unindent (end tag)
|
|
311
|
+
def indent(str, output = true)
|
|
312
|
+
if (@parser.is_start_tag)
|
|
313
|
+
@indent << str
|
|
314
|
+
out(str, true) if @output
|
|
315
|
+
else
|
|
316
|
+
@indent = @indent.slice(0, (str.size * -1))
|
|
317
|
+
end
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
# handle plain text
|
|
321
|
+
def handle_text
|
|
322
|
+
if (has_parent('pre') && @parser.node.index("\n"))
|
|
323
|
+
@parser.node.gsub!("\n", "\n#{@indent}")
|
|
324
|
+
end
|
|
325
|
+
if (!has_parent('code') && !has_parent('pre'))
|
|
326
|
+
# entity decode
|
|
327
|
+
decode(@parser.node)
|
|
328
|
+
if (!@skip_conversion)
|
|
329
|
+
# escape some chars in normal text
|
|
330
|
+
@parser.node.gsub!(@escape_in_text['search'], @escape_in_text['replace'])
|
|
331
|
+
end
|
|
332
|
+
else
|
|
333
|
+
@parser.node.gsub!(['"', '&apos'], ['"', '\''])
|
|
334
|
+
end
|
|
335
|
+
out(@parser.node)
|
|
336
|
+
@last_closed_tag = ''
|
|
337
|
+
end
|
|
87
338
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
339
|
+
# handle non-Markdownable tags
|
|
340
|
+
def handle_tag_to_text
|
|
341
|
+
if (!@keep_html)
|
|
342
|
+
set_linebreaks(2) if (!@parser.is_start_tag && @parser.is_block_element)
|
|
343
|
+
else
|
|
344
|
+
# dont convert to markdown inside this tag
|
|
345
|
+
# TODO: markdown extra
|
|
346
|
+
if (!@parser.is_empty_tag)
|
|
347
|
+
if (@parser.is_start_tag)
|
|
348
|
+
unless (@skip_conversion)
|
|
349
|
+
@skip_conversion = @parser.tag_name + '::' + @parser.open_tags.join('/')
|
|
350
|
+
end
|
|
351
|
+
else
|
|
352
|
+
if (@skip_conversion == (@parser.tag_name + '::' + @parser.open_tags.join('/'))
|
|
353
|
+
@skip_conversion = false
|
|
354
|
+
end
|
|
355
|
+
end
|
|
356
|
+
end # end !@parser.is_empty_tag
|
|
357
|
+
|
|
358
|
+
if (@parser.is_block_element)
|
|
359
|
+
if (@parser.is_start_tag)
|
|
360
|
+
if (%w(ins del).include?(parent))
|
|
361
|
+
# looks like ins or del are block elements now
|
|
362
|
+
out("\n", true)
|
|
363
|
+
indent(' ')
|
|
364
|
+
end
|
|
365
|
+
if (@parser.tag_name != 'pre')
|
|
366
|
+
out(@parser.node + "\n" + @indent)
|
|
367
|
+
@parser.is_empty_tag ? set_linebreaks(1) : indent(' ')
|
|
368
|
+
@parser.html = @parser.html.lstrip
|
|
369
|
+
else
|
|
370
|
+
# dont indent inside <pre> tags
|
|
371
|
+
out(@parser.node)
|
|
372
|
+
@static_indent = @indent
|
|
373
|
+
@indent = ''
|
|
374
|
+
end
|
|
375
|
+
else
|
|
376
|
+
@output = rstrip(@output) unless @parser.keep_whitespace
|
|
377
|
+
if (@parser.tag_name != 'pre')
|
|
378
|
+
indent(' ')
|
|
379
|
+
out("\n" + @indent + @parser.node)
|
|
380
|
+
else
|
|
381
|
+
# reset indentation
|
|
382
|
+
out(@parser.node)
|
|
383
|
+
@indent = @static_indent
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
if (%w(ins del).include?(parent))
|
|
387
|
+
# ins or del was block element
|
|
388
|
+
out("\n")
|
|
389
|
+
indent(' ')
|
|
390
|
+
end
|
|
391
|
+
|
|
392
|
+
@parser.tag_name == 'li' ? set_linebreaks(1) : set_linebreaks(2)
|
|
393
|
+
end
|
|
394
|
+
else
|
|
395
|
+
out(@parser.node)
|
|
396
|
+
end
|
|
397
|
+
|
|
398
|
+
if (%w(code pre).include?(@parser.tag_name))
|
|
399
|
+
if (@parser.is_start_tag)
|
|
400
|
+
buffer
|
|
401
|
+
else
|
|
402
|
+
# add stuff so cleanup just reverses this
|
|
403
|
+
tmp = unbugger.gsub('>', '&gt;')
|
|
404
|
+
out(tmp.gsub('<', '&lt;'))
|
|
405
|
+
end
|
|
406
|
+
end
|
|
91
407
|
end
|
|
92
|
-
text
|
|
93
408
|
end
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def
|
|
97
|
-
|
|
98
|
-
|
|
409
|
+
|
|
410
|
+
# get tag name of direct parent tag
|
|
411
|
+
def parent
|
|
412
|
+
@parser.open_tags.last
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
# check if current not has a tag as parent (somewhere, not just the direct parent)
|
|
416
|
+
def has_parent(tag)
|
|
417
|
+
@parser.open_tags.include?(tag)
|
|
418
|
+
end
|
|
419
|
+
|
|
420
|
+
# add current node to the stack (this only stores the attributes)
|
|
421
|
+
def stack
|
|
422
|
+
@stack[@parser.tag_name] = [] if (@stack[@parser.tag_name].nil?)
|
|
423
|
+
@stack[@parser.tag_name] << @parser.tag_attributes
|
|
424
|
+
end
|
|
425
|
+
|
|
426
|
+
# remove current tag from stack
|
|
427
|
+
def unstack
|
|
428
|
+
if (@stack[@parser.tag_name].nil? || !@stack[@parser.tag_name].is_a?(Array))
|
|
429
|
+
# TODO: trigger and error
|
|
430
|
+
raise "somebody set us up the bomb"
|
|
99
431
|
end
|
|
100
|
-
|
|
432
|
+
@stack[@parser.tag_name].pop
|
|
101
433
|
end
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
def
|
|
105
|
-
|
|
106
|
-
|
|
434
|
+
|
|
435
|
+
# get last stacked element of type tag
|
|
436
|
+
def get_stacked(tag)
|
|
437
|
+
@stack[tag][@stack[tag].size-1]
|
|
438
|
+
end
|
|
439
|
+
|
|
440
|
+
# buffer next parser output until unbuffer is called
|
|
441
|
+
def buffer
|
|
442
|
+
@buffer << ''
|
|
443
|
+
end
|
|
444
|
+
|
|
445
|
+
# end current buffer and return buffered output
|
|
446
|
+
def unbuffer
|
|
447
|
+
@buffer.pop
|
|
448
|
+
end
|
|
449
|
+
|
|
450
|
+
# wordwrap for utf8 encoded strings
|
|
451
|
+
def wordwrap(str, width, brk, cut = false)
|
|
452
|
+
>>>>>>> b6201584759afcc6f24a557ef9312597bd63f98f:lib/clothblue_rewrite.rb
|
|
453
|
+
|
|
454
|
+
if (@parser.is_start_tag)
|
|
455
|
+
ret = true
|
|
456
|
+
if (@keep_html)
|
|
457
|
+
diff = @parser.tag_attributes.reject { |a| @parser.tag_name.include?(a) }
|
|
458
|
+
ret = false unless diff.empty? # non markdownable attributes given
|
|
459
|
+
end
|
|
460
|
+
if (ret)
|
|
461
|
+
IS_MARKDOWNABLE.each do |attr, type|
|
|
462
|
+
if ((type == 'required') && @parser.tag_attributes[attr].nil?)
|
|
463
|
+
# required Markdown attribute not given
|
|
464
|
+
ret = false
|
|
465
|
+
break
|
|
466
|
+
end
|
|
467
|
+
end
|
|
468
|
+
end
|
|
469
|
+
unless (ret)
|
|
470
|
+
@not_converted << (@parser.tag_name + '::' + @parser.open_tags.join('/'))
|
|
471
|
+
end
|
|
472
|
+
return ret
|
|
473
|
+
else
|
|
474
|
+
if (!@not_converted.empty? && (@not_converted.last == (@parser.tag_name + '::' + @parser.open_tags.join('/'))))
|
|
475
|
+
@not_converted.pop
|
|
476
|
+
return false
|
|
477
|
+
end
|
|
478
|
+
return true
|
|
107
479
|
end
|
|
108
|
-
text
|
|
109
480
|
end
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
481
|
+
|
|
482
|
+
<<<<<<< HEAD:lib/clothblue_rewrite.rb
|
|
483
|
+
# flush enqued linebreaks
|
|
484
|
+
def flush_linebreaks
|
|
485
|
+
if ((@linebreaks > 0) && !@output.empty?)
|
|
486
|
+
out("\n" * @linebreaks, true)
|
|
114
487
|
end
|
|
115
|
-
|
|
488
|
+
@linebreaks = 0
|
|
489
|
+
end
|
|
490
|
+
|
|
491
|
+
# output all stacked tags
|
|
492
|
+
def flush_stacked
|
|
493
|
+
# # links
|
|
494
|
+
# foreach ($this->stack as $tag => $a) {
|
|
495
|
+
# if (!empty($a)) {
|
|
496
|
+
# call_user_func(array(&$this, 'flushStacked_'.$tag));
|
|
497
|
+
# }
|
|
498
|
+
# }
|
|
116
499
|
end
|
|
117
500
|
|
|
501
|
+
# set number of line breaks before next start tag
|
|
502
|
+
def set_linebreaks(number)
|
|
503
|
+
@linebreaks = number if (@linebreaks < number)
|
|
504
|
+
end
|
|
505
|
+
|
|
506
|
+
# append string to the correct var, either directly to
|
|
507
|
+
# @output or to the current buffers
|
|
508
|
+
def out(put = '', nowrap = false)
|
|
509
|
+
return if put.empty?
|
|
510
|
+
|
|
511
|
+
if (!@buffer.empty?)
|
|
512
|
+
@buffer.last << put
|
|
513
|
+
else
|
|
514
|
+
if ((@body_width > 0) && !@parser.keep_whitespace) # wrap lines
|
|
515
|
+
# get last line
|
|
516
|
+
pos = @output.index("\n")
|
|
517
|
+
line = pos ? @output.slice(pos, @output.size - pos) : @output
|
|
518
|
+
end
|
|
519
|
+
|
|
520
|
+
if (nowrap)
|
|
521
|
+
if ((put[0,1] != "\n") && (line.size + put.size) > @body_width)
|
|
522
|
+
@output << "\n#{indent(put)}"
|
|
523
|
+
else
|
|
524
|
+
@output << put
|
|
525
|
+
end
|
|
526
|
+
return
|
|
527
|
+
else
|
|
528
|
+
put << "\n" # make sure we get all lines in the while below
|
|
529
|
+
line_len = line.size
|
|
530
|
+
while (pos = put.index("\n"))
|
|
531
|
+
put_line = put.slice(1, pos+1)
|
|
532
|
+
put_len = put_line.size
|
|
533
|
+
put = put.slice(pos+1, put.size - pos)
|
|
534
|
+
if (line_len + put_len < @body_width)
|
|
535
|
+
@output << put_line
|
|
536
|
+
line_len = put_len
|
|
537
|
+
else
|
|
538
|
+
# $split = preg_split('#^(.{0,'.($this->bodyWidth - $lineLen).'})\b#', $putLine, 2, PREG_SPLIT_OFFSET_CAPTURE | PREG_SPLIT_DELIM_CAPTURE);
|
|
539
|
+
# $this->output .= rtrim($split[1][0])."\n".$this->indent.$this->wordwrap(ltrim($split[2][0]), $this->bodyWidth, "\n".$this->indent, false);
|
|
540
|
+
end
|
|
541
|
+
end # end while
|
|
542
|
+
end
|
|
543
|
+
@output = @output(0, -1)
|
|
544
|
+
return
|
|
545
|
+
else
|
|
546
|
+
@output << put
|
|
547
|
+
end
|
|
548
|
+
end
|
|
549
|
+
|
|
550
|
+
# indent next output (start tag) or unindent (end tag)
|
|
551
|
+
def indent(str, output = true)
|
|
552
|
+
if (@parser.is_start_tag)
|
|
553
|
+
@indent << str
|
|
554
|
+
out(str, true) if @output
|
|
555
|
+
else
|
|
556
|
+
@indent = @indent.slice(0, (str.size * -1))
|
|
557
|
+
end
|
|
558
|
+
end
|
|
559
|
+
|
|
560
|
+
# handle plain text
|
|
561
|
+
def handle_text
|
|
562
|
+
if (has_parent('pre') && @parser.node.index("\n"))
|
|
563
|
+
@parser.node.gsub!("\n", "\n#{@indent}")
|
|
564
|
+
end
|
|
565
|
+
if (!has_parent('code') && !has_parent('pre'))
|
|
566
|
+
# entity decode
|
|
567
|
+
decode(@parser.node)
|
|
568
|
+
if (!@skip_conversion)
|
|
569
|
+
# escape some chars in normal text
|
|
570
|
+
@parser.node.gsub!(@escape_in_text['search'], @escape_in_text['replace'])
|
|
571
|
+
end
|
|
572
|
+
else
|
|
573
|
+
@parser.node.gsub!(['"', '&apos'], ['"', '\''])
|
|
574
|
+
end
|
|
575
|
+
out(@parser.node)
|
|
576
|
+
@last_closed_tag = ''
|
|
577
|
+
end
|
|
578
|
+
|
|
579
|
+
# handle non-Markdownable tags
|
|
580
|
+
def handle_tag_to_text
|
|
581
|
+
if (!@keep_html)
|
|
582
|
+
set_linebreaks(2) if (!@parser.is_start_tag && @parser.is_block_element)
|
|
583
|
+
else
|
|
584
|
+
# dont convert to markdown inside this tag
|
|
585
|
+
# TODO: markdown extra
|
|
586
|
+
if (!@parser.is_empty_tag)
|
|
587
|
+
if (@parser.is_start_tag)
|
|
588
|
+
unless (@skip_conversion)
|
|
589
|
+
@skip_conversion = @parser.tag_name + '::' + @parser.open_tags.join('/')
|
|
590
|
+
end
|
|
591
|
+
else
|
|
592
|
+
if (@skip_conversion == (@parser.tag_name + '::' + @parser.open_tags.join('/'))
|
|
593
|
+
@skip_conversion = false
|
|
594
|
+
end
|
|
595
|
+
end
|
|
596
|
+
end # end !@parser.is_empty_tag
|
|
597
|
+
|
|
598
|
+
if (@parser.is_block_element)
|
|
599
|
+
if (@parser.is_start_tag)
|
|
600
|
+
if (%w(ins del).include?(parent))
|
|
601
|
+
# looks like ins or del are block elements now
|
|
602
|
+
out("\n", true)
|
|
603
|
+
indent(' ')
|
|
604
|
+
end
|
|
605
|
+
if (@parser.tag_name != 'pre')
|
|
606
|
+
out(@parser.node + "\n" + @indent)
|
|
607
|
+
@parser.is_empty_tag ? set_linebreaks(1) : indent(' ')
|
|
608
|
+
@parser.html = @parser.html.lstrip
|
|
609
|
+
else
|
|
610
|
+
# dont indent inside <pre> tags
|
|
611
|
+
out(@parser.node)
|
|
612
|
+
@static_indent = @indent
|
|
613
|
+
@indent = ''
|
|
614
|
+
end
|
|
615
|
+
else
|
|
616
|
+
@output = rstrip(@output) unless @parser.keep_whitespace
|
|
617
|
+
if (@parser.tag_name != 'pre')
|
|
618
|
+
indent(' ')
|
|
619
|
+
out("\n" + @indent + @parser.node)
|
|
620
|
+
else
|
|
621
|
+
# reset indentation
|
|
622
|
+
out(@parser.node)
|
|
623
|
+
@indent = @static_indent
|
|
624
|
+
end
|
|
625
|
+
|
|
626
|
+
if (%w(ins del).include?(parent))
|
|
627
|
+
# ins or del was block element
|
|
628
|
+
out("\n")
|
|
629
|
+
indent(' ')
|
|
630
|
+
end
|
|
631
|
+
|
|
632
|
+
@parser.tag_name == 'li' ? set_linebreaks(1) : set_linebreaks(2)
|
|
633
|
+
end
|
|
634
|
+
else
|
|
635
|
+
out(@parser.node)
|
|
636
|
+
end
|
|
637
|
+
|
|
638
|
+
if (%w(code pre).include?(@parser.tag_name))
|
|
639
|
+
if (@parser.is_start_tag)
|
|
640
|
+
buffer
|
|
641
|
+
else
|
|
642
|
+
# add stuff so cleanup just reverses this
|
|
643
|
+
tmp = unbugger.gsub('>', '&gt;')
|
|
644
|
+
out(tmp.gsub('<', '&lt;'))
|
|
645
|
+
end
|
|
646
|
+
end
|
|
647
|
+
end
|
|
648
|
+
end
|
|
649
|
+
|
|
650
|
+
# get tag name of direct parent tag
|
|
651
|
+
def parent
|
|
652
|
+
@parser.open_tags.last
|
|
653
|
+
end
|
|
654
|
+
|
|
655
|
+
# check if current not has a tag as parent (somewhere, not just the direct parent)
|
|
656
|
+
def has_parent(tag)
|
|
657
|
+
@parser.open_tags.include?(tag)
|
|
658
|
+
end
|
|
659
|
+
|
|
660
|
+
# add current node to the stack (this only stores the attributes)
|
|
661
|
+
def stack
|
|
662
|
+
@stack[@parser.tag_name] = [] if (@stack[@parser.tag_name].nil?)
|
|
663
|
+
@stack[@parser.tag_name] << @parser.tag_attributes
|
|
664
|
+
end
|
|
665
|
+
|
|
666
|
+
# remove current tag from stack
|
|
667
|
+
def unstack
|
|
668
|
+
if (@stack[@parser.tag_name].nil? || !@stack[@parser.tag_name].is_a?(Array))
|
|
669
|
+
# TODO: trigger and error
|
|
670
|
+
raise "somebody set us up the bomb"
|
|
671
|
+
end
|
|
672
|
+
@stack[@parser.tag_name].pop
|
|
673
|
+
end
|
|
674
|
+
|
|
675
|
+
# get last stacked element of type tag
|
|
676
|
+
def get_stacked(tag)
|
|
677
|
+
@stack[tag][@stack[tag].size-1]
|
|
678
|
+
end
|
|
679
|
+
|
|
680
|
+
# buffer next parser output until unbuffer is called
|
|
681
|
+
def buffer
|
|
682
|
+
@buffer << ''
|
|
683
|
+
end
|
|
684
|
+
|
|
685
|
+
# end current buffer and return buffered output
|
|
686
|
+
def unbuffer
|
|
687
|
+
@buffer.pop
|
|
688
|
+
end
|
|
689
|
+
|
|
690
|
+
# wordwrap for utf8 encoded strings
|
|
691
|
+
def wordwrap(str, width, brk, cut = false)
|
|
692
|
+
# TODO: implement wordwrap for utf8 code
|
|
693
|
+
end
|
|
694
|
+
|
|
695
|
+
=======
|
|
696
|
+
>>>>>>> b6201584759afcc6f24a557ef9312597bd63f98f:lib/clothblue_rewrite.rb
|
|
697
|
+
# decode email address
|
|
698
|
+
def decode(text, quoted_style = '')
|
|
699
|
+
# TODO: implement decode method
|
|
118
700
|
|
|
119
|
-
|
|
120
|
-
#
|
|
121
|
-
|
|
701
|
+
# @author derernst@gmx.ch <http://www.php.net/manual/en/function.html-entity-decode.php#68536>
|
|
702
|
+
# @author Milian Wolff <http://milianw.de>
|
|
703
|
+
# if (version_compare(PHP_VERSION, '5', '>=')) {
|
|
704
|
+
# # UTF-8 is only supported in PHP 5.x.x and above
|
|
705
|
+
# $text = html_entity_decode($text, $quote_style, 'UTF-8');
|
|
706
|
+
# } else {
|
|
707
|
+
# if (function_exists('html_entity_decode')) {
|
|
708
|
+
# $text = html_entity_decode($text, $quote_style, 'ISO-8859-1');
|
|
709
|
+
# } else {
|
|
710
|
+
# static $trans_tbl;
|
|
711
|
+
# if (!isset($trans_tbl)) {
|
|
712
|
+
# $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, $quote_style));
|
|
713
|
+
# }
|
|
714
|
+
# $text = strtr($text, $trans_tbl);
|
|
715
|
+
# }
|
|
716
|
+
# $text = preg_replace_callback('~&#x([0-9a-f]+);~i', array(&$this, '_decode_hex'), $text);
|
|
717
|
+
# $text = preg_replace_callback('~&#(\d{2,5});~', array(&$this, '_decode_numeric'), $text);
|
|
718
|
+
# }
|
|
719
|
+
# return $text;
|
|
122
720
|
end
|
|
123
|
-
|
|
124
|
-
end
|
|
721
|
+
|
|
722
|
+
end
|