parsehtml 1.12.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,4 @@
1
+ == 1.12.0 2008-12-05
2
+
3
+ * No major enhancements
4
+ * Just changing the gem server from github to rubyforge
@@ -0,0 +1,19 @@
1
+ History.txt
2
+ Manifest.txt
3
+ PostInstall.txt
4
+ README.rdoc
5
+ Rakefile
6
+ config/website.yml
7
+ lib/parsehtml.rb
8
+ script/console
9
+ script/destroy
10
+ script/generate
11
+ script/txt2html
12
+ test/test_helper.rb
13
+ test/test_parsehtml.rb
14
+ test/test_parse.rb
15
+ website/index.html
16
+ website/index.txt
17
+ website/javascripts/rounded_corners_lite.inc.js
18
+ website/stylesheets/screen.css
19
+ website/template.html.erb
@@ -0,0 +1,2 @@
1
+
2
+ For more information on parsehtml, see http://parsehtml.rubyforge.org
@@ -0,0 +1,52 @@
1
+ = parsehtml
2
+
3
+ http://parsehtml.rubyforge.org
4
+
5
+ == DESCRIPTION:
6
+
7
+ ParseHTML is an HTML parser which works with Ruby 1.8 and above. ParseHTML will even try to handle invalid HTML to some degree.
8
+
9
+ == SYNOPSIS:
10
+
11
+ FIX (code sample of usage)
12
+
13
+ == REQUIREMENTS:
14
+
15
+ Ruby 1.8
16
+
17
+ == INSTALL:
18
+
19
+ sudo gem install parsehtml
20
+
21
+ == DEVELOPERS:
22
+
23
+ Craig P Jolicoeur - http://github.com/cpjolicoeur
24
+
25
+ == ACKNOWLEDGEMENTS:
26
+
27
+ ParseHTML is heavily based on the ParseHTML PHP library by Milian Wolf (http://milianw.de)
28
+
29
+ == LICENSE:
30
+
31
+ (The MIT License)
32
+
33
+ Copyright (c) 2008 Craig P Jolicoeur
34
+
35
+ Permission is hereby granted, free of charge, to any person obtaining
36
+ a copy of this software and associated documentation files (the
37
+ 'Software'), to deal in the Software without restriction, including
38
+ without limitation the rights to use, copy, modify, merge, publish,
39
+ distribute, sublicense, and/or sell copies of the Software, and to
40
+ permit persons to whom the Software is furnished to do so, subject to
41
+ the following conditions:
42
+
43
+ The above copyright notice and this permission notice shall be
44
+ included in all copies or substantial portions of the Software.
45
+
46
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
47
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
48
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
49
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
50
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
51
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
52
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,28 @@
1
+ %w[rubygems rake rake/clean fileutils newgem rubigen].each { |f| require f }
2
+ require File.dirname(__FILE__) + '/lib/parsehtml'
3
+
4
+ # Generate all the Rake tasks
5
+ # Run 'rake -T' to see list of generated tasks (from gem root directory)
6
+ $hoe = Hoe.new('parsehtml', ParseHTML::VERSION) do |p|
7
+ p.developer('ParseHTML', 'cpjolicoeur@gmail.com')
8
+ p.changes = p.paragraphs_of("History.txt", 0..1).join("\n\n")
9
+ p.post_install_message = 'PostInstall.txt' # TODO remove if post-install message not required
10
+ p.rubyforge_name = p.name # TODO this is default value
11
+ # p.extra_deps = [
12
+ # ['activesupport','>= 2.0.2'],
13
+ # ]
14
+ p.extra_dev_deps = [
15
+ ['newgem', ">= #{::Newgem::VERSION}"]
16
+ ]
17
+
18
+ p.clean_globs |= %w[**/.DS_Store tmp *.log]
19
+ path = (p.rubyforge_name == p.name) ? p.rubyforge_name : "\#{p.rubyforge_name}/\#{p.name}"
20
+ p.remote_rdoc_dir = File.join(path.gsub(/^#{p.rubyforge_name}\/?/,''), 'rdoc')
21
+ p.rsync_args = '-av --delete --ignore-errors'
22
+ end
23
+
24
+ require 'newgem/tasks' # load /tasks/*.rake
25
+ Dir['tasks/**/*.rake'].each { |t| load t }
26
+
27
+ # TODO - want other tests/tasks run by default? Add them to the list
28
+ # task :default => [:spec, :features]
@@ -0,0 +1,2 @@
1
+ host: cpjolicoeur@rubyforge.org
2
+ remote_dir: /var/www/gforge-projects/parsehtml
@@ -0,0 +1,452 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ class ParseHTML
5
+ VERSION = '1.12.0'
6
+
7
+ # tags which are always empty (<br />, etc.)
8
+ EMPTY_TAGS = %w(br hr input img area link meta param)
9
+
10
+ # tags with preformatted text - whitespace won't be touched in them
11
+ PREFORMATTED_TAGS = %w(script style pre code)
12
+
13
+ # list of block elements
14
+ # - tag_name => bool (is block level)
15
+ BLOCK_ELEMENTS = {'address' => true,
16
+ 'blockquote' => true,
17
+ 'center' => true,
18
+ 'del' => true,
19
+ 'dir' => true,
20
+ 'div' => true,
21
+ 'dl' => true,
22
+ 'fieldset' => true,
23
+ 'form' => true,
24
+ 'h1' => true,
25
+ 'h2' => true,
26
+ 'h3' => true,
27
+ 'h4' => true,
28
+ 'h5' => true,
29
+ 'h6' => true,
30
+ 'hr' => true,
31
+ 'ins' => true,
32
+ 'isindex' => true,
33
+ 'menu' => true,
34
+ 'noframes' => true,
35
+ 'noscript' => true,
36
+ 'ol' => true,
37
+ 'p' => true,
38
+ 'pre' => true,
39
+ 'table' => true,
40
+ 'ul' => true,
41
+ # set table elements and list items to block as well
42
+ 'thead' => true,
43
+ 'tbody' => true,
44
+ 'tfoot' => true,
45
+ 'td' => true,
46
+ 'tr' => true,
47
+ 'th' => true,
48
+ 'li' => true,
49
+ 'dd' => true,
50
+ 'dt' => true,
51
+ # header items and html / body as well
52
+ 'html' => true,
53
+ 'body' => true,
54
+ 'head' => true,
55
+ 'meta' => true,
56
+ 'link' => true,
57
+ 'style' => true,
58
+ 'title' => true,
59
+ # media tags to render as block
60
+ 'map' => true,
61
+ 'object' => true,
62
+ 'param' => true,
63
+ 'embed' => true,
64
+ 'area' => true,
65
+ # inline elements
66
+ 'a' => false,
67
+ 'abbr' => false,
68
+ 'acronym' => false,
69
+ 'applet' => false,
70
+ 'b' => false,
71
+ 'basefont' => false,
72
+ 'bdo' => false,
73
+ 'big' => false,
74
+ 'br' => false,
75
+ 'button' => false,
76
+ 'cite' => false,
77
+ 'code' => false,
78
+ 'del' => false,
79
+ 'dfn' => false,
80
+ 'em' => false,
81
+ 'font' => false,
82
+ 'i' => false,
83
+ 'img' => false,
84
+ 'ins' => false,
85
+ 'input' => false,
86
+ 'iframe' => false,
87
+ 'kbd' => false,
88
+ 'label' => false,
89
+ 'q' => false,
90
+ 'samp' => false,
91
+ 'script' => false,
92
+ 'select' => false,
93
+ 'small' => false,
94
+ 'span' => false,
95
+ 'strong' => false,
96
+ 'sub' => false,
97
+ 'sup' => false,
98
+ 'textarea' => false,
99
+ 'tt' => false,
100
+ 'var' => false}
101
+
102
+ # html to be parsed
103
+ attr_accessor :html
104
+
105
+ # node type:
106
+ # - tag (see isStartTag)
107
+ # - text (include cdata)
108
+ # - comment
109
+ # - doctype
110
+ # - pi (processing instruction)
111
+ attr_reader :node_type
112
+
113
+ # current node context
114
+ # - either a simple string (text node) or something like
115
+ # - <tag attrib="value"...>
116
+ attr_accessor :node
117
+
118
+ # supress HTML tags inside preformatted tags
119
+ attr_accessor :no_tags_in_code
120
+
121
+ # whether the current node is an opening tag (<a>) or not (</a>)
122
+ # - set to nil if current node is not a tag
123
+ # - NOTE: empty tags (<br />) set this to true as well!
124
+ attr_reader :is_start_tag
125
+
126
+ # whether current node is an empty tag (<br />) or not (<a></a>)
127
+ attr_reader :is_empty_tag
128
+
129
+ # whether the current tag is a block level element
130
+ attr_reader :is_block_element
131
+
132
+ # tag name
133
+ attr_reader :tag_name
134
+
135
+ # attributes of current_tag (in hash)
136
+ attr_reader :tag_attributes
137
+
138
+ # keep whitespace formatting
139
+ attr_reader :keep_whitespace
140
+
141
+ # list of open tags (array)
142
+ # - count this to get current depth
143
+ attr_reader :open_tags
144
+
145
+
146
+ def initialize(html = '')
147
+ @html = html
148
+ @open_tags = []
149
+ @node_type, @node, @tag_name = '', '', ''
150
+ @is_start_tag, @is_empty_tag, @is_block_element, @no_tags_in_code = false, false, false, false
151
+ @tag_attributes = nil
152
+ @keep_whitespace = 0
153
+ end
154
+
155
+ # get next node
156
+ def next_node
157
+ return false if (@html.nil? || @html.empty?)
158
+
159
+ skip_whitespace = true # FIXME: should probably be a class variable?
160
+ if (@is_start_tag && !@is_empty_tag)
161
+ @open_tags << @tag_name
162
+ @keep_whitespace += 1 if PREFORMATTED_TAGS.include?(@tag_name)
163
+ end
164
+
165
+ if (@html[0,1] == '<')
166
+ token = html[0,9]
167
+ if (token[0,2] == '<?')
168
+ # xml, prolog, or other pi's
169
+ # TODO: trigger error (this might need some work)
170
+ pos = @html.index('>')
171
+ set_node('pi', pos+1)
172
+ return true;
173
+ end # end pi tag
174
+ if (token[0,4] == '<!--')
175
+ # HTML comment
176
+ pos = @html.index('-->')
177
+ if pos.nil?
178
+ # could not find a closing -->, use next gt tag instead
179
+ # this is what firefox does with its parsing
180
+ pos = @html.index('>') + 1
181
+ else
182
+ pos += 3
183
+ end
184
+ set_node('comment', pos)
185
+ return true
186
+ end # end comment tag
187
+ if (token == '<!DOCTYPE')
188
+ # doctype
189
+ set_node('doctype', @html.index('>')+1)
190
+ @skip_whitespace = true
191
+ return true
192
+ end # end <!DOCTYPE tag
193
+ if (token == '<![CDATA[')
194
+ # cdata, use text mode
195
+
196
+ # remove leading <![CDATA[
197
+ @html = @html[9, @html.size-9]
198
+ set_node('text', @html.index(']]>')+3)
199
+
200
+ # remove trailing ]]> and trim
201
+ @node = @node[0, -3]
202
+ handle_whitespaces
203
+
204
+ @skip_whitespace = true
205
+ return true
206
+ end # end cdata
207
+ if (parse_tag)
208
+ # seems to be a tag so handle whitespaces
209
+ skip_whitespace = @is_block_element ? true : false
210
+ return true
211
+ end # end parse_tag
212
+ end
213
+
214
+ skip_whitespace = false if @keep_whitespace
215
+
216
+ # when we get here it seems to be a text node
217
+ pos = @html.index('<') || @html.size
218
+
219
+ set_node('text', pos)
220
+ handle_whitespaces
221
+ return next_node if (skip_whitespace && @node == ' ')
222
+ skip_whitespace = false
223
+ return true
224
+ end # end next_node
225
+
226
+ # normalize self.node
227
+ def normalize_node
228
+ @node = '<'
229
+ unless (@is_start_tag)
230
+ @node << "/#{@tag_name}>"
231
+ return
232
+ end
233
+ @node << @tag_name
234
+ @tag_attributes.each do |name, value|
235
+ str = " #{name}=\"" + value.gsub(/\"/, '&quot;') + "\""
236
+ @node << str
237
+ end
238
+ @node << ' /' if (@is_empty_tag)
239
+ @node << '>'
240
+ end
241
+
242
+ private
243
+
244
+ # parse tag, set tag name and attributes, check for closing tag, etc...
245
+ def parse_tag
246
+ a_ord = ?a
247
+ z_ord = ?z
248
+ special_ords = [?:, ?-] # for xml:lang and http-equiv
249
+
250
+ tag_name = ''
251
+ pos = 1
252
+ is_start_tag = (@html[pos,1] != '/')
253
+ pos += 1 unless is_start_tag
254
+
255
+ # get tag name
256
+ while (@html[pos,1])
257
+ char = @html.downcase[pos,1]
258
+ pos_ord = char[0]
259
+ if ((pos_ord >= a_ord && pos_ord <= z_ord) || (!tag_name.empty? && is_numeric?(char)))
260
+ tag_name << char
261
+ pos += 1
262
+ else
263
+ pos -= 1
264
+ break
265
+ end
266
+ end # end while
267
+
268
+ tag_name.downcase!
269
+ if (tag_name.empty? || !BLOCK_ELEMENTS.include?(tag_name))
270
+ # something went wrong, invalid tag
271
+ invalid_tag
272
+ return false
273
+ end
274
+
275
+ if (@no_tags_in_code && @open_tags.last == 'code' && !(tag_name == 'code' && !is_start_tag))
276
+ # supress all HTML tags inside code tags
277
+ invalid_tag
278
+ return false
279
+ end
280
+
281
+ # get tag attributes
282
+ # TODO: in HTML 4 attributes dont need to be quoted
283
+ is_empty_tag = false
284
+ attributes = {}
285
+ curr_attribute = ''
286
+ while (@html[pos+1,1])
287
+ pos += 1
288
+ # close tag
289
+ if (@html[pos,1] == '>' || @html[pos,2] == '/>')
290
+ if (@html[pos,1] == '/')
291
+ is_empty_tag = true
292
+ pos += 1
293
+ end
294
+ break
295
+ end
296
+
297
+ char = @html.downcase[pos,1]
298
+ pos_ord = char[0]
299
+ if (pos_ord >= a_ord && pos_ord <= z_ord)
300
+ # attribute name
301
+ curr_attribute << char
302
+ elsif ([' ', "\t", "\n"].include?(char))
303
+ # drop whitespace
304
+ elsif
305
+ # get attribute value
306
+ pos += 1
307
+ await = @html[pos,1] # single or double quote
308
+ pos += 1
309
+ value = ''
310
+ while (@html[pos,1] && @html[pos,1] != await)
311
+ value << @html[pos,1]
312
+ pos += 1
313
+ end # end while
314
+ attributes[curr_attribute] = value
315
+ curr_attribute = ''
316
+ else
317
+ invalid_tag
318
+ return false
319
+ end
320
+ end # end while
321
+
322
+ if (@html[pos, 1] != '>')
323
+ invalid_tag
324
+ return false
325
+ end
326
+
327
+ if (!curr_attribute.empty?)
328
+ # html4 allows something like <option selected> instead of <option selected="selected">
329
+ attributes[curr_attribute] = curr_attribute
330
+ end
331
+
332
+ unless (is_start_tag)
333
+ if (!attributes.empty? || (tag_name != @open_tags.last))
334
+ # end tags must not contain any attributes
335
+ # or maybe we did not expect a different tag to be closed
336
+ invalid_tag
337
+ return false
338
+ end
339
+ @open_tags.pop
340
+ if (PREFORMATTED_TAGS.include?(tag_name))
341
+ @keep_whitespace -= 1
342
+ end
343
+ end
344
+ pos += 1
345
+ @node = @html[0,pos]
346
+ @html = @html[pos, @html.size-pos]
347
+ @tag_name = tag_name
348
+ @tag_attributes = attributes
349
+ @is_start_tag = is_start_tag
350
+ @is_empty_tag = is_empty_tag || EMPTY_TAGS.include?(tag_name)
351
+ if (@is_empty_tag)
352
+ # might not be well formed
353
+ @node.gsub!(/ *\/? *>$/, ' />')
354
+ end
355
+ @node_type = 'tag'
356
+ @is_block_element = BLOCK_ELEMENTS[tag_name]
357
+ return true
358
+ end
359
+
360
+ # handle invalid tags
361
+ def invalid_tag
362
+ @html = '&lt;' + @html.slice(1, @html.size - 1)
363
+ end
364
+
365
+ # update all variables and make @html shorter
366
+ # - param type => @nodeType
367
+ # - param pos => which position to cut at
368
+ def set_node(type, pos)
369
+ if (@node_type == 'tag') # (type == 'tag')
370
+ # set specific tag vars to null
371
+ # type == tag should not be called here
372
+ # see parse_tag for more info
373
+ @tag_name = nil
374
+ @tag_attributes = nil
375
+ @is_start_tag = false
376
+ @is_empty_tag = false
377
+ @is_block_element = false
378
+ end
379
+ @node_type = type
380
+ @node = @html[0, pos]
381
+ @html = @html[pos, @html.size-pos]
382
+ end # end set_node
383
+
384
+ # check if @html begins with a specific string
385
+ def match?(str)
386
+ @html.slice(0, str.size) == str
387
+ end
388
+
389
+ # truncate whitespaces
390
+ def handle_whitespaces
391
+ return if (@keep_whitespace.nil? || @keep_whitespace.zero?)
392
+ @node.gsub!(/\s+/, ' ')
393
+ end
394
+
395
+ # check if a string is a valid numeric value
396
+ def is_numeric?(val)
397
+ Float val rescue false
398
+ end
399
+
400
+ # indent HTML properly
401
+ def self.indent_html(html, indent = ' ', no_tags_in_code = false)
402
+ parser = ParseHTML.new(html)
403
+ parser.no_tags_in_code = no_tags_in_code
404
+ html = ''
405
+ last = true # last tag was block element
406
+ indent_a = []
407
+
408
+ while (parser.next_node)
409
+ parser.normalize_node if (parser.node_type == 'tag')
410
+ if ((parser.node_type == 'tag') && parser.is_block_element)
411
+ is_pre_or_code = ['code', 'pre'].include?(parser.tag_name)
412
+ if(parser.keep_whitespace.zero? && !last && !is_pre_or_code)
413
+ html = html.rstrip + "\n"
414
+ end
415
+ if (parser.is_start_tag)
416
+ html << indent_a.join(' ')
417
+ if (!parser.is_empty_tag)
418
+ indent_a << indent
419
+ end
420
+ else
421
+ indent_a.pop
422
+ if (!is_pre_or_code)
423
+ html << indent_a.join(' ')
424
+ end
425
+ end
426
+ html << parser.node
427
+ if (parser.keep_whitespace.zero? && !(is_pre_or_code && parser.is_start_tag))
428
+ html << "\n"
429
+ end
430
+ last = true
431
+ else
432
+ if (parser.node_type == 'tag' && parser.tag_name == 'br')
433
+ html << (parser.node + "\n")
434
+ last = true
435
+ next
436
+ elsif (last && parser.keep_whitespace.zero?)
437
+ html << indent_a.join(' ')
438
+ parser.node = parser.node.lstrip
439
+ end
440
+ html << parser.node
441
+
442
+ if (['comment', 'pi', 'doctype'].include?(parser.node_type))
443
+ html << "\n"
444
+ else
445
+ last = false
446
+ end
447
+ end
448
+ end # end while
449
+ return html
450
+ end
451
+
452
+ end # end class ParseHTML