parsehtml 1.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ == 1.12.0 2008-12-05
2
+
3
+ * No major enhancements
4
+ * Just changing the gem server from github to rubyforge
@@ -0,0 +1,19 @@
1
+ History.txt
2
+ Manifest.txt
3
+ PostInstall.txt
4
+ README.rdoc
5
+ Rakefile
6
+ config/website.yml
7
+ lib/parsehtml.rb
8
+ script/console
9
+ script/destroy
10
+ script/generate
11
+ script/txt2html
12
+ test/test_helper.rb
13
+ test/test_parsehtml.rb
14
+ test/test_parse.rb
15
+ website/index.html
16
+ website/index.txt
17
+ website/javascripts/rounded_corners_lite.inc.js
18
+ website/stylesheets/screen.css
19
+ website/template.html.erb
@@ -0,0 +1,2 @@
1
+
2
+ For more information on parsehtml, see http://parsehtml.rubyforge.org
@@ -0,0 +1,52 @@
1
+ = parsehtml
2
+
3
+ http://parsehtml.rubyforge.org
4
+
5
+ == DESCRIPTION:
6
+
7
+ ParseHTML is an HTML parser which works with Ruby 1.8 and above. ParseHTML will even try to handle invalid HTML to some degree.
8
+
9
+ == SYNOPSIS:
10
+
11
+ FIX (code sample of usage)
12
+
13
+ == REQUIREMENTS:
14
+
15
+ Ruby 1.8
16
+
17
+ == INSTALL:
18
+
19
+ sudo gem install parsehtml
20
+
21
+ == DEVELOPERS:
22
+
23
+ Craig P Jolicoeur - http://github.com/cpjolicoeur
24
+
25
+ == ACKNOWLEDGEMENTS:
26
+
27
+ ParseHTML is heavily based on the ParseHTML PHP library by Milian Wolf (http://milianw.de)
28
+
29
+ == LICENSE:
30
+
31
+ (The MIT License)
32
+
33
+ Copyright (c) 2008 Craig P Jolicoeur
34
+
35
+ Permission is hereby granted, free of charge, to any person obtaining
36
+ a copy of this software and associated documentation files (the
37
+ 'Software'), to deal in the Software without restriction, including
38
+ without limitation the rights to use, copy, modify, merge, publish,
39
+ distribute, sublicense, and/or sell copies of the Software, and to
40
+ permit persons to whom the Software is furnished to do so, subject to
41
+ the following conditions:
42
+
43
+ The above copyright notice and this permission notice shall be
44
+ included in all copies or substantial portions of the Software.
45
+
46
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
47
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
48
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
49
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
50
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
51
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
52
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,28 @@
1
+ %w[rubygems rake rake/clean fileutils newgem rubigen].each { |f| require f }
2
+ require File.dirname(__FILE__) + '/lib/parsehtml'
3
+
4
+ # Generate all the Rake tasks
5
+ # Run 'rake -T' to see list of generated tasks (from gem root directory)
6
+ $hoe = Hoe.new('parsehtml', ParseHTML::VERSION) do |p|
7
+ p.developer('ParseHTML', 'cpjolicoeur@gmail.com')
8
+ p.changes = p.paragraphs_of("History.txt", 0..1).join("\n\n")
9
+ p.post_install_message = 'PostInstall.txt' # TODO remove if post-install message not required
10
+ p.rubyforge_name = p.name # TODO this is default value
11
+ # p.extra_deps = [
12
+ # ['activesupport','>= 2.0.2'],
13
+ # ]
14
+ p.extra_dev_deps = [
15
+ ['newgem', ">= #{::Newgem::VERSION}"]
16
+ ]
17
+
18
+ p.clean_globs |= %w[**/.DS_Store tmp *.log]
19
+ path = (p.rubyforge_name == p.name) ? p.rubyforge_name : "\#{p.rubyforge_name}/\#{p.name}"
20
+ p.remote_rdoc_dir = File.join(path.gsub(/^#{p.rubyforge_name}\/?/,''), 'rdoc')
21
+ p.rsync_args = '-av --delete --ignore-errors'
22
+ end
23
+
24
+ require 'newgem/tasks' # load /tasks/*.rake
25
+ Dir['tasks/**/*.rake'].each { |t| load t }
26
+
27
+ # TODO - want other tests/tasks run by default? Add them to the list
28
+ # task :default => [:spec, :features]
@@ -0,0 +1,2 @@
1
+ host: cpjolicoeur@rubyforge.org
2
+ remote_dir: /var/www/gforge-projects/parsehtml
@@ -0,0 +1,452 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ class ParseHTML
5
+ VERSION = '1.12.0'
6
+
7
+ # tags which are always empty (<br />, etc.)
8
+ EMPTY_TAGS = %w(br hr input img area link meta param)
9
+
10
+ # tags with preformatted text - whitespace won't be touched in them
11
+ PREFORMATTED_TAGS = %w(script style pre code)
12
+
13
+ # list of block elements
14
+ # - tag_name => bool (is block level)
15
+ BLOCK_ELEMENTS = {'address' => true,
16
+ 'blockquote' => true,
17
+ 'center' => true,
18
+ 'del' => true,
19
+ 'dir' => true,
20
+ 'div' => true,
21
+ 'dl' => true,
22
+ 'fieldset' => true,
23
+ 'form' => true,
24
+ 'h1' => true,
25
+ 'h2' => true,
26
+ 'h3' => true,
27
+ 'h4' => true,
28
+ 'h5' => true,
29
+ 'h6' => true,
30
+ 'hr' => true,
31
+ 'ins' => true,
32
+ 'isindex' => true,
33
+ 'menu' => true,
34
+ 'noframes' => true,
35
+ 'noscript' => true,
36
+ 'ol' => true,
37
+ 'p' => true,
38
+ 'pre' => true,
39
+ 'table' => true,
40
+ 'ul' => true,
41
+ # set table elements and list items to block as well
42
+ 'thead' => true,
43
+ 'tbody' => true,
44
+ 'tfoot' => true,
45
+ 'td' => true,
46
+ 'tr' => true,
47
+ 'th' => true,
48
+ 'li' => true,
49
+ 'dd' => true,
50
+ 'dt' => true,
51
+ # header items and html / body as well
52
+ 'html' => true,
53
+ 'body' => true,
54
+ 'head' => true,
55
+ 'meta' => true,
56
+ 'link' => true,
57
+ 'style' => true,
58
+ 'title' => true,
59
+ # media tags to render as block
60
+ 'map' => true,
61
+ 'object' => true,
62
+ 'param' => true,
63
+ 'embed' => true,
64
+ 'area' => true,
65
+ # inline elements
66
+ 'a' => false,
67
+ 'abbr' => false,
68
+ 'acronym' => false,
69
+ 'applet' => false,
70
+ 'b' => false,
71
+ 'basefont' => false,
72
+ 'bdo' => false,
73
+ 'big' => false,
74
+ 'br' => false,
75
+ 'button' => false,
76
+ 'cite' => false,
77
+ 'code' => false,
78
+ 'del' => false,
79
+ 'dfn' => false,
80
+ 'em' => false,
81
+ 'font' => false,
82
+ 'i' => false,
83
+ 'img' => false,
84
+ 'ins' => false,
85
+ 'input' => false,
86
+ 'iframe' => false,
87
+ 'kbd' => false,
88
+ 'label' => false,
89
+ 'q' => false,
90
+ 'samp' => false,
91
+ 'script' => false,
92
+ 'select' => false,
93
+ 'small' => false,
94
+ 'span' => false,
95
+ 'strong' => false,
96
+ 'sub' => false,
97
+ 'sup' => false,
98
+ 'textarea' => false,
99
+ 'tt' => false,
100
+ 'var' => false}
101
+
102
+ # html to be parsed
103
+ attr_accessor :html
104
+
105
+ # node type:
106
+ # - tag (see isStartTag)
107
+ # - text (include cdata)
108
+ # - comment
109
+ # - doctype
110
+ # - pi (processing instruction)
111
+ attr_reader :node_type
112
+
113
+ # current node context
114
+ # - either a simple string (text node) or something like
115
+ # - <tag attrib="value"...>
116
+ attr_accessor :node
117
+
118
+ # supress HTML tags inside preformatted tags
119
+ attr_accessor :no_tags_in_code
120
+
121
+ # whether the current node is an opening tag (<a>) or not (</a>)
122
+ # - set to nil if current node is not a tag
123
+ # - NOTE: empty tags (<br />) set this to true as well!
124
+ attr_reader :is_start_tag
125
+
126
+ # whether current node is an empty tag (<br />) or not (<a></a>)
127
+ attr_reader :is_empty_tag
128
+
129
+ # whether the current tag is a block level element
130
+ attr_reader :is_block_element
131
+
132
+ # tag name
133
+ attr_reader :tag_name
134
+
135
+ # attributes of current_tag (in hash)
136
+ attr_reader :tag_attributes
137
+
138
+ # keep whitespace formatting
139
+ attr_reader :keep_whitespace
140
+
141
+ # list of open tags (array)
142
+ # - count this to get current depth
143
+ attr_reader :open_tags
144
+
145
+
146
+ def initialize(html = '')
147
+ @html = html
148
+ @open_tags = []
149
+ @node_type, @node, @tag_name = '', '', ''
150
+ @is_start_tag, @is_empty_tag, @is_block_element, @no_tags_in_code = false, false, false, false
151
+ @tag_attributes = nil
152
+ @keep_whitespace = 0
153
+ end
154
+
155
+ # get next node
156
+ def next_node
157
+ return false if (@html.nil? || @html.empty?)
158
+
159
+ skip_whitespace = true # FIXME: should probably be a class variable?
160
+ if (@is_start_tag && !@is_empty_tag)
161
+ @open_tags << @tag_name
162
+ @keep_whitespace += 1 if PREFORMATTED_TAGS.include?(@tag_name)
163
+ end
164
+
165
+ if (@html[0,1] == '<')
166
+ token = html[0,9]
167
+ if (token[0,2] == '<?')
168
+ # xml, prolog, or other pi's
169
+ # TODO: trigger error (this might need some work)
170
+ pos = @html.index('>')
171
+ set_node('pi', pos+1)
172
+ return true;
173
+ end # end pi tag
174
+ if (token[0,4] == '<!--')
175
+ # HTML comment
176
+ pos = @html.index('-->')
177
+ if pos.nil?
178
+ # could not find a closing -->, use next gt tag instead
179
+ # this is what firefox does with its parsing
180
+ pos = @html.index('>') + 1
181
+ else
182
+ pos += 3
183
+ end
184
+ set_node('comment', pos)
185
+ return true
186
+ end # end comment tag
187
+ if (token == '<!DOCTYPE')
188
+ # doctype
189
+ set_node('doctype', @html.index('>')+1)
190
+ @skip_whitespace = true
191
+ return true
192
+ end # end <!DOCTYPE tag
193
+ if (token == '<![CDATA[')
194
+ # cdata, use text mode
195
+
196
+ # remove leading <![CDATA[
197
+ @html = @html[9, @html.size-9]
198
+ set_node('text', @html.index(']]>')+3)
199
+
200
+ # remove trailing ]]> and trim
201
+ @node = @node[0, -3]
202
+ handle_whitespaces
203
+
204
+ @skip_whitespace = true
205
+ return true
206
+ end # end cdata
207
+ if (parse_tag)
208
+ # seems to be a tag so handle whitespaces
209
+ skip_whitespace = @is_block_element ? true : false
210
+ return true
211
+ end # end parse_tag
212
+ end
213
+
214
+ skip_whitespace = false if @keep_whitespace
215
+
216
+ # when we get here it seems to be a text node
217
+ pos = @html.index('<') || @html.size
218
+
219
+ set_node('text', pos)
220
+ handle_whitespaces
221
+ return next_node if (skip_whitespace && @node == ' ')
222
+ skip_whitespace = false
223
+ return true
224
+ end # end next_node
225
+
226
+ # normalize self.node
227
+ def normalize_node
228
+ @node = '<'
229
+ unless (@is_start_tag)
230
+ @node << "/#{@tag_name}>"
231
+ return
232
+ end
233
+ @node << @tag_name
234
+ @tag_attributes.each do |name, value|
235
+ str = " #{name}=\"" + value.gsub(/\"/, '&quot;') + "\""
236
+ @node << str
237
+ end
238
+ @node << ' /' if (@is_empty_tag)
239
+ @node << '>'
240
+ end
241
+
242
+ private
243
+
244
+ # parse tag, set tag name and attributes, check for closing tag, etc...
245
+ def parse_tag
246
+ a_ord = ?a
247
+ z_ord = ?z
248
+ special_ords = [?:, ?-] # for xml:lang and http-equiv
249
+
250
+ tag_name = ''
251
+ pos = 1
252
+ is_start_tag = (@html[pos,1] != '/')
253
+ pos += 1 unless is_start_tag
254
+
255
+ # get tag name
256
+ while (@html[pos,1])
257
+ char = @html.downcase[pos,1]
258
+ pos_ord = char[0]
259
+ if ((pos_ord >= a_ord && pos_ord <= z_ord) || (!tag_name.empty? && is_numeric?(char)))
260
+ tag_name << char
261
+ pos += 1
262
+ else
263
+ pos -= 1
264
+ break
265
+ end
266
+ end # end while
267
+
268
+ tag_name.downcase!
269
+ if (tag_name.empty? || !BLOCK_ELEMENTS.include?(tag_name))
270
+ # something went wrong, invalid tag
271
+ invalid_tag
272
+ return false
273
+ end
274
+
275
+ if (@no_tags_in_code && @open_tags.last == 'code' && !(tag_name == 'code' && !is_start_tag))
276
+ # supress all HTML tags inside code tags
277
+ invalid_tag
278
+ return false
279
+ end
280
+
281
+ # get tag attributes
282
+ # TODO: in HTML 4 attributes dont need to be quoted
283
+ is_empty_tag = false
284
+ attributes = {}
285
+ curr_attribute = ''
286
+ while (@html[pos+1,1])
287
+ pos += 1
288
+ # close tag
289
+ if (@html[pos,1] == '>' || @html[pos,2] == '/>')
290
+ if (@html[pos,1] == '/')
291
+ is_empty_tag = true
292
+ pos += 1
293
+ end
294
+ break
295
+ end
296
+
297
+ char = @html.downcase[pos,1]
298
+ pos_ord = char[0]
299
+ if (pos_ord >= a_ord && pos_ord <= z_ord)
300
+ # attribute name
301
+ curr_attribute << char
302
+ elsif ([' ', "\t", "\n"].include?(char))
303
+ # drop whitespace
304
+ elsif
305
+ # get attribute value
306
+ pos += 1
307
+ await = @html[pos,1] # single or double quote
308
+ pos += 1
309
+ value = ''
310
+ while (@html[pos,1] && @html[pos,1] != await)
311
+ value << @html[pos,1]
312
+ pos += 1
313
+ end # end while
314
+ attributes[curr_attribute] = value
315
+ curr_attribute = ''
316
+ else
317
+ invalid_tag
318
+ return false
319
+ end
320
+ end # end while
321
+
322
+ if (@html[pos, 1] != '>')
323
+ invalid_tag
324
+ return false
325
+ end
326
+
327
+ if (!curr_attribute.empty?)
328
+ # html4 allows something like <option selected> instead of <option selected="selected">
329
+ attributes[curr_attribute] = curr_attribute
330
+ end
331
+
332
+ unless (is_start_tag)
333
+ if (!attributes.empty? || (tag_name != @open_tags.last))
334
+ # end tags must not contain any attributes
335
+ # or maybe we did not expect a different tag to be closed
336
+ invalid_tag
337
+ return false
338
+ end
339
+ @open_tags.pop
340
+ if (PREFORMATTED_TAGS.include?(tag_name))
341
+ @keep_whitespace -= 1
342
+ end
343
+ end
344
+ pos += 1
345
+ @node = @html[0,pos]
346
+ @html = @html[pos, @html.size-pos]
347
+ @tag_name = tag_name
348
+ @tag_attributes = attributes
349
+ @is_start_tag = is_start_tag
350
+ @is_empty_tag = is_empty_tag || EMPTY_TAGS.include?(tag_name)
351
+ if (@is_empty_tag)
352
+ # might not be well formed
353
+ @node.gsub!(/ *\/? *>$/, ' />')
354
+ end
355
+ @node_type = 'tag'
356
+ @is_block_element = BLOCK_ELEMENTS[tag_name]
357
+ return true
358
+ end
359
+
360
+ # handle invalid tags
361
+ def invalid_tag
362
+ @html = '&lt;' + @html.slice(1, @html.size - 1)
363
+ end
364
+
365
+ # update all variables and make @html shorter
366
+ # - param type => @nodeType
367
+ # - param pos => which position to cut at
368
+ def set_node(type, pos)
369
+ if (@node_type == 'tag') # (type == 'tag')
370
+ # set specific tag vars to null
371
+ # type == tag should not be called here
372
+ # see parse_tag for more info
373
+ @tag_name = nil
374
+ @tag_attributes = nil
375
+ @is_start_tag = false
376
+ @is_empty_tag = false
377
+ @is_block_element = false
378
+ end
379
+ @node_type = type
380
+ @node = @html[0, pos]
381
+ @html = @html[pos, @html.size-pos]
382
+ end # end set_node
383
+
384
+ # check if @html begins with a specific string
385
+ def match?(str)
386
+ @html.slice(0, str.size) == str
387
+ end
388
+
389
+ # truncate whitespaces
390
+ def handle_whitespaces
391
+ return if (@keep_whitespace.nil? || @keep_whitespace.zero?)
392
+ @node.gsub!(/\s+/, ' ')
393
+ end
394
+
395
+ # check if a string is a valid numeric value
396
+ def is_numeric?(val)
397
+ Float val rescue false
398
+ end
399
+
400
+ # indent HTML properly
401
+ def self.indent_html(html, indent = ' ', no_tags_in_code = false)
402
+ parser = ParseHTML.new(html)
403
+ parser.no_tags_in_code = no_tags_in_code
404
+ html = ''
405
+ last = true # last tag was block element
406
+ indent_a = []
407
+
408
+ while (parser.next_node)
409
+ parser.normalize_node if (parser.node_type == 'tag')
410
+ if ((parser.node_type == 'tag') && parser.is_block_element)
411
+ is_pre_or_code = ['code', 'pre'].include?(parser.tag_name)
412
+ if(parser.keep_whitespace.zero? && !last && !is_pre_or_code)
413
+ html = html.rstrip + "\n"
414
+ end
415
+ if (parser.is_start_tag)
416
+ html << indent_a.join(' ')
417
+ if (!parser.is_empty_tag)
418
+ indent_a << indent
419
+ end
420
+ else
421
+ indent_a.pop
422
+ if (!is_pre_or_code)
423
+ html << indent_a.join(' ')
424
+ end
425
+ end
426
+ html << parser.node
427
+ if (parser.keep_whitespace.zero? && !(is_pre_or_code && parser.is_start_tag))
428
+ html << "\n"
429
+ end
430
+ last = true
431
+ else
432
+ if (parser.node_type == 'tag' && parser.tag_name == 'br')
433
+ html << (parser.node + "\n")
434
+ last = true
435
+ next
436
+ elsif (last && parser.keep_whitespace.zero?)
437
+ html << indent_a.join(' ')
438
+ parser.node = parser.node.lstrip
439
+ end
440
+ html << parser.node
441
+
442
+ if (['comment', 'pi', 'doctype'].include?(parser.node_type))
443
+ html << "\n"
444
+ else
445
+ last = false
446
+ end
447
+ end
448
+ end # end while
449
+ return html
450
+ end
451
+
452
+ end # end class ParseHTML