htmltools 1.10

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,140 @@
1
+ #!/usr/bin/ruby
2
+ # This is an HTML parser that builds an element tree for further
3
+ # processing. Attributes and data are also stored.
4
+ #
5
+ # Typical usage is:
6
+ # parser = HTMLTree::Parser.new(false, false)
7
+ # parser.parse_file_named('whatever.html')
8
+ # # then you have the tree built..
9
+ # parser.tree.dump
10
+ #
11
+ # Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
12
+ # Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
13
+ # License:: Ruby's
14
+ # CVS ID:: $Id: tree.rb,v 1.2 2004/09/24 23:28:55 jhannes Exp $
15
+
16
+ require 'html/tags'
17
+ require 'html/stparser'
18
+ require 'html/element'
19
+
20
+ # This is a tree building HTML parser.
21
+ module HTMLTree
22
+ class Parser < HTML::StackingParser
23
+
24
+ # verbose:: if true, will warn to $stderr on unknown
25
+ # tags/entities/characters, as well as missing end tags and extra end
26
+ # tags.
27
+ # strip_white:: if true, remove all non-essential whitespace. Note
28
+ # that there are browser bugs that may cause this to change the
29
+ # appearance of HTML (even though it shouldn't by the standard).
30
+ def initialize(verbose=false, strip_white=true)
31
+ super
32
+ reset
33
+ end
34
+
35
+ # Reset this parser so that it can parse a new document.
36
+ def reset
37
+ super
38
+ @rootNode = @currentNode = Document.new
39
+ end
40
+
41
+ # Return the tree that was built. This will be an HTMLTree::Element that
42
+ # represents the whole document. The \<html> node is a child of this.
43
+ def tree
44
+ @rootNode
45
+ end
46
+
47
+ # Return the <html> node, if any.
48
+ def html
49
+ @rootNode.html_node()
50
+ end
51
+
52
+ # no user-serviceable parts inside...
53
+ # though you can subclass carefully.
54
+ private
55
+
56
+ def add_child_to_current(tag, attrs)
57
+ node = Element.new(@currentNode, tag)
58
+ attrs.each { |a| node.add_attribute(*a) }
59
+ node
60
+ end
61
+
62
+ # callbacks
63
+
64
+ # add a child to the current node and descend
65
+ def handle_start_tag(tag, attrs)
66
+ node = add_child_to_current(tag, attrs)
67
+ @rootNode = node unless @rootNode
68
+ @currentNode = node
69
+ end
70
+
71
+ # go up to parent
72
+ def handle_end_tag(tag)
73
+ @currentNode = @currentNode.parent
74
+ end
75
+
76
+ # add a child to the current node
77
+ def handle_empty_tag(tag, attrs)
78
+ add_child_to_current(tag, attrs)
79
+ end
80
+
81
+ # Add a child to the current node and descend
82
+ # Assume that the unknown tag has an end tag.
83
+ def handle_unknown_tag(tag, attrs)
84
+ super
85
+ handle_start_tag(tag, attrs)
86
+ end
87
+
88
+ # go up to parent
89
+ def handle_missing_end_tag(tag)
90
+ super
91
+ handle_end_tag(tag)
92
+ end
93
+
94
+ # ignore
95
+ def handle_extra_end_tag(tag)
96
+ super
97
+ end
98
+
99
+ def handle_cdata(data)
100
+ node = Data.new(@currentNode, data)
101
+ end
102
+
103
+ def handle_script(data)
104
+ node = Data.new(@currentNode, data)
105
+ end
106
+
107
+ def handle_unknown_character(name)
108
+ node = Data.new(@currentNode, "&##{name};")
109
+ end
110
+
111
+ def handle_unknown_entity(name)
112
+ node = Data.new(@currentNode, "&#{name};")
113
+ end
114
+
115
+ def handle_comment(data)
116
+ super # make sure and strip whitespace.
117
+ node = Comment.new(@currentNode, data)
118
+ end
119
+
120
+ def handle_special(data)
121
+ node = HTMLTree::Special.new(@currentNode, data)
122
+ $stderr.print('special ', node, ' discarded') unless @currentNode
123
+ end
124
+
125
+ end
126
+ end
127
+
128
+ if $0 == __FILE__
129
+ $stdout.sync = true
130
+
131
+ class TestStackingParser < HTMLTree::Parser
132
+ $DEBUG = false
133
+ p = TestStackingParser.new(true, false)
134
+ p.parse_file_named(ARGV[0] || 'ebay.html')
135
+ File.open('xx.html', 'w') { |of|
136
+ p.tree.write(of)
137
+ }
138
+ p.tree.dump
139
+ end
140
+ end
@@ -0,0 +1,173 @@
1
+ #!/usr/bin/ruby
2
+ # This is a tree building HTML parser that makes an XML structure
3
+ # using the format of REXML.
4
+ #
5
+ # Typical usage is:
6
+ # parser = HTMLTree::XMLParser.new(false, false)
7
+ # parser.parse_file_named('whatever.html')
8
+ # # then you have the tree built..
9
+ # parser.document # is a REXML::Document
10
+ #
11
+ # Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
12
+ # Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
13
+ # License:: Ruby's
14
+ # CVS ID:: $Id: xmltree.rb,v 1.2 2004/09/24 23:28:55 jhannes Exp $
15
+
16
+ require 'html/tags'
17
+ require 'html/stparser'
18
+ require 'rexml/element'
19
+ require 'rexml/document'
20
+
21
+ # REXML::Child
22
+ # REXML::XMLDecl
23
+ # REXML::Instruction
24
+ # REXML::Text
25
+ # REXML::Comment
26
+ # REXML::Entity
27
+ # REXML::Parent
28
+ # REXML::Element (+REXML::Namespace)
29
+ # REXML::Document
30
+ # REXML::DocType
31
+ #
32
+ # This is a tree building HTML parser that makes XML.
33
+ module HTMLTree
34
+ class XMLParser < HTML::StackingParser
35
+
36
+ # verbose:: if true, will warn to $stderr on unknown
37
+ # tags/entities/characters, as well as missing end tags and extra end
38
+ # tags.
39
+ # strip_white:: if true, remove all non-essential whitespace. Note
40
+ # that there are browser bugs that may cause this to change the
41
+ # appearance of HTML (even though it shouldn't by the standard).
42
+ def initialize(verbose=false, strip_white=true)
43
+ super
44
+ reset
45
+ end
46
+
47
+ # Reset this parser so that it can parse a new document.
48
+ def reset
49
+ super
50
+ @rootNode = @currentNode = REXML::Document.new()
51
+ end
52
+
53
+ # Return the document that was built. This will be an
54
+ # REXML::Document that represents the whole document. The \<html>
55
+ # node is a child of this.
56
+ def document
57
+ @rootNode
58
+ end
59
+
60
+ def tree
61
+ document
62
+ end
63
+
64
+ # Return the root of the document, if any.
65
+ def root
66
+ @rootNode.root()
67
+ end
68
+
69
+ # Return the <html> node, if any.
70
+ def html
71
+ @rootNode.root.elements['html']
72
+ end
73
+
74
+ # no user-serviceable parts inside...
75
+ # though you can subclass carefully.
76
+ private
77
+
78
+ def add_child_to_current(tag, attrs)
79
+ node = REXML::Element.new(tag, @currentNode)
80
+ attrs.each { |a| node.attributes[a[0]] = a[1] }
81
+ node
82
+ end
83
+
84
+ # callbacks
85
+
86
+ # add a child to the current node and descend
87
+ def handle_start_tag(tag, attrs)
88
+ node = add_child_to_current(tag, attrs)
89
+ @rootNode = node unless @rootNode
90
+ @currentNode = node
91
+ end
92
+
93
+ # go up to parent
94
+ def handle_end_tag(tag)
95
+ @currentNode = @currentNode.parent
96
+ end
97
+
98
+ # add a child to the current node
99
+ def handle_empty_tag(tag, attrs)
100
+ add_child_to_current(tag, attrs)
101
+ end
102
+
103
+ # Add a child to the current node and descend
104
+ # Assume that the unknown tag has an end tag.
105
+ def handle_unknown_tag(tag, attrs)
106
+ super
107
+ handle_start_tag(tag, attrs)
108
+ end
109
+
110
+ # go up to parent
111
+ def handle_missing_end_tag(tag)
112
+ super
113
+ handle_end_tag(tag)
114
+ end
115
+
116
+ # ignore
117
+ def handle_extra_end_tag(tag)
118
+ super
119
+ end
120
+
121
+ def handle_cdata(data)
122
+ node = REXML::Text.new(data, !@stripWhitespace, @currentNode)
123
+ node.parent = @currentNode
124
+ end
125
+
126
+ def handle_script(data)
127
+ node = REXML::Comment.new(data, @currentNode)
128
+ node.parent = @currentNode
129
+ end
130
+
131
+ def handle_unknown_character(name)
132
+ node = REXML::Text.new("&##{name};", false, @currentNode)
133
+ node.raw = true
134
+ node.parent = @currentNode
135
+ node
136
+ end
137
+
138
+ def handle_unknown_entity(name)
139
+ node = REXML::Text.new("&#{name};", false, @currentNode)
140
+ node.raw = true
141
+ node.parent = @currentNode
142
+ node
143
+ end
144
+
145
+ def handle_comment(data)
146
+ super # strip white
147
+ node = REXML::Comment.new(data, @currentNode)
148
+ node.parent = @currentNode
149
+ node
150
+ end
151
+
152
+ def handle_special(data)
153
+ source = REXML::SourceFactory::create_from( "<#{data}>" )
154
+ node = REXML::DocType.new(source, @currentNode)
155
+ node.parent = @currentNode
156
+ node
157
+ end
158
+
159
+ end
160
+ end
161
+
162
+ if $0 == __FILE__
163
+ $stdout.sync = true
164
+
165
+ class TestStackingParser < HTMLTree::XMLParser
166
+ $DEBUG = false
167
+ p = TestStackingParser.new(true, false)
168
+ p.parse_file_named(ARGV[0] || 'ebay.html')
169
+ File.open('xx.html', 'w') { |of|
170
+ p.document.write(of)
171
+ }
172
+ end
173
+ end
@@ -0,0 +1,72 @@
1
+ # This module adapts REXML's XPath functionality for use with
2
+ # <tt>HTMLTree::Parser</tt>.
3
+ #
4
+ # Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
5
+ # Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
6
+ # License:: Same as Ruby's
7
+ # CVS ID: $Id: xpath.rb,v 1.3 2004/09/24 23:28:55 jhannes Exp $
8
+
9
+ require 'html/tree'
10
+ require 'rexml/element'
11
+ require 'rexml/document'
12
+ require 'rexml/xpath'
13
+
14
+ module HTMLTree
15
+
16
+ module TreeElement
17
+ # Given the XPath path, return an Array of matching sub-elements of
18
+ # the REXML tree.
19
+ def rexml_match(path)
20
+ node = as_rexml_document
21
+ REXML::XPath.match(node, path)
22
+ end
23
+ end
24
+
25
+ class Element
26
+ # convert the given HTMLTree::Element (or HTMLTree::Document) into
27
+ # a REXML::Element or REXML::Document, ready to use REXML::XPath on.
28
+ # Note that this caches the tree; further changes to my tree will
29
+ # not be reflected in subsequent calls
30
+ def as_rexml_document(rparent = nil, context = {})
31
+ return @_rexml_tree if @_rexml_tree
32
+ node = REXML::Element.new( tag, rparent, context )
33
+ attribute_order().each { |attr|
34
+ node.add_attribute(attr, attribute(attr).to_s)
35
+ }
36
+ children().each { |child|
37
+ childNode = child.as_rexml_document(node, context)
38
+ }
39
+ @_rexml_tree = node
40
+ end
41
+ end
42
+
43
+ class Data
44
+ def as_rexml_document(rparent = nil, context = {})
45
+ rparent.add_text(@_content)
46
+ end
47
+ end
48
+
49
+ class Comment
50
+ def as_rexml_document(rparent = nil, context = {})
51
+ node = REXML::Comment.new(@_content, parent)
52
+ end
53
+ end
54
+
55
+ class Special
56
+ def as_rexml_document(rparent = nil, context = {})
57
+ node = REXML::Instruction.new(@_content,
58
+ context[:respect_whitespace] || false, rparent)
59
+ end
60
+ end
61
+
62
+ class Document
63
+ def as_rexml_document(context = {})
64
+ node = REXML::Document.new(nil, context)
65
+ # add DocType
66
+ # add <HTML> node
67
+ html_node.as_rexml_document(node, context)
68
+ node
69
+ end
70
+ end
71
+
72
+ end
@@ -0,0 +1,5 @@
1
+ require 'test/tc_html-element.rb'
2
+ require 'test/tc_html-tree.rb'
3
+ require 'test/tc_stacking-parser.rb'
4
+ require 'test/tc_xpath.rb'
5
+
@@ -0,0 +1,73 @@
1
+ #!/usr/bin/ruby
2
+ require 'html/element'
3
+ require 'test/unit'
4
+
5
+ module HTMLTree
6
+ class Element
7
+ def inspect
8
+ "<#{@_tag}> " + attributes.inspect + children.inspect
9
+ end
10
+ end
11
+ end
12
+
13
+ class HTMLElementTestCase < Test::Unit::TestCase
14
+ def setup
15
+ @e = HTMLTree::Element.new
16
+ end
17
+
18
+ attr_reader :e
19
+
20
+ def test_empty
21
+ assert_equal(nil, e.tag)
22
+ assert_equal({}, e.attributes)
23
+ assert_equal([], e.children)
24
+ end
25
+
26
+ def test_tag
27
+ e2 = HTMLTree::Element.new(nil, 'sometag')
28
+ assert_equal('sometag', e2.tag)
29
+ assert_equal({}, e2.attributes)
30
+ assert_equal([], e2.children)
31
+ end
32
+
33
+ def test_attribute
34
+ e.add_attribute('a', 'b')
35
+ assert_equal('b', e.attribute('a'))
36
+ assert_equal('b', e['a'])
37
+ e.add_attribute('a', 'c')
38
+ assert_equal(['b','c'], e.attribute('a'))
39
+ assert_equal(['b','c'], e['a'])
40
+ e.add_attribute('a', 'd', 'e')
41
+ assert_equal(['b','c', 'd', 'e'], e.attribute('a'))
42
+ e.add_attribute('b', ['c','d'])
43
+ assert_equal(['c','d'], e.attribute('b'))
44
+ e.add_attribute('b', ['e','f'])
45
+ assert_equal(['c','d', 'e', 'f'], e.attribute('b'))
46
+ e['b'] = 'aaa'
47
+ assert_equal('aaa', e.attribute('b'))
48
+ end
49
+
50
+ def test_parent
51
+ p = HTMLTree::Element.new(nil, 'p')
52
+
53
+ c = HTMLTree::Element.new(p, 'c')
54
+ assert_equal(nil, p.parent)
55
+ assert_equal([c], p.children)
56
+ assert_equal(p, c.parent)
57
+
58
+ d = HTMLTree::Element.new(p, 'd')
59
+ assert_equal([c,d], p.children)
60
+ assert_equal(p, d.parent)
61
+
62
+ p.remove_child(d)
63
+ assert_equal([c], p.children)
64
+ assert_equal(p, c.parent)
65
+ assert_equal(nil, d.parent)
66
+ end
67
+
68
+ def test_iterator
69
+ p = HTMLTree::Element.new(nil, 'p')
70
+ c = HTMLTree::Element.new(p, 'c')
71
+ d = HTMLTree::Element.new(p, 'd')
72
+ end
73
+ end