htmltools 1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ #!/usr/bin/ruby
2
+ # This is an HTML parser that builds an element tree for further
3
+ # processing. Attributes and data are also stored.
4
+ #
5
+ # Typical usage is:
6
+ # parser = HTMLTree::Parser.new(false, false)
7
+ # parser.parse_file_named('whatever.html')
8
+ # # then you have the tree built..
9
+ # parser.tree.dump
10
+ #
11
+ # Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
12
+ # Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
13
+ # License:: Ruby's
14
+ # CVS ID:: $Id: tree.rb,v 1.2 2004/09/24 23:28:55 jhannes Exp $
15
+
16
+ require 'html/tags'
17
+ require 'html/stparser'
18
+ require 'html/element'
19
+
20
+ # This is a tree building HTML parser.
21
+ module HTMLTree
22
+ class Parser < HTML::StackingParser
23
+
24
+ # verbose:: if true, will warn to $stderr on unknown
25
+ # tags/entities/characters, as well as missing end tags and extra end
26
+ # tags.
27
+ # strip_white:: if true, remove all non-essential whitespace. Note
28
+ # that there are browser bugs that may cause this to change the
29
+ # appearance of HTML (even though it shouldn't by the standard).
30
+ def initialize(verbose=false, strip_white=true)
31
+ super
32
+ reset
33
+ end
34
+
35
+ # Reset this parser so that it can parse a new document.
36
+ def reset
37
+ super
38
+ @rootNode = @currentNode = Document.new
39
+ end
40
+
41
+ # Return the tree that was built. This will be an HTMLTree::Element that
42
+ # represents the whole document. The \<html> node is a child of this.
43
+ def tree
44
+ @rootNode
45
+ end
46
+
47
+ # Return the <html> node, if any.
48
+ def html
49
+ @rootNode.html_node()
50
+ end
51
+
52
+ # no user-serviceable parts inside...
53
+ # though you can subclass carefully.
54
+ private
55
+
56
+ def add_child_to_current(tag, attrs)
57
+ node = Element.new(@currentNode, tag)
58
+ attrs.each { |a| node.add_attribute(*a) }
59
+ node
60
+ end
61
+
62
+ # callbacks
63
+
64
+ # add a child to the current node and descend
65
+ def handle_start_tag(tag, attrs)
66
+ node = add_child_to_current(tag, attrs)
67
+ @rootNode = node unless @rootNode
68
+ @currentNode = node
69
+ end
70
+
71
+ # go up to parent
72
+ def handle_end_tag(tag)
73
+ @currentNode = @currentNode.parent
74
+ end
75
+
76
+ # add a child to the current node
77
+ def handle_empty_tag(tag, attrs)
78
+ add_child_to_current(tag, attrs)
79
+ end
80
+
81
+ # Add a child to the current node and descend
82
+ # Assume that the unknown tag has an end tag.
83
+ def handle_unknown_tag(tag, attrs)
84
+ super
85
+ handle_start_tag(tag, attrs)
86
+ end
87
+
88
+ # go up to parent
89
+ def handle_missing_end_tag(tag)
90
+ super
91
+ handle_end_tag(tag)
92
+ end
93
+
94
+ # ignore
95
+ def handle_extra_end_tag(tag)
96
+ super
97
+ end
98
+
99
+ def handle_cdata(data)
100
+ node = Data.new(@currentNode, data)
101
+ end
102
+
103
+ def handle_script(data)
104
+ node = Data.new(@currentNode, data)
105
+ end
106
+
107
+ def handle_unknown_character(name)
108
+ node = Data.new(@currentNode, "&##{name};")
109
+ end
110
+
111
+ def handle_unknown_entity(name)
112
+ node = Data.new(@currentNode, "&#{name};")
113
+ end
114
+
115
+ def handle_comment(data)
116
+ super # make sure and strip whitespace.
117
+ node = Comment.new(@currentNode, data)
118
+ end
119
+
120
+ def handle_special(data)
121
+ node = HTMLTree::Special.new(@currentNode, data)
122
+ $stderr.print('special ', node, ' discarded') unless @currentNode
123
+ end
124
+
125
+ end
126
+ end
127
+
128
+ if $0 == __FILE__
129
+ $stdout.sync = true
130
+
131
+ class TestStackingParser < HTMLTree::Parser
132
+ $DEBUG = false
133
+ p = TestStackingParser.new(true, false)
134
+ p.parse_file_named(ARGV[0] || 'ebay.html')
135
+ File.open('xx.html', 'w') { |of|
136
+ p.tree.write(of)
137
+ }
138
+ p.tree.dump
139
+ end
140
+ end
@@ -0,0 +1,173 @@
1
+ #!/usr/bin/ruby
2
+ # This is a tree building HTML parser that makes an XML structure
3
+ # using the format of REXML.
4
+ #
5
+ # Typical usage is:
6
+ # parser = HTMLTree::XMLParser.new(false, false)
7
+ # parser.parse_file_named('whatever.html')
8
+ # # then you have the tree built..
9
+ # parser.document # is a REXML::Document
10
+ #
11
+ # Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
12
+ # Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
13
+ # License:: Ruby's
14
+ # CVS ID:: $Id: xmltree.rb,v 1.2 2004/09/24 23:28:55 jhannes Exp $
15
+
16
+ require 'html/tags'
17
+ require 'html/stparser'
18
+ require 'rexml/element'
19
+ require 'rexml/document'
20
+
21
+ # REXML::Child
22
+ # REXML::XMLDecl
23
+ # REXML::Instruction
24
+ # REXML::Text
25
+ # REXML::Comment
26
+ # REXML::Entity
27
+ # REXML::Parent
28
+ # REXML::Element (+REXML::Namespace)
29
+ # REXML::Document
30
+ # REXML::DocType
31
+ #
32
+ # This is a tree building HTML parser that makes XML.
33
+ module HTMLTree
34
+ class XMLParser < HTML::StackingParser
35
+
36
+ # verbose:: if true, will warn to $stderr on unknown
37
+ # tags/entities/characters, as well as missing end tags and extra end
38
+ # tags.
39
+ # strip_white:: if true, remove all non-essential whitespace. Note
40
+ # that there are browser bugs that may cause this to change the
41
+ # appearance of HTML (even though it shouldn't by the standard).
42
+ def initialize(verbose=false, strip_white=true)
43
+ super
44
+ reset
45
+ end
46
+
47
+ # Reset this parser so that it can parse a new document.
48
+ def reset
49
+ super
50
+ @rootNode = @currentNode = REXML::Document.new()
51
+ end
52
+
53
+ # Return the document that was built. This will be an
54
+ # REXML::Document that represents the whole document. The \<html>
55
+ # node is a child of this.
56
+ def document
57
+ @rootNode
58
+ end
59
+
60
+ def tree
61
+ document
62
+ end
63
+
64
+ # Return the root of the document, if any.
65
+ def root
66
+ @rootNode.root()
67
+ end
68
+
69
+ # Return the <html> node, if any.
70
+ def html
71
+ @rootNode.root.elements['html']
72
+ end
73
+
74
+ # no user-serviceable parts inside...
75
+ # though you can subclass carefully.
76
+ private
77
+
78
+ def add_child_to_current(tag, attrs)
79
+ node = REXML::Element.new(tag, @currentNode)
80
+ attrs.each { |a| node.attributes[a[0]] = a[1] }
81
+ node
82
+ end
83
+
84
+ # callbacks
85
+
86
+ # add a child to the current node and descend
87
+ def handle_start_tag(tag, attrs)
88
+ node = add_child_to_current(tag, attrs)
89
+ @rootNode = node unless @rootNode
90
+ @currentNode = node
91
+ end
92
+
93
+ # go up to parent
94
+ def handle_end_tag(tag)
95
+ @currentNode = @currentNode.parent
96
+ end
97
+
98
+ # add a child to the current node
99
+ def handle_empty_tag(tag, attrs)
100
+ add_child_to_current(tag, attrs)
101
+ end
102
+
103
+ # Add a child to the current node and descend
104
+ # Assume that the unknown tag has an end tag.
105
+ def handle_unknown_tag(tag, attrs)
106
+ super
107
+ handle_start_tag(tag, attrs)
108
+ end
109
+
110
+ # go up to parent
111
+ def handle_missing_end_tag(tag)
112
+ super
113
+ handle_end_tag(tag)
114
+ end
115
+
116
+ # ignore
117
+ def handle_extra_end_tag(tag)
118
+ super
119
+ end
120
+
121
+ def handle_cdata(data)
122
+ node = REXML::Text.new(data, !@stripWhitespace, @currentNode)
123
+ node.parent = @currentNode
124
+ end
125
+
126
+ def handle_script(data)
127
+ node = REXML::Comment.new(data, @currentNode)
128
+ node.parent = @currentNode
129
+ end
130
+
131
+ def handle_unknown_character(name)
132
+ node = REXML::Text.new("&##{name};", false, @currentNode)
133
+ node.raw = true
134
+ node.parent = @currentNode
135
+ node
136
+ end
137
+
138
+ def handle_unknown_entity(name)
139
+ node = REXML::Text.new("&#{name};", false, @currentNode)
140
+ node.raw = true
141
+ node.parent = @currentNode
142
+ node
143
+ end
144
+
145
+ def handle_comment(data)
146
+ super # strip white
147
+ node = REXML::Comment.new(data, @currentNode)
148
+ node.parent = @currentNode
149
+ node
150
+ end
151
+
152
+ def handle_special(data)
153
+ source = REXML::SourceFactory::create_from( "<#{data}>" )
154
+ node = REXML::DocType.new(source, @currentNode)
155
+ node.parent = @currentNode
156
+ node
157
+ end
158
+
159
+ end
160
+ end
161
+
162
+ if $0 == __FILE__
163
+ $stdout.sync = true
164
+
165
+ class TestStackingParser < HTMLTree::XMLParser
166
+ $DEBUG = false
167
+ p = TestStackingParser.new(true, false)
168
+ p.parse_file_named(ARGV[0] || 'ebay.html')
169
+ File.open('xx.html', 'w') { |of|
170
+ p.document.write(of)
171
+ }
172
+ end
173
+ end
@@ -0,0 +1,72 @@
1
+ # This module adapts REXML's XPath functionality for use with
2
+ # <tt>HTMLTree::Parser</tt>.
3
+ #
4
+ # Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
5
+ # Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
6
+ # License:: Same as Ruby's
7
+ # CVS ID: $Id: xpath.rb,v 1.3 2004/09/24 23:28:55 jhannes Exp $
8
+
9
+ require 'html/tree'
10
+ require 'rexml/element'
11
+ require 'rexml/document'
12
+ require 'rexml/xpath'
13
+
14
+ module HTMLTree
15
+
16
+ module TreeElement
17
+ # Given the XPath path, return an Array of matching sub-elements of
18
+ # the REXML tree.
19
+ def rexml_match(path)
20
+ node = as_rexml_document
21
+ REXML::XPath.match(node, path)
22
+ end
23
+ end
24
+
25
+ class Element
26
+ # convert the given HTMLTree::Element (or HTMLTree::Document) into
27
+ # a REXML::Element or REXML::Document, ready to use REXML::XPath on.
28
+ # Note that this caches the tree; further changes to my tree will
29
+ # not be reflected in subsequent calls
30
+ def as_rexml_document(rparent = nil, context = {})
31
+ return @_rexml_tree if @_rexml_tree
32
+ node = REXML::Element.new( tag, rparent, context )
33
+ attribute_order().each { |attr|
34
+ node.add_attribute(attr, attribute(attr).to_s)
35
+ }
36
+ children().each { |child|
37
+ childNode = child.as_rexml_document(node, context)
38
+ }
39
+ @_rexml_tree = node
40
+ end
41
+ end
42
+
43
+ class Data
44
+ def as_rexml_document(rparent = nil, context = {})
45
+ rparent.add_text(@_content)
46
+ end
47
+ end
48
+
49
+ class Comment
50
+ def as_rexml_document(rparent = nil, context = {})
51
+ node = REXML::Comment.new(@_content, parent)
52
+ end
53
+ end
54
+
55
+ class Special
56
+ def as_rexml_document(rparent = nil, context = {})
57
+ node = REXML::Instruction.new(@_content,
58
+ context[:respect_whitespace] || false, rparent)
59
+ end
60
+ end
61
+
62
+ class Document
63
+ def as_rexml_document(context = {})
64
+ node = REXML::Document.new(nil, context)
65
+ # add DocType
66
+ # add <HTML> node
67
+ html_node.as_rexml_document(node, context)
68
+ node
69
+ end
70
+ end
71
+
72
+ end
@@ -0,0 +1,5 @@
1
+ require 'test/tc_html-element.rb'
2
+ require 'test/tc_html-tree.rb'
3
+ require 'test/tc_stacking-parser.rb'
4
+ require 'test/tc_xpath.rb'
5
+
@@ -0,0 +1,73 @@
1
+ #!/usr/bin/ruby
2
+ require 'html/element'
3
+ require 'test/unit'
4
+
5
+ module HTMLTree
6
+ class Element
7
+ def inspect
8
+ "<#{@_tag}> " + attributes.inspect + children.inspect
9
+ end
10
+ end
11
+ end
12
+
13
+ class HTMLElementTestCase < Test::Unit::TestCase
14
+ def setup
15
+ @e = HTMLTree::Element.new
16
+ end
17
+
18
+ attr_reader :e
19
+
20
+ def test_empty
21
+ assert_equal(nil, e.tag)
22
+ assert_equal({}, e.attributes)
23
+ assert_equal([], e.children)
24
+ end
25
+
26
+ def test_tag
27
+ e2 = HTMLTree::Element.new(nil, 'sometag')
28
+ assert_equal('sometag', e2.tag)
29
+ assert_equal({}, e2.attributes)
30
+ assert_equal([], e2.children)
31
+ end
32
+
33
+ def test_attribute
34
+ e.add_attribute('a', 'b')
35
+ assert_equal('b', e.attribute('a'))
36
+ assert_equal('b', e['a'])
37
+ e.add_attribute('a', 'c')
38
+ assert_equal(['b','c'], e.attribute('a'))
39
+ assert_equal(['b','c'], e['a'])
40
+ e.add_attribute('a', 'd', 'e')
41
+ assert_equal(['b','c', 'd', 'e'], e.attribute('a'))
42
+ e.add_attribute('b', ['c','d'])
43
+ assert_equal(['c','d'], e.attribute('b'))
44
+ e.add_attribute('b', ['e','f'])
45
+ assert_equal(['c','d', 'e', 'f'], e.attribute('b'))
46
+ e['b'] = 'aaa'
47
+ assert_equal('aaa', e.attribute('b'))
48
+ end
49
+
50
+ def test_parent
51
+ p = HTMLTree::Element.new(nil, 'p')
52
+
53
+ c = HTMLTree::Element.new(p, 'c')
54
+ assert_equal(nil, p.parent)
55
+ assert_equal([c], p.children)
56
+ assert_equal(p, c.parent)
57
+
58
+ d = HTMLTree::Element.new(p, 'd')
59
+ assert_equal([c,d], p.children)
60
+ assert_equal(p, d.parent)
61
+
62
+ p.remove_child(d)
63
+ assert_equal([c], p.children)
64
+ assert_equal(p, c.parent)
65
+ assert_equal(nil, d.parent)
66
+ end
67
+
68
+ def test_iterator
69
+ p = HTMLTree::Element.new(nil, 'p')
70
+ c = HTMLTree::Element.new(p, 'c')
71
+ d = HTMLTree::Element.new(p, 'd')
72
+ end
73
+ end