htmltools 1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +58 -0
- data/README +162 -0
- data/demo/degolive.rb +89 -0
- data/demo/ebaySearch.rb +93 -0
- data/demo/xpath.rb +62 -0
- data/lib/html/element.rb +323 -0
- data/lib/html/rexml-nodepath.rb +49 -0
- data/lib/html/sgml-parser.rb +372 -0
- data/lib/html/stparser.rb +280 -0
- data/lib/html/tags.rb +288 -0
- data/lib/html/tree.rb +140 -0
- data/lib/html/xmltree.rb +173 -0
- data/lib/html/xpath.rb +72 -0
- data/test/suite.rb +5 -0
- data/test/tc_html-element.rb +73 -0
- data/test/tc_html-tree.rb +201 -0
- data/test/tc_source-parser.rb +160 -0
- data/test/tc_stacking-parser.rb +196 -0
- data/test/tc_xpath.rb +87 -0
- metadata +58 -0
data/lib/html/tree.rb
ADDED
@@ -0,0 +1,140 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# This is an HTML parser that builds an element tree for further
|
3
|
+
# processing. Attributes and data are also stored.
|
4
|
+
#
|
5
|
+
# Typical usage is:
|
6
|
+
# parser = HTMLTree::Parser.new(false, false)
|
7
|
+
# parser.parse_file_named('whatever.html')
|
8
|
+
# # then you have the tree built..
|
9
|
+
# parser.tree.dump
|
10
|
+
#
|
11
|
+
# Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
|
12
|
+
# Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
|
13
|
+
# License:: Ruby's
|
14
|
+
# CVS ID:: $Id: tree.rb,v 1.2 2004/09/24 23:28:55 jhannes Exp $
|
15
|
+
|
16
|
+
require 'html/tags'
|
17
|
+
require 'html/stparser'
|
18
|
+
require 'html/element'
|
19
|
+
|
20
|
+
# This is a tree building HTML parser.
|
21
|
+
module HTMLTree
|
22
|
+
class Parser < HTML::StackingParser
|
23
|
+
|
24
|
+
# verbose:: if true, will warn to $stderr on unknown
|
25
|
+
# tags/entities/characters, as well as missing end tags and extra end
|
26
|
+
# tags.
|
27
|
+
# strip_white:: if true, remove all non-essential whitespace. Note
|
28
|
+
# that there are browser bugs that may cause this to change the
|
29
|
+
# appearance of HTML (even though it shouldn't by the standard).
|
30
|
+
def initialize(verbose=false, strip_white=true)
|
31
|
+
super
|
32
|
+
reset
|
33
|
+
end
|
34
|
+
|
35
|
+
# Reset this parser so that it can parse a new document.
|
36
|
+
def reset
|
37
|
+
super
|
38
|
+
@rootNode = @currentNode = Document.new
|
39
|
+
end
|
40
|
+
|
41
|
+
# Return the tree that was built. This will be an HTMLTree::Element that
|
42
|
+
# represents the whole document. The \<html> node is a child of this.
|
43
|
+
def tree
|
44
|
+
@rootNode
|
45
|
+
end
|
46
|
+
|
47
|
+
# Return the <html> node, if any.
|
48
|
+
def html
|
49
|
+
@rootNode.html_node()
|
50
|
+
end
|
51
|
+
|
52
|
+
# no user-serviceable parts inside...
|
53
|
+
# though you can subclass carefully.
|
54
|
+
private
|
55
|
+
|
56
|
+
def add_child_to_current(tag, attrs)
|
57
|
+
node = Element.new(@currentNode, tag)
|
58
|
+
attrs.each { |a| node.add_attribute(*a) }
|
59
|
+
node
|
60
|
+
end
|
61
|
+
|
62
|
+
# callbacks
|
63
|
+
|
64
|
+
# add a child to the current node and descend
|
65
|
+
def handle_start_tag(tag, attrs)
|
66
|
+
node = add_child_to_current(tag, attrs)
|
67
|
+
@rootNode = node unless @rootNode
|
68
|
+
@currentNode = node
|
69
|
+
end
|
70
|
+
|
71
|
+
# go up to parent
|
72
|
+
def handle_end_tag(tag)
|
73
|
+
@currentNode = @currentNode.parent
|
74
|
+
end
|
75
|
+
|
76
|
+
# add a child to the current node
|
77
|
+
def handle_empty_tag(tag, attrs)
|
78
|
+
add_child_to_current(tag, attrs)
|
79
|
+
end
|
80
|
+
|
81
|
+
# Add a child to the current node and descend
|
82
|
+
# Assume that the unknown tag has an end tag.
|
83
|
+
def handle_unknown_tag(tag, attrs)
|
84
|
+
super
|
85
|
+
handle_start_tag(tag, attrs)
|
86
|
+
end
|
87
|
+
|
88
|
+
# go up to parent
|
89
|
+
def handle_missing_end_tag(tag)
|
90
|
+
super
|
91
|
+
handle_end_tag(tag)
|
92
|
+
end
|
93
|
+
|
94
|
+
# ignore
|
95
|
+
def handle_extra_end_tag(tag)
|
96
|
+
super
|
97
|
+
end
|
98
|
+
|
99
|
+
def handle_cdata(data)
|
100
|
+
node = Data.new(@currentNode, data)
|
101
|
+
end
|
102
|
+
|
103
|
+
def handle_script(data)
|
104
|
+
node = Data.new(@currentNode, data)
|
105
|
+
end
|
106
|
+
|
107
|
+
def handle_unknown_character(name)
|
108
|
+
node = Data.new(@currentNode, "&##{name};")
|
109
|
+
end
|
110
|
+
|
111
|
+
def handle_unknown_entity(name)
|
112
|
+
node = Data.new(@currentNode, "&#{name};")
|
113
|
+
end
|
114
|
+
|
115
|
+
def handle_comment(data)
|
116
|
+
super # make sure and strip whitespace.
|
117
|
+
node = Comment.new(@currentNode, data)
|
118
|
+
end
|
119
|
+
|
120
|
+
def handle_special(data)
|
121
|
+
node = HTMLTree::Special.new(@currentNode, data)
|
122
|
+
$stderr.print('special ', node, ' discarded') unless @currentNode
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
if $0 == __FILE__
|
129
|
+
$stdout.sync = true
|
130
|
+
|
131
|
+
class TestStackingParser < HTMLTree::Parser
|
132
|
+
$DEBUG = false
|
133
|
+
p = TestStackingParser.new(true, false)
|
134
|
+
p.parse_file_named(ARGV[0] || 'ebay.html')
|
135
|
+
File.open('xx.html', 'w') { |of|
|
136
|
+
p.tree.write(of)
|
137
|
+
}
|
138
|
+
p.tree.dump
|
139
|
+
end
|
140
|
+
end
|
data/lib/html/xmltree.rb
ADDED
@@ -0,0 +1,173 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# This is a tree building HTML parser that makes an XML structure
|
3
|
+
# using the format of REXML.
|
4
|
+
#
|
5
|
+
# Typical usage is:
|
6
|
+
# parser = HTMLTree::XMLParser.new(false, false)
|
7
|
+
# parser.parse_file_named('whatever.html')
|
8
|
+
# # then you have the tree built..
|
9
|
+
# parser.document # is a REXML::Document
|
10
|
+
#
|
11
|
+
# Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
|
12
|
+
# Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
|
13
|
+
# License:: Ruby's
|
14
|
+
# CVS ID:: $Id: xmltree.rb,v 1.2 2004/09/24 23:28:55 jhannes Exp $
|
15
|
+
|
16
|
+
require 'html/tags'
|
17
|
+
require 'html/stparser'
|
18
|
+
require 'rexml/element'
|
19
|
+
require 'rexml/document'
|
20
|
+
|
21
|
+
# REXML::Child
|
22
|
+
# REXML::XMLDecl
|
23
|
+
# REXML::Instruction
|
24
|
+
# REXML::Text
|
25
|
+
# REXML::Comment
|
26
|
+
# REXML::Entity
|
27
|
+
# REXML::Parent
|
28
|
+
# REXML::Element (+REXML::Namespace)
|
29
|
+
# REXML::Document
|
30
|
+
# REXML::DocType
|
31
|
+
#
|
32
|
+
# This is a tree building HTML parser that makes XML.
|
33
|
+
module HTMLTree
|
34
|
+
class XMLParser < HTML::StackingParser
|
35
|
+
|
36
|
+
# verbose:: if true, will warn to $stderr on unknown
|
37
|
+
# tags/entities/characters, as well as missing end tags and extra end
|
38
|
+
# tags.
|
39
|
+
# strip_white:: if true, remove all non-essential whitespace. Note
|
40
|
+
# that there are browser bugs that may cause this to change the
|
41
|
+
# appearance of HTML (even though it shouldn't by the standard).
|
42
|
+
def initialize(verbose=false, strip_white=true)
|
43
|
+
super
|
44
|
+
reset
|
45
|
+
end
|
46
|
+
|
47
|
+
# Reset this parser so that it can parse a new document.
|
48
|
+
def reset
|
49
|
+
super
|
50
|
+
@rootNode = @currentNode = REXML::Document.new()
|
51
|
+
end
|
52
|
+
|
53
|
+
# Return the document that was built. This will be an
|
54
|
+
# REXML::Document that represents the whole document. The \<html>
|
55
|
+
# node is a child of this.
|
56
|
+
def document
|
57
|
+
@rootNode
|
58
|
+
end
|
59
|
+
|
60
|
+
def tree
|
61
|
+
document
|
62
|
+
end
|
63
|
+
|
64
|
+
# Return the root of the document, if any.
|
65
|
+
def root
|
66
|
+
@rootNode.root()
|
67
|
+
end
|
68
|
+
|
69
|
+
# Return the <html> node, if any.
|
70
|
+
def html
|
71
|
+
@rootNode.root.elements['html']
|
72
|
+
end
|
73
|
+
|
74
|
+
# no user-serviceable parts inside...
|
75
|
+
# though you can subclass carefully.
|
76
|
+
private
|
77
|
+
|
78
|
+
def add_child_to_current(tag, attrs)
|
79
|
+
node = REXML::Element.new(tag, @currentNode)
|
80
|
+
attrs.each { |a| node.attributes[a[0]] = a[1] }
|
81
|
+
node
|
82
|
+
end
|
83
|
+
|
84
|
+
# callbacks
|
85
|
+
|
86
|
+
# add a child to the current node and descend
|
87
|
+
def handle_start_tag(tag, attrs)
|
88
|
+
node = add_child_to_current(tag, attrs)
|
89
|
+
@rootNode = node unless @rootNode
|
90
|
+
@currentNode = node
|
91
|
+
end
|
92
|
+
|
93
|
+
# go up to parent
|
94
|
+
def handle_end_tag(tag)
|
95
|
+
@currentNode = @currentNode.parent
|
96
|
+
end
|
97
|
+
|
98
|
+
# add a child to the current node
|
99
|
+
def handle_empty_tag(tag, attrs)
|
100
|
+
add_child_to_current(tag, attrs)
|
101
|
+
end
|
102
|
+
|
103
|
+
# Add a child to the current node and descend
|
104
|
+
# Assume that the unknown tag has an end tag.
|
105
|
+
def handle_unknown_tag(tag, attrs)
|
106
|
+
super
|
107
|
+
handle_start_tag(tag, attrs)
|
108
|
+
end
|
109
|
+
|
110
|
+
# go up to parent
|
111
|
+
def handle_missing_end_tag(tag)
|
112
|
+
super
|
113
|
+
handle_end_tag(tag)
|
114
|
+
end
|
115
|
+
|
116
|
+
# ignore
|
117
|
+
def handle_extra_end_tag(tag)
|
118
|
+
super
|
119
|
+
end
|
120
|
+
|
121
|
+
def handle_cdata(data)
|
122
|
+
node = REXML::Text.new(data, !@stripWhitespace, @currentNode)
|
123
|
+
node.parent = @currentNode
|
124
|
+
end
|
125
|
+
|
126
|
+
def handle_script(data)
|
127
|
+
node = REXML::Comment.new(data, @currentNode)
|
128
|
+
node.parent = @currentNode
|
129
|
+
end
|
130
|
+
|
131
|
+
def handle_unknown_character(name)
|
132
|
+
node = REXML::Text.new("&##{name};", false, @currentNode)
|
133
|
+
node.raw = true
|
134
|
+
node.parent = @currentNode
|
135
|
+
node
|
136
|
+
end
|
137
|
+
|
138
|
+
def handle_unknown_entity(name)
|
139
|
+
node = REXML::Text.new("&#{name};", false, @currentNode)
|
140
|
+
node.raw = true
|
141
|
+
node.parent = @currentNode
|
142
|
+
node
|
143
|
+
end
|
144
|
+
|
145
|
+
def handle_comment(data)
|
146
|
+
super # strip white
|
147
|
+
node = REXML::Comment.new(data, @currentNode)
|
148
|
+
node.parent = @currentNode
|
149
|
+
node
|
150
|
+
end
|
151
|
+
|
152
|
+
def handle_special(data)
|
153
|
+
source = REXML::SourceFactory::create_from( "<#{data}>" )
|
154
|
+
node = REXML::DocType.new(source, @currentNode)
|
155
|
+
node.parent = @currentNode
|
156
|
+
node
|
157
|
+
end
|
158
|
+
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
if $0 == __FILE__
|
163
|
+
$stdout.sync = true
|
164
|
+
|
165
|
+
class TestStackingParser < HTMLTree::XMLParser
|
166
|
+
$DEBUG = false
|
167
|
+
p = TestStackingParser.new(true, false)
|
168
|
+
p.parse_file_named(ARGV[0] || 'ebay.html')
|
169
|
+
File.open('xx.html', 'w') { |of|
|
170
|
+
p.document.write(of)
|
171
|
+
}
|
172
|
+
end
|
173
|
+
end
|
data/lib/html/xpath.rb
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
# This module adapts REXML's XPath functionality for use with
|
2
|
+
# <tt>HTMLTree::Parser</tt>.
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
|
5
|
+
# Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
|
6
|
+
# License:: Same as Ruby's
|
7
|
+
# CVS ID: $Id: xpath.rb,v 1.3 2004/09/24 23:28:55 jhannes Exp $
|
8
|
+
|
9
|
+
require 'html/tree'
|
10
|
+
require 'rexml/element'
|
11
|
+
require 'rexml/document'
|
12
|
+
require 'rexml/xpath'
|
13
|
+
|
14
|
+
module HTMLTree
|
15
|
+
|
16
|
+
module TreeElement
|
17
|
+
# Given the XPath path, return an Array of matching sub-elements of
|
18
|
+
# the REXML tree.
|
19
|
+
def rexml_match(path)
|
20
|
+
node = as_rexml_document
|
21
|
+
REXML::XPath.match(node, path)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class Element
|
26
|
+
# convert the given HTMLTree::Element (or HTMLTree::Document) into
|
27
|
+
# a REXML::Element or REXML::Document, ready to use REXML::XPath on.
|
28
|
+
# Note that this caches the tree; further changes to my tree will
|
29
|
+
# not be reflected in subsequent calls
|
30
|
+
def as_rexml_document(rparent = nil, context = {})
|
31
|
+
return @_rexml_tree if @_rexml_tree
|
32
|
+
node = REXML::Element.new( tag, rparent, context )
|
33
|
+
attribute_order().each { |attr|
|
34
|
+
node.add_attribute(attr, attribute(attr).to_s)
|
35
|
+
}
|
36
|
+
children().each { |child|
|
37
|
+
childNode = child.as_rexml_document(node, context)
|
38
|
+
}
|
39
|
+
@_rexml_tree = node
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Data
|
44
|
+
def as_rexml_document(rparent = nil, context = {})
|
45
|
+
rparent.add_text(@_content)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
class Comment
|
50
|
+
def as_rexml_document(rparent = nil, context = {})
|
51
|
+
node = REXML::Comment.new(@_content, parent)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class Special
|
56
|
+
def as_rexml_document(rparent = nil, context = {})
|
57
|
+
node = REXML::Instruction.new(@_content,
|
58
|
+
context[:respect_whitespace] || false, rparent)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
class Document
|
63
|
+
def as_rexml_document(context = {})
|
64
|
+
node = REXML::Document.new(nil, context)
|
65
|
+
# add DocType
|
66
|
+
# add <HTML> node
|
67
|
+
html_node.as_rexml_document(node, context)
|
68
|
+
node
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
data/test/suite.rb
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
require 'html/element'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
module HTMLTree
|
6
|
+
class Element
|
7
|
+
def inspect
|
8
|
+
"<#{@_tag}> " + attributes.inspect + children.inspect
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class HTMLElementTestCase < Test::Unit::TestCase
|
14
|
+
def setup
|
15
|
+
@e = HTMLTree::Element.new
|
16
|
+
end
|
17
|
+
|
18
|
+
attr_reader :e
|
19
|
+
|
20
|
+
def test_empty
|
21
|
+
assert_equal(nil, e.tag)
|
22
|
+
assert_equal({}, e.attributes)
|
23
|
+
assert_equal([], e.children)
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_tag
|
27
|
+
e2 = HTMLTree::Element.new(nil, 'sometag')
|
28
|
+
assert_equal('sometag', e2.tag)
|
29
|
+
assert_equal({}, e2.attributes)
|
30
|
+
assert_equal([], e2.children)
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_attribute
|
34
|
+
e.add_attribute('a', 'b')
|
35
|
+
assert_equal('b', e.attribute('a'))
|
36
|
+
assert_equal('b', e['a'])
|
37
|
+
e.add_attribute('a', 'c')
|
38
|
+
assert_equal(['b','c'], e.attribute('a'))
|
39
|
+
assert_equal(['b','c'], e['a'])
|
40
|
+
e.add_attribute('a', 'd', 'e')
|
41
|
+
assert_equal(['b','c', 'd', 'e'], e.attribute('a'))
|
42
|
+
e.add_attribute('b', ['c','d'])
|
43
|
+
assert_equal(['c','d'], e.attribute('b'))
|
44
|
+
e.add_attribute('b', ['e','f'])
|
45
|
+
assert_equal(['c','d', 'e', 'f'], e.attribute('b'))
|
46
|
+
e['b'] = 'aaa'
|
47
|
+
assert_equal('aaa', e.attribute('b'))
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_parent
|
51
|
+
p = HTMLTree::Element.new(nil, 'p')
|
52
|
+
|
53
|
+
c = HTMLTree::Element.new(p, 'c')
|
54
|
+
assert_equal(nil, p.parent)
|
55
|
+
assert_equal([c], p.children)
|
56
|
+
assert_equal(p, c.parent)
|
57
|
+
|
58
|
+
d = HTMLTree::Element.new(p, 'd')
|
59
|
+
assert_equal([c,d], p.children)
|
60
|
+
assert_equal(p, d.parent)
|
61
|
+
|
62
|
+
p.remove_child(d)
|
63
|
+
assert_equal([c], p.children)
|
64
|
+
assert_equal(p, c.parent)
|
65
|
+
assert_equal(nil, d.parent)
|
66
|
+
end
|
67
|
+
|
68
|
+
def test_iterator
|
69
|
+
p = HTMLTree::Element.new(nil, 'p')
|
70
|
+
c = HTMLTree::Element.new(p, 'c')
|
71
|
+
d = HTMLTree::Element.new(p, 'd')
|
72
|
+
end
|
73
|
+
end
|