htmltools 1.10
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +58 -0
- data/README +162 -0
- data/demo/degolive.rb +89 -0
- data/demo/ebaySearch.rb +93 -0
- data/demo/xpath.rb +62 -0
- data/lib/html/element.rb +323 -0
- data/lib/html/rexml-nodepath.rb +49 -0
- data/lib/html/sgml-parser.rb +372 -0
- data/lib/html/stparser.rb +280 -0
- data/lib/html/tags.rb +288 -0
- data/lib/html/tree.rb +140 -0
- data/lib/html/xmltree.rb +173 -0
- data/lib/html/xpath.rb +72 -0
- data/test/suite.rb +5 -0
- data/test/tc_html-element.rb +73 -0
- data/test/tc_html-tree.rb +201 -0
- data/test/tc_source-parser.rb +160 -0
- data/test/tc_stacking-parser.rb +196 -0
- data/test/tc_xpath.rb +87 -0
- metadata +58 -0
data/lib/html/tree.rb
ADDED
@@ -0,0 +1,140 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# This is an HTML parser that builds an element tree for further
|
3
|
+
# processing. Attributes and data are also stored.
|
4
|
+
#
|
5
|
+
# Typical usage is:
|
6
|
+
# parser = HTMLTree::Parser.new(false, false)
|
7
|
+
# parser.parse_file_named('whatever.html')
|
8
|
+
# # then you have the tree built..
|
9
|
+
# parser.tree.dump
|
10
|
+
#
|
11
|
+
# Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
|
12
|
+
# Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
|
13
|
+
# License:: Ruby's
|
14
|
+
# CVS ID:: $Id: tree.rb,v 1.2 2004/09/24 23:28:55 jhannes Exp $
|
15
|
+
|
16
|
+
require 'html/tags'
|
17
|
+
require 'html/stparser'
|
18
|
+
require 'html/element'
|
19
|
+
|
20
|
+
# This is a tree building HTML parser.
|
21
|
+
module HTMLTree
|
22
|
+
class Parser < HTML::StackingParser
|
23
|
+
|
24
|
+
# verbose:: if true, will warn to $stderr on unknown
|
25
|
+
# tags/entities/characters, as well as missing end tags and extra end
|
26
|
+
# tags.
|
27
|
+
# strip_white:: if true, remove all non-essential whitespace. Note
|
28
|
+
# that there are browser bugs that may cause this to change the
|
29
|
+
# appearance of HTML (even though it shouldn't by the standard).
|
30
|
+
def initialize(verbose=false, strip_white=true)
|
31
|
+
super
|
32
|
+
reset
|
33
|
+
end
|
34
|
+
|
35
|
+
# Reset this parser so that it can parse a new document.
|
36
|
+
def reset
|
37
|
+
super
|
38
|
+
@rootNode = @currentNode = Document.new
|
39
|
+
end
|
40
|
+
|
41
|
+
# Return the tree that was built. This will be an HTMLTree::Element that
|
42
|
+
# represents the whole document. The \<html> node is a child of this.
|
43
|
+
def tree
|
44
|
+
@rootNode
|
45
|
+
end
|
46
|
+
|
47
|
+
# Return the <html> node, if any.
|
48
|
+
def html
|
49
|
+
@rootNode.html_node()
|
50
|
+
end
|
51
|
+
|
52
|
+
# no user-serviceable parts inside...
|
53
|
+
# though you can subclass carefully.
|
54
|
+
private
|
55
|
+
|
56
|
+
def add_child_to_current(tag, attrs)
|
57
|
+
node = Element.new(@currentNode, tag)
|
58
|
+
attrs.each { |a| node.add_attribute(*a) }
|
59
|
+
node
|
60
|
+
end
|
61
|
+
|
62
|
+
# callbacks
|
63
|
+
|
64
|
+
# add a child to the current node and descend
|
65
|
+
def handle_start_tag(tag, attrs)
|
66
|
+
node = add_child_to_current(tag, attrs)
|
67
|
+
@rootNode = node unless @rootNode
|
68
|
+
@currentNode = node
|
69
|
+
end
|
70
|
+
|
71
|
+
# go up to parent
|
72
|
+
def handle_end_tag(tag)
|
73
|
+
@currentNode = @currentNode.parent
|
74
|
+
end
|
75
|
+
|
76
|
+
# add a child to the current node
|
77
|
+
def handle_empty_tag(tag, attrs)
|
78
|
+
add_child_to_current(tag, attrs)
|
79
|
+
end
|
80
|
+
|
81
|
+
# Add a child to the current node and descend
|
82
|
+
# Assume that the unknown tag has an end tag.
|
83
|
+
def handle_unknown_tag(tag, attrs)
|
84
|
+
super
|
85
|
+
handle_start_tag(tag, attrs)
|
86
|
+
end
|
87
|
+
|
88
|
+
# go up to parent
|
89
|
+
def handle_missing_end_tag(tag)
|
90
|
+
super
|
91
|
+
handle_end_tag(tag)
|
92
|
+
end
|
93
|
+
|
94
|
+
# ignore
|
95
|
+
def handle_extra_end_tag(tag)
|
96
|
+
super
|
97
|
+
end
|
98
|
+
|
99
|
+
def handle_cdata(data)
|
100
|
+
node = Data.new(@currentNode, data)
|
101
|
+
end
|
102
|
+
|
103
|
+
def handle_script(data)
|
104
|
+
node = Data.new(@currentNode, data)
|
105
|
+
end
|
106
|
+
|
107
|
+
def handle_unknown_character(name)
|
108
|
+
node = Data.new(@currentNode, "&##{name};")
|
109
|
+
end
|
110
|
+
|
111
|
+
def handle_unknown_entity(name)
|
112
|
+
node = Data.new(@currentNode, "&#{name};")
|
113
|
+
end
|
114
|
+
|
115
|
+
def handle_comment(data)
|
116
|
+
super # make sure and strip whitespace.
|
117
|
+
node = Comment.new(@currentNode, data)
|
118
|
+
end
|
119
|
+
|
120
|
+
def handle_special(data)
|
121
|
+
node = HTMLTree::Special.new(@currentNode, data)
|
122
|
+
$stderr.print('special ', node, ' discarded') unless @currentNode
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
if $0 == __FILE__
|
129
|
+
$stdout.sync = true
|
130
|
+
|
131
|
+
class TestStackingParser < HTMLTree::Parser
|
132
|
+
$DEBUG = false
|
133
|
+
p = TestStackingParser.new(true, false)
|
134
|
+
p.parse_file_named(ARGV[0] || 'ebay.html')
|
135
|
+
File.open('xx.html', 'w') { |of|
|
136
|
+
p.tree.write(of)
|
137
|
+
}
|
138
|
+
p.tree.dump
|
139
|
+
end
|
140
|
+
end
|
data/lib/html/xmltree.rb
ADDED
@@ -0,0 +1,173 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# This is a tree building HTML parser that makes an XML structure
|
3
|
+
# using the format of REXML.
|
4
|
+
#
|
5
|
+
# Typical usage is:
|
6
|
+
# parser = HTMLTree::XMLParser.new(false, false)
|
7
|
+
# parser.parse_file_named('whatever.html')
|
8
|
+
# # then you have the tree built..
|
9
|
+
# parser.document # is a REXML::Document
|
10
|
+
#
|
11
|
+
# Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
|
12
|
+
# Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
|
13
|
+
# License:: Ruby's
|
14
|
+
# CVS ID:: $Id: xmltree.rb,v 1.2 2004/09/24 23:28:55 jhannes Exp $
|
15
|
+
|
16
|
+
require 'html/tags'
|
17
|
+
require 'html/stparser'
|
18
|
+
require 'rexml/element'
|
19
|
+
require 'rexml/document'
|
20
|
+
|
21
|
+
# REXML::Child
|
22
|
+
# REXML::XMLDecl
|
23
|
+
# REXML::Instruction
|
24
|
+
# REXML::Text
|
25
|
+
# REXML::Comment
|
26
|
+
# REXML::Entity
|
27
|
+
# REXML::Parent
|
28
|
+
# REXML::Element (+REXML::Namespace)
|
29
|
+
# REXML::Document
|
30
|
+
# REXML::DocType
|
31
|
+
#
|
32
|
+
# This is a tree building HTML parser that makes XML.
|
33
|
+
module HTMLTree
|
34
|
+
class XMLParser < HTML::StackingParser
|
35
|
+
|
36
|
+
# verbose:: if true, will warn to $stderr on unknown
|
37
|
+
# tags/entities/characters, as well as missing end tags and extra end
|
38
|
+
# tags.
|
39
|
+
# strip_white:: if true, remove all non-essential whitespace. Note
|
40
|
+
# that there are browser bugs that may cause this to change the
|
41
|
+
# appearance of HTML (even though it shouldn't by the standard).
|
42
|
+
def initialize(verbose=false, strip_white=true)
|
43
|
+
super
|
44
|
+
reset
|
45
|
+
end
|
46
|
+
|
47
|
+
# Reset this parser so that it can parse a new document.
|
48
|
+
def reset
|
49
|
+
super
|
50
|
+
@rootNode = @currentNode = REXML::Document.new()
|
51
|
+
end
|
52
|
+
|
53
|
+
# Return the document that was built. This will be an
|
54
|
+
# REXML::Document that represents the whole document. The \<html>
|
55
|
+
# node is a child of this.
|
56
|
+
def document
|
57
|
+
@rootNode
|
58
|
+
end
|
59
|
+
|
60
|
+
def tree
|
61
|
+
document
|
62
|
+
end
|
63
|
+
|
64
|
+
# Return the root of the document, if any.
|
65
|
+
def root
|
66
|
+
@rootNode.root()
|
67
|
+
end
|
68
|
+
|
69
|
+
# Return the <html> node, if any.
|
70
|
+
def html
|
71
|
+
@rootNode.root.elements['html']
|
72
|
+
end
|
73
|
+
|
74
|
+
# no user-serviceable parts inside...
|
75
|
+
# though you can subclass carefully.
|
76
|
+
private
|
77
|
+
|
78
|
+
def add_child_to_current(tag, attrs)
|
79
|
+
node = REXML::Element.new(tag, @currentNode)
|
80
|
+
attrs.each { |a| node.attributes[a[0]] = a[1] }
|
81
|
+
node
|
82
|
+
end
|
83
|
+
|
84
|
+
# callbacks
|
85
|
+
|
86
|
+
# add a child to the current node and descend
|
87
|
+
def handle_start_tag(tag, attrs)
|
88
|
+
node = add_child_to_current(tag, attrs)
|
89
|
+
@rootNode = node unless @rootNode
|
90
|
+
@currentNode = node
|
91
|
+
end
|
92
|
+
|
93
|
+
# go up to parent
|
94
|
+
def handle_end_tag(tag)
|
95
|
+
@currentNode = @currentNode.parent
|
96
|
+
end
|
97
|
+
|
98
|
+
# add a child to the current node
|
99
|
+
def handle_empty_tag(tag, attrs)
|
100
|
+
add_child_to_current(tag, attrs)
|
101
|
+
end
|
102
|
+
|
103
|
+
# Add a child to the current node and descend
|
104
|
+
# Assume that the unknown tag has an end tag.
|
105
|
+
def handle_unknown_tag(tag, attrs)
|
106
|
+
super
|
107
|
+
handle_start_tag(tag, attrs)
|
108
|
+
end
|
109
|
+
|
110
|
+
# go up to parent
|
111
|
+
def handle_missing_end_tag(tag)
|
112
|
+
super
|
113
|
+
handle_end_tag(tag)
|
114
|
+
end
|
115
|
+
|
116
|
+
# ignore
|
117
|
+
def handle_extra_end_tag(tag)
|
118
|
+
super
|
119
|
+
end
|
120
|
+
|
121
|
+
def handle_cdata(data)
|
122
|
+
node = REXML::Text.new(data, !@stripWhitespace, @currentNode)
|
123
|
+
node.parent = @currentNode
|
124
|
+
end
|
125
|
+
|
126
|
+
def handle_script(data)
|
127
|
+
node = REXML::Comment.new(data, @currentNode)
|
128
|
+
node.parent = @currentNode
|
129
|
+
end
|
130
|
+
|
131
|
+
def handle_unknown_character(name)
|
132
|
+
node = REXML::Text.new("&##{name};", false, @currentNode)
|
133
|
+
node.raw = true
|
134
|
+
node.parent = @currentNode
|
135
|
+
node
|
136
|
+
end
|
137
|
+
|
138
|
+
def handle_unknown_entity(name)
|
139
|
+
node = REXML::Text.new("&#{name};", false, @currentNode)
|
140
|
+
node.raw = true
|
141
|
+
node.parent = @currentNode
|
142
|
+
node
|
143
|
+
end
|
144
|
+
|
145
|
+
def handle_comment(data)
|
146
|
+
super # strip white
|
147
|
+
node = REXML::Comment.new(data, @currentNode)
|
148
|
+
node.parent = @currentNode
|
149
|
+
node
|
150
|
+
end
|
151
|
+
|
152
|
+
def handle_special(data)
|
153
|
+
source = REXML::SourceFactory::create_from( "<#{data}>" )
|
154
|
+
node = REXML::DocType.new(source, @currentNode)
|
155
|
+
node.parent = @currentNode
|
156
|
+
node
|
157
|
+
end
|
158
|
+
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
if $0 == __FILE__
|
163
|
+
$stdout.sync = true
|
164
|
+
|
165
|
+
class TestStackingParser < HTMLTree::XMLParser
|
166
|
+
$DEBUG = false
|
167
|
+
p = TestStackingParser.new(true, false)
|
168
|
+
p.parse_file_named(ARGV[0] || 'ebay.html')
|
169
|
+
File.open('xx.html', 'w') { |of|
|
170
|
+
p.document.write(of)
|
171
|
+
}
|
172
|
+
end
|
173
|
+
end
|
data/lib/html/xpath.rb
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
# This module adapts REXML's XPath functionality for use with
|
2
|
+
# <tt>HTMLTree::Parser</tt>.
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
|
5
|
+
# Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
|
6
|
+
# License:: Same as Ruby's
|
7
|
+
# CVS ID: $Id: xpath.rb,v 1.3 2004/09/24 23:28:55 jhannes Exp $
|
8
|
+
|
9
|
+
require 'html/tree'
|
10
|
+
require 'rexml/element'
|
11
|
+
require 'rexml/document'
|
12
|
+
require 'rexml/xpath'
|
13
|
+
|
14
|
+
module HTMLTree
|
15
|
+
|
16
|
+
module TreeElement
|
17
|
+
# Given the XPath path, return an Array of matching sub-elements of
|
18
|
+
# the REXML tree.
|
19
|
+
def rexml_match(path)
|
20
|
+
node = as_rexml_document
|
21
|
+
REXML::XPath.match(node, path)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class Element
|
26
|
+
# convert the given HTMLTree::Element (or HTMLTree::Document) into
|
27
|
+
# a REXML::Element or REXML::Document, ready to use REXML::XPath on.
|
28
|
+
# Note that this caches the tree; further changes to my tree will
|
29
|
+
# not be reflected in subsequent calls
|
30
|
+
def as_rexml_document(rparent = nil, context = {})
|
31
|
+
return @_rexml_tree if @_rexml_tree
|
32
|
+
node = REXML::Element.new( tag, rparent, context )
|
33
|
+
attribute_order().each { |attr|
|
34
|
+
node.add_attribute(attr, attribute(attr).to_s)
|
35
|
+
}
|
36
|
+
children().each { |child|
|
37
|
+
childNode = child.as_rexml_document(node, context)
|
38
|
+
}
|
39
|
+
@_rexml_tree = node
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Data
|
44
|
+
def as_rexml_document(rparent = nil, context = {})
|
45
|
+
rparent.add_text(@_content)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
class Comment
|
50
|
+
def as_rexml_document(rparent = nil, context = {})
|
51
|
+
node = REXML::Comment.new(@_content, parent)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class Special
|
56
|
+
def as_rexml_document(rparent = nil, context = {})
|
57
|
+
node = REXML::Instruction.new(@_content,
|
58
|
+
context[:respect_whitespace] || false, rparent)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
class Document
|
63
|
+
def as_rexml_document(context = {})
|
64
|
+
node = REXML::Document.new(nil, context)
|
65
|
+
# add DocType
|
66
|
+
# add <HTML> node
|
67
|
+
html_node.as_rexml_document(node, context)
|
68
|
+
node
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
data/test/suite.rb
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
require 'html/element'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
module HTMLTree
|
6
|
+
class Element
|
7
|
+
def inspect
|
8
|
+
"<#{@_tag}> " + attributes.inspect + children.inspect
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class HTMLElementTestCase < Test::Unit::TestCase
|
14
|
+
def setup
|
15
|
+
@e = HTMLTree::Element.new
|
16
|
+
end
|
17
|
+
|
18
|
+
attr_reader :e
|
19
|
+
|
20
|
+
def test_empty
|
21
|
+
assert_equal(nil, e.tag)
|
22
|
+
assert_equal({}, e.attributes)
|
23
|
+
assert_equal([], e.children)
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_tag
|
27
|
+
e2 = HTMLTree::Element.new(nil, 'sometag')
|
28
|
+
assert_equal('sometag', e2.tag)
|
29
|
+
assert_equal({}, e2.attributes)
|
30
|
+
assert_equal([], e2.children)
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_attribute
|
34
|
+
e.add_attribute('a', 'b')
|
35
|
+
assert_equal('b', e.attribute('a'))
|
36
|
+
assert_equal('b', e['a'])
|
37
|
+
e.add_attribute('a', 'c')
|
38
|
+
assert_equal(['b','c'], e.attribute('a'))
|
39
|
+
assert_equal(['b','c'], e['a'])
|
40
|
+
e.add_attribute('a', 'd', 'e')
|
41
|
+
assert_equal(['b','c', 'd', 'e'], e.attribute('a'))
|
42
|
+
e.add_attribute('b', ['c','d'])
|
43
|
+
assert_equal(['c','d'], e.attribute('b'))
|
44
|
+
e.add_attribute('b', ['e','f'])
|
45
|
+
assert_equal(['c','d', 'e', 'f'], e.attribute('b'))
|
46
|
+
e['b'] = 'aaa'
|
47
|
+
assert_equal('aaa', e.attribute('b'))
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_parent
|
51
|
+
p = HTMLTree::Element.new(nil, 'p')
|
52
|
+
|
53
|
+
c = HTMLTree::Element.new(p, 'c')
|
54
|
+
assert_equal(nil, p.parent)
|
55
|
+
assert_equal([c], p.children)
|
56
|
+
assert_equal(p, c.parent)
|
57
|
+
|
58
|
+
d = HTMLTree::Element.new(p, 'd')
|
59
|
+
assert_equal([c,d], p.children)
|
60
|
+
assert_equal(p, d.parent)
|
61
|
+
|
62
|
+
p.remove_child(d)
|
63
|
+
assert_equal([c], p.children)
|
64
|
+
assert_equal(p, c.parent)
|
65
|
+
assert_equal(nil, d.parent)
|
66
|
+
end
|
67
|
+
|
68
|
+
def test_iterator
|
69
|
+
p = HTMLTree::Element.new(nil, 'p')
|
70
|
+
c = HTMLTree::Element.new(p, 'c')
|
71
|
+
d = HTMLTree::Element.new(p, 'd')
|
72
|
+
end
|
73
|
+
end
|