htmltools 1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +58 -0
- data/README +162 -0
- data/demo/degolive.rb +89 -0
- data/demo/ebaySearch.rb +93 -0
- data/demo/xpath.rb +62 -0
- data/lib/html/element.rb +323 -0
- data/lib/html/rexml-nodepath.rb +49 -0
- data/lib/html/sgml-parser.rb +372 -0
- data/lib/html/stparser.rb +280 -0
- data/lib/html/tags.rb +288 -0
- data/lib/html/tree.rb +140 -0
- data/lib/html/xmltree.rb +173 -0
- data/lib/html/xpath.rb +72 -0
- data/test/suite.rb +5 -0
- data/test/tc_html-element.rb +73 -0
- data/test/tc_html-tree.rb +201 -0
- data/test/tc_source-parser.rb +160 -0
- data/test/tc_stacking-parser.rb +196 -0
- data/test/tc_xpath.rb +87 -0
- metadata +58 -0
data/INSTALL
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
====================
|
2
|
+
How to install:
|
3
|
+
====================
|
4
|
+
|
5
|
+
Configure:
|
6
|
+
|
7
|
+
ruby install.rb config
|
8
|
+
|
9
|
+
Then, set up:
|
10
|
+
|
11
|
+
ruby install.rb setup
|
12
|
+
|
13
|
+
Then, install (this may have to be done as root):
|
14
|
+
|
15
|
+
ruby install.rb install
|
16
|
+
|
17
|
+
You can change paths, etc. using options to install.rb.
|
18
|
+
|
19
|
+
install will automatically run tests and will not install if they failed.
|
20
|
+
|
21
|
+
====================
|
22
|
+
Usage of install.rb:
|
23
|
+
====================
|
24
|
+
ruby install.rb <global option>
|
25
|
+
ruby install.rb [<global options>] <task> [<task options>]
|
26
|
+
|
27
|
+
Global options:
|
28
|
+
-q,--quiet suppress message outputs
|
29
|
+
--verbose output messages verbosely
|
30
|
+
-h,--help print this message
|
31
|
+
-v,--version print version and quit
|
32
|
+
--copyright print copyright and quit
|
33
|
+
|
34
|
+
Tasks:
|
35
|
+
config saves your configurations
|
36
|
+
show shows current configuration
|
37
|
+
setup compiles extention or else
|
38
|
+
install installs files
|
39
|
+
clean does `make clean' for each extention
|
40
|
+
|
41
|
+
Options for config:
|
42
|
+
--prefix=path path prefix of target environment [/usr/local]
|
43
|
+
--std-ruby=path the directory for standard ruby libraries [$prefix/lib/ruby/1.6]
|
44
|
+
--site-ruby-common=path the directory for version-independent non-standard ruby libraries [$prefix/lib/ruby/site_ruby]
|
45
|
+
--site-ruby=path the directory for non-standard ruby libraries [$prefix/lib/ruby/site_ruby/1.6]
|
46
|
+
--bin-dir=path the directory for commands [$prefix/bin]
|
47
|
+
--rb-dir=path the directory for ruby scripts [$site-ruby]
|
48
|
+
--so-dir=path the directory for ruby extentions [$prefix/lib/ruby/site_ruby/1.6/i686-linux]
|
49
|
+
--data-dir=path the directory for shared data [$prefix/share]
|
50
|
+
--ruby-path=path path to set to #! line [/usr/local/bin/ruby]
|
51
|
+
--ruby-prog=name the ruby program using for installation [/usr/local/bin/ruby]
|
52
|
+
--make-prog=name the make program to compile ruby extentions [make]
|
53
|
+
--without-ext does not compile/install ruby extentions [no]
|
54
|
+
--rbconfig=path your rbconfig.rb to load [running ruby's]
|
55
|
+
|
56
|
+
Options for install:
|
57
|
+
--no-harm only display what to do if given [off]
|
58
|
+
|
data/README
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
Version: 1.06
|
2
|
+
12. September, 2003
|
3
|
+
|
4
|
+
This is a Ruby library for building trees representing HTML structure.
|
5
|
+
|
6
|
+
See the file INSTALL for installation instructions.
|
7
|
+
|
8
|
+
Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>
|
9
|
+
Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
|
10
|
+
|
11
|
+
License: Ruby's
|
12
|
+
|
13
|
+
See http://rubyforge.org/projects/ruby-htmltools for the most recent version.
|
14
|
+
|
15
|
+
|
16
|
+
|
17
|
+
This project includes SGML-parser, ported from Python by Takahiro Maebashi <maebashi@iij.ad.jp> (see: http://www.jin.gr.jp/~nahi/Ruby/html-parser/README.html)
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
=============
|
23
|
+
PREREQUISITES
|
24
|
+
=============
|
25
|
+
Ruby 1.8
|
26
|
+
|
27
|
+
-------------
|
28
|
+
Test::Unit
|
29
|
+
-------------
|
30
|
+
The tests run using Test::Unit. Test::Unit is part of the standard Ruby install
|
31
|
+
as of 1.8
|
32
|
+
|
33
|
+
-------------
|
34
|
+
REXML
|
35
|
+
-------------
|
36
|
+
XPath support requires REXML. REXML is part of the standard Ruby install
|
37
|
+
as of 1.8
|
38
|
+
|
39
|
+
===========
|
40
|
+
CHANGES
|
41
|
+
===========
|
42
|
+
------------------
|
43
|
+
Changes from 1.09:
|
44
|
+
------------------
|
45
|
+
|
46
|
+
* Some minor bugfixes
|
47
|
+
* SGMLParser.src_range makes it very easy to write applications which
|
48
|
+
parse HTML files into components and manipulate the corresponding
|
49
|
+
source code _without_ altering it. (by Philip Dorrell)
|
50
|
+
|
51
|
+
------------------
|
52
|
+
Changes from 1.08:
|
53
|
+
------------------
|
54
|
+
|
55
|
+
* Fixed xpath script and added tests
|
56
|
+
* Fixed bug #681 (xhtml)
|
57
|
+
* Added GemSpec
|
58
|
+
|
59
|
+
------------------
|
60
|
+
Changes from 1.07:
|
61
|
+
------------------
|
62
|
+
|
63
|
+
* Fixed tc_xpath test_match_all after it was broken by upgrade of REXML.
|
64
|
+
* Refactored utility code for printing node paths into rexml-nodepath.rb
|
65
|
+
|
66
|
+
------------------
|
67
|
+
Changes from 1.06:
|
68
|
+
------------------
|
69
|
+
|
70
|
+
* Included stuff that I had forgot to package into the tarball.
|
71
|
+
|
72
|
+
------------------
|
73
|
+
Changes from 1.05:
|
74
|
+
------------------
|
75
|
+
|
76
|
+
* Updated everything to work with Ruby 1.8.
|
77
|
+
|
78
|
+
|
79
|
+
------------------
|
80
|
+
Changes from 1.04:
|
81
|
+
------------------
|
82
|
+
|
83
|
+
* Made sure that unknown entities and characters are not discarded, in both
|
84
|
+
html/tree.rb and html/xmltree.rb
|
85
|
+
|
86
|
+
* Added handling of DOCTYPE to html/xmltree.rb
|
87
|
+
|
88
|
+
------------------
|
89
|
+
Changes from 1.03:
|
90
|
+
------------------
|
91
|
+
|
92
|
+
* Added HTMLTree::XMLParser, which makes a REXML document from the given HTML.
|
93
|
+
|
94
|
+
* Changed HTMLTree::Element::print_on() to write()
|
95
|
+
|
96
|
+
* Made it so that a string or IO can be passed to HTMLTree::Element::dump()
|
97
|
+
|
98
|
+
* Made it so that a string or IO can be passed to HTMLTree::Element::write()
|
99
|
+
|
100
|
+
------------------
|
101
|
+
Changes from 1.02:
|
102
|
+
------------------
|
103
|
+
|
104
|
+
* added XPath and XML conversion (needs REXML)
|
105
|
+
|
106
|
+
* Wrapped all code in namespaces. The following class names have changed:
|
107
|
+
|
108
|
+
-- in html/element.rb
|
109
|
+
HTMLDocument => HTMLTree::Document
|
110
|
+
HTMLElement => HTMLTree::Element
|
111
|
+
HTMLData => HTMLTree::Data
|
112
|
+
HTMLComment => HTMLTree::Comment
|
113
|
+
HTMLSpecial => HTMLTree::Special
|
114
|
+
|
115
|
+
-- in html/tags.rb
|
116
|
+
HTMLTag => HTML::Tag
|
117
|
+
HTMLBlockTag => HTML::BlockTag
|
118
|
+
HTMLInlineTag => HTML::InlineTag
|
119
|
+
HTMLBlockOrInlineTag => HTML::BlockOrInlineTag
|
120
|
+
HTMLEmptyTag => HTML::EmptyTag
|
121
|
+
|
122
|
+
-- in html/tree.rb
|
123
|
+
HTMLTreeParser => HTMLTree::Parser
|
124
|
+
|
125
|
+
-- in html/stparser.rb
|
126
|
+
StackingParser => HTML::StackingParser
|
127
|
+
|
128
|
+
* added HTMLTree::Element.root()
|
129
|
+
|
130
|
+
------------------
|
131
|
+
Changes from 1.01:
|
132
|
+
------------------
|
133
|
+
|
134
|
+
* documented change to sgml-parser.
|
135
|
+
|
136
|
+
* added bin/ebaySearch.rb example
|
137
|
+
|
138
|
+
------------------
|
139
|
+
Changes from 1.0:
|
140
|
+
------------------
|
141
|
+
|
142
|
+
* attributes now maintain their order. Though this probably isn't
|
143
|
+
strictly necessary under HTML, it may make it easier to compare
|
144
|
+
document versions.
|
145
|
+
|
146
|
+
* the generated tree now has a top-level node for the document itself,
|
147
|
+
so the DTD can be stored. THIS WILL REQUIRE CODE CHANGES if you have
|
148
|
+
code that assumes that the root node is always <html>. To find the
|
149
|
+
<html> node, you can use the new methods HTMLTreeParser#html() or
|
150
|
+
HTMLDocument#html_node():
|
151
|
+
|
152
|
+
html = parser.html()
|
153
|
+
|
154
|
+
Or, querying the tree:
|
155
|
+
|
156
|
+
html = parser.tree.html_node()
|
157
|
+
|
158
|
+
* comments are stored in the tree
|
159
|
+
|
160
|
+
* added HTMLElement#print_on() to print a (sub)tree to an IO stream
|
161
|
+
|
162
|
+
vim: ts=2 sw=2 et
|
data/demo/degolive.rb
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
#!c:/ruby-1.8/bin/ruby
|
2
|
+
# This removes GoLive tags and attributes.
|
3
|
+
#
|
4
|
+
# usage:
|
5
|
+
# ruby degolive.rb \<file> > <file>
|
6
|
+
#
|
7
|
+
# or (changes the file in place and saves the originals as .bak):
|
8
|
+
# ruby -i.bak degolive.rb files
|
9
|
+
#
|
10
|
+
# Copyright:: Copyright (C) 2002 Ned Konz
|
11
|
+
# License:: Ruby License
|
12
|
+
# CVS ID:: $Id: degolive.rb,v 1.1 2003/09/12 18:41:04 jhannes Exp $
|
13
|
+
#
|
14
|
+
require 'html/tags'
|
15
|
+
require 'html/stparser'
|
16
|
+
|
17
|
+
# Add nasty GoLive tags so we can remove them
|
18
|
+
# (name, is_block, is_inline, is_empty, can_omit)
|
19
|
+
HTML::Tag.add_tag('CSACTIONS', true, false, false, false)
|
20
|
+
HTML::Tag.add_tag('CSACTION', false, true, true, false)
|
21
|
+
HTML::Tag.add_tag('CSSCRIPTDICT', true, false, false, false)
|
22
|
+
HTML::Tag.add_tag('CSACTIONDICT', true, false, false, false)
|
23
|
+
|
24
|
+
class GoLiveRemover < HTML::StackingParser
|
25
|
+
# return true if we are in the scope of a bad tag
|
26
|
+
def ignoring(tag=nil)
|
27
|
+
(tag and tag =~ /^cs[as]/i) or
|
28
|
+
last_tag =~ /^cs[as]/i or
|
29
|
+
parent_tag =~ /^cs[as]/i
|
30
|
+
end
|
31
|
+
|
32
|
+
def printTag(tag, isStart=false, attrs=nil)
|
33
|
+
print(isStart ? "<" : "</")
|
34
|
+
print tag
|
35
|
+
if attrs
|
36
|
+
attrs.each { |a|
|
37
|
+
# Also need to remove 'csclick="..."'
|
38
|
+
# and on.*="CSAction(..." attribs
|
39
|
+
print " #{a[0]}=\"#{a[1]}\"" \
|
40
|
+
unless a[0] == "csclick" or (a[1] || '') =~ /^CSAction\(/
|
41
|
+
}
|
42
|
+
end
|
43
|
+
print(">")
|
44
|
+
end
|
45
|
+
|
46
|
+
def handle_start_tag(tag, attrs)
|
47
|
+
printTag(tag, true, attrs) unless ignoring(tag)
|
48
|
+
end
|
49
|
+
|
50
|
+
def handle_empty_tag(tag, attrs)
|
51
|
+
printTag(tag, true, attrs) unless ignoring(tag)
|
52
|
+
end
|
53
|
+
|
54
|
+
def handle_end_tag(tag)
|
55
|
+
printTag(tag, false) unless ignoring(tag)
|
56
|
+
end
|
57
|
+
|
58
|
+
def handle_missing_end_tag(tag)
|
59
|
+
warn("warning: inserting missing end tag </#{tag}>\n")
|
60
|
+
print("</#{tag}><!-- inserted -->")
|
61
|
+
end
|
62
|
+
|
63
|
+
def handle_data(data)
|
64
|
+
print(data) unless ignoring
|
65
|
+
end
|
66
|
+
|
67
|
+
def handle_script(data)
|
68
|
+
print(data) unless ignoring
|
69
|
+
end
|
70
|
+
|
71
|
+
def handle_unknown_character(name)
|
72
|
+
print("&\##{name};") unless ignoring
|
73
|
+
end
|
74
|
+
|
75
|
+
def handle_unknown_entity(name)
|
76
|
+
print("&#{name};") unless ignoring
|
77
|
+
end
|
78
|
+
|
79
|
+
def handle_comment(data)
|
80
|
+
print(data) unless ignoring
|
81
|
+
end
|
82
|
+
|
83
|
+
def handle_special(data)
|
84
|
+
print(data) unless ignoring
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
p = GoLiveRemover.new(true, true)
|
89
|
+
ARGF.each_line { |line| p.feed(line) }
|
data/demo/ebaySearch.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
require 'html/tree'
|
2
|
+
require 'net/http'
|
3
|
+
|
4
|
+
# A demo script showing HTML parsing after a HTTP request.
|
5
|
+
# This does an eBay search for the given term(s), and displays the
|
6
|
+
# results as a text table, delimited with '|' characters.
|
7
|
+
#
|
8
|
+
# usage:
|
9
|
+
# ruby ebaySearch.rb searchterm [...]
|
10
|
+
#
|
11
|
+
# If you give the -d flag, it contacts http://localhost:8080 instead
|
12
|
+
# (for testing).
|
13
|
+
#
|
14
|
+
# Note that actually using this script is in violation of the
|
15
|
+
# eBay User Agreement.
|
16
|
+
#
|
17
|
+
# A real robot would respect the REP.
|
18
|
+
#
|
19
|
+
# This is just an example.
|
20
|
+
|
21
|
+
verbose = false
|
22
|
+
if ARGV[0] == '-v'
|
23
|
+
verbose = true
|
24
|
+
ARGV.shift
|
25
|
+
end
|
26
|
+
|
27
|
+
unless ARGV.size > 0
|
28
|
+
puts "usage: #{$0} [-v] searchterm [...]"
|
29
|
+
puts " -v turns on verbose error reporting"
|
30
|
+
exit 2
|
31
|
+
end
|
32
|
+
|
33
|
+
query = ARGV.join('+')
|
34
|
+
|
35
|
+
queryHost = "search.ebay.com"
|
36
|
+
queryPort = 80
|
37
|
+
queryURL = "/search/search.dll" +
|
38
|
+
"?MfcISAPICommand=GetResult&ht=1&SortProperty=MetaEndSort&query=#{query}"
|
39
|
+
|
40
|
+
# try to look like a real browser (don't know if it matters)...
|
41
|
+
headers = {
|
42
|
+
'User-Agent' => 'Mozilla/5.0 (compatible; Konqueror/3.0.0-10; Linux)',
|
43
|
+
'Pragma' => 'no-cache',
|
44
|
+
'Cache-control' => 'no-cache',
|
45
|
+
'Accept' => 'text/*, image/jpeg, image/png, image/*, */*',
|
46
|
+
'Accept-Encoding' => 'x-gzip, gzip, identity',
|
47
|
+
'Accept-Charset' => 'ISO-8859-1',
|
48
|
+
}
|
49
|
+
|
50
|
+
data = ""
|
51
|
+
|
52
|
+
# add these non-HTML 4.0 tags because eBay seems to use them
|
53
|
+
# (name, is_block, is_inline, is_empty, can_omit)
|
54
|
+
HTML::Tag.add_tag('ilayer', true, false, true, true)
|
55
|
+
HTML::Tag.add_tag('layer', true, false, true, true)
|
56
|
+
HTML::Tag.add_tag('nolayer', true, false, true, true)
|
57
|
+
HTML::Tag.add_tag('noframe', true, false, false, false)
|
58
|
+
|
59
|
+
begin
|
60
|
+
Net::HTTP.version_1_1
|
61
|
+
http = Net::HTTP.new(queryHost, queryPort)
|
62
|
+
# http.open_timeout = 30
|
63
|
+
http.read_timeout = 120
|
64
|
+
resp, data = http.get(queryURL, headers)
|
65
|
+
rescue
|
66
|
+
print 'error:'
|
67
|
+
puts http.inspect
|
68
|
+
puts resp.inspect
|
69
|
+
exit 1
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
p = HTMLTree::Parser.new(verbose, false)
|
74
|
+
p.feed(data)
|
75
|
+
|
76
|
+
# Find all ViewItem links. These are in table rows for each item.
|
77
|
+
itemAnchors = p.html.select { |ea| ea.tag == 'a' && ea['href'] =~ /ViewItem/ }
|
78
|
+
|
79
|
+
# Now find their rows by going up to the first <tr>
|
80
|
+
itemRows = itemAnchors.collect { |ea|
|
81
|
+
while ea.tag != 'tr'
|
82
|
+
ea = ea.parent
|
83
|
+
end
|
84
|
+
ea
|
85
|
+
}
|
86
|
+
|
87
|
+
# print the text from them
|
88
|
+
itemRows.each { |row|
|
89
|
+
texts = row.select { |item| item.data? }. # just look at cdata
|
90
|
+
collect { |data| data.strip }. # strip it
|
91
|
+
select { |data| data.size > 0 } # and keep the non-blank fields
|
92
|
+
puts texts.join('|')
|
93
|
+
}
|
data/demo/xpath.rb
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# This is a demo program that takes a given HTML file, parses it,
|
3
|
+
# and allows exploration of XPath queries.
|
4
|
+
require 'html/xmltree'
|
5
|
+
require 'html/rexml-nodepath'
|
6
|
+
require 'rexml/xpath'
|
7
|
+
|
8
|
+
|
9
|
+
def displayXPath(d, path)
|
10
|
+
REXML::XPath.each(d, path) do |node|
|
11
|
+
puts node.full_path + " --> " + node.to_s
|
12
|
+
end
|
13
|
+
nil
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
unless ARGV.size >= 1
|
18
|
+
$stderr.puts(%Q{usage: #{$0} file.html interactive
|
19
|
+
#{$0} file.html expressions read expressions from expressions
|
20
|
+
})
|
21
|
+
exit(1)
|
22
|
+
end
|
23
|
+
|
24
|
+
$use_readline = false
|
25
|
+
if $stderr.isatty
|
26
|
+
$stdout.sync = true
|
27
|
+
begin
|
28
|
+
require 'readline'
|
29
|
+
$use_readline = true
|
30
|
+
$stderr.puts('line editing enabled')
|
31
|
+
trap('SIGINT', 'IGNORE')
|
32
|
+
rescue LoadError
|
33
|
+
$use_readline = false
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def getline(prompt)
|
38
|
+
if $use_readline
|
39
|
+
Readline.readline(prompt, true)
|
40
|
+
else
|
41
|
+
$stdout.print prompt
|
42
|
+
$stdin.gets
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
inputFile = ARGV.shift
|
47
|
+
p = HTMLTree::XMLParser.new(true)
|
48
|
+
p.parse_file_named(inputFile)
|
49
|
+
d = p.document
|
50
|
+
|
51
|
+
if ARGV.size > 0 then
|
52
|
+
for path in ARGV
|
53
|
+
displayXPath(d, path)
|
54
|
+
end
|
55
|
+
exit
|
56
|
+
end
|
57
|
+
|
58
|
+
prompt = 'Enter XPath expression on a single line (ctrl-D (unix) or ctrl-Z (win) to quit): '
|
59
|
+
while expr = getline(prompt)
|
60
|
+
displayXPath(d, expr)
|
61
|
+
prompt = 'expr: '
|
62
|
+
end
|