htmltools 1.10
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +58 -0
- data/README +162 -0
- data/demo/degolive.rb +89 -0
- data/demo/ebaySearch.rb +93 -0
- data/demo/xpath.rb +62 -0
- data/lib/html/element.rb +323 -0
- data/lib/html/rexml-nodepath.rb +49 -0
- data/lib/html/sgml-parser.rb +372 -0
- data/lib/html/stparser.rb +280 -0
- data/lib/html/tags.rb +288 -0
- data/lib/html/tree.rb +140 -0
- data/lib/html/xmltree.rb +173 -0
- data/lib/html/xpath.rb +72 -0
- data/test/suite.rb +5 -0
- data/test/tc_html-element.rb +73 -0
- data/test/tc_html-tree.rb +201 -0
- data/test/tc_source-parser.rb +160 -0
- data/test/tc_stacking-parser.rb +196 -0
- data/test/tc_xpath.rb +87 -0
- metadata +58 -0
data/INSTALL
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
====================
|
2
|
+
How to install:
|
3
|
+
====================
|
4
|
+
|
5
|
+
Configure:
|
6
|
+
|
7
|
+
ruby install.rb config
|
8
|
+
|
9
|
+
Then, set up:
|
10
|
+
|
11
|
+
ruby install.rb setup
|
12
|
+
|
13
|
+
Then, install (this may have to be done as root):
|
14
|
+
|
15
|
+
ruby install.rb install
|
16
|
+
|
17
|
+
You can change paths, etc. using options to install.rb.
|
18
|
+
|
19
|
+
install will automatically run tests and will not install if they failed.
|
20
|
+
|
21
|
+
====================
|
22
|
+
Usage of install.rb:
|
23
|
+
====================
|
24
|
+
ruby install.rb <global option>
|
25
|
+
ruby install.rb [<global options>] <task> [<task options>]
|
26
|
+
|
27
|
+
Global options:
|
28
|
+
-q,--quiet suppress message outputs
|
29
|
+
--verbose output messages verbosely
|
30
|
+
-h,--help print this message
|
31
|
+
-v,--version print version and quit
|
32
|
+
--copyright print copyright and quit
|
33
|
+
|
34
|
+
Tasks:
|
35
|
+
config saves your configurations
|
36
|
+
show shows current configuration
|
37
|
+
setup compiles extention or else
|
38
|
+
install installs files
|
39
|
+
clean does `make clean' for each extention
|
40
|
+
|
41
|
+
Options for config:
|
42
|
+
--prefix=path path prefix of target environment [/usr/local]
|
43
|
+
--std-ruby=path the directory for standard ruby libraries [$prefix/lib/ruby/1.6]
|
44
|
+
--site-ruby-common=path the directory for version-independent non-standard ruby libraries [$prefix/lib/ruby/site_ruby]
|
45
|
+
--site-ruby=path the directory for non-standard ruby libraries [$prefix/lib/ruby/site_ruby/1.6]
|
46
|
+
--bin-dir=path the directory for commands [$prefix/bin]
|
47
|
+
--rb-dir=path the directory for ruby scripts [$site-ruby]
|
48
|
+
--so-dir=path the directory for ruby extentions [$prefix/lib/ruby/site_ruby/1.6/i686-linux]
|
49
|
+
--data-dir=path the directory for shared data [$prefix/share]
|
50
|
+
--ruby-path=path path to set to #! line [/usr/local/bin/ruby]
|
51
|
+
--ruby-prog=name the ruby program using for installation [/usr/local/bin/ruby]
|
52
|
+
--make-prog=name the make program to compile ruby extentions [make]
|
53
|
+
--without-ext does not compile/install ruby extentions [no]
|
54
|
+
--rbconfig=path your rbconfig.rb to load [running ruby's]
|
55
|
+
|
56
|
+
Options for install:
|
57
|
+
--no-harm only display what to do if given [off]
|
58
|
+
|
data/README
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
Version: 1.06
|
2
|
+
12. September, 2003
|
3
|
+
|
4
|
+
This is a Ruby library for building trees representing HTML structure.
|
5
|
+
|
6
|
+
See the file INSTALL for installation instructions.
|
7
|
+
|
8
|
+
Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>
|
9
|
+
Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
|
10
|
+
|
11
|
+
License: Ruby's
|
12
|
+
|
13
|
+
See http://rubyforge.org/projects/ruby-htmltools for the most recent version.
|
14
|
+
|
15
|
+
|
16
|
+
|
17
|
+
This project includes SGML-parser, ported from Python by Takahiro Maebashi <maebashi@iij.ad.jp> (see: http://www.jin.gr.jp/~nahi/Ruby/html-parser/README.html)
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
=============
|
23
|
+
PREREQUISITES
|
24
|
+
=============
|
25
|
+
Ruby 1.8
|
26
|
+
|
27
|
+
-------------
|
28
|
+
Test::Unit
|
29
|
+
-------------
|
30
|
+
The tests run using Test::Unit. Test::Unit is part of the standard Ruby install
|
31
|
+
as of 1.8
|
32
|
+
|
33
|
+
-------------
|
34
|
+
REXML
|
35
|
+
-------------
|
36
|
+
XPath support requires REXML. REXML is part of the standard Ruby install
|
37
|
+
as of 1.8
|
38
|
+
|
39
|
+
===========
|
40
|
+
CHANGES
|
41
|
+
===========
|
42
|
+
------------------
|
43
|
+
Changes from 1.09:
|
44
|
+
------------------
|
45
|
+
|
46
|
+
* Some minor bugfixes
|
47
|
+
* SGMLParser.src_range makes it very easy to write applications which
|
48
|
+
parse HTML files into components and manipulate the corresponding
|
49
|
+
source code _without_ altering it. (by Philip Dorrell)
|
50
|
+
|
51
|
+
------------------
|
52
|
+
Changes from 1.08:
|
53
|
+
------------------
|
54
|
+
|
55
|
+
* Fixed xpath script and added tests
|
56
|
+
* Fixed bug #681 (xhtml)
|
57
|
+
* Added GemSpec
|
58
|
+
|
59
|
+
------------------
|
60
|
+
Changes from 1.07:
|
61
|
+
------------------
|
62
|
+
|
63
|
+
* Fixed tc_xpath test_match_all after it was broken by upgrade of REXML.
|
64
|
+
* Refactored utility code for printing node paths into rexml-nodepath.rb
|
65
|
+
|
66
|
+
------------------
|
67
|
+
Changes from 1.06:
|
68
|
+
------------------
|
69
|
+
|
70
|
+
* Included stuff that I had forgot to package into the tarball.
|
71
|
+
|
72
|
+
------------------
|
73
|
+
Changes from 1.05:
|
74
|
+
------------------
|
75
|
+
|
76
|
+
* Updated everything to work with Ruby 1.8.
|
77
|
+
|
78
|
+
|
79
|
+
------------------
|
80
|
+
Changes from 1.04:
|
81
|
+
------------------
|
82
|
+
|
83
|
+
* Made sure that unknown entities and characters are not discarded, in both
|
84
|
+
html/tree.rb and html/xmltree.rb
|
85
|
+
|
86
|
+
* Added handling of DOCTYPE to html/xmltree.rb
|
87
|
+
|
88
|
+
------------------
|
89
|
+
Changes from 1.03:
|
90
|
+
------------------
|
91
|
+
|
92
|
+
* Added HTMLTree::XMLParser, which makes a REXML document from the given HTML.
|
93
|
+
|
94
|
+
* Changed HTMLTree::Element::print_on() to write()
|
95
|
+
|
96
|
+
* Made it so that a string or IO can be passed to HTMLTree::Element::dump()
|
97
|
+
|
98
|
+
* Made it so that a string or IO can be passed to HTMLTree::Element::write()
|
99
|
+
|
100
|
+
------------------
|
101
|
+
Changes from 1.02:
|
102
|
+
------------------
|
103
|
+
|
104
|
+
* added XPath and XML conversion (needs REXML)
|
105
|
+
|
106
|
+
* Wrapped all code in namespaces. The following class names have changed:
|
107
|
+
|
108
|
+
-- in html/element.rb
|
109
|
+
HTMLDocument => HTMLTree::Document
|
110
|
+
HTMLElement => HTMLTree::Element
|
111
|
+
HTMLData => HTMLTree::Data
|
112
|
+
HTMLComment => HTMLTree::Comment
|
113
|
+
HTMLSpecial => HTMLTree::Special
|
114
|
+
|
115
|
+
-- in html/tags.rb
|
116
|
+
HTMLTag => HTML::Tag
|
117
|
+
HTMLBlockTag => HTML::BlockTag
|
118
|
+
HTMLInlineTag => HTML::InlineTag
|
119
|
+
HTMLBlockOrInlineTag => HTML::BlockOrInlineTag
|
120
|
+
HTMLEmptyTag => HTML::EmptyTag
|
121
|
+
|
122
|
+
-- in html/tree.rb
|
123
|
+
HTMLTreeParser => HTMLTree::Parser
|
124
|
+
|
125
|
+
-- in html/stparser.rb
|
126
|
+
StackingParser => HTML::StackingParser
|
127
|
+
|
128
|
+
* added HTMLTree::Element.root()
|
129
|
+
|
130
|
+
------------------
|
131
|
+
Changes from 1.01:
|
132
|
+
------------------
|
133
|
+
|
134
|
+
* documented change to sgml-parser.
|
135
|
+
|
136
|
+
* added bin/ebaySearch.rb example
|
137
|
+
|
138
|
+
------------------
|
139
|
+
Changes from 1.0:
|
140
|
+
------------------
|
141
|
+
|
142
|
+
* attributes now maintain their order. Though this probably isn't
|
143
|
+
strictly necessary under HTML, it may make it easier to compare
|
144
|
+
document versions.
|
145
|
+
|
146
|
+
* the generated tree now has a top-level node for the document itself,
|
147
|
+
so the DTD can be stored. THIS WILL REQUIRE CODE CHANGES if you have
|
148
|
+
code that assumes that the root node is always <html>. To find the
|
149
|
+
<html> node, you can use the new methods HTMLTreeParser#html() or
|
150
|
+
HTMLDocument#html_node():
|
151
|
+
|
152
|
+
html = parser.html()
|
153
|
+
|
154
|
+
Or, querying the tree:
|
155
|
+
|
156
|
+
html = parser.tree.html_node()
|
157
|
+
|
158
|
+
* comments are stored in the tree
|
159
|
+
|
160
|
+
* added HTMLElement#print_on() to print a (sub)tree to an IO stream
|
161
|
+
|
162
|
+
vim: ts=2 sw=2 et
|
data/demo/degolive.rb
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
#!c:/ruby-1.8/bin/ruby
|
2
|
+
# This removes GoLive tags and attributes.
|
3
|
+
#
|
4
|
+
# usage:
|
5
|
+
# ruby degolive.rb \<file> > <file>
|
6
|
+
#
|
7
|
+
# or (changes the file in place and saves the originals as .bak):
|
8
|
+
# ruby -i.bak degolive.rb files
|
9
|
+
#
|
10
|
+
# Copyright:: Copyright (C) 2002 Ned Konz
|
11
|
+
# License:: Ruby License
|
12
|
+
# CVS ID:: $Id: degolive.rb,v 1.1 2003/09/12 18:41:04 jhannes Exp $
|
13
|
+
#
|
14
|
+
require 'html/tags'
|
15
|
+
require 'html/stparser'
|
16
|
+
|
17
|
+
# Add nasty GoLive tags so we can remove them
|
18
|
+
# (name, is_block, is_inline, is_empty, can_omit)
|
19
|
+
HTML::Tag.add_tag('CSACTIONS', true, false, false, false)
|
20
|
+
HTML::Tag.add_tag('CSACTION', false, true, true, false)
|
21
|
+
HTML::Tag.add_tag('CSSCRIPTDICT', true, false, false, false)
|
22
|
+
HTML::Tag.add_tag('CSACTIONDICT', true, false, false, false)
|
23
|
+
|
24
|
+
class GoLiveRemover < HTML::StackingParser
|
25
|
+
# return true if we are in the scope of a bad tag
|
26
|
+
def ignoring(tag=nil)
|
27
|
+
(tag and tag =~ /^cs[as]/i) or
|
28
|
+
last_tag =~ /^cs[as]/i or
|
29
|
+
parent_tag =~ /^cs[as]/i
|
30
|
+
end
|
31
|
+
|
32
|
+
def printTag(tag, isStart=false, attrs=nil)
|
33
|
+
print(isStart ? "<" : "</")
|
34
|
+
print tag
|
35
|
+
if attrs
|
36
|
+
attrs.each { |a|
|
37
|
+
# Also need to remove 'csclick="..."'
|
38
|
+
# and on.*="CSAction(..." attribs
|
39
|
+
print " #{a[0]}=\"#{a[1]}\"" \
|
40
|
+
unless a[0] == "csclick" or (a[1] || '') =~ /^CSAction\(/
|
41
|
+
}
|
42
|
+
end
|
43
|
+
print(">")
|
44
|
+
end
|
45
|
+
|
46
|
+
def handle_start_tag(tag, attrs)
|
47
|
+
printTag(tag, true, attrs) unless ignoring(tag)
|
48
|
+
end
|
49
|
+
|
50
|
+
def handle_empty_tag(tag, attrs)
|
51
|
+
printTag(tag, true, attrs) unless ignoring(tag)
|
52
|
+
end
|
53
|
+
|
54
|
+
def handle_end_tag(tag)
|
55
|
+
printTag(tag, false) unless ignoring(tag)
|
56
|
+
end
|
57
|
+
|
58
|
+
def handle_missing_end_tag(tag)
|
59
|
+
warn("warning: inserting missing end tag </#{tag}>\n")
|
60
|
+
print("</#{tag}><!-- inserted -->")
|
61
|
+
end
|
62
|
+
|
63
|
+
def handle_data(data)
|
64
|
+
print(data) unless ignoring
|
65
|
+
end
|
66
|
+
|
67
|
+
def handle_script(data)
|
68
|
+
print(data) unless ignoring
|
69
|
+
end
|
70
|
+
|
71
|
+
def handle_unknown_character(name)
|
72
|
+
print("&\##{name};") unless ignoring
|
73
|
+
end
|
74
|
+
|
75
|
+
def handle_unknown_entity(name)
|
76
|
+
print("&#{name};") unless ignoring
|
77
|
+
end
|
78
|
+
|
79
|
+
def handle_comment(data)
|
80
|
+
print(data) unless ignoring
|
81
|
+
end
|
82
|
+
|
83
|
+
def handle_special(data)
|
84
|
+
print(data) unless ignoring
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
p = GoLiveRemover.new(true, true)
|
89
|
+
ARGF.each_line { |line| p.feed(line) }
|
data/demo/ebaySearch.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
require 'html/tree'
|
2
|
+
require 'net/http'
|
3
|
+
|
4
|
+
# A demo script showing HTML parsing after a HTTP request.
|
5
|
+
# This does an eBay search for the given term(s), and displays the
|
6
|
+
# results as a text table, delimited with '|' characters.
|
7
|
+
#
|
8
|
+
# usage:
|
9
|
+
# ruby ebaySearch.rb searchterm [...]
|
10
|
+
#
|
11
|
+
# If you give the -d flag, it contacts http://localhost:8080 instead
|
12
|
+
# (for testing).
|
13
|
+
#
|
14
|
+
# Note that actually using this script is in violation of the
|
15
|
+
# eBay User Agreement.
|
16
|
+
#
|
17
|
+
# A real robot would respect the REP.
|
18
|
+
#
|
19
|
+
# This is just an example.
|
20
|
+
|
21
|
+
verbose = false
|
22
|
+
if ARGV[0] == '-v'
|
23
|
+
verbose = true
|
24
|
+
ARGV.shift
|
25
|
+
end
|
26
|
+
|
27
|
+
unless ARGV.size > 0
|
28
|
+
puts "usage: #{$0} [-v] searchterm [...]"
|
29
|
+
puts " -v turns on verbose error reporting"
|
30
|
+
exit 2
|
31
|
+
end
|
32
|
+
|
33
|
+
query = ARGV.join('+')
|
34
|
+
|
35
|
+
queryHost = "search.ebay.com"
|
36
|
+
queryPort = 80
|
37
|
+
queryURL = "/search/search.dll" +
|
38
|
+
"?MfcISAPICommand=GetResult&ht=1&SortProperty=MetaEndSort&query=#{query}"
|
39
|
+
|
40
|
+
# try to look like a real browser (don't know if it matters)...
|
41
|
+
headers = {
|
42
|
+
'User-Agent' => 'Mozilla/5.0 (compatible; Konqueror/3.0.0-10; Linux)',
|
43
|
+
'Pragma' => 'no-cache',
|
44
|
+
'Cache-control' => 'no-cache',
|
45
|
+
'Accept' => 'text/*, image/jpeg, image/png, image/*, */*',
|
46
|
+
'Accept-Encoding' => 'x-gzip, gzip, identity',
|
47
|
+
'Accept-Charset' => 'ISO-8859-1',
|
48
|
+
}
|
49
|
+
|
50
|
+
data = ""
|
51
|
+
|
52
|
+
# add these non-HTML 4.0 tags because eBay seems to use them
|
53
|
+
# (name, is_block, is_inline, is_empty, can_omit)
|
54
|
+
HTML::Tag.add_tag('ilayer', true, false, true, true)
|
55
|
+
HTML::Tag.add_tag('layer', true, false, true, true)
|
56
|
+
HTML::Tag.add_tag('nolayer', true, false, true, true)
|
57
|
+
HTML::Tag.add_tag('noframe', true, false, false, false)
|
58
|
+
|
59
|
+
begin
|
60
|
+
Net::HTTP.version_1_1
|
61
|
+
http = Net::HTTP.new(queryHost, queryPort)
|
62
|
+
# http.open_timeout = 30
|
63
|
+
http.read_timeout = 120
|
64
|
+
resp, data = http.get(queryURL, headers)
|
65
|
+
rescue
|
66
|
+
print 'error:'
|
67
|
+
puts http.inspect
|
68
|
+
puts resp.inspect
|
69
|
+
exit 1
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
p = HTMLTree::Parser.new(verbose, false)
|
74
|
+
p.feed(data)
|
75
|
+
|
76
|
+
# Find all ViewItem links. These are in table rows for each item.
|
77
|
+
itemAnchors = p.html.select { |ea| ea.tag == 'a' && ea['href'] =~ /ViewItem/ }
|
78
|
+
|
79
|
+
# Now find their rows by going up to the first <tr>
|
80
|
+
itemRows = itemAnchors.collect { |ea|
|
81
|
+
while ea.tag != 'tr'
|
82
|
+
ea = ea.parent
|
83
|
+
end
|
84
|
+
ea
|
85
|
+
}
|
86
|
+
|
87
|
+
# print the text from them
|
88
|
+
itemRows.each { |row|
|
89
|
+
texts = row.select { |item| item.data? }. # just look at cdata
|
90
|
+
collect { |data| data.strip }. # strip it
|
91
|
+
select { |data| data.size > 0 } # and keep the non-blank fields
|
92
|
+
puts texts.join('|')
|
93
|
+
}
|
data/demo/xpath.rb
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# This is a demo program that takes a given HTML file, parses it,
|
3
|
+
# and allows exploration of XPath queries.
|
4
|
+
require 'html/xmltree'
|
5
|
+
require 'html/rexml-nodepath'
|
6
|
+
require 'rexml/xpath'
|
7
|
+
|
8
|
+
|
9
|
+
def displayXPath(d, path)
|
10
|
+
REXML::XPath.each(d, path) do |node|
|
11
|
+
puts node.full_path + " --> " + node.to_s
|
12
|
+
end
|
13
|
+
nil
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
unless ARGV.size >= 1
|
18
|
+
$stderr.puts(%Q{usage: #{$0} file.html interactive
|
19
|
+
#{$0} file.html expressions read expressions from expressions
|
20
|
+
})
|
21
|
+
exit(1)
|
22
|
+
end
|
23
|
+
|
24
|
+
$use_readline = false
|
25
|
+
if $stderr.isatty
|
26
|
+
$stdout.sync = true
|
27
|
+
begin
|
28
|
+
require 'readline'
|
29
|
+
$use_readline = true
|
30
|
+
$stderr.puts('line editing enabled')
|
31
|
+
trap('SIGINT', 'IGNORE')
|
32
|
+
rescue LoadError
|
33
|
+
$use_readline = false
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def getline(prompt)
|
38
|
+
if $use_readline
|
39
|
+
Readline.readline(prompt, true)
|
40
|
+
else
|
41
|
+
$stdout.print prompt
|
42
|
+
$stdin.gets
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
inputFile = ARGV.shift
|
47
|
+
p = HTMLTree::XMLParser.new(true)
|
48
|
+
p.parse_file_named(inputFile)
|
49
|
+
d = p.document
|
50
|
+
|
51
|
+
if ARGV.size > 0 then
|
52
|
+
for path in ARGV
|
53
|
+
displayXPath(d, path)
|
54
|
+
end
|
55
|
+
exit
|
56
|
+
end
|
57
|
+
|
58
|
+
prompt = 'Enter XPath expression on a single line (ctrl-D (unix) or ctrl-Z (win) to quit): '
|
59
|
+
while expr = getline(prompt)
|
60
|
+
displayXPath(d, expr)
|
61
|
+
prompt = 'expr: '
|
62
|
+
end
|