htmltools 1.10

Sign up to get free protection for your applications and to get access to all the features.
data/INSTALL ADDED
@@ -0,0 +1,58 @@
1
+ ====================
2
+ How to install:
3
+ ====================
4
+
5
+ Configure:
6
+
7
+ ruby install.rb config
8
+
9
+ Then, set up:
10
+
11
+ ruby install.rb setup
12
+
13
+ Then, install (this may have to be done as root):
14
+
15
+ ruby install.rb install
16
+
17
+ You can change paths, etc. using options to install.rb.
18
+
19
+ install will automatically run tests and will not install if they failed.
20
+
21
+ ====================
22
+ Usage of install.rb:
23
+ ====================
24
+ ruby install.rb <global option>
25
+ ruby install.rb [<global options>] <task> [<task options>]
26
+
27
+ Global options:
28
+ -q,--quiet suppress message outputs
29
+ --verbose output messages verbosely
30
+ -h,--help print this message
31
+ -v,--version print version and quit
32
+ --copyright print copyright and quit
33
+
34
+ Tasks:
35
+ config saves your configurations
36
+ show shows current configuration
37
+ setup compiles extention or else
38
+ install installs files
39
+ clean does `make clean' for each extention
40
+
41
+ Options for config:
42
+ --prefix=path path prefix of target environment [/usr/local]
43
+ --std-ruby=path the directory for standard ruby libraries [$prefix/lib/ruby/1.6]
44
+ --site-ruby-common=path the directory for version-independent non-standard ruby libraries [$prefix/lib/ruby/site_ruby]
45
+ --site-ruby=path the directory for non-standard ruby libraries [$prefix/lib/ruby/site_ruby/1.6]
46
+ --bin-dir=path the directory for commands [$prefix/bin]
47
+ --rb-dir=path the directory for ruby scripts [$site-ruby]
48
+ --so-dir=path the directory for ruby extentions [$prefix/lib/ruby/site_ruby/1.6/i686-linux]
49
+ --data-dir=path the directory for shared data [$prefix/share]
50
+ --ruby-path=path path to set to #! line [/usr/local/bin/ruby]
51
+ --ruby-prog=name the ruby program using for installation [/usr/local/bin/ruby]
52
+ --make-prog=name the make program to compile ruby extentions [make]
53
+ --without-ext does not compile/install ruby extentions [no]
54
+ --rbconfig=path your rbconfig.rb to load [running ruby's]
55
+
56
+ Options for install:
57
+ --no-harm only display what to do if given [off]
58
+
data/README ADDED
@@ -0,0 +1,162 @@
1
+ Version: 1.06
2
+ 12. September, 2003
3
+
4
+ This is a Ruby library for building trees representing HTML structure.
5
+
6
+ See the file INSTALL for installation instructions.
7
+
8
+ Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>
9
+ Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
10
+
11
+ License: Ruby's
12
+
13
+ See http://rubyforge.org/projects/ruby-htmltools for the most recent version.
14
+
15
+
16
+
17
+ This project includes SGML-parser, ported from Python by Takahiro Maebashi <maebashi@iij.ad.jp> (see: http://www.jin.gr.jp/~nahi/Ruby/html-parser/README.html)
18
+
19
+
20
+
21
+
22
+ =============
23
+ PREREQUISITES
24
+ =============
25
+ Ruby 1.8
26
+
27
+ -------------
28
+ Test::Unit
29
+ -------------
30
+ The tests run using Test::Unit. Test::Unit is part of the standard Ruby install
31
+ as of 1.8
32
+
33
+ -------------
34
+ REXML
35
+ -------------
36
+ XPath support requires REXML. REXML is part of the standard Ruby install
37
+ as of 1.8
38
+
39
+ ===========
40
+ CHANGES
41
+ ===========
42
+ ------------------
43
+ Changes from 1.09:
44
+ ------------------
45
+
46
+ * Some minor bugfixes
47
+ * SGMLParser.src_range makes it very easy to write applications which
48
+ parse HTML files into components and manipulate the corresponding
49
+ source code _without_ altering it. (by Philip Dorrell)
50
+
51
+ ------------------
52
+ Changes from 1.08:
53
+ ------------------
54
+
55
+ * Fixed xpath script and added tests
56
+ * Fixed bug #681 (xhtml)
57
+ * Added GemSpec
58
+
59
+ ------------------
60
+ Changes from 1.07:
61
+ ------------------
62
+
63
+ * Fixed tc_xpath test_match_all after it was broken by upgrade of REXML.
64
+ * Refactored utility code for printing node paths into rexml-nodepath.rb
65
+
66
+ ------------------
67
+ Changes from 1.06:
68
+ ------------------
69
+
70
+ * Included stuff that I had forgot to package into the tarball.
71
+
72
+ ------------------
73
+ Changes from 1.05:
74
+ ------------------
75
+
76
+ * Updated everything to work with Ruby 1.8.
77
+
78
+
79
+ ------------------
80
+ Changes from 1.04:
81
+ ------------------
82
+
83
+ * Made sure that unknown entities and characters are not discarded, in both
84
+ html/tree.rb and html/xmltree.rb
85
+
86
+ * Added handling of DOCTYPE to html/xmltree.rb
87
+
88
+ ------------------
89
+ Changes from 1.03:
90
+ ------------------
91
+
92
+ * Added HTMLTree::XMLParser, which makes a REXML document from the given HTML.
93
+
94
+ * Changed HTMLTree::Element::print_on() to write()
95
+
96
+ * Made it so that a string or IO can be passed to HTMLTree::Element::dump()
97
+
98
+ * Made it so that a string or IO can be passed to HTMLTree::Element::write()
99
+
100
+ ------------------
101
+ Changes from 1.02:
102
+ ------------------
103
+
104
+ * added XPath and XML conversion (needs REXML)
105
+
106
+ * Wrapped all code in namespaces. The following class names have changed:
107
+
108
+ -- in html/element.rb
109
+ HTMLDocument => HTMLTree::Document
110
+ HTMLElement => HTMLTree::Element
111
+ HTMLData => HTMLTree::Data
112
+ HTMLComment => HTMLTree::Comment
113
+ HTMLSpecial => HTMLTree::Special
114
+
115
+ -- in html/tags.rb
116
+ HTMLTag => HTML::Tag
117
+ HTMLBlockTag => HTML::BlockTag
118
+ HTMLInlineTag => HTML::InlineTag
119
+ HTMLBlockOrInlineTag => HTML::BlockOrInlineTag
120
+ HTMLEmptyTag => HTML::EmptyTag
121
+
122
+ -- in html/tree.rb
123
+ HTMLTreeParser => HTMLTree::Parser
124
+
125
+ -- in html/stparser.rb
126
+ StackingParser => HTML::StackingParser
127
+
128
+ * added HTMLTree::Element.root()
129
+
130
+ ------------------
131
+ Changes from 1.01:
132
+ ------------------
133
+
134
+ * documented change to sgml-parser.
135
+
136
+ * added bin/ebaySearch.rb example
137
+
138
+ ------------------
139
+ Changes from 1.0:
140
+ ------------------
141
+
142
+ * attributes now maintain their order. Though this probably isn't
143
+ strictly necessary under HTML, it may make it easier to compare
144
+ document versions.
145
+
146
+ * the generated tree now has a top-level node for the document itself,
147
+ so the DTD can be stored. THIS WILL REQUIRE CODE CHANGES if you have
148
+ code that assumes that the root node is always <html>. To find the
149
+ <html> node, you can use the new methods HTMLTreeParser#html() or
150
+ HTMLDocument#html_node():
151
+
152
+ html = parser.html()
153
+
154
+ Or, querying the tree:
155
+
156
+ html = parser.tree.html_node()
157
+
158
+ * comments are stored in the tree
159
+
160
+ * added HTMLElement#print_on() to print a (sub)tree to an IO stream
161
+
162
+ vim: ts=2 sw=2 et
@@ -0,0 +1,89 @@
1
+ #!c:/ruby-1.8/bin/ruby
2
+ # This removes GoLive tags and attributes.
3
+ #
4
+ # usage:
5
+ # ruby degolive.rb \<file> > <file>
6
+ #
7
+ # or (changes the file in place and saves the originals as .bak):
8
+ # ruby -i.bak degolive.rb files
9
+ #
10
+ # Copyright:: Copyright (C) 2002 Ned Konz
11
+ # License:: Ruby License
12
+ # CVS ID:: $Id: degolive.rb,v 1.1 2003/09/12 18:41:04 jhannes Exp $
13
+ #
14
+ require 'html/tags'
15
+ require 'html/stparser'
16
+
17
+ # Add nasty GoLive tags so we can remove them
18
+ # (name, is_block, is_inline, is_empty, can_omit)
19
+ HTML::Tag.add_tag('CSACTIONS', true, false, false, false)
20
+ HTML::Tag.add_tag('CSACTION', false, true, true, false)
21
+ HTML::Tag.add_tag('CSSCRIPTDICT', true, false, false, false)
22
+ HTML::Tag.add_tag('CSACTIONDICT', true, false, false, false)
23
+
24
+ class GoLiveRemover < HTML::StackingParser
25
+ # return true if we are in the scope of a bad tag
26
+ def ignoring(tag=nil)
27
+ (tag and tag =~ /^cs[as]/i) or
28
+ last_tag =~ /^cs[as]/i or
29
+ parent_tag =~ /^cs[as]/i
30
+ end
31
+
32
+ def printTag(tag, isStart=false, attrs=nil)
33
+ print(isStart ? "<" : "</")
34
+ print tag
35
+ if attrs
36
+ attrs.each { |a|
37
+ # Also need to remove 'csclick="..."'
38
+ # and on.*="CSAction(..." attribs
39
+ print " #{a[0]}=\"#{a[1]}\"" \
40
+ unless a[0] == "csclick" or (a[1] || '') =~ /^CSAction\(/
41
+ }
42
+ end
43
+ print(">")
44
+ end
45
+
46
+ def handle_start_tag(tag, attrs)
47
+ printTag(tag, true, attrs) unless ignoring(tag)
48
+ end
49
+
50
+ def handle_empty_tag(tag, attrs)
51
+ printTag(tag, true, attrs) unless ignoring(tag)
52
+ end
53
+
54
+ def handle_end_tag(tag)
55
+ printTag(tag, false) unless ignoring(tag)
56
+ end
57
+
58
+ def handle_missing_end_tag(tag)
59
+ warn("warning: inserting missing end tag </#{tag}>\n")
60
+ print("</#{tag}><!-- inserted -->")
61
+ end
62
+
63
+ def handle_data(data)
64
+ print(data) unless ignoring
65
+ end
66
+
67
+ def handle_script(data)
68
+ print(data) unless ignoring
69
+ end
70
+
71
+ def handle_unknown_character(name)
72
+ print("&\##{name};") unless ignoring
73
+ end
74
+
75
+ def handle_unknown_entity(name)
76
+ print("&#{name};") unless ignoring
77
+ end
78
+
79
+ def handle_comment(data)
80
+ print(data) unless ignoring
81
+ end
82
+
83
+ def handle_special(data)
84
+ print(data) unless ignoring
85
+ end
86
+ end
87
+
88
+ p = GoLiveRemover.new(true, true)
89
+ ARGF.each_line { |line| p.feed(line) }
@@ -0,0 +1,93 @@
1
+ require 'html/tree'
2
+ require 'net/http'
3
+
4
+ # A demo script showing HTML parsing after a HTTP request.
5
+ # This does an eBay search for the given term(s), and displays the
6
+ # results as a text table, delimited with '|' characters.
7
+ #
8
+ # usage:
9
+ # ruby ebaySearch.rb searchterm [...]
10
+ #
11
+ # If you give the -d flag, it contacts http://localhost:8080 instead
12
+ # (for testing).
13
+ #
14
+ # Note that actually using this script is in violation of the
15
+ # eBay User Agreement.
16
+ #
17
+ # A real robot would respect the REP.
18
+ #
19
+ # This is just an example.
20
+
21
+ verbose = false
22
+ if ARGV[0] == '-v'
23
+ verbose = true
24
+ ARGV.shift
25
+ end
26
+
27
+ unless ARGV.size > 0
28
+ puts "usage: #{$0} [-v] searchterm [...]"
29
+ puts " -v turns on verbose error reporting"
30
+ exit 2
31
+ end
32
+
33
+ query = ARGV.join('+')
34
+
35
+ queryHost = "search.ebay.com"
36
+ queryPort = 80
37
+ queryURL = "/search/search.dll" +
38
+ "?MfcISAPICommand=GetResult&ht=1&SortProperty=MetaEndSort&query=#{query}"
39
+
40
+ # try to look like a real browser (don't know if it matters)...
41
+ headers = {
42
+ 'User-Agent' => 'Mozilla/5.0 (compatible; Konqueror/3.0.0-10; Linux)',
43
+ 'Pragma' => 'no-cache',
44
+ 'Cache-control' => 'no-cache',
45
+ 'Accept' => 'text/*, image/jpeg, image/png, image/*, */*',
46
+ 'Accept-Encoding' => 'x-gzip, gzip, identity',
47
+ 'Accept-Charset' => 'ISO-8859-1',
48
+ }
49
+
50
+ data = ""
51
+
52
+ # add these non-HTML 4.0 tags because eBay seems to use them
53
+ # (name, is_block, is_inline, is_empty, can_omit)
54
+ HTML::Tag.add_tag('ilayer', true, false, true, true)
55
+ HTML::Tag.add_tag('layer', true, false, true, true)
56
+ HTML::Tag.add_tag('nolayer', true, false, true, true)
57
+ HTML::Tag.add_tag('noframe', true, false, false, false)
58
+
59
+ begin
60
+ Net::HTTP.version_1_1
61
+ http = Net::HTTP.new(queryHost, queryPort)
62
+ # http.open_timeout = 30
63
+ http.read_timeout = 120
64
+ resp, data = http.get(queryURL, headers)
65
+ rescue
66
+ print 'error:'
67
+ puts http.inspect
68
+ puts resp.inspect
69
+ exit 1
70
+ end
71
+
72
+
73
+ p = HTMLTree::Parser.new(verbose, false)
74
+ p.feed(data)
75
+
76
+ # Find all ViewItem links. These are in table rows for each item.
77
+ itemAnchors = p.html.select { |ea| ea.tag == 'a' && ea['href'] =~ /ViewItem/ }
78
+
79
+ # Now find their rows by going up to the first <tr>
80
+ itemRows = itemAnchors.collect { |ea|
81
+ while ea.tag != 'tr'
82
+ ea = ea.parent
83
+ end
84
+ ea
85
+ }
86
+
87
+ # print the text from them
88
+ itemRows.each { |row|
89
+ texts = row.select { |item| item.data? }. # just look at cdata
90
+ collect { |data| data.strip }. # strip it
91
+ select { |data| data.size > 0 } # and keep the non-blank fields
92
+ puts texts.join('|')
93
+ }
@@ -0,0 +1,62 @@
1
+ #!/usr/bin/ruby
2
+ # This is a demo program that takes a given HTML file, parses it,
3
+ # and allows exploration of XPath queries.
4
+ require 'html/xmltree'
5
+ require 'html/rexml-nodepath'
6
+ require 'rexml/xpath'
7
+
8
+
9
+ def displayXPath(d, path)
10
+ REXML::XPath.each(d, path) do |node|
11
+ puts node.full_path + " --> " + node.to_s
12
+ end
13
+ nil
14
+ end
15
+
16
+
17
+ unless ARGV.size >= 1
18
+ $stderr.puts(%Q{usage: #{$0} file.html interactive
19
+ #{$0} file.html expressions read expressions from expressions
20
+ })
21
+ exit(1)
22
+ end
23
+
24
+ $use_readline = false
25
+ if $stderr.isatty
26
+ $stdout.sync = true
27
+ begin
28
+ require 'readline'
29
+ $use_readline = true
30
+ $stderr.puts('line editing enabled')
31
+ trap('SIGINT', 'IGNORE')
32
+ rescue LoadError
33
+ $use_readline = false
34
+ end
35
+ end
36
+
37
+ def getline(prompt)
38
+ if $use_readline
39
+ Readline.readline(prompt, true)
40
+ else
41
+ $stdout.print prompt
42
+ $stdin.gets
43
+ end
44
+ end
45
+
46
+ inputFile = ARGV.shift
47
+ p = HTMLTree::XMLParser.new(true)
48
+ p.parse_file_named(inputFile)
49
+ d = p.document
50
+
51
+ if ARGV.size > 0 then
52
+ for path in ARGV
53
+ displayXPath(d, path)
54
+ end
55
+ exit
56
+ end
57
+
58
+ prompt = 'Enter XPath expression on a single line (ctrl-D (unix) or ctrl-Z (win) to quit): '
59
+ while expr = getline(prompt)
60
+ displayXPath(d, expr)
61
+ prompt = 'expr: '
62
+ end