htmltools 1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/INSTALL ADDED
@@ -0,0 +1,58 @@
1
+ ====================
2
+ How to install:
3
+ ====================
4
+
5
+ Configure:
6
+
7
+ ruby install.rb config
8
+
9
+ Then, set up:
10
+
11
+ ruby install.rb setup
12
+
13
+ Then, install (this may have to be done as root):
14
+
15
+ ruby install.rb install
16
+
17
+ You can change paths, etc. using options to install.rb.
18
+
19
+ install will automatically run tests and will not install if they failed.
20
+
21
+ ====================
22
+ Usage of install.rb:
23
+ ====================
24
+ ruby install.rb <global option>
25
+ ruby install.rb [<global options>] <task> [<task options>]
26
+
27
+ Global options:
28
+ -q,--quiet suppress message outputs
29
+ --verbose output messages verbosely
30
+ -h,--help print this message
31
+ -v,--version print version and quit
32
+ --copyright print copyright and quit
33
+
34
+ Tasks:
35
+ config saves your configurations
36
+ show shows current configuration
37
+ setup compiles extention or else
38
+ install installs files
39
+ clean does `make clean' for each extention
40
+
41
+ Options for config:
42
+ --prefix=path path prefix of target environment [/usr/local]
43
+ --std-ruby=path the directory for standard ruby libraries [$prefix/lib/ruby/1.6]
44
+ --site-ruby-common=path the directory for version-independent non-standard ruby libraries [$prefix/lib/ruby/site_ruby]
45
+ --site-ruby=path the directory for non-standard ruby libraries [$prefix/lib/ruby/site_ruby/1.6]
46
+ --bin-dir=path the directory for commands [$prefix/bin]
47
+ --rb-dir=path the directory for ruby scripts [$site-ruby]
48
+ --so-dir=path the directory for ruby extentions [$prefix/lib/ruby/site_ruby/1.6/i686-linux]
49
+ --data-dir=path the directory for shared data [$prefix/share]
50
+ --ruby-path=path path to set to #! line [/usr/local/bin/ruby]
51
+ --ruby-prog=name the ruby program using for installation [/usr/local/bin/ruby]
52
+ --make-prog=name the make program to compile ruby extentions [make]
53
+ --without-ext does not compile/install ruby extentions [no]
54
+ --rbconfig=path your rbconfig.rb to load [running ruby's]
55
+
56
+ Options for install:
57
+ --no-harm only display what to do if given [off]
58
+
data/README ADDED
@@ -0,0 +1,162 @@
1
+ Version: 1.06
2
+ 12. September, 2003
3
+
4
+ This is a Ruby library for building trees representing HTML structure.
5
+
6
+ See the file INSTALL for installation instructions.
7
+
8
+ Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>
9
+ Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
10
+
11
+ License: Ruby's
12
+
13
+ See http://rubyforge.org/projects/ruby-htmltools for the most recent version.
14
+
15
+
16
+
17
+ This project includes SGML-parser, ported from Python by Takahiro Maebashi <maebashi@iij.ad.jp> (see: http://www.jin.gr.jp/~nahi/Ruby/html-parser/README.html)
18
+
19
+
20
+
21
+
22
+ =============
23
+ PREREQUISITES
24
+ =============
25
+ Ruby 1.8
26
+
27
+ -------------
28
+ Test::Unit
29
+ -------------
30
+ The tests run using Test::Unit. Test::Unit is part of the standard Ruby install
31
+ as of 1.8
32
+
33
+ -------------
34
+ REXML
35
+ -------------
36
+ XPath support requires REXML. REXML is part of the standard Ruby install
37
+ as of 1.8
38
+
39
+ ===========
40
+ CHANGES
41
+ ===========
42
+ ------------------
43
+ Changes from 1.09:
44
+ ------------------
45
+
46
+ * Some minor bugfixes
47
+ * SGMLParser.src_range makes it very easy to write applications which
48
+ parse HTML files into components and manipulate the corresponding
49
+ source code _without_ altering it. (by Philip Dorrell)
50
+
51
+ ------------------
52
+ Changes from 1.08:
53
+ ------------------
54
+
55
+ * Fixed xpath script and added tests
56
+ * Fixed bug #681 (xhtml)
57
+ * Added GemSpec
58
+
59
+ ------------------
60
+ Changes from 1.07:
61
+ ------------------
62
+
63
+ * Fixed tc_xpath test_match_all after it was broken by upgrade of REXML.
64
+ * Refactored utility code for printing node paths into rexml-nodepath.rb
65
+
66
+ ------------------
67
+ Changes from 1.06:
68
+ ------------------
69
+
70
+ * Included stuff that I had forgot to package into the tarball.
71
+
72
+ ------------------
73
+ Changes from 1.05:
74
+ ------------------
75
+
76
+ * Updated everything to work with Ruby 1.8.
77
+
78
+
79
+ ------------------
80
+ Changes from 1.04:
81
+ ------------------
82
+
83
+ * Made sure that unknown entities and characters are not discarded, in both
84
+ html/tree.rb and html/xmltree.rb
85
+
86
+ * Added handling of DOCTYPE to html/xmltree.rb
87
+
88
+ ------------------
89
+ Changes from 1.03:
90
+ ------------------
91
+
92
+ * Added HTMLTree::XMLParser, which makes a REXML document from the given HTML.
93
+
94
+ * Changed HTMLTree::Element::print_on() to write()
95
+
96
+ * Made it so that a string or IO can be passed to HTMLTree::Element::dump()
97
+
98
+ * Made it so that a string or IO can be passed to HTMLTree::Element::write()
99
+
100
+ ------------------
101
+ Changes from 1.02:
102
+ ------------------
103
+
104
+ * added XPath and XML conversion (needs REXML)
105
+
106
+ * Wrapped all code in namespaces. The following class names have changed:
107
+
108
+ -- in html/element.rb
109
+ HTMLDocument => HTMLTree::Document
110
+ HTMLElement => HTMLTree::Element
111
+ HTMLData => HTMLTree::Data
112
+ HTMLComment => HTMLTree::Comment
113
+ HTMLSpecial => HTMLTree::Special
114
+
115
+ -- in html/tags.rb
116
+ HTMLTag => HTML::Tag
117
+ HTMLBlockTag => HTML::BlockTag
118
+ HTMLInlineTag => HTML::InlineTag
119
+ HTMLBlockOrInlineTag => HTML::BlockOrInlineTag
120
+ HTMLEmptyTag => HTML::EmptyTag
121
+
122
+ -- in html/tree.rb
123
+ HTMLTreeParser => HTMLTree::Parser
124
+
125
+ -- in html/stparser.rb
126
+ StackingParser => HTML::StackingParser
127
+
128
+ * added HTMLTree::Element.root()
129
+
130
+ ------------------
131
+ Changes from 1.01:
132
+ ------------------
133
+
134
+ * documented change to sgml-parser.
135
+
136
+ * added bin/ebaySearch.rb example
137
+
138
+ ------------------
139
+ Changes from 1.0:
140
+ ------------------
141
+
142
+ * attributes now maintain their order. Though this probably isn't
143
+ strictly necessary under HTML, it may make it easier to compare
144
+ document versions.
145
+
146
+ * the generated tree now has a top-level node for the document itself,
147
+ so the DTD can be stored. THIS WILL REQUIRE CODE CHANGES if you have
148
+ code that assumes that the root node is always <html>. To find the
149
+ <html> node, you can use the new methods HTMLTreeParser#html() or
150
+ HTMLDocument#html_node():
151
+
152
+ html = parser.html()
153
+
154
+ Or, querying the tree:
155
+
156
+ html = parser.tree.html_node()
157
+
158
+ * comments are stored in the tree
159
+
160
+ * added HTMLElement#print_on() to print a (sub)tree to an IO stream
161
+
162
+ vim: ts=2 sw=2 et
@@ -0,0 +1,89 @@
1
+ #!c:/ruby-1.8/bin/ruby
2
+ # This removes GoLive tags and attributes.
3
+ #
4
+ # usage:
5
+ # ruby degolive.rb \<file> > <file>
6
+ #
7
+ # or (changes the file in place and saves the originals as .bak):
8
+ # ruby -i.bak degolive.rb files
9
+ #
10
+ # Copyright:: Copyright (C) 2002 Ned Konz
11
+ # License:: Ruby License
12
+ # CVS ID:: $Id: degolive.rb,v 1.1 2003/09/12 18:41:04 jhannes Exp $
13
+ #
14
+ require 'html/tags'
15
+ require 'html/stparser'
16
+
17
+ # Add nasty GoLive tags so we can remove them
18
+ # (name, is_block, is_inline, is_empty, can_omit)
19
+ HTML::Tag.add_tag('CSACTIONS', true, false, false, false)
20
+ HTML::Tag.add_tag('CSACTION', false, true, true, false)
21
+ HTML::Tag.add_tag('CSSCRIPTDICT', true, false, false, false)
22
+ HTML::Tag.add_tag('CSACTIONDICT', true, false, false, false)
23
+
24
+ class GoLiveRemover < HTML::StackingParser
25
+ # return true if we are in the scope of a bad tag
26
+ def ignoring(tag=nil)
27
+ (tag and tag =~ /^cs[as]/i) or
28
+ last_tag =~ /^cs[as]/i or
29
+ parent_tag =~ /^cs[as]/i
30
+ end
31
+
32
+ def printTag(tag, isStart=false, attrs=nil)
33
+ print(isStart ? "<" : "</")
34
+ print tag
35
+ if attrs
36
+ attrs.each { |a|
37
+ # Also need to remove 'csclick="..."'
38
+ # and on.*="CSAction(..." attribs
39
+ print " #{a[0]}=\"#{a[1]}\"" \
40
+ unless a[0] == "csclick" or (a[1] || '') =~ /^CSAction\(/
41
+ }
42
+ end
43
+ print(">")
44
+ end
45
+
46
+ def handle_start_tag(tag, attrs)
47
+ printTag(tag, true, attrs) unless ignoring(tag)
48
+ end
49
+
50
+ def handle_empty_tag(tag, attrs)
51
+ printTag(tag, true, attrs) unless ignoring(tag)
52
+ end
53
+
54
+ def handle_end_tag(tag)
55
+ printTag(tag, false) unless ignoring(tag)
56
+ end
57
+
58
+ def handle_missing_end_tag(tag)
59
+ warn("warning: inserting missing end tag </#{tag}>\n")
60
+ print("</#{tag}><!-- inserted -->")
61
+ end
62
+
63
+ def handle_data(data)
64
+ print(data) unless ignoring
65
+ end
66
+
67
+ def handle_script(data)
68
+ print(data) unless ignoring
69
+ end
70
+
71
+ def handle_unknown_character(name)
72
+ print("&\##{name};") unless ignoring
73
+ end
74
+
75
+ def handle_unknown_entity(name)
76
+ print("&#{name};") unless ignoring
77
+ end
78
+
79
+ def handle_comment(data)
80
+ print(data) unless ignoring
81
+ end
82
+
83
+ def handle_special(data)
84
+ print(data) unless ignoring
85
+ end
86
+ end
87
+
88
+ p = GoLiveRemover.new(true, true)
89
+ ARGF.each_line { |line| p.feed(line) }
@@ -0,0 +1,93 @@
1
+ require 'html/tree'
2
+ require 'net/http'
3
+
4
+ # A demo script showing HTML parsing after a HTTP request.
5
+ # This does an eBay search for the given term(s), and displays the
6
+ # results as a text table, delimited with '|' characters.
7
+ #
8
+ # usage:
9
+ # ruby ebaySearch.rb searchterm [...]
10
+ #
11
+ # If you give the -d flag, it contacts http://localhost:8080 instead
12
+ # (for testing).
13
+ #
14
+ # Note that actually using this script is in violation of the
15
+ # eBay User Agreement.
16
+ #
17
+ # A real robot would respect the REP.
18
+ #
19
+ # This is just an example.
20
+
21
+ verbose = false
22
+ if ARGV[0] == '-v'
23
+ verbose = true
24
+ ARGV.shift
25
+ end
26
+
27
+ unless ARGV.size > 0
28
+ puts "usage: #{$0} [-v] searchterm [...]"
29
+ puts " -v turns on verbose error reporting"
30
+ exit 2
31
+ end
32
+
33
+ query = ARGV.join('+')
34
+
35
+ queryHost = "search.ebay.com"
36
+ queryPort = 80
37
+ queryURL = "/search/search.dll" +
38
+ "?MfcISAPICommand=GetResult&ht=1&SortProperty=MetaEndSort&query=#{query}"
39
+
40
+ # try to look like a real browser (don't know if it matters)...
41
+ headers = {
42
+ 'User-Agent' => 'Mozilla/5.0 (compatible; Konqueror/3.0.0-10; Linux)',
43
+ 'Pragma' => 'no-cache',
44
+ 'Cache-control' => 'no-cache',
45
+ 'Accept' => 'text/*, image/jpeg, image/png, image/*, */*',
46
+ 'Accept-Encoding' => 'x-gzip, gzip, identity',
47
+ 'Accept-Charset' => 'ISO-8859-1',
48
+ }
49
+
50
+ data = ""
51
+
52
+ # add these non-HTML 4.0 tags because eBay seems to use them
53
+ # (name, is_block, is_inline, is_empty, can_omit)
54
+ HTML::Tag.add_tag('ilayer', true, false, true, true)
55
+ HTML::Tag.add_tag('layer', true, false, true, true)
56
+ HTML::Tag.add_tag('nolayer', true, false, true, true)
57
+ HTML::Tag.add_tag('noframe', true, false, false, false)
58
+
59
+ begin
60
+ Net::HTTP.version_1_1
61
+ http = Net::HTTP.new(queryHost, queryPort)
62
+ # http.open_timeout = 30
63
+ http.read_timeout = 120
64
+ resp, data = http.get(queryURL, headers)
65
+ rescue
66
+ print 'error:'
67
+ puts http.inspect
68
+ puts resp.inspect
69
+ exit 1
70
+ end
71
+
72
+
73
+ p = HTMLTree::Parser.new(verbose, false)
74
+ p.feed(data)
75
+
76
+ # Find all ViewItem links. These are in table rows for each item.
77
+ itemAnchors = p.html.select { |ea| ea.tag == 'a' && ea['href'] =~ /ViewItem/ }
78
+
79
+ # Now find their rows by going up to the first <tr>
80
+ itemRows = itemAnchors.collect { |ea|
81
+ while ea.tag != 'tr'
82
+ ea = ea.parent
83
+ end
84
+ ea
85
+ }
86
+
87
+ # print the text from them
88
+ itemRows.each { |row|
89
+ texts = row.select { |item| item.data? }. # just look at cdata
90
+ collect { |data| data.strip }. # strip it
91
+ select { |data| data.size > 0 } # and keep the non-blank fields
92
+ puts texts.join('|')
93
+ }
@@ -0,0 +1,62 @@
1
+ #!/usr/bin/ruby
2
+ # This is a demo program that takes a given HTML file, parses it,
3
+ # and allows exploration of XPath queries.
4
+ require 'html/xmltree'
5
+ require 'html/rexml-nodepath'
6
+ require 'rexml/xpath'
7
+
8
+
9
+ def displayXPath(d, path)
10
+ REXML::XPath.each(d, path) do |node|
11
+ puts node.full_path + " --> " + node.to_s
12
+ end
13
+ nil
14
+ end
15
+
16
+
17
+ unless ARGV.size >= 1
18
+ $stderr.puts(%Q{usage: #{$0} file.html interactive
19
+ #{$0} file.html expressions read expressions from expressions
20
+ })
21
+ exit(1)
22
+ end
23
+
24
+ $use_readline = false
25
+ if $stderr.isatty
26
+ $stdout.sync = true
27
+ begin
28
+ require 'readline'
29
+ $use_readline = true
30
+ $stderr.puts('line editing enabled')
31
+ trap('SIGINT', 'IGNORE')
32
+ rescue LoadError
33
+ $use_readline = false
34
+ end
35
+ end
36
+
37
+ def getline(prompt)
38
+ if $use_readline
39
+ Readline.readline(prompt, true)
40
+ else
41
+ $stdout.print prompt
42
+ $stdin.gets
43
+ end
44
+ end
45
+
46
+ inputFile = ARGV.shift
47
+ p = HTMLTree::XMLParser.new(true)
48
+ p.parse_file_named(inputFile)
49
+ d = p.document
50
+
51
+ if ARGV.size > 0 then
52
+ for path in ARGV
53
+ displayXPath(d, path)
54
+ end
55
+ exit
56
+ end
57
+
58
+ prompt = 'Enter XPath expression on a single line (ctrl-D (unix) or ctrl-Z (win) to quit): '
59
+ while expr = getline(prompt)
60
+ displayXPath(d, expr)
61
+ prompt = 'expr: '
62
+ end