hpricot 0.8.3-i386-mswin32
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +104 -0
- data/COPYING +18 -0
- data/README.md +276 -0
- data/Rakefile +234 -0
- data/ext/fast_xs/FastXsService.java +1123 -0
- data/ext/fast_xs/extconf.rb +4 -0
- data/ext/fast_xs/fast_xs.c +210 -0
- data/ext/hpricot_scan/HpricotCss.java +850 -0
- data/ext/hpricot_scan/HpricotScanService.java +2099 -0
- data/ext/hpricot_scan/extconf.rb +9 -0
- data/ext/hpricot_scan/hpricot_common.rl +76 -0
- data/ext/hpricot_scan/hpricot_css.c +3511 -0
- data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
- data/ext/hpricot_scan/hpricot_css.rl +120 -0
- data/ext/hpricot_scan/hpricot_scan.c +7039 -0
- data/ext/hpricot_scan/hpricot_scan.h +79 -0
- data/ext/hpricot_scan/hpricot_scan.java.rl +1161 -0
- data/ext/hpricot_scan/hpricot_scan.rl +896 -0
- data/extras/hpricot.png +0 -0
- data/lib/fast_xs.rb +1 -0
- data/lib/fast_xs/1.8/fast_xs.so +0 -0
- data/lib/fast_xs/1.9/fast_xs.so +0 -0
- data/lib/hpricot.rb +26 -0
- data/lib/hpricot/blankslate.rb +63 -0
- data/lib/hpricot/builder.rb +216 -0
- data/lib/hpricot/elements.rb +514 -0
- data/lib/hpricot/htmlinfo.rb +691 -0
- data/lib/hpricot/inspect.rb +103 -0
- data/lib/hpricot/modules.rb +40 -0
- data/lib/hpricot/parse.rb +38 -0
- data/lib/hpricot/tag.rb +219 -0
- data/lib/hpricot/tags.rb +164 -0
- data/lib/hpricot/traverse.rb +839 -0
- data/lib/hpricot/xchar.rb +94 -0
- data/lib/hpricot_scan.rb +1 -0
- data/lib/hpricot_scan/1.8/hpricot_scan.so +0 -0
- data/lib/hpricot_scan/1.9/hpricot_scan.so +0 -0
- data/test/files/basic.xhtml +17 -0
- data/test/files/boingboing.html +2266 -0
- data/test/files/cy0.html +3653 -0
- data/test/files/immob.html +400 -0
- data/test/files/pace_application.html +1320 -0
- data/test/files/tenderlove.html +16 -0
- data/test/files/uswebgen.html +220 -0
- data/test/files/utf8.html +1054 -0
- data/test/files/week9.html +1723 -0
- data/test/files/why.xml +19 -0
- data/test/load_files.rb +7 -0
- data/test/nokogiri-bench.rb +64 -0
- data/test/test_alter.rb +96 -0
- data/test/test_builder.rb +37 -0
- data/test/test_parser.rb +457 -0
- data/test/test_paths.rb +25 -0
- data/test/test_preserved.rb +88 -0
- data/test/test_xml.rb +28 -0
- metadata +128 -0
data/CHANGELOG
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
= 0.8.3
|
2
|
+
=== 3 November, 2010
|
3
|
+
* GH#8: Nil-check before downcasing attribute key
|
4
|
+
* GH#25: Proper ruby 1.9 encoding support
|
5
|
+
* GH#28. Use integers instead of ?? on 1.9, which is just a string.
|
6
|
+
* including noscript to ElementInclusions , so that hpricot wont fail
|
7
|
+
when trying to parse a meta tag inside head section when noscript is
|
8
|
+
present.
|
9
|
+
* latest changes from fast_xs mainline
|
10
|
+
* Fixes to get Hpricot running on Rubinius:
|
11
|
+
* Use free, not XFREE
|
12
|
+
* Remove RSTRUCT craziness, don't break Array#at
|
13
|
+
|
14
|
+
= 0.8.2
|
15
|
+
=== 5 November, 2009
|
16
|
+
* Bring JRuby support up to speed, including Java-based hpricot_css support
|
17
|
+
* Change JRuby fast_xs to have same escaping behavior as C fast_xs
|
18
|
+
* fix for issue #2, downcasing of html attributes inside the parser.
|
19
|
+
* solve issue #3 with bogus etags being preserved in `to_s` rather than just `to_original_html`.
|
20
|
+
* fix error when attempting to reparent cleared node. (issue #5)
|
21
|
+
* Hpricot::Attributes proxy object for using `ele.attributes[k] = v` directly.
|
22
|
+
however, it is preferred to use the jquery-like `elements.attr(k, v)`.
|
23
|
+
|
24
|
+
= 0.8.1
|
25
|
+
=== 3 April, 2009
|
26
|
+
* big problems on Ruby 1.8.6, use INT2FIX instead of INT2NUM. hashes were being cast to bignums.
|
27
|
+
* patch for 1.8.5 to define RARRAY_PTR. thanks, mike perham!
|
28
|
+
* inspecting empty document bug, courtesy of @TalLevAmi.
|
29
|
+
|
30
|
+
= 0.8
|
31
|
+
=== 31st March, 2009
|
32
|
+
* Saving memory and speed by using RStruct-based elements in the C extension.
|
33
|
+
* Bug in tag parsing, causing runaway <script> and <style> tags in HTML.
|
34
|
+
* Problem compiling under Ruby 1.9, due to our_rb_hash_lookup function meant for Ruby 1.8.
|
35
|
+
* CData was missing inner_text method.
|
36
|
+
|
37
|
+
= 0.7
|
38
|
+
=== 17th March, 2009
|
39
|
+
* Rewritten parser routine, much lighter on memory, quite a bit faster.
|
40
|
+
* Friendlier with Ruby 1.9.
|
41
|
+
* Fixes to nth-child and text() selectors.
|
42
|
+
|
43
|
+
= 0.6
|
44
|
+
=== 15th June, 2007
|
45
|
+
* Hpricot for JRuby -- nice work Ola Bini!
|
46
|
+
* Inline Markaby for Hpricot documents.
|
47
|
+
* XML tags and attributes are no longer downcased like HTML is.
|
48
|
+
* new syntax for grabbing everything between two elements using a Range in the search method: (doc/("font".."font/br")) or in nodes_at like so: (doc/"font").nodes_at("*".."br"). Only works with either a pair of siblings or a set of a parent and a sibling.
|
49
|
+
* Ignore self-closing endings on tags (such as form) which are containers. Treat them like open parent tags. Reported by Jonathan Nichols on the hpricot list.
|
50
|
+
* Escaping of attributes, yanked from Jim Weirich and Sam Ruby's work in Builder.
|
51
|
+
* Element#raw_attributes gives unescaped data. Element#attributes gives escaped.
|
52
|
+
* Added: Elements#attr, Elements#remove_attr, Elements#remove_class.
|
53
|
+
* Added: Traverse#preceding, Traverse#following, Traverse#previous, Traverse#next.
|
54
|
+
|
55
|
+
= 0.5
|
56
|
+
=== 31rd January, 2007
|
57
|
+
|
58
|
+
* support for a[text()="Click Me!"] and h3[text()*="space"] and the like.
|
59
|
+
* Hpricot.buffer_size accessor for increasing Hpricot's buffer if you're encountering huge ASP.NET viewstate attribs.
|
60
|
+
* some support for colons in tag names (not full namespace support yet.)
|
61
|
+
* Element.to_original_html will attempt to preserve the original HTML while merging your changes.
|
62
|
+
* Element.to_plain_text converts an element's contents to a simple text format.
|
63
|
+
* Element.inner_text removes all tags and returns text nodes concatenated into a single string.
|
64
|
+
* no @raw_string variable kept for comments, text, and cdata -- as it's redundant.
|
65
|
+
* xpath-style indices (//p/a[1]) but keep in mind that they aren't zero-based.
|
66
|
+
* node_position is the index among all sibling nodes, while position is the position among children of identical type.
|
67
|
+
* comment() and text() search criteria, like: //p/text(), which selects all text inside paragraph tags.
|
68
|
+
* every element has css_path and xpath methods which return respective absolute paths.
|
69
|
+
* more flexibility all around: in parsing attributes, tags, comments and cdata.
|
70
|
+
|
71
|
+
= 0.4
|
72
|
+
=== 11th August, 2006
|
73
|
+
|
74
|
+
* The :fixup_tags option will try to sort out the hierarchy so elements end up with the right parents.
|
75
|
+
* Elements such as *script* and *style* (identified as having CDATA contents) receive a single text node as their children now. Previously, Hpricot was parsing out tags found in scripts.
|
76
|
+
* Better scanning of partially quoted attributes (found by Brent Beardsly on http://uswebgen.com/)
|
77
|
+
* Better scanning of unquoted attributes -- thanks to Aaron Patterson for the test cases!
|
78
|
+
* Some tags were being output in the empty tag style, although browsers hated that. FIXED!
|
79
|
+
* Added Elements#at for finding single elements.
|
80
|
+
* Added Elem::Trav#[] and Elem::Trav#[]= for reading and writing attributes.
|
81
|
+
|
82
|
+
= 0.3
|
83
|
+
=== 7th July, 2006
|
84
|
+
|
85
|
+
* Fixed negative string size error on empty tokens. (news.bbc.co.uk)
|
86
|
+
* Allow the parser to accept just text nodes. (such as: <tt>Hpricot.parse('TEXT')</tt>)
|
87
|
+
* from JQuery to Hpricot::Elements: remove, empty, append, prepend, before, after, wrap, set,
|
88
|
+
html(...), to_html, to_s.
|
89
|
+
* on containers: to_html, replace_child, insert_before, insert_after, innerHTML=.
|
90
|
+
* Hpricot(...) is an alias for parse.
|
91
|
+
* open up all properties to setters, let people do as they may.
|
92
|
+
* use to_html for the full html of a node or set of elements.
|
93
|
+
* doctypes were messed.
|
94
|
+
|
95
|
+
= 0.2
|
96
|
+
=== 4th July, 2006
|
97
|
+
|
98
|
+
* Rewrote the HTree parser to be simpler, more adequate for the common man. Will add encoding back in later.
|
99
|
+
|
100
|
+
= 0.1
|
101
|
+
=== 3rd July, 2006
|
102
|
+
|
103
|
+
* For whatever reason, wrote this HTML parser in C.
|
104
|
+
I guess Ragel is addictive and I want to improve HTree.
|
data/COPYING
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
Copyright (c) 2006 why the lucky stiff
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to
|
5
|
+
deal in the Software without restriction, including without limitation the
|
6
|
+
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
7
|
+
sell copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
16
|
+
THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
17
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
18
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,276 @@
|
|
1
|
+
# Hpricot, Read Any HTML
|
2
|
+
|
3
|
+
Hpricot is a fast, flexible HTML parser written in C. It's designed to be very
|
4
|
+
accommodating (like Tanaka Akira's HTree) and to have a very helpful library
|
5
|
+
(like some JavaScript libs -- JQuery, Prototype -- give you.) The XPath and CSS
|
6
|
+
parser, in fact, is based on John Resig's JQuery.
|
7
|
+
|
8
|
+
Also, Hpricot can be handy for reading broken XML files, since many of the same
|
9
|
+
techniques can be used. If a quote is missing, Hpricot tries to figure it out.
|
10
|
+
If tags overlap, Hpricot works on sorting them out. You know, that sort of
|
11
|
+
thing.
|
12
|
+
|
13
|
+
*Please read this entire document* before making assumptions about how this
|
14
|
+
software works.
|
15
|
+
|
16
|
+
## An Overview
|
17
|
+
|
18
|
+
Let's clear up what Hpricot is.
|
19
|
+
|
20
|
+
* Hpricot is *a standalone library*. It requires no other libraries. Just Ruby!
|
21
|
+
* While priding itself on speed, Hpricot *works hard to sort out bad HTML* and
|
22
|
+
pays a small penalty in order to get that right. So that's slightly more important
|
23
|
+
to me than speed.
|
24
|
+
* *If you can see it in Firefox, then Hpricot should parse it.* That's
|
25
|
+
how it should be! Let me know the minute it's otherwise.
|
26
|
+
* Primarily, Hpricot is used for reading HTML and tries to sort out troubled
|
27
|
+
HTML by having some idea of what good HTML is. Some people still like to use
|
28
|
+
Hpricot for XML reading, but *remember to use the Hpricot::XML() method* for that!
|
29
|
+
|
30
|
+
## The Hpricot Kingdom
|
31
|
+
|
32
|
+
First, here are all the links you need to know:
|
33
|
+
|
34
|
+
* http://wiki.github.com/hpricot/hpricot is the Hpricot wiki and
|
35
|
+
http://github.com/hpricot/hpricot/issues is the bug tracker.
|
36
|
+
Go there for news and recipes and patches. It's the center of activity.
|
37
|
+
* http://github.com/hpricot/hpricot is the main Git
|
38
|
+
repository for Hpricot. You can get the latest code there.
|
39
|
+
* See COPYING for the terms of this software. (Spoiler: it's absolutely free.)
|
40
|
+
|
41
|
+
If you have any trouble, don't hesitate to contact the author. As always, I'm
|
42
|
+
not going to say "Use at your own risk" because I don't want this library to be
|
43
|
+
risky. If you trip on something, I'll share the liability by repairing things
|
44
|
+
as quickly as I can. Your responsibility is to report the inadequacies.
|
45
|
+
|
46
|
+
## Installing Hpricot
|
47
|
+
|
48
|
+
You may get the latest stable version from Rubyforge. Win32 binaries,
|
49
|
+
Java binaries (for JRuby), and source gems are available.
|
50
|
+
|
51
|
+
$ gem install hpricot
|
52
|
+
|
53
|
+
## An Hpricot Showcase
|
54
|
+
|
55
|
+
We're going to run through a big pile of examples to get you jump-started.
|
56
|
+
Many of these examples are also found at
|
57
|
+
http://wiki.github.com/hpricot/hpricot/hpricot-basics, in case you
|
58
|
+
want to add some of your own.
|
59
|
+
|
60
|
+
### Loading Hpricot Itself
|
61
|
+
|
62
|
+
You have probably got the gem, right? To load Hpricot:
|
63
|
+
|
64
|
+
require 'rubygems'
|
65
|
+
require 'hpricot'
|
66
|
+
|
67
|
+
If you've installed the plain source distribution, go ahead and just:
|
68
|
+
|
69
|
+
require 'hpricot'
|
70
|
+
|
71
|
+
### Load an HTML Page
|
72
|
+
|
73
|
+
The <tt>Hpricot()</tt> method takes a string or any IO object and loads the
|
74
|
+
contents into a document object.
|
75
|
+
|
76
|
+
doc = Hpricot("<p>A simple <b>test</b> string.</p>")
|
77
|
+
|
78
|
+
To load from a file, just get the stream open:
|
79
|
+
|
80
|
+
doc = open("index.html") { |f| Hpricot(f) }
|
81
|
+
|
82
|
+
To load from a web URL, use <tt>open-uri</tt>, which comes with Ruby:
|
83
|
+
|
84
|
+
require 'open-uri'
|
85
|
+
doc = open("http://qwantz.com/") { |f| Hpricot(f) }
|
86
|
+
|
87
|
+
Hpricot uses an internal buffer to parse the file, so the IO will stream
|
88
|
+
properly and large documents won't be loaded into memory all at once. However,
|
89
|
+
the parsed document object will be present in memory, in its entirety.
|
90
|
+
|
91
|
+
### Search for Elements
|
92
|
+
|
93
|
+
Use <tt>Doc.search</tt>:
|
94
|
+
|
95
|
+
doc.search("//p[@class='posted']")
|
96
|
+
#=> #<Hpricot:Elements[{p ...}, {p ...}]>
|
97
|
+
|
98
|
+
<tt>Doc.search</tt> can take an XPath or CSS expression. In the above example,
|
99
|
+
all paragraph <tt><p></tt> elements are grabbed which have a <tt>class</tt>
|
100
|
+
attribute of <tt>"posted"</tt>.
|
101
|
+
|
102
|
+
A shortcut is to use the divisor:
|
103
|
+
|
104
|
+
(doc/"p.posted")
|
105
|
+
#=> #<Hpricot:Elements[{p ...}, {p ...}]>
|
106
|
+
|
107
|
+
### Finding Just One Element
|
108
|
+
|
109
|
+
If you're looking for a single element, the <tt>at</tt> method will return the
|
110
|
+
first element matched by the expression. In this case, you'll get back the
|
111
|
+
element itself rather than the <tt>Hpricot::Elements</tt> array.
|
112
|
+
|
113
|
+
doc.at("body")['onload']
|
114
|
+
|
115
|
+
The above code will find the body tag and give you back the <tt>onload</tt>
|
116
|
+
attribute. This is the most common reason to use the element directly: when
|
117
|
+
reading and writing HTML attributes.
|
118
|
+
|
119
|
+
### Fetching the Contents of an Element
|
120
|
+
|
121
|
+
Just as with browser scripting, the <tt>inner_html</tt> property can be used to
|
122
|
+
get the inner contents of an element.
|
123
|
+
|
124
|
+
(doc/"#elementID").inner_html
|
125
|
+
#=> "..contents.."
|
126
|
+
|
127
|
+
If your expression matches more than one element, you'll get back the contents
|
128
|
+
of ''all the matched elements''. So you may want to use <tt>first</tt> to be
|
129
|
+
sure you get back only one.
|
130
|
+
|
131
|
+
(doc/"#elementID").first.inner_html
|
132
|
+
#=> "..contents.."
|
133
|
+
|
134
|
+
### Fetching the HTML for an Element
|
135
|
+
|
136
|
+
If you want the HTML for the whole element (not just the contents), use
|
137
|
+
<tt>to_html</tt>:
|
138
|
+
|
139
|
+
(doc/"#elementID").to_html
|
140
|
+
#=> "<div id='elementID'>...</div>"
|
141
|
+
|
142
|
+
### Looping
|
143
|
+
|
144
|
+
All searches return a set of <tt>Hpricot::Elements</tt>. Go ahead and loop
|
145
|
+
through them like you would an array.
|
146
|
+
|
147
|
+
(doc/"p/a/img").each do |img|
|
148
|
+
puts img.attributes['class']
|
149
|
+
end
|
150
|
+
|
151
|
+
### Continuing Searches
|
152
|
+
|
153
|
+
Searches can be continued from a collection of elements, in order to search deeper.
|
154
|
+
|
155
|
+
# find all paragraphs.
|
156
|
+
elements = doc.search("/html/body//p")
|
157
|
+
# continue the search by finding any images within those paragraphs.
|
158
|
+
(elements/"img")
|
159
|
+
#=> #<Hpricot::Elements[{img ...}, {img ...}]>
|
160
|
+
|
161
|
+
Searches can also be continued by searching within container elements.
|
162
|
+
|
163
|
+
# find all images within paragraphs.
|
164
|
+
doc.search("/html/body//p").each do |para|
|
165
|
+
puts "== Found a paragraph =="
|
166
|
+
pp para
|
167
|
+
|
168
|
+
imgs = para.search("img")
|
169
|
+
if imgs.any?
|
170
|
+
puts "== Found #{imgs.length} images inside =="
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
Of course, the most succinct ways to do the above are using CSS or XPath.
|
175
|
+
|
176
|
+
# the xpath version
|
177
|
+
(doc/"/html/body//p//img")
|
178
|
+
# the css version
|
179
|
+
(doc/"html > body > p img")
|
180
|
+
# ..or symbols work, too!
|
181
|
+
(doc/:html/:body/:p/:img)
|
182
|
+
|
183
|
+
### Looping Edits
|
184
|
+
|
185
|
+
You may certainly edit objects from within your search loops. Then, when you
|
186
|
+
spit out the HTML, the altered elements will show.
|
187
|
+
|
188
|
+
|
189
|
+
(doc/"span.entryPermalink").each do |span|
|
190
|
+
span.attributes['class'] = 'newLinks'
|
191
|
+
end
|
192
|
+
puts doc
|
193
|
+
|
194
|
+
This changes all <tt>span.entryPermalink</tt> elements to
|
195
|
+
<tt>span.newLinks</tt>. Keep in mind that there are often more convenient ways
|
196
|
+
of doing this. Such as the <tt>set</tt> method:
|
197
|
+
|
198
|
+
(doc/"span.entryPermalink").set(:class => 'newLinks')
|
199
|
+
|
200
|
+
### Figuring Out Paths
|
201
|
+
|
202
|
+
Every element can tell you its unique path (either XPath or CSS) to get to the
|
203
|
+
element from the root tag.
|
204
|
+
|
205
|
+
The <tt>css_path</tt> method:
|
206
|
+
|
207
|
+
doc.at("div > div:nth(1)").css_path
|
208
|
+
#=> "div > div:nth(1)"
|
209
|
+
doc.at("#header").css_path
|
210
|
+
#=> "#header"
|
211
|
+
|
212
|
+
Or, the <tt>xpath</tt> method:
|
213
|
+
|
214
|
+
doc.at("div > div:nth(1)").xpath
|
215
|
+
#=> "/div/div:eq(1)"
|
216
|
+
doc.at("#header").xpath
|
217
|
+
#=> "//div[@id='header']"
|
218
|
+
|
219
|
+
## Hpricot Fixups
|
220
|
+
|
221
|
+
When loading HTML documents, you have a few settings that can make Hpricot more
|
222
|
+
or less intense about how it gets involved.
|
223
|
+
|
224
|
+
## :fixup_tags
|
225
|
+
|
226
|
+
Really, there are so many ways to clean up HTML and your intentions may be to
|
227
|
+
keep the HTML as-is. So Hpricot's default behavior is to keep things flexible.
|
228
|
+
Making sure to open and close all the tags, but ignore any validation problems.
|
229
|
+
|
230
|
+
As of Hpricot 0.4, there's a new <tt>:fixup_tags</tt> option which will attempt
|
231
|
+
to shift the document's tags to meet XHTML 1.0 Strict.
|
232
|
+
|
233
|
+
doc = open("index.html") { |f| Hpricot f, :fixup_tags => true }
|
234
|
+
|
235
|
+
This doesn't quite meet the XHTML 1.0 Strict standard, it just tries to follow
|
236
|
+
the rules a bit better. Like: say Hpricot finds a paragraph in a link, it's
|
237
|
+
going to move the paragraph below the link. Or up and out of other elements
|
238
|
+
where paragraphs don't belong.
|
239
|
+
|
240
|
+
If an unknown element is found, it is ignored. Again, <tt>:fixup_tags</tt>.
|
241
|
+
|
242
|
+
## :xhtml_strict
|
243
|
+
|
244
|
+
So, let's go beyond just trying to fix the hierarchy. The
|
245
|
+
<tt>:xhtml_strict</tt> option really tries to force the document to be an XHTML
|
246
|
+
1.0 Strict document. Even at the cost of removing elements that get in the way.
|
247
|
+
|
248
|
+
doc = open("index.html") { |f| Hpricot f, :xhtml_strict => true }
|
249
|
+
|
250
|
+
What measures does <tt>:xhtml_strict</tt> take?
|
251
|
+
|
252
|
+
1. Shift elements into their proper containers just like :fixup_tags.
|
253
|
+
2. Remove unknown elements.
|
254
|
+
3. Remove unknown attributes.
|
255
|
+
4. Remove illegal content.
|
256
|
+
5. Alter the doctype to XHTML 1.0 Strict.
|
257
|
+
|
258
|
+
## Hpricot.XML()
|
259
|
+
|
260
|
+
The last option is the <tt>:xml</tt> option, which makes some slight variations
|
261
|
+
on the standard mode. The main difference is that :xml mode won't try to output
|
262
|
+
tags which are friendlier for browsers. For example, if an opening and closing
|
263
|
+
<tt>br</tt> tag is found, XML mode won't try to turn that into an empty element.
|
264
|
+
|
265
|
+
XML mode also doesn't downcase the tags and attributes for you. So pay attention
|
266
|
+
to case, friends.
|
267
|
+
|
268
|
+
The primary way to use Hpricot's XML mode is to call the Hpricot.XML method:
|
269
|
+
|
270
|
+
doc = open("http://redhanded.hobix.com/index.xml") do |f|
|
271
|
+
Hpricot.XML(f)
|
272
|
+
end
|
273
|
+
|
274
|
+
*Also, :fixup_tags is canceled out by the :xml option.* This is because
|
275
|
+
:fixup_tags makes assumptions based how HTML is structured. Specifically, how
|
276
|
+
tags are defined in the XHTML 1.0 DTD.
|
data/Rakefile
ADDED
@@ -0,0 +1,234 @@
|
|
1
|
+
require 'rake/clean'
|
2
|
+
require 'rake/gempackagetask'
|
3
|
+
require 'rake/rdoctask'
|
4
|
+
require 'rake/testtask'
|
5
|
+
begin
|
6
|
+
require 'rake/extensiontask'
|
7
|
+
rescue LoadError
|
8
|
+
abort "To build, please first gem install rake-compiler"
|
9
|
+
end
|
10
|
+
|
11
|
+
RbConfig = Config unless defined?(RbConfig)
|
12
|
+
|
13
|
+
NAME = "hpricot"
|
14
|
+
REV = (`#{ENV['GIT'] || "git"} rev-list HEAD`.split.length + 1).to_s
|
15
|
+
VERS = ENV['VERSION'] || "0.8" + (REV ? ".#{REV}" : "")
|
16
|
+
PKG = "#{NAME}-#{VERS}"
|
17
|
+
BIN = "*.{bundle,jar,so,o,obj,pdb,lib,def,exp,class,rbc}"
|
18
|
+
CLEAN.include ["#{BIN}", "ext/**/#{BIN}", "lib/**/#{BIN}", "test/**/#{BIN}",
|
19
|
+
'ext/fast_xs/Makefile', 'ext/hpricot_scan/Makefile',
|
20
|
+
'**/.*.sw?', '*.gem', '.config', 'pkg', 'lib/hpricot_scan.rb', 'lib/fast_xs.rb']
|
21
|
+
RDOC_OPTS = ['--quiet', '--title', 'The Hpricot Reference', '--main', 'README.md', '--inline-source']
|
22
|
+
PKG_FILES = %w(CHANGELOG COPYING README.md Rakefile) +
|
23
|
+
Dir.glob("{bin,doc,test,extras}/**/*") +
|
24
|
+
(Dir.glob("lib/**/*.rb") - %w(lib/hpricot_scan.rb lib/fast_xs.rb)) +
|
25
|
+
Dir.glob("ext/**/*.{h,java,c,rb,rl}") +
|
26
|
+
%w[ext/hpricot_scan/hpricot_scan.c ext/hpricot_scan/hpricot_css.c ext/hpricot_scan/HpricotScanService.java] # needed because they are generated later
|
27
|
+
RAGEL_C_CODE_GENERATION_STYLES = {
|
28
|
+
"table_driven" => 'T0',
|
29
|
+
"faster_table_driven" => 'T1',
|
30
|
+
"flat_table_driven" => 'F0',
|
31
|
+
"faster_flat_table_driven" => 'F1',
|
32
|
+
"goto_driven" => 'G0',
|
33
|
+
"faster_goto_driven" => 'G1',
|
34
|
+
"really_fast goto_driven" => 'G2'
|
35
|
+
# "n_way_split_really_fast_goto_driven" => 'P<N>'
|
36
|
+
}
|
37
|
+
DEFAULT_RAGEL_C_CODE_GENERATION = "really_fast goto_driven"
|
38
|
+
SPEC =
|
39
|
+
Gem::Specification.new do |s|
|
40
|
+
s.name = NAME
|
41
|
+
s.version = VERS
|
42
|
+
s.platform = Gem::Platform::RUBY
|
43
|
+
s.has_rdoc = true
|
44
|
+
s.rdoc_options += RDOC_OPTS
|
45
|
+
s.extra_rdoc_files = ["README.md", "CHANGELOG", "COPYING"]
|
46
|
+
s.summary = "a swift, liberal HTML parser with a fantastic library"
|
47
|
+
s.description = s.summary
|
48
|
+
s.author = "why the lucky stiff"
|
49
|
+
s.email = 'why@ruby-lang.org'
|
50
|
+
s.homepage = 'http://code.whytheluckystiff.net/hpricot/'
|
51
|
+
s.rubyforge_project = 'hobix'
|
52
|
+
s.files = PKG_FILES
|
53
|
+
s.require_paths = ["lib"]
|
54
|
+
s.extensions = FileList["ext/**/extconf.rb"].to_a
|
55
|
+
s.bindir = "bin"
|
56
|
+
end
|
57
|
+
|
58
|
+
# FAT cross-compile
|
59
|
+
# Pass RUBY_CC_VERSION=1.8.7:1.9.2 when packaging for 1.8+1.9 mswin32 binaries
|
60
|
+
%w(hpricot_scan fast_xs).each do |target|
|
61
|
+
Rake::ExtensionTask.new(target, SPEC) do |ext|
|
62
|
+
ext.lib_dir = File.join('lib', target) if ENV['RUBY_CC_VERSION']
|
63
|
+
ext.cross_compile = true # enable cross compilation (requires cross compile toolchain)
|
64
|
+
ext.cross_platform = 'i386-mswin32' # forces the Windows platform instead of the default one
|
65
|
+
end
|
66
|
+
|
67
|
+
# HACK around 1.9.2 cross .def file creation
|
68
|
+
def_file = "tmp/i386-mswin32/#{target}/1.9.2/#{target}-i386-mingw32.def"
|
69
|
+
directory File.dirname(def_file)
|
70
|
+
file def_file => File.dirname(def_file) do |t|
|
71
|
+
File.open(t.name, "w") do |f|
|
72
|
+
f << "EXPORTS\nInit_#{target}\n"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
task File.join(File.dirname(def_file), "Makefile") => def_file
|
77
|
+
# END HACK
|
78
|
+
file "lib/#{target}.rb" do |t|
|
79
|
+
File.open(t.name, "w") do |f|
|
80
|
+
f.puts %{require "#{target}/\#{RUBY_VERSION.sub(/\\.\\d+$/, '')}/#{target}"}
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
file 'ext/hpricot_scan/extconf.rb' => :ragel
|
85
|
+
|
86
|
+
desc "set environment variables to build and/or test with debug options"
|
87
|
+
task :debug do
|
88
|
+
ENV['CFLAGS'] ||= ""
|
89
|
+
ENV['CFLAGS'] += " -g -DDEBUG"
|
90
|
+
end
|
91
|
+
|
92
|
+
desc "Does a full compile, test run"
|
93
|
+
if defined?(JRUBY_VERSION)
|
94
|
+
task :default => [:compile_java, :clean_fat_rb, :test]
|
95
|
+
else
|
96
|
+
task :default => [:compile, :clean_fat_rb, :test]
|
97
|
+
end
|
98
|
+
|
99
|
+
task :clean_fat_rb do
|
100
|
+
rm_f "lib/hpricot_scan.rb"
|
101
|
+
rm_f "lib/fast_xs.rb"
|
102
|
+
end
|
103
|
+
|
104
|
+
desc "Packages up Hpricot for all platforms."
|
105
|
+
task :package => [:clean]
|
106
|
+
|
107
|
+
desc "Run all the tests"
|
108
|
+
Rake::TestTask.new do |t|
|
109
|
+
t.libs << "test"
|
110
|
+
t.test_files = FileList['test/test_*.rb']
|
111
|
+
t.verbose = true
|
112
|
+
end
|
113
|
+
|
114
|
+
Rake::RDocTask.new do |rdoc|
|
115
|
+
rdoc.rdoc_dir = 'doc/rdoc'
|
116
|
+
rdoc.options += RDOC_OPTS
|
117
|
+
rdoc.main = "README.md"
|
118
|
+
rdoc.rdoc_files.add ['README.md', 'CHANGELOG', 'COPYING', 'lib/**/*.rb']
|
119
|
+
end
|
120
|
+
|
121
|
+
Rake::GemPackageTask.new(SPEC) do |p|
|
122
|
+
p.need_tar = true
|
123
|
+
p.gem_spec = SPEC
|
124
|
+
end
|
125
|
+
|
126
|
+
### Win32 Packages ###
|
127
|
+
Win32Spec = SPEC.dup
|
128
|
+
Win32Spec.platform = 'i386-mswin32'
|
129
|
+
Win32Spec.files = PKG_FILES + %w(hpricot_scan fast_xs).map do |t|
|
130
|
+
unless ENV['RUBY_CC_VERSION']
|
131
|
+
file "lib/#{t}/1.8/#{t}.so" do
|
132
|
+
abort "ERROR while packaging: re-run for fat win32 gems:\nrake #{ARGV.join(' ')} RUBY_CC_VERSION=1.8.7:1.9.2"
|
133
|
+
end
|
134
|
+
end
|
135
|
+
["lib/#{t}.rb", "lib/#{t}/1.8/#{t}.so", "lib/#{t}/1.9/#{t}.so"]
|
136
|
+
end.flatten
|
137
|
+
Win32Spec.extensions = []
|
138
|
+
|
139
|
+
Rake::GemPackageTask.new(Win32Spec) do |p|
|
140
|
+
p.need_tar = false
|
141
|
+
p.gem_spec = Win32Spec
|
142
|
+
end
|
143
|
+
|
144
|
+
JRubySpec = SPEC.dup
|
145
|
+
JRubySpec.platform = 'java'
|
146
|
+
JRubySpec.files = PKG_FILES + ["lib/hpricot_scan.jar", "lib/fast_xs.jar"]
|
147
|
+
JRubySpec.extensions = []
|
148
|
+
|
149
|
+
Rake::GemPackageTask.new(JRubySpec) do |p|
|
150
|
+
p.need_tar = false
|
151
|
+
p.gem_spec = JRubySpec
|
152
|
+
end
|
153
|
+
|
154
|
+
desc "Determines the Ragel version and displays it on the console along with the location of the Ragel binary."
|
155
|
+
task :ragel_version do
|
156
|
+
@ragel_v = `ragel -v`[/(version )(\S*)/,2].to_f
|
157
|
+
puts "Using ragel version: #{@ragel_v}, location: #{`which ragel`}"
|
158
|
+
@ragel_v
|
159
|
+
end
|
160
|
+
|
161
|
+
desc "Generates the C scanner code with Ragel."
|
162
|
+
task :ragel => [:ragel_version] do
|
163
|
+
if @ragel_v >= 6.1
|
164
|
+
@ragel_c_code_generation_style = RAGEL_C_CODE_GENERATION_STYLES[DEFAULT_RAGEL_C_CODE_GENERATION]
|
165
|
+
Dir.chdir("ext/hpricot_scan") do
|
166
|
+
sh %{ragel hpricot_scan.rl -#{@ragel_c_code_generation_style} -o hpricot_scan.c}
|
167
|
+
sh %{ragel hpricot_css.rl -#{@ragel_c_code_generation_style} -o hpricot_css.c}
|
168
|
+
end
|
169
|
+
else
|
170
|
+
STDERR.puts "Ragel 6.1 or greater is required."
|
171
|
+
exit(1)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
# Java only supports the table-driven code
|
176
|
+
# generation style at this point.
|
177
|
+
desc "Generates the Java scanner code using the Ragel table-driven code generation style."
|
178
|
+
task :ragel_java => [:ragel_version] do
|
179
|
+
if @ragel_v >= 6.1
|
180
|
+
puts "compiling with ragel version #{@ragel_v}"
|
181
|
+
Dir.chdir("ext/hpricot_scan") do
|
182
|
+
sh %{ragel -J -o HpricotCss.java hpricot_css.java.rl}
|
183
|
+
sh %{ragel -J -o HpricotScanService.java hpricot_scan.java.rl}
|
184
|
+
end
|
185
|
+
else
|
186
|
+
STDERR.puts "Ragel 6.1 or greater is required."
|
187
|
+
exit(1)
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
### JRuby Compile ###
|
192
|
+
|
193
|
+
def java_classpath_arg # myriad of ways to discover JRuby classpath
|
194
|
+
begin
|
195
|
+
cpath = Java::java.lang.System.getProperty('java.class.path').split(File::PATH_SEPARATOR)
|
196
|
+
cpath += Java::java.lang.System.getProperty('sun.boot.class.path').split(File::PATH_SEPARATOR)
|
197
|
+
jruby_cpath = cpath.compact.join(File::PATH_SEPARATOR)
|
198
|
+
rescue => e
|
199
|
+
end
|
200
|
+
unless jruby_cpath
|
201
|
+
jruby_cpath = ENV['JRUBY_PARENT_CLASSPATH'] || ENV['JRUBY_HOME'] &&
|
202
|
+
FileList["#{ENV['JRUBY_HOME']}/lib/*.jar"].join(File::PATH_SEPARATOR)
|
203
|
+
end
|
204
|
+
unless jruby_cpath || ENV['CLASSPATH'] =~ /jruby/
|
205
|
+
abort %{WARNING: No JRuby classpath has been set up.
|
206
|
+
Define JRUBY_HOME=/path/to/jruby on the command line or in the environment}
|
207
|
+
end
|
208
|
+
"-cp \"#{jruby_cpath}\""
|
209
|
+
end
|
210
|
+
|
211
|
+
def compile_java(filenames, jarname)
|
212
|
+
sh %{javac -source 1.5 -target 1.5 #{java_classpath_arg} #{filenames.join(" ")}}
|
213
|
+
sh %{jar cf #{jarname} *.class}
|
214
|
+
end
|
215
|
+
|
216
|
+
task :hpricot_scan_java => [:ragel_java] do
|
217
|
+
Dir.chdir "ext/hpricot_scan" do
|
218
|
+
compile_java(["HpricotScanService.java", "HpricotCss.java"], "hpricot_scan.jar")
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
task :fast_xs_java do
|
223
|
+
Dir.chdir "ext/fast_xs" do
|
224
|
+
compile_java(["FastXsService.java"], "fast_xs.jar")
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
%w(hpricot_scan fast_xs).each do |ext|
|
229
|
+
file "lib/#{ext}.jar" => "#{ext}_java" do |t|
|
230
|
+
mv "ext/#{ext}/#{ext}.jar", "lib"
|
231
|
+
end
|
232
|
+
task :compile_java => "lib/#{ext}.jar"
|
233
|
+
end
|
234
|
+
|