hpricot 0.8.3-i386-mswin32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +104 -0
- data/COPYING +18 -0
- data/README.md +276 -0
- data/Rakefile +234 -0
- data/ext/fast_xs/FastXsService.java +1123 -0
- data/ext/fast_xs/extconf.rb +4 -0
- data/ext/fast_xs/fast_xs.c +210 -0
- data/ext/hpricot_scan/HpricotCss.java +850 -0
- data/ext/hpricot_scan/HpricotScanService.java +2099 -0
- data/ext/hpricot_scan/extconf.rb +9 -0
- data/ext/hpricot_scan/hpricot_common.rl +76 -0
- data/ext/hpricot_scan/hpricot_css.c +3511 -0
- data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
- data/ext/hpricot_scan/hpricot_css.rl +120 -0
- data/ext/hpricot_scan/hpricot_scan.c +7039 -0
- data/ext/hpricot_scan/hpricot_scan.h +79 -0
- data/ext/hpricot_scan/hpricot_scan.java.rl +1161 -0
- data/ext/hpricot_scan/hpricot_scan.rl +896 -0
- data/extras/hpricot.png +0 -0
- data/lib/fast_xs.rb +1 -0
- data/lib/fast_xs/1.8/fast_xs.so +0 -0
- data/lib/fast_xs/1.9/fast_xs.so +0 -0
- data/lib/hpricot.rb +26 -0
- data/lib/hpricot/blankslate.rb +63 -0
- data/lib/hpricot/builder.rb +216 -0
- data/lib/hpricot/elements.rb +514 -0
- data/lib/hpricot/htmlinfo.rb +691 -0
- data/lib/hpricot/inspect.rb +103 -0
- data/lib/hpricot/modules.rb +40 -0
- data/lib/hpricot/parse.rb +38 -0
- data/lib/hpricot/tag.rb +219 -0
- data/lib/hpricot/tags.rb +164 -0
- data/lib/hpricot/traverse.rb +839 -0
- data/lib/hpricot/xchar.rb +94 -0
- data/lib/hpricot_scan.rb +1 -0
- data/lib/hpricot_scan/1.8/hpricot_scan.so +0 -0
- data/lib/hpricot_scan/1.9/hpricot_scan.so +0 -0
- data/test/files/basic.xhtml +17 -0
- data/test/files/boingboing.html +2266 -0
- data/test/files/cy0.html +3653 -0
- data/test/files/immob.html +400 -0
- data/test/files/pace_application.html +1320 -0
- data/test/files/tenderlove.html +16 -0
- data/test/files/uswebgen.html +220 -0
- data/test/files/utf8.html +1054 -0
- data/test/files/week9.html +1723 -0
- data/test/files/why.xml +19 -0
- data/test/load_files.rb +7 -0
- data/test/nokogiri-bench.rb +64 -0
- data/test/test_alter.rb +96 -0
- data/test/test_builder.rb +37 -0
- data/test/test_parser.rb +457 -0
- data/test/test_paths.rb +25 -0
- data/test/test_preserved.rb +88 -0
- data/test/test_xml.rb +28 -0
- metadata +128 -0
data/CHANGELOG
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
= 0.8.3
|
2
|
+
=== 3 November, 2010
|
3
|
+
* GH#8: Nil-check before downcasing attribute key
|
4
|
+
* GH#25: Proper ruby 1.9 encoding support
|
5
|
+
* GH#28. Use integers instead of ?? on 1.9, which is just a string.
|
6
|
+
* including noscript to ElementInclusions , so that hpricot wont fail
|
7
|
+
when trying to parse a meta tag inside head section when noscript is
|
8
|
+
present.
|
9
|
+
* latest changes from fast_xs mainline
|
10
|
+
* Fixes to get Hpricot running on Rubinius:
|
11
|
+
* Use free, not XFREE
|
12
|
+
* Remove RSTRUCT craziness, don't break Array#at
|
13
|
+
|
14
|
+
= 0.8.2
|
15
|
+
=== 5 November, 2009
|
16
|
+
* Bring JRuby support up to speed, including Java-based hpricot_css support
|
17
|
+
* Change JRuby fast_xs to have same escaping behavior as C fast_xs
|
18
|
+
* fix for issue #2, downcasing of html attributes inside the parser.
|
19
|
+
* solve issue #3 with bogus etags being preserved in `to_s` rather than just `to_original_html`.
|
20
|
+
* fix error when attempting to reparent cleared node. (issue #5)
|
21
|
+
* Hpricot::Attributes proxy object for using `ele.attributes[k] = v` directly.
|
22
|
+
however, it is preferred to use the jquery-like `elements.attr(k, v)`.
|
23
|
+
|
24
|
+
= 0.8.1
|
25
|
+
=== 3 April, 2009
|
26
|
+
* big problems on Ruby 1.8.6, use INT2FIX instead of INT2NUM. hashes were being cast to bignums.
|
27
|
+
* patch for 1.8.5 to define RARRAY_PTR. thanks, mike perham!
|
28
|
+
* inspecting empty document bug, courtesy of @TalLevAmi.
|
29
|
+
|
30
|
+
= 0.8
|
31
|
+
=== 31st March, 2009
|
32
|
+
* Saving memory and speed by using RStruct-based elements in the C extension.
|
33
|
+
* Bug in tag parsing, causing runaway <script> and <style> tags in HTML.
|
34
|
+
* Problem compiling under Ruby 1.9, due to our_rb_hash_lookup function meant for Ruby 1.8.
|
35
|
+
* CData was missing inner_text method.
|
36
|
+
|
37
|
+
= 0.7
|
38
|
+
=== 17th March, 2009
|
39
|
+
* Rewritten parser routine, much lighter on memory, quite a bit faster.
|
40
|
+
* Friendlier with Ruby 1.9.
|
41
|
+
* Fixes to nth-child and text() selectors.
|
42
|
+
|
43
|
+
= 0.6
|
44
|
+
=== 15th June, 2007
|
45
|
+
* Hpricot for JRuby -- nice work Ola Bini!
|
46
|
+
* Inline Markaby for Hpricot documents.
|
47
|
+
* XML tags and attributes are no longer downcased like HTML is.
|
48
|
+
* new syntax for grabbing everything between two elements using a Range in the search method: (doc/("font".."font/br")) or in nodes_at like so: (doc/"font").nodes_at("*".."br"). Only works with either a pair of siblings or a set of a parent and a sibling.
|
49
|
+
* Ignore self-closing endings on tags (such as form) which are containers. Treat them like open parent tags. Reported by Jonathan Nichols on the hpricot list.
|
50
|
+
* Escaping of attributes, yanked from Jim Weirich and Sam Ruby's work in Builder.
|
51
|
+
* Element#raw_attributes gives unescaped data. Element#attributes gives escaped.
|
52
|
+
* Added: Elements#attr, Elements#remove_attr, Elements#remove_class.
|
53
|
+
* Added: Traverse#preceding, Traverse#following, Traverse#previous, Traverse#next.
|
54
|
+
|
55
|
+
= 0.5
|
56
|
+
=== 31rd January, 2007
|
57
|
+
|
58
|
+
* support for a[text()="Click Me!"] and h3[text()*="space"] and the like.
|
59
|
+
* Hpricot.buffer_size accessor for increasing Hpricot's buffer if you're encountering huge ASP.NET viewstate attribs.
|
60
|
+
* some support for colons in tag names (not full namespace support yet.)
|
61
|
+
* Element.to_original_html will attempt to preserve the original HTML while merging your changes.
|
62
|
+
* Element.to_plain_text converts an element's contents to a simple text format.
|
63
|
+
* Element.inner_text removes all tags and returns text nodes concatenated into a single string.
|
64
|
+
* no @raw_string variable kept for comments, text, and cdata -- as it's redundant.
|
65
|
+
* xpath-style indices (//p/a[1]) but keep in mind that they aren't zero-based.
|
66
|
+
* node_position is the index among all sibling nodes, while position is the position among children of identical type.
|
67
|
+
* comment() and text() search criteria, like: //p/text(), which selects all text inside paragraph tags.
|
68
|
+
* every element has css_path and xpath methods which return respective absolute paths.
|
69
|
+
* more flexibility all around: in parsing attributes, tags, comments and cdata.
|
70
|
+
|
71
|
+
= 0.4
|
72
|
+
=== 11th August, 2006
|
73
|
+
|
74
|
+
* The :fixup_tags option will try to sort out the hierarchy so elements end up with the right parents.
|
75
|
+
* Elements such as *script* and *style* (identified as having CDATA contents) receive a single text node as their children now. Previously, Hpricot was parsing out tags found in scripts.
|
76
|
+
* Better scanning of partially quoted attributes (found by Brent Beardsly on http://uswebgen.com/)
|
77
|
+
* Better scanning of unquoted attributes -- thanks to Aaron Patterson for the test cases!
|
78
|
+
* Some tags were being output in the empty tag style, although browsers hated that. FIXED!
|
79
|
+
* Added Elements#at for finding single elements.
|
80
|
+
* Added Elem::Trav#[] and Elem::Trav#[]= for reading and writing attributes.
|
81
|
+
|
82
|
+
= 0.3
|
83
|
+
=== 7th July, 2006
|
84
|
+
|
85
|
+
* Fixed negative string size error on empty tokens. (news.bbc.co.uk)
|
86
|
+
* Allow the parser to accept just text nodes. (such as: <tt>Hpricot.parse('TEXT')</tt>)
|
87
|
+
* from JQuery to Hpricot::Elements: remove, empty, append, prepend, before, after, wrap, set,
|
88
|
+
html(...), to_html, to_s.
|
89
|
+
* on containers: to_html, replace_child, insert_before, insert_after, innerHTML=.
|
90
|
+
* Hpricot(...) is an alias for parse.
|
91
|
+
* open up all properties to setters, let people do as they may.
|
92
|
+
* use to_html for the full html of a node or set of elements.
|
93
|
+
* doctypes were messed.
|
94
|
+
|
95
|
+
= 0.2
|
96
|
+
=== 4th July, 2006
|
97
|
+
|
98
|
+
* Rewrote the HTree parser to be simpler, more adequate for the common man. Will add encoding back in later.
|
99
|
+
|
100
|
+
= 0.1
|
101
|
+
=== 3rd July, 2006
|
102
|
+
|
103
|
+
* For whatever reason, wrote this HTML parser in C.
|
104
|
+
I guess Ragel is addictive and I want to improve HTree.
|
data/COPYING
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
Copyright (c) 2006 why the lucky stiff
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to
|
5
|
+
deal in the Software without restriction, including without limitation the
|
6
|
+
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
7
|
+
sell copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
16
|
+
THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
17
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
18
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,276 @@
|
|
1
|
+
# Hpricot, Read Any HTML
|
2
|
+
|
3
|
+
Hpricot is a fast, flexible HTML parser written in C. It's designed to be very
|
4
|
+
accommodating (like Tanaka Akira's HTree) and to have a very helpful library
|
5
|
+
(like some JavaScript libs -- JQuery, Prototype -- give you.) The XPath and CSS
|
6
|
+
parser, in fact, is based on John Resig's JQuery.
|
7
|
+
|
8
|
+
Also, Hpricot can be handy for reading broken XML files, since many of the same
|
9
|
+
techniques can be used. If a quote is missing, Hpricot tries to figure it out.
|
10
|
+
If tags overlap, Hpricot works on sorting them out. You know, that sort of
|
11
|
+
thing.
|
12
|
+
|
13
|
+
*Please read this entire document* before making assumptions about how this
|
14
|
+
software works.
|
15
|
+
|
16
|
+
## An Overview
|
17
|
+
|
18
|
+
Let's clear up what Hpricot is.
|
19
|
+
|
20
|
+
* Hpricot is *a standalone library*. It requires no other libraries. Just Ruby!
|
21
|
+
* While priding itself on speed, Hpricot *works hard to sort out bad HTML* and
|
22
|
+
pays a small penalty in order to get that right. So that's slightly more important
|
23
|
+
to me than speed.
|
24
|
+
* *If you can see it in Firefox, then Hpricot should parse it.* That's
|
25
|
+
how it should be! Let me know the minute it's otherwise.
|
26
|
+
* Primarily, Hpricot is used for reading HTML and tries to sort out troubled
|
27
|
+
HTML by having some idea of what good HTML is. Some people still like to use
|
28
|
+
Hpricot for XML reading, but *remember to use the Hpricot::XML() method* for that!
|
29
|
+
|
30
|
+
## The Hpricot Kingdom
|
31
|
+
|
32
|
+
First, here are all the links you need to know:
|
33
|
+
|
34
|
+
* http://wiki.github.com/hpricot/hpricot is the Hpricot wiki and
|
35
|
+
http://github.com/hpricot/hpricot/issues is the bug tracker.
|
36
|
+
Go there for news and recipes and patches. It's the center of activity.
|
37
|
+
* http://github.com/hpricot/hpricot is the main Git
|
38
|
+
repository for Hpricot. You can get the latest code there.
|
39
|
+
* See COPYING for the terms of this software. (Spoiler: it's absolutely free.)
|
40
|
+
|
41
|
+
If you have any trouble, don't hesitate to contact the author. As always, I'm
|
42
|
+
not going to say "Use at your own risk" because I don't want this library to be
|
43
|
+
risky. If you trip on something, I'll share the liability by repairing things
|
44
|
+
as quickly as I can. Your responsibility is to report the inadequacies.
|
45
|
+
|
46
|
+
## Installing Hpricot
|
47
|
+
|
48
|
+
You may get the latest stable version from Rubyforge. Win32 binaries,
|
49
|
+
Java binaries (for JRuby), and source gems are available.
|
50
|
+
|
51
|
+
$ gem install hpricot
|
52
|
+
|
53
|
+
## An Hpricot Showcase
|
54
|
+
|
55
|
+
We're going to run through a big pile of examples to get you jump-started.
|
56
|
+
Many of these examples are also found at
|
57
|
+
http://wiki.github.com/hpricot/hpricot/hpricot-basics, in case you
|
58
|
+
want to add some of your own.
|
59
|
+
|
60
|
+
### Loading Hpricot Itself
|
61
|
+
|
62
|
+
You have probably got the gem, right? To load Hpricot:
|
63
|
+
|
64
|
+
require 'rubygems'
|
65
|
+
require 'hpricot'
|
66
|
+
|
67
|
+
If you've installed the plain source distribution, go ahead and just:
|
68
|
+
|
69
|
+
require 'hpricot'
|
70
|
+
|
71
|
+
### Load an HTML Page
|
72
|
+
|
73
|
+
The <tt>Hpricot()</tt> method takes a string or any IO object and loads the
|
74
|
+
contents into a document object.
|
75
|
+
|
76
|
+
doc = Hpricot("<p>A simple <b>test</b> string.</p>")
|
77
|
+
|
78
|
+
To load from a file, just get the stream open:
|
79
|
+
|
80
|
+
doc = open("index.html") { |f| Hpricot(f) }
|
81
|
+
|
82
|
+
To load from a web URL, use <tt>open-uri</tt>, which comes with Ruby:
|
83
|
+
|
84
|
+
require 'open-uri'
|
85
|
+
doc = open("http://qwantz.com/") { |f| Hpricot(f) }
|
86
|
+
|
87
|
+
Hpricot uses an internal buffer to parse the file, so the IO will stream
|
88
|
+
properly and large documents won't be loaded into memory all at once. However,
|
89
|
+
the parsed document object will be present in memory, in its entirety.
|
90
|
+
|
91
|
+
### Search for Elements
|
92
|
+
|
93
|
+
Use <tt>Doc.search</tt>:
|
94
|
+
|
95
|
+
doc.search("//p[@class='posted']")
|
96
|
+
#=> #<Hpricot:Elements[{p ...}, {p ...}]>
|
97
|
+
|
98
|
+
<tt>Doc.search</tt> can take an XPath or CSS expression. In the above example,
|
99
|
+
all paragraph <tt><p></tt> elements are grabbed which have a <tt>class</tt>
|
100
|
+
attribute of <tt>"posted"</tt>.
|
101
|
+
|
102
|
+
A shortcut is to use the divisor:
|
103
|
+
|
104
|
+
(doc/"p.posted")
|
105
|
+
#=> #<Hpricot:Elements[{p ...}, {p ...}]>
|
106
|
+
|
107
|
+
### Finding Just One Element
|
108
|
+
|
109
|
+
If you're looking for a single element, the <tt>at</tt> method will return the
|
110
|
+
first element matched by the expression. In this case, you'll get back the
|
111
|
+
element itself rather than the <tt>Hpricot::Elements</tt> array.
|
112
|
+
|
113
|
+
doc.at("body")['onload']
|
114
|
+
|
115
|
+
The above code will find the body tag and give you back the <tt>onload</tt>
|
116
|
+
attribute. This is the most common reason to use the element directly: when
|
117
|
+
reading and writing HTML attributes.
|
118
|
+
|
119
|
+
### Fetching the Contents of an Element
|
120
|
+
|
121
|
+
Just as with browser scripting, the <tt>inner_html</tt> property can be used to
|
122
|
+
get the inner contents of an element.
|
123
|
+
|
124
|
+
(doc/"#elementID").inner_html
|
125
|
+
#=> "..contents.."
|
126
|
+
|
127
|
+
If your expression matches more than one element, you'll get back the contents
|
128
|
+
of ''all the matched elements''. So you may want to use <tt>first</tt> to be
|
129
|
+
sure you get back only one.
|
130
|
+
|
131
|
+
(doc/"#elementID").first.inner_html
|
132
|
+
#=> "..contents.."
|
133
|
+
|
134
|
+
### Fetching the HTML for an Element
|
135
|
+
|
136
|
+
If you want the HTML for the whole element (not just the contents), use
|
137
|
+
<tt>to_html</tt>:
|
138
|
+
|
139
|
+
(doc/"#elementID").to_html
|
140
|
+
#=> "<div id='elementID'>...</div>"
|
141
|
+
|
142
|
+
### Looping
|
143
|
+
|
144
|
+
All searches return a set of <tt>Hpricot::Elements</tt>. Go ahead and loop
|
145
|
+
through them like you would an array.
|
146
|
+
|
147
|
+
(doc/"p/a/img").each do |img|
|
148
|
+
puts img.attributes['class']
|
149
|
+
end
|
150
|
+
|
151
|
+
### Continuing Searches
|
152
|
+
|
153
|
+
Searches can be continued from a collection of elements, in order to search deeper.
|
154
|
+
|
155
|
+
# find all paragraphs.
|
156
|
+
elements = doc.search("/html/body//p")
|
157
|
+
# continue the search by finding any images within those paragraphs.
|
158
|
+
(elements/"img")
|
159
|
+
#=> #<Hpricot::Elements[{img ...}, {img ...}]>
|
160
|
+
|
161
|
+
Searches can also be continued by searching within container elements.
|
162
|
+
|
163
|
+
# find all images within paragraphs.
|
164
|
+
doc.search("/html/body//p").each do |para|
|
165
|
+
puts "== Found a paragraph =="
|
166
|
+
pp para
|
167
|
+
|
168
|
+
imgs = para.search("img")
|
169
|
+
if imgs.any?
|
170
|
+
puts "== Found #{imgs.length} images inside =="
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
Of course, the most succinct ways to do the above are using CSS or XPath.
|
175
|
+
|
176
|
+
# the xpath version
|
177
|
+
(doc/"/html/body//p//img")
|
178
|
+
# the css version
|
179
|
+
(doc/"html > body > p img")
|
180
|
+
# ..or symbols work, too!
|
181
|
+
(doc/:html/:body/:p/:img)
|
182
|
+
|
183
|
+
### Looping Edits
|
184
|
+
|
185
|
+
You may certainly edit objects from within your search loops. Then, when you
|
186
|
+
spit out the HTML, the altered elements will show.
|
187
|
+
|
188
|
+
|
189
|
+
(doc/"span.entryPermalink").each do |span|
|
190
|
+
span.attributes['class'] = 'newLinks'
|
191
|
+
end
|
192
|
+
puts doc
|
193
|
+
|
194
|
+
This changes all <tt>span.entryPermalink</tt> elements to
|
195
|
+
<tt>span.newLinks</tt>. Keep in mind that there are often more convenient ways
|
196
|
+
of doing this. Such as the <tt>set</tt> method:
|
197
|
+
|
198
|
+
(doc/"span.entryPermalink").set(:class => 'newLinks')
|
199
|
+
|
200
|
+
### Figuring Out Paths
|
201
|
+
|
202
|
+
Every element can tell you its unique path (either XPath or CSS) to get to the
|
203
|
+
element from the root tag.
|
204
|
+
|
205
|
+
The <tt>css_path</tt> method:
|
206
|
+
|
207
|
+
doc.at("div > div:nth(1)").css_path
|
208
|
+
#=> "div > div:nth(1)"
|
209
|
+
doc.at("#header").css_path
|
210
|
+
#=> "#header"
|
211
|
+
|
212
|
+
Or, the <tt>xpath</tt> method:
|
213
|
+
|
214
|
+
doc.at("div > div:nth(1)").xpath
|
215
|
+
#=> "/div/div:eq(1)"
|
216
|
+
doc.at("#header").xpath
|
217
|
+
#=> "//div[@id='header']"
|
218
|
+
|
219
|
+
## Hpricot Fixups
|
220
|
+
|
221
|
+
When loading HTML documents, you have a few settings that can make Hpricot more
|
222
|
+
or less intense about how it gets involved.
|
223
|
+
|
224
|
+
## :fixup_tags
|
225
|
+
|
226
|
+
Really, there are so many ways to clean up HTML and your intentions may be to
|
227
|
+
keep the HTML as-is. So Hpricot's default behavior is to keep things flexible.
|
228
|
+
Making sure to open and close all the tags, but ignore any validation problems.
|
229
|
+
|
230
|
+
As of Hpricot 0.4, there's a new <tt>:fixup_tags</tt> option which will attempt
|
231
|
+
to shift the document's tags to meet XHTML 1.0 Strict.
|
232
|
+
|
233
|
+
doc = open("index.html") { |f| Hpricot f, :fixup_tags => true }
|
234
|
+
|
235
|
+
This doesn't quite meet the XHTML 1.0 Strict standard, it just tries to follow
|
236
|
+
the rules a bit better. Like: say Hpricot finds a paragraph in a link, it's
|
237
|
+
going to move the paragraph below the link. Or up and out of other elements
|
238
|
+
where paragraphs don't belong.
|
239
|
+
|
240
|
+
If an unknown element is found, it is ignored. Again, <tt>:fixup_tags</tt>.
|
241
|
+
|
242
|
+
## :xhtml_strict
|
243
|
+
|
244
|
+
So, let's go beyond just trying to fix the hierarchy. The
|
245
|
+
<tt>:xhtml_strict</tt> option really tries to force the document to be an XHTML
|
246
|
+
1.0 Strict document. Even at the cost of removing elements that get in the way.
|
247
|
+
|
248
|
+
doc = open("index.html") { |f| Hpricot f, :xhtml_strict => true }
|
249
|
+
|
250
|
+
What measures does <tt>:xhtml_strict</tt> take?
|
251
|
+
|
252
|
+
1. Shift elements into their proper containers just like :fixup_tags.
|
253
|
+
2. Remove unknown elements.
|
254
|
+
3. Remove unknown attributes.
|
255
|
+
4. Remove illegal content.
|
256
|
+
5. Alter the doctype to XHTML 1.0 Strict.
|
257
|
+
|
258
|
+
## Hpricot.XML()
|
259
|
+
|
260
|
+
The last option is the <tt>:xml</tt> option, which makes some slight variations
|
261
|
+
on the standard mode. The main difference is that :xml mode won't try to output
|
262
|
+
tags which are friendlier for browsers. For example, if an opening and closing
|
263
|
+
<tt>br</tt> tag is found, XML mode won't try to turn that into an empty element.
|
264
|
+
|
265
|
+
XML mode also doesn't downcase the tags and attributes for you. So pay attention
|
266
|
+
to case, friends.
|
267
|
+
|
268
|
+
The primary way to use Hpricot's XML mode is to call the Hpricot.XML method:
|
269
|
+
|
270
|
+
doc = open("http://redhanded.hobix.com/index.xml") do |f|
|
271
|
+
Hpricot.XML(f)
|
272
|
+
end
|
273
|
+
|
274
|
+
*Also, :fixup_tags is canceled out by the :xml option.* This is because
|
275
|
+
:fixup_tags makes assumptions based how HTML is structured. Specifically, how
|
276
|
+
tags are defined in the XHTML 1.0 DTD.
|
data/Rakefile
ADDED
@@ -0,0 +1,234 @@
|
|
1
|
+
require 'rake/clean'
|
2
|
+
require 'rake/gempackagetask'
|
3
|
+
require 'rake/rdoctask'
|
4
|
+
require 'rake/testtask'
|
5
|
+
begin
|
6
|
+
require 'rake/extensiontask'
|
7
|
+
rescue LoadError
|
8
|
+
abort "To build, please first gem install rake-compiler"
|
9
|
+
end
|
10
|
+
|
11
|
+
RbConfig = Config unless defined?(RbConfig)
|
12
|
+
|
13
|
+
NAME = "hpricot"
|
14
|
+
REV = (`#{ENV['GIT'] || "git"} rev-list HEAD`.split.length + 1).to_s
|
15
|
+
VERS = ENV['VERSION'] || "0.8" + (REV ? ".#{REV}" : "")
|
16
|
+
PKG = "#{NAME}-#{VERS}"
|
17
|
+
BIN = "*.{bundle,jar,so,o,obj,pdb,lib,def,exp,class,rbc}"
|
18
|
+
CLEAN.include ["#{BIN}", "ext/**/#{BIN}", "lib/**/#{BIN}", "test/**/#{BIN}",
|
19
|
+
'ext/fast_xs/Makefile', 'ext/hpricot_scan/Makefile',
|
20
|
+
'**/.*.sw?', '*.gem', '.config', 'pkg', 'lib/hpricot_scan.rb', 'lib/fast_xs.rb']
|
21
|
+
RDOC_OPTS = ['--quiet', '--title', 'The Hpricot Reference', '--main', 'README.md', '--inline-source']
|
22
|
+
PKG_FILES = %w(CHANGELOG COPYING README.md Rakefile) +
|
23
|
+
Dir.glob("{bin,doc,test,extras}/**/*") +
|
24
|
+
(Dir.glob("lib/**/*.rb") - %w(lib/hpricot_scan.rb lib/fast_xs.rb)) +
|
25
|
+
Dir.glob("ext/**/*.{h,java,c,rb,rl}") +
|
26
|
+
%w[ext/hpricot_scan/hpricot_scan.c ext/hpricot_scan/hpricot_css.c ext/hpricot_scan/HpricotScanService.java] # needed because they are generated later
|
27
|
+
RAGEL_C_CODE_GENERATION_STYLES = {
|
28
|
+
"table_driven" => 'T0',
|
29
|
+
"faster_table_driven" => 'T1',
|
30
|
+
"flat_table_driven" => 'F0',
|
31
|
+
"faster_flat_table_driven" => 'F1',
|
32
|
+
"goto_driven" => 'G0',
|
33
|
+
"faster_goto_driven" => 'G1',
|
34
|
+
"really_fast goto_driven" => 'G2'
|
35
|
+
# "n_way_split_really_fast_goto_driven" => 'P<N>'
|
36
|
+
}
|
37
|
+
DEFAULT_RAGEL_C_CODE_GENERATION = "really_fast goto_driven"
|
38
|
+
SPEC =
|
39
|
+
Gem::Specification.new do |s|
|
40
|
+
s.name = NAME
|
41
|
+
s.version = VERS
|
42
|
+
s.platform = Gem::Platform::RUBY
|
43
|
+
s.has_rdoc = true
|
44
|
+
s.rdoc_options += RDOC_OPTS
|
45
|
+
s.extra_rdoc_files = ["README.md", "CHANGELOG", "COPYING"]
|
46
|
+
s.summary = "a swift, liberal HTML parser with a fantastic library"
|
47
|
+
s.description = s.summary
|
48
|
+
s.author = "why the lucky stiff"
|
49
|
+
s.email = 'why@ruby-lang.org'
|
50
|
+
s.homepage = 'http://code.whytheluckystiff.net/hpricot/'
|
51
|
+
s.rubyforge_project = 'hobix'
|
52
|
+
s.files = PKG_FILES
|
53
|
+
s.require_paths = ["lib"]
|
54
|
+
s.extensions = FileList["ext/**/extconf.rb"].to_a
|
55
|
+
s.bindir = "bin"
|
56
|
+
end
|
57
|
+
|
58
|
+
# FAT cross-compile
|
59
|
+
# Pass RUBY_CC_VERSION=1.8.7:1.9.2 when packaging for 1.8+1.9 mswin32 binaries
|
60
|
+
%w(hpricot_scan fast_xs).each do |target|
|
61
|
+
Rake::ExtensionTask.new(target, SPEC) do |ext|
|
62
|
+
ext.lib_dir = File.join('lib', target) if ENV['RUBY_CC_VERSION']
|
63
|
+
ext.cross_compile = true # enable cross compilation (requires cross compile toolchain)
|
64
|
+
ext.cross_platform = 'i386-mswin32' # forces the Windows platform instead of the default one
|
65
|
+
end
|
66
|
+
|
67
|
+
# HACK around 1.9.2 cross .def file creation
|
68
|
+
def_file = "tmp/i386-mswin32/#{target}/1.9.2/#{target}-i386-mingw32.def"
|
69
|
+
directory File.dirname(def_file)
|
70
|
+
file def_file => File.dirname(def_file) do |t|
|
71
|
+
File.open(t.name, "w") do |f|
|
72
|
+
f << "EXPORTS\nInit_#{target}\n"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
task File.join(File.dirname(def_file), "Makefile") => def_file
|
77
|
+
# END HACK
|
78
|
+
file "lib/#{target}.rb" do |t|
|
79
|
+
File.open(t.name, "w") do |f|
|
80
|
+
f.puts %{require "#{target}/\#{RUBY_VERSION.sub(/\\.\\d+$/, '')}/#{target}"}
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
file 'ext/hpricot_scan/extconf.rb' => :ragel
|
85
|
+
|
86
|
+
desc "set environment variables to build and/or test with debug options"
|
87
|
+
task :debug do
|
88
|
+
ENV['CFLAGS'] ||= ""
|
89
|
+
ENV['CFLAGS'] += " -g -DDEBUG"
|
90
|
+
end
|
91
|
+
|
92
|
+
desc "Does a full compile, test run"
|
93
|
+
if defined?(JRUBY_VERSION)
|
94
|
+
task :default => [:compile_java, :clean_fat_rb, :test]
|
95
|
+
else
|
96
|
+
task :default => [:compile, :clean_fat_rb, :test]
|
97
|
+
end
|
98
|
+
|
99
|
+
task :clean_fat_rb do
|
100
|
+
rm_f "lib/hpricot_scan.rb"
|
101
|
+
rm_f "lib/fast_xs.rb"
|
102
|
+
end
|
103
|
+
|
104
|
+
desc "Packages up Hpricot for all platforms."
|
105
|
+
task :package => [:clean]
|
106
|
+
|
107
|
+
desc "Run all the tests"
|
108
|
+
Rake::TestTask.new do |t|
|
109
|
+
t.libs << "test"
|
110
|
+
t.test_files = FileList['test/test_*.rb']
|
111
|
+
t.verbose = true
|
112
|
+
end
|
113
|
+
|
114
|
+
Rake::RDocTask.new do |rdoc|
|
115
|
+
rdoc.rdoc_dir = 'doc/rdoc'
|
116
|
+
rdoc.options += RDOC_OPTS
|
117
|
+
rdoc.main = "README.md"
|
118
|
+
rdoc.rdoc_files.add ['README.md', 'CHANGELOG', 'COPYING', 'lib/**/*.rb']
|
119
|
+
end
|
120
|
+
|
121
|
+
Rake::GemPackageTask.new(SPEC) do |p|
|
122
|
+
p.need_tar = true
|
123
|
+
p.gem_spec = SPEC
|
124
|
+
end
|
125
|
+
|
126
|
+
### Win32 Packages ###
|
127
|
+
Win32Spec = SPEC.dup
|
128
|
+
Win32Spec.platform = 'i386-mswin32'
|
129
|
+
Win32Spec.files = PKG_FILES + %w(hpricot_scan fast_xs).map do |t|
|
130
|
+
unless ENV['RUBY_CC_VERSION']
|
131
|
+
file "lib/#{t}/1.8/#{t}.so" do
|
132
|
+
abort "ERROR while packaging: re-run for fat win32 gems:\nrake #{ARGV.join(' ')} RUBY_CC_VERSION=1.8.7:1.9.2"
|
133
|
+
end
|
134
|
+
end
|
135
|
+
["lib/#{t}.rb", "lib/#{t}/1.8/#{t}.so", "lib/#{t}/1.9/#{t}.so"]
|
136
|
+
end.flatten
|
137
|
+
Win32Spec.extensions = []
|
138
|
+
|
139
|
+
Rake::GemPackageTask.new(Win32Spec) do |p|
|
140
|
+
p.need_tar = false
|
141
|
+
p.gem_spec = Win32Spec
|
142
|
+
end
|
143
|
+
|
144
|
+
JRubySpec = SPEC.dup
|
145
|
+
JRubySpec.platform = 'java'
|
146
|
+
JRubySpec.files = PKG_FILES + ["lib/hpricot_scan.jar", "lib/fast_xs.jar"]
|
147
|
+
JRubySpec.extensions = []
|
148
|
+
|
149
|
+
Rake::GemPackageTask.new(JRubySpec) do |p|
|
150
|
+
p.need_tar = false
|
151
|
+
p.gem_spec = JRubySpec
|
152
|
+
end
|
153
|
+
|
154
|
+
desc "Determines the Ragel version and displays it on the console along with the location of the Ragel binary."
|
155
|
+
task :ragel_version do
|
156
|
+
@ragel_v = `ragel -v`[/(version )(\S*)/,2].to_f
|
157
|
+
puts "Using ragel version: #{@ragel_v}, location: #{`which ragel`}"
|
158
|
+
@ragel_v
|
159
|
+
end
|
160
|
+
|
161
|
+
desc "Generates the C scanner code with Ragel."
|
162
|
+
task :ragel => [:ragel_version] do
|
163
|
+
if @ragel_v >= 6.1
|
164
|
+
@ragel_c_code_generation_style = RAGEL_C_CODE_GENERATION_STYLES[DEFAULT_RAGEL_C_CODE_GENERATION]
|
165
|
+
Dir.chdir("ext/hpricot_scan") do
|
166
|
+
sh %{ragel hpricot_scan.rl -#{@ragel_c_code_generation_style} -o hpricot_scan.c}
|
167
|
+
sh %{ragel hpricot_css.rl -#{@ragel_c_code_generation_style} -o hpricot_css.c}
|
168
|
+
end
|
169
|
+
else
|
170
|
+
STDERR.puts "Ragel 6.1 or greater is required."
|
171
|
+
exit(1)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
# Java only supports the table-driven code
|
176
|
+
# generation style at this point.
|
177
|
+
desc "Generates the Java scanner code using the Ragel table-driven code generation style."
|
178
|
+
task :ragel_java => [:ragel_version] do
|
179
|
+
if @ragel_v >= 6.1
|
180
|
+
puts "compiling with ragel version #{@ragel_v}"
|
181
|
+
Dir.chdir("ext/hpricot_scan") do
|
182
|
+
sh %{ragel -J -o HpricotCss.java hpricot_css.java.rl}
|
183
|
+
sh %{ragel -J -o HpricotScanService.java hpricot_scan.java.rl}
|
184
|
+
end
|
185
|
+
else
|
186
|
+
STDERR.puts "Ragel 6.1 or greater is required."
|
187
|
+
exit(1)
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
### JRuby Compile ###
|
192
|
+
|
193
|
+
def java_classpath_arg # myriad of ways to discover JRuby classpath
|
194
|
+
begin
|
195
|
+
cpath = Java::java.lang.System.getProperty('java.class.path').split(File::PATH_SEPARATOR)
|
196
|
+
cpath += Java::java.lang.System.getProperty('sun.boot.class.path').split(File::PATH_SEPARATOR)
|
197
|
+
jruby_cpath = cpath.compact.join(File::PATH_SEPARATOR)
|
198
|
+
rescue => e
|
199
|
+
end
|
200
|
+
unless jruby_cpath
|
201
|
+
jruby_cpath = ENV['JRUBY_PARENT_CLASSPATH'] || ENV['JRUBY_HOME'] &&
|
202
|
+
FileList["#{ENV['JRUBY_HOME']}/lib/*.jar"].join(File::PATH_SEPARATOR)
|
203
|
+
end
|
204
|
+
unless jruby_cpath || ENV['CLASSPATH'] =~ /jruby/
|
205
|
+
abort %{WARNING: No JRuby classpath has been set up.
|
206
|
+
Define JRUBY_HOME=/path/to/jruby on the command line or in the environment}
|
207
|
+
end
|
208
|
+
"-cp \"#{jruby_cpath}\""
|
209
|
+
end
|
210
|
+
|
211
|
+
def compile_java(filenames, jarname)
|
212
|
+
sh %{javac -source 1.5 -target 1.5 #{java_classpath_arg} #{filenames.join(" ")}}
|
213
|
+
sh %{jar cf #{jarname} *.class}
|
214
|
+
end
|
215
|
+
|
216
|
+
task :hpricot_scan_java => [:ragel_java] do
|
217
|
+
Dir.chdir "ext/hpricot_scan" do
|
218
|
+
compile_java(["HpricotScanService.java", "HpricotCss.java"], "hpricot_scan.jar")
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
task :fast_xs_java do
|
223
|
+
Dir.chdir "ext/fast_xs" do
|
224
|
+
compile_java(["FastXsService.java"], "fast_xs.jar")
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
%w(hpricot_scan fast_xs).each do |ext|
|
229
|
+
file "lib/#{ext}.jar" => "#{ext}_java" do |t|
|
230
|
+
mv "ext/#{ext}/#{ext}.jar", "lib"
|
231
|
+
end
|
232
|
+
task :compile_java => "lib/#{ext}.jar"
|
233
|
+
end
|
234
|
+
|