hpricot 0.8.3-i386-mswin32

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. data/CHANGELOG +104 -0
  2. data/COPYING +18 -0
  3. data/README.md +276 -0
  4. data/Rakefile +234 -0
  5. data/ext/fast_xs/FastXsService.java +1123 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +210 -0
  8. data/ext/hpricot_scan/HpricotCss.java +850 -0
  9. data/ext/hpricot_scan/HpricotScanService.java +2099 -0
  10. data/ext/hpricot_scan/extconf.rb +9 -0
  11. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  12. data/ext/hpricot_scan/hpricot_css.c +3511 -0
  13. data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
  14. data/ext/hpricot_scan/hpricot_css.rl +120 -0
  15. data/ext/hpricot_scan/hpricot_scan.c +7039 -0
  16. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  17. data/ext/hpricot_scan/hpricot_scan.java.rl +1161 -0
  18. data/ext/hpricot_scan/hpricot_scan.rl +896 -0
  19. data/extras/hpricot.png +0 -0
  20. data/lib/fast_xs.rb +1 -0
  21. data/lib/fast_xs/1.8/fast_xs.so +0 -0
  22. data/lib/fast_xs/1.9/fast_xs.so +0 -0
  23. data/lib/hpricot.rb +26 -0
  24. data/lib/hpricot/blankslate.rb +63 -0
  25. data/lib/hpricot/builder.rb +216 -0
  26. data/lib/hpricot/elements.rb +514 -0
  27. data/lib/hpricot/htmlinfo.rb +691 -0
  28. data/lib/hpricot/inspect.rb +103 -0
  29. data/lib/hpricot/modules.rb +40 -0
  30. data/lib/hpricot/parse.rb +38 -0
  31. data/lib/hpricot/tag.rb +219 -0
  32. data/lib/hpricot/tags.rb +164 -0
  33. data/lib/hpricot/traverse.rb +839 -0
  34. data/lib/hpricot/xchar.rb +94 -0
  35. data/lib/hpricot_scan.rb +1 -0
  36. data/lib/hpricot_scan/1.8/hpricot_scan.so +0 -0
  37. data/lib/hpricot_scan/1.9/hpricot_scan.so +0 -0
  38. data/test/files/basic.xhtml +17 -0
  39. data/test/files/boingboing.html +2266 -0
  40. data/test/files/cy0.html +3653 -0
  41. data/test/files/immob.html +400 -0
  42. data/test/files/pace_application.html +1320 -0
  43. data/test/files/tenderlove.html +16 -0
  44. data/test/files/uswebgen.html +220 -0
  45. data/test/files/utf8.html +1054 -0
  46. data/test/files/week9.html +1723 -0
  47. data/test/files/why.xml +19 -0
  48. data/test/load_files.rb +7 -0
  49. data/test/nokogiri-bench.rb +64 -0
  50. data/test/test_alter.rb +96 -0
  51. data/test/test_builder.rb +37 -0
  52. data/test/test_parser.rb +457 -0
  53. data/test/test_paths.rb +25 -0
  54. data/test/test_preserved.rb +88 -0
  55. data/test/test_xml.rb +28 -0
  56. metadata +128 -0
@@ -0,0 +1,104 @@
1
+ = 0.8.3
2
+ === 3 November, 2010
3
+ * GH#8: Nil-check before downcasing attribute key
4
+ * GH#25: Proper ruby 1.9 encoding support
5
+ * GH#28. Use integers instead of ?? on 1.9, which is just a string.
6
+ * including noscript to ElementInclusions , so that hpricot wont fail
7
+ when trying to parse a meta tag inside head section when noscript is
8
+ present.
9
+ * latest changes from fast_xs mainline
10
+ * Fixes to get Hpricot running on Rubinius:
11
+ * Use free, not XFREE
12
+ * Remove RSTRUCT craziness, don't break Array#at
13
+
14
+ = 0.8.2
15
+ === 5 November, 2009
16
+ * Bring JRuby support up to speed, including Java-based hpricot_css support
17
+ * Change JRuby fast_xs to have same escaping behavior as C fast_xs
18
+ * fix for issue #2, downcasing of html attributes inside the parser.
19
+ * solve issue #3 with bogus etags being preserved in `to_s` rather than just `to_original_html`.
20
+ * fix error when attempting to reparent cleared node. (issue #5)
21
+ * Hpricot::Attributes proxy object for using `ele.attributes[k] = v` directly.
22
+ however, it is preferred to use the jquery-like `elements.attr(k, v)`.
23
+
24
+ = 0.8.1
25
+ === 3 April, 2009
26
+ * big problems on Ruby 1.8.6, use INT2FIX instead of INT2NUM. hashes were being cast to bignums.
27
+ * patch for 1.8.5 to define RARRAY_PTR. thanks, mike perham!
28
+ * inspecting empty document bug, courtesy of @TalLevAmi.
29
+
30
+ = 0.8
31
+ === 31st March, 2009
32
+ * Saving memory and speed by using RStruct-based elements in the C extension.
33
+ * Bug in tag parsing, causing runaway <script> and <style> tags in HTML.
34
+ * Problem compiling under Ruby 1.9, due to our_rb_hash_lookup function meant for Ruby 1.8.
35
+ * CData was missing inner_text method.
36
+
37
+ = 0.7
38
+ === 17th March, 2009
39
+ * Rewritten parser routine, much lighter on memory, quite a bit faster.
40
+ * Friendlier with Ruby 1.9.
41
+ * Fixes to nth-child and text() selectors.
42
+
43
+ = 0.6
44
+ === 15th June, 2007
45
+ * Hpricot for JRuby -- nice work Ola Bini!
46
+ * Inline Markaby for Hpricot documents.
47
+ * XML tags and attributes are no longer downcased like HTML is.
48
+ * new syntax for grabbing everything between two elements using a Range in the search method: (doc/("font".."font/br")) or in nodes_at like so: (doc/"font").nodes_at("*".."br"). Only works with either a pair of siblings or a set of a parent and a sibling.
49
+ * Ignore self-closing endings on tags (such as form) which are containers. Treat them like open parent tags. Reported by Jonathan Nichols on the hpricot list.
50
+ * Escaping of attributes, yanked from Jim Weirich and Sam Ruby's work in Builder.
51
+ * Element#raw_attributes gives unescaped data. Element#attributes gives escaped.
52
+ * Added: Elements#attr, Elements#remove_attr, Elements#remove_class.
53
+ * Added: Traverse#preceding, Traverse#following, Traverse#previous, Traverse#next.
54
+
55
+ = 0.5
56
+ === 31rd January, 2007
57
+
58
+ * support for a[text()="Click Me!"] and h3[text()*="space"] and the like.
59
+ * Hpricot.buffer_size accessor for increasing Hpricot's buffer if you're encountering huge ASP.NET viewstate attribs.
60
+ * some support for colons in tag names (not full namespace support yet.)
61
+ * Element.to_original_html will attempt to preserve the original HTML while merging your changes.
62
+ * Element.to_plain_text converts an element's contents to a simple text format.
63
+ * Element.inner_text removes all tags and returns text nodes concatenated into a single string.
64
+ * no @raw_string variable kept for comments, text, and cdata -- as it's redundant.
65
+ * xpath-style indices (//p/a[1]) but keep in mind that they aren't zero-based.
66
+ * node_position is the index among all sibling nodes, while position is the position among children of identical type.
67
+ * comment() and text() search criteria, like: //p/text(), which selects all text inside paragraph tags.
68
+ * every element has css_path and xpath methods which return respective absolute paths.
69
+ * more flexibility all around: in parsing attributes, tags, comments and cdata.
70
+
71
+ = 0.4
72
+ === 11th August, 2006
73
+
74
+ * The :fixup_tags option will try to sort out the hierarchy so elements end up with the right parents.
75
+ * Elements such as *script* and *style* (identified as having CDATA contents) receive a single text node as their children now. Previously, Hpricot was parsing out tags found in scripts.
76
+ * Better scanning of partially quoted attributes (found by Brent Beardsly on http://uswebgen.com/)
77
+ * Better scanning of unquoted attributes -- thanks to Aaron Patterson for the test cases!
78
+ * Some tags were being output in the empty tag style, although browsers hated that. FIXED!
79
+ * Added Elements#at for finding single elements.
80
+ * Added Elem::Trav#[] and Elem::Trav#[]= for reading and writing attributes.
81
+
82
+ = 0.3
83
+ === 7th July, 2006
84
+
85
+ * Fixed negative string size error on empty tokens. (news.bbc.co.uk)
86
+ * Allow the parser to accept just text nodes. (such as: <tt>Hpricot.parse('TEXT')</tt>)
87
+ * from JQuery to Hpricot::Elements: remove, empty, append, prepend, before, after, wrap, set,
88
+ html(...), to_html, to_s.
89
+ * on containers: to_html, replace_child, insert_before, insert_after, innerHTML=.
90
+ * Hpricot(...) is an alias for parse.
91
+ * open up all properties to setters, let people do as they may.
92
+ * use to_html for the full html of a node or set of elements.
93
+ * doctypes were messed.
94
+
95
+ = 0.2
96
+ === 4th July, 2006
97
+
98
+ * Rewrote the HTree parser to be simpler, more adequate for the common man. Will add encoding back in later.
99
+
100
+ = 0.1
101
+ === 3rd July, 2006
102
+
103
+ * For whatever reason, wrote this HTML parser in C.
104
+ I guess Ragel is addictive and I want to improve HTree.
data/COPYING ADDED
@@ -0,0 +1,18 @@
1
+ Copyright (c) 2006 why the lucky stiff
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to
5
+ deal in the Software without restriction, including without limitation the
6
+ rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7
+ sell copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16
+ THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,276 @@
1
+ # Hpricot, Read Any HTML
2
+
3
+ Hpricot is a fast, flexible HTML parser written in C. It's designed to be very
4
+ accommodating (like Tanaka Akira's HTree) and to have a very helpful library
5
+ (like some JavaScript libs -- JQuery, Prototype -- give you.) The XPath and CSS
6
+ parser, in fact, is based on John Resig's JQuery.
7
+
8
+ Also, Hpricot can be handy for reading broken XML files, since many of the same
9
+ techniques can be used. If a quote is missing, Hpricot tries to figure it out.
10
+ If tags overlap, Hpricot works on sorting them out. You know, that sort of
11
+ thing.
12
+
13
+ *Please read this entire document* before making assumptions about how this
14
+ software works.
15
+
16
+ ## An Overview
17
+
18
+ Let's clear up what Hpricot is.
19
+
20
+ * Hpricot is *a standalone library*. It requires no other libraries. Just Ruby!
21
+ * While priding itself on speed, Hpricot *works hard to sort out bad HTML* and
22
+ pays a small penalty in order to get that right. So that's slightly more important
23
+ to me than speed.
24
+ * *If you can see it in Firefox, then Hpricot should parse it.* That's
25
+ how it should be! Let me know the minute it's otherwise.
26
+ * Primarily, Hpricot is used for reading HTML and tries to sort out troubled
27
+ HTML by having some idea of what good HTML is. Some people still like to use
28
+ Hpricot for XML reading, but *remember to use the Hpricot::XML() method* for that!
29
+
30
+ ## The Hpricot Kingdom
31
+
32
+ First, here are all the links you need to know:
33
+
34
+ * http://wiki.github.com/hpricot/hpricot is the Hpricot wiki and
35
+ http://github.com/hpricot/hpricot/issues is the bug tracker.
36
+ Go there for news and recipes and patches. It's the center of activity.
37
+ * http://github.com/hpricot/hpricot is the main Git
38
+ repository for Hpricot. You can get the latest code there.
39
+ * See COPYING for the terms of this software. (Spoiler: it's absolutely free.)
40
+
41
+ If you have any trouble, don't hesitate to contact the author. As always, I'm
42
+ not going to say "Use at your own risk" because I don't want this library to be
43
+ risky. If you trip on something, I'll share the liability by repairing things
44
+ as quickly as I can. Your responsibility is to report the inadequacies.
45
+
46
+ ## Installing Hpricot
47
+
48
+ You may get the latest stable version from Rubyforge. Win32 binaries,
49
+ Java binaries (for JRuby), and source gems are available.
50
+
51
+ $ gem install hpricot
52
+
53
+ ## An Hpricot Showcase
54
+
55
+ We're going to run through a big pile of examples to get you jump-started.
56
+ Many of these examples are also found at
57
+ http://wiki.github.com/hpricot/hpricot/hpricot-basics, in case you
58
+ want to add some of your own.
59
+
60
+ ### Loading Hpricot Itself
61
+
62
+ You have probably got the gem, right? To load Hpricot:
63
+
64
+ require 'rubygems'
65
+ require 'hpricot'
66
+
67
+ If you've installed the plain source distribution, go ahead and just:
68
+
69
+ require 'hpricot'
70
+
71
+ ### Load an HTML Page
72
+
73
+ The <tt>Hpricot()</tt> method takes a string or any IO object and loads the
74
+ contents into a document object.
75
+
76
+ doc = Hpricot("<p>A simple <b>test</b> string.</p>")
77
+
78
+ To load from a file, just get the stream open:
79
+
80
+ doc = open("index.html") { |f| Hpricot(f) }
81
+
82
+ To load from a web URL, use <tt>open-uri</tt>, which comes with Ruby:
83
+
84
+ require 'open-uri'
85
+ doc = open("http://qwantz.com/") { |f| Hpricot(f) }
86
+
87
+ Hpricot uses an internal buffer to parse the file, so the IO will stream
88
+ properly and large documents won't be loaded into memory all at once. However,
89
+ the parsed document object will be present in memory, in its entirety.
90
+
91
+ ### Search for Elements
92
+
93
+ Use <tt>Doc.search</tt>:
94
+
95
+ doc.search("//p[@class='posted']")
96
+ #=> #<Hpricot:Elements[{p ...}, {p ...}]>
97
+
98
+ <tt>Doc.search</tt> can take an XPath or CSS expression. In the above example,
99
+ all paragraph <tt><p></tt> elements are grabbed which have a <tt>class</tt>
100
+ attribute of <tt>"posted"</tt>.
101
+
102
+ A shortcut is to use the divisor:
103
+
104
+ (doc/"p.posted")
105
+ #=> #<Hpricot:Elements[{p ...}, {p ...}]>
106
+
107
+ ### Finding Just One Element
108
+
109
+ If you're looking for a single element, the <tt>at</tt> method will return the
110
+ first element matched by the expression. In this case, you'll get back the
111
+ element itself rather than the <tt>Hpricot::Elements</tt> array.
112
+
113
+ doc.at("body")['onload']
114
+
115
+ The above code will find the body tag and give you back the <tt>onload</tt>
116
+ attribute. This is the most common reason to use the element directly: when
117
+ reading and writing HTML attributes.
118
+
119
+ ### Fetching the Contents of an Element
120
+
121
+ Just as with browser scripting, the <tt>inner_html</tt> property can be used to
122
+ get the inner contents of an element.
123
+
124
+ (doc/"#elementID").inner_html
125
+ #=> "..contents.."
126
+
127
+ If your expression matches more than one element, you'll get back the contents
128
+ of ''all the matched elements''. So you may want to use <tt>first</tt> to be
129
+ sure you get back only one.
130
+
131
+ (doc/"#elementID").first.inner_html
132
+ #=> "..contents.."
133
+
134
+ ### Fetching the HTML for an Element
135
+
136
+ If you want the HTML for the whole element (not just the contents), use
137
+ <tt>to_html</tt>:
138
+
139
+ (doc/"#elementID").to_html
140
+ #=> "<div id='elementID'>...</div>"
141
+
142
+ ### Looping
143
+
144
+ All searches return a set of <tt>Hpricot::Elements</tt>. Go ahead and loop
145
+ through them like you would an array.
146
+
147
+ (doc/"p/a/img").each do |img|
148
+ puts img.attributes['class']
149
+ end
150
+
151
+ ### Continuing Searches
152
+
153
+ Searches can be continued from a collection of elements, in order to search deeper.
154
+
155
+ # find all paragraphs.
156
+ elements = doc.search("/html/body//p")
157
+ # continue the search by finding any images within those paragraphs.
158
+ (elements/"img")
159
+ #=> #<Hpricot::Elements[{img ...}, {img ...}]>
160
+
161
+ Searches can also be continued by searching within container elements.
162
+
163
+ # find all images within paragraphs.
164
+ doc.search("/html/body//p").each do |para|
165
+ puts "== Found a paragraph =="
166
+ pp para
167
+
168
+ imgs = para.search("img")
169
+ if imgs.any?
170
+ puts "== Found #{imgs.length} images inside =="
171
+ end
172
+ end
173
+
174
+ Of course, the most succinct ways to do the above are using CSS or XPath.
175
+
176
+ # the xpath version
177
+ (doc/"/html/body//p//img")
178
+ # the css version
179
+ (doc/"html > body > p img")
180
+ # ..or symbols work, too!
181
+ (doc/:html/:body/:p/:img)
182
+
183
+ ### Looping Edits
184
+
185
+ You may certainly edit objects from within your search loops. Then, when you
186
+ spit out the HTML, the altered elements will show.
187
+
188
+
189
+ (doc/"span.entryPermalink").each do |span|
190
+ span.attributes['class'] = 'newLinks'
191
+ end
192
+ puts doc
193
+
194
+ This changes all <tt>span.entryPermalink</tt> elements to
195
+ <tt>span.newLinks</tt>. Keep in mind that there are often more convenient ways
196
+ of doing this. Such as the <tt>set</tt> method:
197
+
198
+ (doc/"span.entryPermalink").set(:class => 'newLinks')
199
+
200
+ ### Figuring Out Paths
201
+
202
+ Every element can tell you its unique path (either XPath or CSS) to get to the
203
+ element from the root tag.
204
+
205
+ The <tt>css_path</tt> method:
206
+
207
+ doc.at("div > div:nth(1)").css_path
208
+ #=> "div > div:nth(1)"
209
+ doc.at("#header").css_path
210
+ #=> "#header"
211
+
212
+ Or, the <tt>xpath</tt> method:
213
+
214
+ doc.at("div > div:nth(1)").xpath
215
+ #=> "/div/div:eq(1)"
216
+ doc.at("#header").xpath
217
+ #=> "//div[@id='header']"
218
+
219
+ ## Hpricot Fixups
220
+
221
+ When loading HTML documents, you have a few settings that can make Hpricot more
222
+ or less intense about how it gets involved.
223
+
224
+ ## :fixup_tags
225
+
226
+ Really, there are so many ways to clean up HTML and your intentions may be to
227
+ keep the HTML as-is. So Hpricot's default behavior is to keep things flexible.
228
+ Making sure to open and close all the tags, but ignore any validation problems.
229
+
230
+ As of Hpricot 0.4, there's a new <tt>:fixup_tags</tt> option which will attempt
231
+ to shift the document's tags to meet XHTML 1.0 Strict.
232
+
233
+ doc = open("index.html") { |f| Hpricot f, :fixup_tags => true }
234
+
235
+ This doesn't quite meet the XHTML 1.0 Strict standard, it just tries to follow
236
+ the rules a bit better. Like: say Hpricot finds a paragraph in a link, it's
237
+ going to move the paragraph below the link. Or up and out of other elements
238
+ where paragraphs don't belong.
239
+
240
+ If an unknown element is found, it is ignored. Again, <tt>:fixup_tags</tt>.
241
+
242
+ ## :xhtml_strict
243
+
244
+ So, let's go beyond just trying to fix the hierarchy. The
245
+ <tt>:xhtml_strict</tt> option really tries to force the document to be an XHTML
246
+ 1.0 Strict document. Even at the cost of removing elements that get in the way.
247
+
248
+ doc = open("index.html") { |f| Hpricot f, :xhtml_strict => true }
249
+
250
+ What measures does <tt>:xhtml_strict</tt> take?
251
+
252
+ 1. Shift elements into their proper containers just like :fixup_tags.
253
+ 2. Remove unknown elements.
254
+ 3. Remove unknown attributes.
255
+ 4. Remove illegal content.
256
+ 5. Alter the doctype to XHTML 1.0 Strict.
257
+
258
+ ## Hpricot.XML()
259
+
260
+ The last option is the <tt>:xml</tt> option, which makes some slight variations
261
+ on the standard mode. The main difference is that :xml mode won't try to output
262
+ tags which are friendlier for browsers. For example, if an opening and closing
263
+ <tt>br</tt> tag is found, XML mode won't try to turn that into an empty element.
264
+
265
+ XML mode also doesn't downcase the tags and attributes for you. So pay attention
266
+ to case, friends.
267
+
268
+ The primary way to use Hpricot's XML mode is to call the Hpricot.XML method:
269
+
270
+ doc = open("http://redhanded.hobix.com/index.xml") do |f|
271
+ Hpricot.XML(f)
272
+ end
273
+
274
+ *Also, :fixup_tags is canceled out by the :xml option.* This is because
275
+ :fixup_tags makes assumptions based how HTML is structured. Specifically, how
276
+ tags are defined in the XHTML 1.0 DTD.
@@ -0,0 +1,234 @@
1
+ require 'rake/clean'
2
+ require 'rake/gempackagetask'
3
+ require 'rake/rdoctask'
4
+ require 'rake/testtask'
5
+ begin
6
+ require 'rake/extensiontask'
7
+ rescue LoadError
8
+ abort "To build, please first gem install rake-compiler"
9
+ end
10
+
11
+ RbConfig = Config unless defined?(RbConfig)
12
+
13
+ NAME = "hpricot"
14
+ REV = (`#{ENV['GIT'] || "git"} rev-list HEAD`.split.length + 1).to_s
15
+ VERS = ENV['VERSION'] || "0.8" + (REV ? ".#{REV}" : "")
16
+ PKG = "#{NAME}-#{VERS}"
17
+ BIN = "*.{bundle,jar,so,o,obj,pdb,lib,def,exp,class,rbc}"
18
+ CLEAN.include ["#{BIN}", "ext/**/#{BIN}", "lib/**/#{BIN}", "test/**/#{BIN}",
19
+ 'ext/fast_xs/Makefile', 'ext/hpricot_scan/Makefile',
20
+ '**/.*.sw?', '*.gem', '.config', 'pkg', 'lib/hpricot_scan.rb', 'lib/fast_xs.rb']
21
+ RDOC_OPTS = ['--quiet', '--title', 'The Hpricot Reference', '--main', 'README.md', '--inline-source']
22
+ PKG_FILES = %w(CHANGELOG COPYING README.md Rakefile) +
23
+ Dir.glob("{bin,doc,test,extras}/**/*") +
24
+ (Dir.glob("lib/**/*.rb") - %w(lib/hpricot_scan.rb lib/fast_xs.rb)) +
25
+ Dir.glob("ext/**/*.{h,java,c,rb,rl}") +
26
+ %w[ext/hpricot_scan/hpricot_scan.c ext/hpricot_scan/hpricot_css.c ext/hpricot_scan/HpricotScanService.java] # needed because they are generated later
27
+ RAGEL_C_CODE_GENERATION_STYLES = {
28
+ "table_driven" => 'T0',
29
+ "faster_table_driven" => 'T1',
30
+ "flat_table_driven" => 'F0',
31
+ "faster_flat_table_driven" => 'F1',
32
+ "goto_driven" => 'G0',
33
+ "faster_goto_driven" => 'G1',
34
+ "really_fast goto_driven" => 'G2'
35
+ # "n_way_split_really_fast_goto_driven" => 'P<N>'
36
+ }
37
+ DEFAULT_RAGEL_C_CODE_GENERATION = "really_fast goto_driven"
38
+ SPEC =
39
+ Gem::Specification.new do |s|
40
+ s.name = NAME
41
+ s.version = VERS
42
+ s.platform = Gem::Platform::RUBY
43
+ s.has_rdoc = true
44
+ s.rdoc_options += RDOC_OPTS
45
+ s.extra_rdoc_files = ["README.md", "CHANGELOG", "COPYING"]
46
+ s.summary = "a swift, liberal HTML parser with a fantastic library"
47
+ s.description = s.summary
48
+ s.author = "why the lucky stiff"
49
+ s.email = 'why@ruby-lang.org'
50
+ s.homepage = 'http://code.whytheluckystiff.net/hpricot/'
51
+ s.rubyforge_project = 'hobix'
52
+ s.files = PKG_FILES
53
+ s.require_paths = ["lib"]
54
+ s.extensions = FileList["ext/**/extconf.rb"].to_a
55
+ s.bindir = "bin"
56
+ end
57
+
58
+ # FAT cross-compile
59
+ # Pass RUBY_CC_VERSION=1.8.7:1.9.2 when packaging for 1.8+1.9 mswin32 binaries
60
+ %w(hpricot_scan fast_xs).each do |target|
61
+ Rake::ExtensionTask.new(target, SPEC) do |ext|
62
+ ext.lib_dir = File.join('lib', target) if ENV['RUBY_CC_VERSION']
63
+ ext.cross_compile = true # enable cross compilation (requires cross compile toolchain)
64
+ ext.cross_platform = 'i386-mswin32' # forces the Windows platform instead of the default one
65
+ end
66
+
67
+ # HACK around 1.9.2 cross .def file creation
68
+ def_file = "tmp/i386-mswin32/#{target}/1.9.2/#{target}-i386-mingw32.def"
69
+ directory File.dirname(def_file)
70
+ file def_file => File.dirname(def_file) do |t|
71
+ File.open(t.name, "w") do |f|
72
+ f << "EXPORTS\nInit_#{target}\n"
73
+ end
74
+ end
75
+
76
+ task File.join(File.dirname(def_file), "Makefile") => def_file
77
+ # END HACK
78
+ file "lib/#{target}.rb" do |t|
79
+ File.open(t.name, "w") do |f|
80
+ f.puts %{require "#{target}/\#{RUBY_VERSION.sub(/\\.\\d+$/, '')}/#{target}"}
81
+ end
82
+ end
83
+ end
84
+ file 'ext/hpricot_scan/extconf.rb' => :ragel
85
+
86
+ desc "set environment variables to build and/or test with debug options"
87
+ task :debug do
88
+ ENV['CFLAGS'] ||= ""
89
+ ENV['CFLAGS'] += " -g -DDEBUG"
90
+ end
91
+
92
+ desc "Does a full compile, test run"
93
+ if defined?(JRUBY_VERSION)
94
+ task :default => [:compile_java, :clean_fat_rb, :test]
95
+ else
96
+ task :default => [:compile, :clean_fat_rb, :test]
97
+ end
98
+
99
+ task :clean_fat_rb do
100
+ rm_f "lib/hpricot_scan.rb"
101
+ rm_f "lib/fast_xs.rb"
102
+ end
103
+
104
+ desc "Packages up Hpricot for all platforms."
105
+ task :package => [:clean]
106
+
107
+ desc "Run all the tests"
108
+ Rake::TestTask.new do |t|
109
+ t.libs << "test"
110
+ t.test_files = FileList['test/test_*.rb']
111
+ t.verbose = true
112
+ end
113
+
114
+ Rake::RDocTask.new do |rdoc|
115
+ rdoc.rdoc_dir = 'doc/rdoc'
116
+ rdoc.options += RDOC_OPTS
117
+ rdoc.main = "README.md"
118
+ rdoc.rdoc_files.add ['README.md', 'CHANGELOG', 'COPYING', 'lib/**/*.rb']
119
+ end
120
+
121
+ Rake::GemPackageTask.new(SPEC) do |p|
122
+ p.need_tar = true
123
+ p.gem_spec = SPEC
124
+ end
125
+
126
+ ### Win32 Packages ###
127
+ Win32Spec = SPEC.dup
128
+ Win32Spec.platform = 'i386-mswin32'
129
+ Win32Spec.files = PKG_FILES + %w(hpricot_scan fast_xs).map do |t|
130
+ unless ENV['RUBY_CC_VERSION']
131
+ file "lib/#{t}/1.8/#{t}.so" do
132
+ abort "ERROR while packaging: re-run for fat win32 gems:\nrake #{ARGV.join(' ')} RUBY_CC_VERSION=1.8.7:1.9.2"
133
+ end
134
+ end
135
+ ["lib/#{t}.rb", "lib/#{t}/1.8/#{t}.so", "lib/#{t}/1.9/#{t}.so"]
136
+ end.flatten
137
+ Win32Spec.extensions = []
138
+
139
+ Rake::GemPackageTask.new(Win32Spec) do |p|
140
+ p.need_tar = false
141
+ p.gem_spec = Win32Spec
142
+ end
143
+
144
+ JRubySpec = SPEC.dup
145
+ JRubySpec.platform = 'java'
146
+ JRubySpec.files = PKG_FILES + ["lib/hpricot_scan.jar", "lib/fast_xs.jar"]
147
+ JRubySpec.extensions = []
148
+
149
+ Rake::GemPackageTask.new(JRubySpec) do |p|
150
+ p.need_tar = false
151
+ p.gem_spec = JRubySpec
152
+ end
153
+
154
+ desc "Determines the Ragel version and displays it on the console along with the location of the Ragel binary."
155
+ task :ragel_version do
156
+ @ragel_v = `ragel -v`[/(version )(\S*)/,2].to_f
157
+ puts "Using ragel version: #{@ragel_v}, location: #{`which ragel`}"
158
+ @ragel_v
159
+ end
160
+
161
+ desc "Generates the C scanner code with Ragel."
162
+ task :ragel => [:ragel_version] do
163
+ if @ragel_v >= 6.1
164
+ @ragel_c_code_generation_style = RAGEL_C_CODE_GENERATION_STYLES[DEFAULT_RAGEL_C_CODE_GENERATION]
165
+ Dir.chdir("ext/hpricot_scan") do
166
+ sh %{ragel hpricot_scan.rl -#{@ragel_c_code_generation_style} -o hpricot_scan.c}
167
+ sh %{ragel hpricot_css.rl -#{@ragel_c_code_generation_style} -o hpricot_css.c}
168
+ end
169
+ else
170
+ STDERR.puts "Ragel 6.1 or greater is required."
171
+ exit(1)
172
+ end
173
+ end
174
+
175
+ # Java only supports the table-driven code
176
+ # generation style at this point.
177
+ desc "Generates the Java scanner code using the Ragel table-driven code generation style."
178
+ task :ragel_java => [:ragel_version] do
179
+ if @ragel_v >= 6.1
180
+ puts "compiling with ragel version #{@ragel_v}"
181
+ Dir.chdir("ext/hpricot_scan") do
182
+ sh %{ragel -J -o HpricotCss.java hpricot_css.java.rl}
183
+ sh %{ragel -J -o HpricotScanService.java hpricot_scan.java.rl}
184
+ end
185
+ else
186
+ STDERR.puts "Ragel 6.1 or greater is required."
187
+ exit(1)
188
+ end
189
+ end
190
+
191
+ ### JRuby Compile ###
192
+
193
+ def java_classpath_arg # myriad of ways to discover JRuby classpath
194
+ begin
195
+ cpath = Java::java.lang.System.getProperty('java.class.path').split(File::PATH_SEPARATOR)
196
+ cpath += Java::java.lang.System.getProperty('sun.boot.class.path').split(File::PATH_SEPARATOR)
197
+ jruby_cpath = cpath.compact.join(File::PATH_SEPARATOR)
198
+ rescue => e
199
+ end
200
+ unless jruby_cpath
201
+ jruby_cpath = ENV['JRUBY_PARENT_CLASSPATH'] || ENV['JRUBY_HOME'] &&
202
+ FileList["#{ENV['JRUBY_HOME']}/lib/*.jar"].join(File::PATH_SEPARATOR)
203
+ end
204
+ unless jruby_cpath || ENV['CLASSPATH'] =~ /jruby/
205
+ abort %{WARNING: No JRuby classpath has been set up.
206
+ Define JRUBY_HOME=/path/to/jruby on the command line or in the environment}
207
+ end
208
+ "-cp \"#{jruby_cpath}\""
209
+ end
210
+
211
+ def compile_java(filenames, jarname)
212
+ sh %{javac -source 1.5 -target 1.5 #{java_classpath_arg} #{filenames.join(" ")}}
213
+ sh %{jar cf #{jarname} *.class}
214
+ end
215
+
216
+ task :hpricot_scan_java => [:ragel_java] do
217
+ Dir.chdir "ext/hpricot_scan" do
218
+ compile_java(["HpricotScanService.java", "HpricotCss.java"], "hpricot_scan.jar")
219
+ end
220
+ end
221
+
222
+ task :fast_xs_java do
223
+ Dir.chdir "ext/fast_xs" do
224
+ compile_java(["FastXsService.java"], "fast_xs.jar")
225
+ end
226
+ end
227
+
228
+ %w(hpricot_scan fast_xs).each do |ext|
229
+ file "lib/#{ext}.jar" => "#{ext}_java" do |t|
230
+ mv "ext/#{ext}/#{ext}.jar", "lib"
231
+ end
232
+ task :compile_java => "lib/#{ext}.jar"
233
+ end
234
+