hpricot 0.6-jruby

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/CHANGELOG +62 -0
  2. data/COPYING +18 -0
  3. data/README +284 -0
  4. data/Rakefile +211 -0
  5. data/ext/hpricot_scan/HpricotScanService.java +1340 -0
  6. data/ext/hpricot_scan/extconf.rb +6 -0
  7. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  8. data/ext/hpricot_scan/hpricot_scan.c +5976 -0
  9. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  10. data/ext/hpricot_scan/hpricot_scan.java.rl +363 -0
  11. data/ext/hpricot_scan/hpricot_scan.rl +273 -0
  12. data/extras/mingw-rbconfig.rb +176 -0
  13. data/lib/hpricot.rb +26 -0
  14. data/lib/hpricot/blankslate.rb +63 -0
  15. data/lib/hpricot/builder.rb +200 -0
  16. data/lib/hpricot/elements.rb +510 -0
  17. data/lib/hpricot/htmlinfo.rb +672 -0
  18. data/lib/hpricot/inspect.rb +107 -0
  19. data/lib/hpricot/modules.rb +37 -0
  20. data/lib/hpricot/parse.rb +297 -0
  21. data/lib/hpricot/tag.rb +228 -0
  22. data/lib/hpricot/tags.rb +164 -0
  23. data/lib/hpricot/traverse.rb +821 -0
  24. data/lib/hpricot/xchar.rb +94 -0
  25. data/lib/i686-linux/hpricot_scan.jar +0 -0
  26. data/test/files/basic.xhtml +17 -0
  27. data/test/files/boingboing.html +2266 -0
  28. data/test/files/cy0.html +3653 -0
  29. data/test/files/immob.html +400 -0
  30. data/test/files/pace_application.html +1320 -0
  31. data/test/files/tenderlove.html +16 -0
  32. data/test/files/uswebgen.html +220 -0
  33. data/test/files/utf8.html +1054 -0
  34. data/test/files/week9.html +1723 -0
  35. data/test/files/why.xml +19 -0
  36. data/test/load_files.rb +7 -0
  37. data/test/test_alter.rb +65 -0
  38. data/test/test_builder.rb +24 -0
  39. data/test/test_parser.rb +379 -0
  40. data/test/test_paths.rb +16 -0
  41. data/test/test_preserved.rb +66 -0
  42. data/test/test_xml.rb +28 -0
  43. metadata +98 -0
@@ -0,0 +1,62 @@
1
+ = 0.6
2
+ === 15th June, 2007
3
+ * Hpricot for JRuby -- nice work Ola Bini!
4
+ * Inline Markaby for Hpricot documents.
5
+ * XML tags and attributes are no longer downcased like HTML is.
6
+ * new syntax for grabbing everything between two elements using a Range in the search method: (doc/("font".."font/br")) or in nodes_at like so: (doc/"font").nodes_at("*".."br"). Only works with either a pair of siblings or a set of a parent and a sibling.
7
+ * Ignore self-closing endings on tags (such as form) which are containers. Treat them like open parent tags. Reported by Jonathan Nichols on the hpricot list.
8
+ * Escaping of attributes, yanked from Jim Weirich and Sam Ruby's work in Builder.
9
+ * Element#raw_attributes gives unescaped data. Element#attributes gives escaped.
10
+ * Added: Elements#attr, Elements#remove_attr, Elements#remove_class.
11
+ * Added: Traverse#preceding, Traverse#following, Traverse#previous, Traverse#next.
12
+
13
+ = 0.5
14
+ === 31rd January, 2007
15
+
16
+ * support for a[text()="Click Me!"] and h3[text()*="space"] and the like.
17
+ * Hpricot.buffer_size accessor for increasing Hpricot's buffer if you're encountering huge ASP.NET viewstate attribs.
18
+ * some support for colons in tag names (not full namespace support yet.)
19
+ * Element.to_original_html will attempt to preserve the original HTML while merging your changes.
20
+ * Element.to_plain_text converts an element's contents to a simple text format.
21
+ * Element.inner_text removes all tags and returns text nodes concatenated into a single string.
22
+ * no @raw_string variable kept for comments, text, and cdata -- as it's redundant.
23
+ * xpath-style indices (//p/a[1]) but keep in mind that they aren't zero-based.
24
+ * node_position is the index among all sibling nodes, while position is the position among children of identical type.
25
+ * comment() and text() search criteria, like: //p/text(), which selects all text inside paragraph tags.
26
+ * every element has css_path and xpath methods which return respective absolute paths.
27
+ * more flexibility all around: in parsing attributes, tags, comments and cdata.
28
+
29
+ = 0.4
30
+ === 11th August, 2006
31
+
32
+ * The :fixup_tags option will try to sort out the hierarchy so elements end up with the right parents.
33
+ * Elements such as *script* and *style* (identified as having CDATA contents) receive a single text node as their children now. Previously, Hpricot was parsing out tags found in scripts.
34
+ * Better scanning of partially quoted attributes (found by Brent Beardsly on http://uswebgen.com/)
35
+ * Better scanning of unquoted attributes -- thanks to Aaron Patterson for the test cases!
36
+ * Some tags were being output in the empty tag style, although browsers hated that. FIXED!
37
+ * Added Elements#at for finding single elements.
38
+ * Added Elem::Trav#[] and Elem::Trav#[]= for reading and writing attributes.
39
+
40
+ = 0.3
41
+ === 7th July, 2006
42
+
43
+ * Fixed negative string size error on empty tokens. (news.bbc.co.uk)
44
+ * Allow the parser to accept just text nodes. (such as: <tt>Hpricot.parse('TEXT')</tt>)
45
+ * from JQuery to Hpricot::Elements: remove, empty, append, prepend, before, after, wrap, set,
46
+ html(...), to_html, to_s.
47
+ * on containers: to_html, replace_child, insert_before, insert_after, innerHTML=.
48
+ * Hpricot(...) is an alias for parse.
49
+ * open up all properties to setters, let people do as they may.
50
+ * use to_html for the full html of a node or set of elements.
51
+ * doctypes were messed.
52
+
53
+ = 0.2
54
+ === 4th July, 2006
55
+
56
+ * Rewrote the HTree parser to be simpler, more adequate for the common man. Will add encoding back in later.
57
+
58
+ = 0.1
59
+ === 3rd July, 2006
60
+
61
+ * For whatever reason, wrote this HTML parser in C.
62
+ I guess Ragel is addictive and I want to improve HTree.
data/COPYING ADDED
@@ -0,0 +1,18 @@
1
+ Copyright (c) 2006 why the lucky stiff
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to
5
+ deal in the Software without restriction, including without limitation the
6
+ rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7
+ sell copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16
+ THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,284 @@
1
+ = Hpricot, Read Any HTML
2
+
3
+ Hpricot is a fast, flexible HTML parser written in C. It's designed to be very
4
+ accommodating (like Tanaka Akira's HTree) and to have a very helpful library
5
+ (like some JavaScript libs -- JQuery, Prototype -- give you.) The XPath and CSS
6
+ parser, in fact, is based on John Resig's JQuery.
7
+
8
+ Also, Hpricot can be handy for reading broken XML files, since many of the same
9
+ techniques can be used. If a quote is missing, Hpricot tries to figure it out.
10
+ If tags overlap, Hpricot works on sorting them out. You know, that sort of
11
+ thing.
12
+
13
+ *Please read this entire document* before making assumptions about how this
14
+ software works.
15
+
16
+ == An Overview
17
+
18
+ Let's clear up what Hpricot is.
19
+
20
+ # Hpricot is *a standalone library*. It requires no other libraries. Just Ruby!
21
+ # While priding itself on speed, Hpricot *works hard to sort out bad HTML* and
22
+ pays a small penalty in order to get that right. So that's slightly more important
23
+ to me than speed.
24
+ # *If you can see it in Firefox, then Hpricot should parse it.* That's
25
+ how it should be! Let me know the minute it's otherwise.
26
+ # Primarily, Hpricot is used for reading HTML and tries to sort out troubled
27
+ HTML by having some idea of what good HTML is. Some people still like to use
28
+ Hpricot for XML reading, but *remember to use the Hpricot::XML() method* for that!
29
+
30
+ == The Hpricot Kingdom
31
+
32
+ First, here are all the links you need to know:
33
+
34
+ * http://code.whytheluckystiff.net/hpricot is the Hpricot wiki and bug tracker.
35
+ Go there for news and recipes and patches. It's the center of activity.
36
+ * http://code.whytheluckystiff.net/svn/hpricot/trunk is the main Subversion
37
+ repository for Hpricot. You can get the latest code there.
38
+ * http://code.whytheluckystiff.net/doc/hpricot is the home for the latest copy of
39
+ this reference.
40
+ * See COPYING for the terms of this software. (Spoiler: it's absolutely free.)
41
+
42
+ If you have any trouble, don't hesitate to contact the author. As always, I'm
43
+ not going to say "Use at your own risk" because I don't want this library to be
44
+ risky. If you trip on something, I'll share the liability by repairing things
45
+ as quickly as I can. Your responsibility is to report the inadequacies.
46
+
47
+ == Installing Hpricot
48
+
49
+ You may get the latest stable version from Rubyforge. Win32 binaries and source
50
+ gems are available.
51
+
52
+ $ gem install hpricot
53
+
54
+ As Hpricot is still under active development, you can also try the most recent
55
+ candidate build here:
56
+
57
+ $ gem install hpricot --source http://code.whytheluckystiff.net
58
+
59
+ The development gem is usually in pretty good shape actually. You can also
60
+ get the bleeding edge code or plain Ruby tarballs on the wiki.
61
+
62
+ == An Hpricot Showcase
63
+
64
+ We're going to run through a big pile of examples to get you jump-started.
65
+ Many of these examples are also found at
66
+ http://code.whytheluckystiff.net/hpricot/wiki/HpricotBasics, in case you
67
+ want to add some of your own.
68
+
69
+ === Loading Hpricot Itself
70
+
71
+ You have probably got the gem, right? To load Hpricot:
72
+
73
+ require 'rubygems'
74
+ require 'hpricot'
75
+
76
+ If you've installed the plain source distribution, go ahead and just:
77
+
78
+ require 'hpricot'
79
+
80
+ === Load an HTML Page
81
+
82
+ The <tt>Hpricot()</tt> method takes a string or any IO object and loads the
83
+ contents into a document object.
84
+
85
+ doc = Hpricot("<p>A simple <b>test</b> string.</p>")
86
+
87
+ To load from a file, just get the stream open:
88
+
89
+ doc = open("index.html") { |f| Hpricot(f) }
90
+
91
+ To load from a web URL, use <tt>open-uri</tt>, which comes with Ruby:
92
+
93
+ require 'open-uri'
94
+ doc = open("http://qwantz.com/") { |f| Hpricot(f) }
95
+
96
+ Hpricot uses an internal buffer to parse the file, so the IO will stream
97
+ properly and large documents won't be loaded into memory all at once. However,
98
+ the parsed document object will be present in memory, in its entirety.
99
+
100
+ === Search for Elements
101
+
102
+ Use <tt>Doc.search</tt>:
103
+
104
+ doc.search("//p[@class='posted']")
105
+ #=> #<Hpricot:Elements[{p ...}, {p ...}]>
106
+
107
+ <tt>Doc.search</tt> can take an XPath or CSS expression. In the above example,
108
+ all paragraph <tt><p></tt> elements are grabbed which have a <tt>class</tt>
109
+ attribute of <tt>"posted"</tt>.
110
+
111
+ A shortcut is to use the divisor:
112
+
113
+ (doc/"p.posted")
114
+ #=> #<Hpricot:Elements[{p ...}, {p ...}]>
115
+
116
+ === Finding Just One Element
117
+
118
+ If you're looking for a single element, the <tt>at</tt> method will return the
119
+ first element matched by the expression. In this case, you'll get back the
120
+ element itself rather than the <tt>Hpricot::Elements</tt> array.
121
+
122
+ doc.at("body")['onload']
123
+
124
+ The above code will find the body tag and give you back the <tt>onload</tt>
125
+ attribute. This is the most common reason to use the element directly: when
126
+ reading and writing HTML attributes.
127
+
128
+ === Fetching the Contents of an Element
129
+
130
+ Just as with browser scripting, the <tt>inner_html</tt> property can be used to
131
+ get the inner contents of an element.
132
+
133
+ (doc/"#elementID").inner_html
134
+ #=> "..<b>contents</b>.."
135
+
136
+ If your expression matches more than one element, you'll get back the contents
137
+ of ''all the matched elements''. So you may want to use <tt>first</tt> to be
138
+ sure you get back only one.
139
+
140
+ (doc/"#elementID").first.inner_html
141
+ #=> "..<b>contents</b>.."
142
+
143
+ === Fetching the HTML for an Element
144
+
145
+ If you want the HTML for the whole element (not just the contents), use
146
+ <tt>to_html</tt>:
147
+
148
+ (doc/"#elementID").to_html
149
+ #=> "<div id='elementID'>...</div>"
150
+
151
+ === Looping
152
+
153
+ All searches return a set of <tt>Hpricot::Elements</tt>. Go ahead and loop
154
+ through them like you would an array.
155
+
156
+ (doc/"p/a/img").each do |img|
157
+ puts img.attributes['class']
158
+ end
159
+
160
+ === Continuing Searches
161
+
162
+ Searches can be continued from a collection of elements, in order to search deeper.
163
+
164
+ # find all paragraphs.
165
+ elements = doc.search("/html/body//p")
166
+ # continue the search by finding any images within those paragraphs.
167
+ (elements/"img")
168
+ #=> #<Hpricot::Elements[{img ...}, {img ...}]>
169
+
170
+ Searches can also be continued by searching within container elements.
171
+
172
+ # find all images within paragraphs.
173
+ doc.search("/html/body//p").each do |para|
174
+ puts "== Found a paragraph =="
175
+ pp para
176
+
177
+ imgs = para.search("img")
178
+ if imgs.any?
179
+ puts "== Found #{imgs.length} images inside =="
180
+ end
181
+ end
182
+
183
+ Of course, the most succinct ways to do the above are using CSS or XPath.
184
+
185
+ # the xpath version
186
+ (doc/"/html/body//p//img")
187
+ # the css version
188
+ (doc/"html > body > p img")
189
+ # ..or symbols work, too!
190
+ (doc/:html/:body/:p/:img)
191
+
192
+ === Looping Edits
193
+
194
+ You may certainly edit objects from within your search loops. Then, when you
195
+ spit out the HTML, the altered elements will show.
196
+
197
+ (doc/"span.entryPermalink").each do |span|
198
+ span.attributes['class'] = 'newLinks'
199
+ end
200
+ puts doc
201
+
202
+ This changes all <tt>span.entryPermalink</tt> elements to
203
+ <tt>span.newLinks</tt>. Keep in mind that there are often more convenient ways
204
+ of doing this. Such as the <tt>set</tt> method:
205
+
206
+ (doc/"span.entryPermalink").set(:class => 'newLinks')
207
+
208
+ === Figuring Out Paths
209
+
210
+ Every element can tell you its unique path (either XPath or CSS) to get to the
211
+ element from the root tag.
212
+
213
+ The <tt>css_path</tt> method:
214
+
215
+ doc.at("div > div:nth(1)").css_path
216
+ #=> "div > div:nth(1)"
217
+ doc.at("#header").css_path
218
+ #=> "#header"
219
+
220
+ Or, the <tt>xpath</tt> method:
221
+
222
+ doc.at("div > div:nth(1)").xpath
223
+ #=> "/div/div:eq(1)"
224
+ doc.at("#header").xpath
225
+ #=> "//div[@id='header']"
226
+
227
+ == Hpricot Fixups
228
+
229
+ When loading HTML documents, you have a few settings that can make Hpricot more
230
+ or less intense about how it gets involved.
231
+
232
+ == :fixup_tags
233
+
234
+ Really, there are so many ways to clean up HTML and your intentions may be to
235
+ keep the HTML as-is. So Hpricot's default behavior is to keep things flexible.
236
+ Making sure to open and close all the tags, but ignore any validation problems.
237
+
238
+ As of Hpricot 0.4, there's a new <tt>:fixup_tags</tt> option which will attempt
239
+ to shift the document's tags to meet XHTML 1.0 Strict.
240
+
241
+ doc = open("index.html") { |f| Hpricot f, :fixup_tags => true }
242
+
243
+ This doesn't quite meet the XHTML 1.0 Strict standard, it just tries to follow
244
+ the rules a bit better. Like: say Hpricot finds a paragraph in a link, it's
245
+ going to move the paragraph below the link. Or up and out of other elements
246
+ where paragraphs don't belong.
247
+
248
+ If an unknown element is found, it is ignored. Again, <tt>:fixup_tags</tt>.
249
+
250
+ == :xhtml_strict
251
+
252
+ So, let's go beyond just trying to fix the hierarchy. The
253
+ <tt>:xhtml_strict</tt> option really tries to force the document to be an XHTML
254
+ 1.0 Strict document. Even at the cost of removing elements that get in the way.
255
+
256
+ doc = open("index.html") { |f| Hpricot f, :xhtml_strict => true }
257
+
258
+ What measures does <tt>:xhtml_strict</tt> take?
259
+
260
+ 1. Shift elements into their proper containers just like :fixup_tags.
261
+ 2. Remove unknown elements.
262
+ 3. Remove unknown attributes.
263
+ 4. Remove illegal content.
264
+ 5. Alter the doctype to XHTML 1.0 Strict.
265
+
266
+ == Hpricot.XML()
267
+
268
+ The last option is the <tt>:xml</tt> option, which makes some slight variations
269
+ on the standard mode. The main difference is that :xml mode won't try to output
270
+ tags which are friendlier for browsers. For example, if an opening and closing
271
+ <tt>br</tt> tag is found, XML mode won't try to turn that into an empty element.
272
+
273
+ XML mode also doesn't downcase the tags and attributes for you. So pay attention
274
+ to case, friends.
275
+
276
+ The primary way to use Hpricot's XML mode is to call the Hpricot.XML method:
277
+
278
+ doc = open("http://redhanded.hobix.com/index.xml") do |f|
279
+ Hpricot.XML(f)
280
+ end
281
+
282
+ *Also, :fixup_tags is canceled out by the :xml option.* This is because
283
+ :fixup_tags makes assumptions based how HTML is structured. Specifically, how
284
+ tags are defined in the XHTML 1.0 DTD.
@@ -0,0 +1,211 @@
1
+ require 'rake'
2
+ require 'rake/clean'
3
+ require 'rake/gempackagetask'
4
+ require 'rake/rdoctask'
5
+ require 'rake/testtask'
6
+ require 'fileutils'
7
+ include FileUtils
8
+
9
+ NAME = "hpricot"
10
+ REV = `svn info`[/Revision: (\d+)/, 1] rescue nil
11
+ VERS = ENV['VERSION'] || "0.6" + (REV ? ".#{REV}" : "")
12
+ PKG = "#{NAME}-#{VERS}"
13
+ BIN = "*.{bundle,jar,so,obj,pdb,lib,def,exp}"
14
+ ARCHLIB = "lib/#{::Config::CONFIG['arch']}"
15
+ CLEAN.include ["ext/hpricot_scan/#{BIN}", "lib/**/#{BIN}", 'ext/hpricot_scan/Makefile',
16
+ '**/.*.sw?', '*.gem', '.config']
17
+ RDOC_OPTS = ['--quiet', '--title', 'The Hpricot Reference', '--main', 'README', '--inline-source']
18
+ PKG_FILES = %w(CHANGELOG COPYING README Rakefile) +
19
+ Dir.glob("{bin,doc,test,lib,extras}/**/*") +
20
+ Dir.glob("ext/**/*.{h,java,c,rb,rl}") +
21
+ %w[ext/hpricot_scan/hpricot_scan.c] # needed because it's generated later
22
+ SPEC =
23
+ Gem::Specification.new do |s|
24
+ s.name = NAME
25
+ s.version = VERS
26
+ s.platform = Gem::Platform::RUBY
27
+ s.has_rdoc = true
28
+ s.rdoc_options += RDOC_OPTS
29
+ s.extra_rdoc_files = ["README", "CHANGELOG", "COPYING"]
30
+ s.summary = "a swift, liberal HTML parser with a fantastic library"
31
+ s.description = s.summary
32
+ s.author = "why the lucky stiff"
33
+ s.email = 'why@ruby-lang.org'
34
+ s.homepage = 'http://code.whytheluckystiff.net/hpricot/'
35
+ s.files = PKG_FILES
36
+ s.require_paths = [ARCHLIB, "lib"]
37
+ s.extensions = FileList["ext/**/extconf.rb"].to_a
38
+ s.bindir = "bin"
39
+ end
40
+
41
+ desc "Does a full compile, test run"
42
+ task :default => [:compile, :test]
43
+
44
+ desc "Packages up Hpricot."
45
+ task :package => [:clean, :ragel]
46
+
47
+ desc "Releases packages for all Hpricot packages and platforms."
48
+ task :release => [:package, :package_win32, :package_jruby]
49
+
50
+ desc "Run all the tests"
51
+ Rake::TestTask.new do |t|
52
+ t.libs << "test" << ARCHLIB
53
+ t.test_files = FileList['test/test_*.rb']
54
+ t.verbose = true
55
+ end
56
+
57
+ Rake::RDocTask.new do |rdoc|
58
+ rdoc.rdoc_dir = 'doc/rdoc'
59
+ rdoc.options += RDOC_OPTS
60
+ rdoc.main = "README"
61
+ rdoc.rdoc_files.add ['README', 'CHANGELOG', 'COPYING', 'lib/**/*.rb']
62
+ end
63
+
64
+ Rake::GemPackageTask.new(SPEC) do |p|
65
+ p.need_tar = true
66
+ p.gem_spec = SPEC
67
+ end
68
+
69
+ extension = "hpricot_scan"
70
+ ext = "ext/hpricot_scan"
71
+ ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
72
+ ext_files = FileList[
73
+ "#{ext}/*.c",
74
+ "#{ext}/*.h",
75
+ "#{ext}/*.rl",
76
+ "#{ext}/extconf.rb",
77
+ "#{ext}/Makefile",
78
+ "lib"
79
+ ]
80
+
81
+ task "lib" do
82
+ directory "lib"
83
+ end
84
+
85
+ desc "Compiles the Ruby extension"
86
+ task :compile => [:hpricot_scan] do
87
+ if Dir.glob(File.join(ARCHLIB,"hpricot_scan.*")).length == 0
88
+ STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
89
+ STDERR.puts "Gem actually failed to build. Your system is"
90
+ STDERR.puts "NOT configured properly to build hpricot."
91
+ STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
92
+ exit(1)
93
+ end
94
+ end
95
+ task :hpricot_scan => [:ragel]
96
+
97
+ desc "Builds just the #{extension} extension"
98
+ task extension.to_sym => ["#{ext}/Makefile", ext_so ]
99
+
100
+ file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
101
+ Dir.chdir(ext) do ruby "extconf.rb" end
102
+ end
103
+
104
+ file ext_so => ext_files do
105
+ Dir.chdir(ext) do
106
+ sh(PLATFORM =~ /win32/ ? 'nmake' : 'make')
107
+ end
108
+ mkdir_p ARCHLIB
109
+ cp ext_so, ARCHLIB
110
+ end
111
+
112
+ desc "returns the ragel version"
113
+ task :ragel_version do
114
+ @ragel_v = `ragel -v`[/(version )(\S*)/,2].to_f
115
+ end
116
+
117
+ desc "Generates the C scanner code with Ragel."
118
+ task :ragel => [:ragel_version] do
119
+ sh %{ragel ext/hpricot_scan/hpricot_scan.rl | #{@ragel_v >= 5.18 ? 'rlgen-cd' : 'rlcodegen'} -G2 -o ext/hpricot_scan/hpricot_scan.c}
120
+ end
121
+
122
+ desc "Generates the Java scanner code with Ragel."
123
+ task :ragel_java => [:ragel_version] do
124
+ sh %{ragel -J ext/hpricot_scan/hpricot_scan.java.rl | #{@ragel_v >= 5.18 ? 'rlgen-java' : 'rlcodegen'} -o ext/hpricot_scan/HpricotScanService.java}
125
+ end
126
+
127
+ ### Win32 Packages ###
128
+
129
+ Win32Spec = SPEC.dup
130
+ Win32Spec.platform = Gem::Platform::WIN32
131
+ Win32Spec.files = PKG_FILES + ["#{ARCHLIB}/hpricot_scan.so"]
132
+ Win32Spec.extensions = []
133
+
134
+ WIN32_PKG_DIR = "#{PKG}-mswin32"
135
+
136
+ desc "Package up the Win32 distribution."
137
+ file WIN32_PKG_DIR => [:package] do
138
+ sh "tar zxf pkg/#{PKG}.tgz"
139
+ mv PKG, WIN32_PKG_DIR
140
+ end
141
+
142
+ desc "Cross-compile the hpricot_scan extension for win32"
143
+ file "hpricot_scan_win32" => [WIN32_PKG_DIR] do
144
+ cp "extras/mingw-rbconfig.rb", "#{WIN32_PKG_DIR}/ext/hpricot_scan/rbconfig.rb"
145
+ sh "cd #{WIN32_PKG_DIR}/ext/hpricot_scan/ && ruby -I. extconf.rb && make"
146
+ mv "#{WIN32_PKG_DIR}/ext/hpricot_scan/hpricot_scan.so", "#{WIN32_PKG_DIR}/#{ARCHLIB}"
147
+ end
148
+
149
+ desc "Build the binary RubyGems package for win32"
150
+ task :package_win32 => ["hpricot_scan_win32"] do
151
+ Dir.chdir("#{WIN32_PKG_DIR}") do
152
+ Gem::Builder.new(Win32Spec).build
153
+ verbose(true) {
154
+ mv Dir["*.gem"].first, "../pkg/#{WIN32_PKG_DIR}.gem"
155
+ }
156
+ end
157
+ end
158
+
159
+ CLEAN.include WIN32_PKG_DIR
160
+
161
+ ### JRuby Packages ###
162
+
163
+ compile_java = proc do
164
+ sh %{javac -source 1.4 -target 1.4 -classpath $JRUBY_HOME/lib/jruby.jar HpricotScanService.java}
165
+ sh %{jar cf hpricot_scan.jar HpricotScanService.class}
166
+ end
167
+
168
+ desc "Compiles the JRuby extension"
169
+ task :hpricot_scan_java => [:ragel_java] do
170
+ Dir.chdir("ext/hpricot_scan", &compile_java)
171
+ end
172
+
173
+ JRubySpec = SPEC.dup
174
+ JRubySpec.platform = 'jruby'
175
+ JRubySpec.files = PKG_FILES + ["#{ARCHLIB}/hpricot_scan.jar"]
176
+ JRubySpec.extensions = []
177
+
178
+ JRUBY_PKG_DIR = "#{PKG}-jruby"
179
+
180
+ desc "Package up the JRuby distribution."
181
+ file JRUBY_PKG_DIR => [:ragel_java, :package] do
182
+ sh "tar zxf pkg/#{PKG}.tgz"
183
+ mv PKG, JRUBY_PKG_DIR
184
+ end
185
+
186
+ desc "Cross-compile the hpricot_scan extension for JRuby"
187
+ file "hpricot_scan_jruby" => [JRUBY_PKG_DIR] do
188
+ Dir.chdir("#{JRUBY_PKG_DIR}/ext/hpricot_scan", &compile_java)
189
+ mv "#{JRUBY_PKG_DIR}/ext/hpricot_scan/hpricot_scan.jar", "#{JRUBY_PKG_DIR}/#{ARCHLIB}"
190
+ end
191
+
192
+ desc "Build the RubyGems package for JRuby"
193
+ task :package_jruby => ["hpricot_scan_jruby"] do
194
+ Dir.chdir("#{JRUBY_PKG_DIR}") do
195
+ Gem::Builder.new(JRubySpec).build
196
+ verbose(true) {
197
+ mv Dir["*.gem"].first, "../pkg/#{JRUBY_PKG_DIR}.gem"
198
+ }
199
+ end
200
+ end
201
+
202
+ CLEAN.include JRUBY_PKG_DIR
203
+
204
+ task :install do
205
+ sh %{rake package}
206
+ sh %{sudo gem install pkg/#{NAME}-#{VERS}}
207
+ end
208
+
209
+ task :uninstall => [:clean] do
210
+ sh %{sudo gem uninstall #{NAME}}
211
+ end