hpricot 0.6-jruby

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/CHANGELOG +62 -0
  2. data/COPYING +18 -0
  3. data/README +284 -0
  4. data/Rakefile +211 -0
  5. data/ext/hpricot_scan/HpricotScanService.java +1340 -0
  6. data/ext/hpricot_scan/extconf.rb +6 -0
  7. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  8. data/ext/hpricot_scan/hpricot_scan.c +5976 -0
  9. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  10. data/ext/hpricot_scan/hpricot_scan.java.rl +363 -0
  11. data/ext/hpricot_scan/hpricot_scan.rl +273 -0
  12. data/extras/mingw-rbconfig.rb +176 -0
  13. data/lib/hpricot.rb +26 -0
  14. data/lib/hpricot/blankslate.rb +63 -0
  15. data/lib/hpricot/builder.rb +200 -0
  16. data/lib/hpricot/elements.rb +510 -0
  17. data/lib/hpricot/htmlinfo.rb +672 -0
  18. data/lib/hpricot/inspect.rb +107 -0
  19. data/lib/hpricot/modules.rb +37 -0
  20. data/lib/hpricot/parse.rb +297 -0
  21. data/lib/hpricot/tag.rb +228 -0
  22. data/lib/hpricot/tags.rb +164 -0
  23. data/lib/hpricot/traverse.rb +821 -0
  24. data/lib/hpricot/xchar.rb +94 -0
  25. data/lib/i686-linux/hpricot_scan.jar +0 -0
  26. data/test/files/basic.xhtml +17 -0
  27. data/test/files/boingboing.html +2266 -0
  28. data/test/files/cy0.html +3653 -0
  29. data/test/files/immob.html +400 -0
  30. data/test/files/pace_application.html +1320 -0
  31. data/test/files/tenderlove.html +16 -0
  32. data/test/files/uswebgen.html +220 -0
  33. data/test/files/utf8.html +1054 -0
  34. data/test/files/week9.html +1723 -0
  35. data/test/files/why.xml +19 -0
  36. data/test/load_files.rb +7 -0
  37. data/test/test_alter.rb +65 -0
  38. data/test/test_builder.rb +24 -0
  39. data/test/test_parser.rb +379 -0
  40. data/test/test_paths.rb +16 -0
  41. data/test/test_preserved.rb +66 -0
  42. data/test/test_xml.rb +28 -0
  43. metadata +98 -0
@@ -0,0 +1,510 @@
1
+ module Hpricot
2
+ # Once you've matched a list of elements, you will often need to handle them as
3
+ # a group. Or you may want to perform the same action on each of them.
4
+ # Hpricot::Elements is an extension of Ruby's array class, with some methods
5
+ # added for altering elements contained in the array.
6
+ #
7
+ # If you need to create an element array from regular elements:
8
+ #
9
+ # Hpricot::Elements[ele1, ele2, ele3]
10
+ #
11
+ # Assuming that ele1, ele2 and ele3 contain element objects (Hpricot::Elem,
12
+ # Hpricot::Doc, etc.)
13
+ #
14
+ # == Continuing Searches
15
+ #
16
+ # Usually the Hpricot::Elements you're working on comes from a search you've
17
+ # done. Well, you can continue searching the list by using the same <tt>at</tt>
18
+ # and <tt>search</tt> methods you can use on plain elements.
19
+ #
20
+ # elements = doc.search("/div/p")
21
+ # elements = elements.search("/a[@href='http://hoodwink.d/']")
22
+ # elements = elements.at("img")
23
+ #
24
+ # == Altering Elements
25
+ #
26
+ # When you're altering elements in the list, your changes will be reflected in
27
+ # the document you started searching from.
28
+ #
29
+ # doc = Hpricot("That's my <b>spoon</b>, Tyler.")
30
+ # doc.at("b").swap("<i>fork</i>")
31
+ # doc.to_html
32
+ # #=> "That's my <i>fork</i>, Tyler."
33
+ #
34
+ # == Getting More Detailed
35
+ #
36
+ # If you can't find a method here that does what you need, you may need to
37
+ # loop through the elements and find a method in Hpricot::Container::Trav
38
+ # which can do what you need.
39
+ #
40
+ # For example, you may want to search for all the H3 header tags in a document
41
+ # and grab all the tags underneath the header, but not inside the header.
42
+ # A good method for this is <tt>next_sibling</tt>:
43
+ #
44
+ # doc.search("h3").each do |h3|
45
+ # while ele = h3.next_sibling
46
+ # ary << ele # stuff away all the elements under the h3
47
+ # end
48
+ # end
49
+ #
50
+ # Most of the useful element methods are in the mixins Hpricot::Traverse
51
+ # and Hpricot::Container::Trav.
52
+ class Elements < Array
53
+
54
+ # Searches this list for any elements (or children of these elements) matching
55
+ # the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
56
+ #
57
+ # See Hpricot::Container::Trav.search for more.
58
+ def search(*expr,&blk)
59
+ Elements[*map { |x| x.search(*expr,&blk) }.flatten.uniq]
60
+ end
61
+ alias_method :/, :search
62
+
63
+ # Searches this list for the first element (or child of these elements) matching
64
+ # the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
65
+ #
66
+ # See Hpricot::Container::Trav.at for more.
67
+ def at(expr, &blk)
68
+ search(expr, &blk).first
69
+ end
70
+ alias_method :%, :at
71
+
72
+ # Convert this group of elements into a complete HTML fragment, returned as a
73
+ # string.
74
+ def to_html
75
+ map { |x| x.output("") }.join
76
+ end
77
+ alias_method :to_s, :to_html
78
+
79
+ # Returns an HTML fragment built of the contents of each element in this list.
80
+ #
81
+ # If a HTML +string+ is supplied, this method acts like inner_html=.
82
+ def inner_html(*string)
83
+ if string.empty?
84
+ map { |x| x.inner_html }.join
85
+ else
86
+ x = self.inner_html = string.pop || x
87
+ end
88
+ end
89
+ alias_method :html, :inner_html
90
+ alias_method :innerHTML, :inner_html
91
+
92
+ # Replaces the contents of each element in this list. Supply an HTML +string+,
93
+ # which is loaded into Hpricot objects and inserted into every element in this
94
+ # list.
95
+ def inner_html=(string)
96
+ each { |x| x.inner_html = string }
97
+ end
98
+ alias_method :html=, :inner_html=
99
+ alias_method :innerHTML=, :inner_html=
100
+
101
+ # Returns an string containing the text contents of each element in this list.
102
+ # All HTML tags are removed.
103
+ def inner_text
104
+ map { |x| x.inner_text }.join
105
+ end
106
+ alias_method :text, :inner_text
107
+
108
+ # Remove all elements in this list from the document which contains them.
109
+ #
110
+ # doc = Hpricot("<html>Remove this: <b>here</b></html>")
111
+ # doc.search("b").remove
112
+ # doc.to_html
113
+ # => "<html>Remove this: </html>"
114
+ #
115
+ def remove
116
+ each { |x| x.parent.children.delete(x) }
117
+ end
118
+
119
+ # Empty the elements in this list, by removing their insides.
120
+ #
121
+ # doc = Hpricot("<p> We have <i>so much</i> to say.</p>")
122
+ # doc.search("i").empty
123
+ # doc.to_html
124
+ # => "<p> We have <i></i> to say.</p>"
125
+ #
126
+ def empty
127
+ each { |x| x.inner_html = nil }
128
+ end
129
+
130
+ # Add to the end of the contents inside each element in this list.
131
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
132
+ def append(str = nil, &blk)
133
+ each { |x| x.html(x.children + Hpricot.make(str, &blk)) }
134
+ end
135
+
136
+ # Add to the start of the contents inside each element in this list.
137
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
138
+ def prepend(str = nil, &blk)
139
+ each { |x| x.html(Hpricot.make(str, &blk) + x.children) }
140
+ end
141
+
142
+ # Add some HTML just previous to each element in this list.
143
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
144
+ def before(str = nil, &blk)
145
+ each { |x| x.parent.insert_before Hpricot.make(str, &blk), x }
146
+ end
147
+
148
+ # Just after each element in this list, add some HTML.
149
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
150
+ def after(str = nil, &blk)
151
+ each { |x| x.parent.insert_after Hpricot.make(str, &blk), x }
152
+ end
153
+
154
+ # Wraps each element in the list inside the element created by HTML +str+.
155
+ # If more than one element is found in the string, Hpricot locates the
156
+ # deepest spot inside the first element.
157
+ #
158
+ # doc.search("a[@href]").
159
+ # wrap(%{<div class="link"><div class="link_inner"></div></div>})
160
+ #
161
+ # This code wraps every link on the page inside a +div.link+ and a +div.link_inner+ nest.
162
+ def wrap(str = nil, &blk)
163
+ each do |x|
164
+ wrap = Hpricot.make(str, &blk)
165
+ nest = wrap.detect { |w| w.respond_to? :children }
166
+ unless nest
167
+ raise Exception, "No wrapping element found."
168
+ end
169
+ x.parent.replace_child(x, wrap)
170
+ nest = nest.children.first until nest.empty?
171
+ nest.html(nest.children + [x])
172
+ end
173
+ end
174
+
175
+ # Gets and sets attributes on all matched elements.
176
+ #
177
+ # Pass in a +key+ on its own and this method will return the string value
178
+ # assigned to that attribute for the first elements. Or +nil+ if the
179
+ # attribute isn't found.
180
+ #
181
+ # doc.search("a").attr("href")
182
+ # #=> "http://hacketyhack.net/"
183
+ #
184
+ # Or, pass in a +key+ and +value+. This will set an attribute for all
185
+ # matched elements.
186
+ #
187
+ # doc.search("p").attr("class", "basic")
188
+ #
189
+ # You may also use a Hash to set a series of attributes:
190
+ #
191
+ # (doc/"a").attr(:class => "basic", :href => "http://hackety.org/")
192
+ #
193
+ # Lastly, a block can be used to rewrite an attribute based on the element
194
+ # it belongs to. The block will pass in an element. Return from the block
195
+ # the new value of the attribute.
196
+ #
197
+ # records.attr("href") { |e| e['href'] + "#top" }
198
+ #
199
+ # This example adds a <tt>#top</tt> anchor to each link.
200
+ #
201
+ def attr key, value = nil, &blk
202
+ if value or blk
203
+ each do |el|
204
+ el.set_attribute(key, value || blk[el])
205
+ end
206
+ return self
207
+ end
208
+ if key.is_a? Hash
209
+ key.each { |k,v| self.attr(k,v) }
210
+ return self
211
+ else
212
+ return self[0].get_attribute(key)
213
+ end
214
+ end
215
+ alias_method :set, :attr
216
+
217
+ # Adds the class to all matched elements.
218
+ #
219
+ # (doc/"p").add_class("bacon")
220
+ #
221
+ # Now all paragraphs will have class="bacon".
222
+ def add_class class_name
223
+ each do |el|
224
+ next unless el.respond_to? :get_attribute
225
+ classes = el.get_attribute('class').to_s.split(" ")
226
+ el.set_attribute('class', classes.push(class_name).uniq.join(" "))
227
+ end
228
+ self
229
+ end
230
+
231
+ # Remove an attribute from each of the matched elements.
232
+ #
233
+ # (doc/"input").remove_attr("disabled")
234
+ #
235
+ def remove_attr name
236
+ each do |el|
237
+ next unless el.respond_to? :remove_attribute
238
+ el.remove_attribute(name)
239
+ end
240
+ self
241
+ end
242
+
243
+ # Removes a class from all matched elements.
244
+ #
245
+ # (doc/"span").remove_class("lightgrey")
246
+ #
247
+ # Or, to remove all classes:
248
+ #
249
+ # (doc/"span").remove_class
250
+ #
251
+ def remove_class name = nil
252
+ each do |el|
253
+ next unless el.respond_to? :get_attribute
254
+ if name
255
+ classes = el.get_attribute('class').to_s.split(" ")
256
+ el.set_attribute('class', (classes - [name]).uniq.join(" "))
257
+ else
258
+ el.remove_attribute("class")
259
+ end
260
+ end
261
+ self
262
+ end
263
+
264
+ ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^\]'"]*)'?"? *\]!i
265
+ BRACK_RE = %r!(\[) *([^\]]*) *\]+!i
266
+ FUNC_RE = %r!(:)?([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)]*?)['\"]? *\)!
267
+ CUST_RE = %r!(:)([a-zA-Z0-9\*_-]*)()!
268
+ CATCH_RE = %r!([:\.#]*)([a-zA-Z0-9\*_-]+)!
269
+
270
+ def self.filter(nodes, expr, truth = true)
271
+ until expr.empty?
272
+ _, *m = *expr.match(/^(?:#{ATTR_RE}|#{BRACK_RE}|#{FUNC_RE}|#{CUST_RE}|#{CATCH_RE})/)
273
+ break unless _
274
+
275
+ expr = $'
276
+ m.compact!
277
+ if m[0] == '@'
278
+ m[0] = "@#{m.slice!(2,1)}"
279
+ end
280
+
281
+ if m[0] == '[' && m[1] =~ /^\d+$/
282
+ m = [":", "nth", m[1].to_i-1]
283
+ end
284
+
285
+ if m[0] == ":" && m[1] == "not"
286
+ nodes, = Elements.filter(nodes, m[2], false)
287
+ elsif "#{m[0]}#{m[1]}" =~ /^(:even|:odd)$/
288
+ new_nodes = []
289
+ nodes.each_with_index {|n,i| new_nodes.push(n) if (i % 2 == (m[1] == "even" ? 0 : 1)) }
290
+ nodes = new_nodes
291
+ elsif "#{m[0]}#{m[1]}" =~ /^(:first|:last)$/
292
+ nodes = [nodes.send(m[1])]
293
+ else
294
+ meth = "filter[#{m[0]}#{m[1]}]" unless m[0].empty?
295
+ if meth and Traverse.method_defined? meth
296
+ args = m[2..-1]
297
+ else
298
+ meth = "filter[#{m[0]}]"
299
+ if Traverse.method_defined? meth
300
+ args = m[1..-1]
301
+ end
302
+ end
303
+ i = -1
304
+ nodes = Elements[*nodes.find_all do |x|
305
+ i += 1
306
+ x.send(meth, *([*args] + [i])) ? truth : !truth
307
+ end]
308
+ end
309
+ end
310
+ [nodes, expr]
311
+ end
312
+
313
+ # Given two elements, attempt to gather an Elements array of everything between
314
+ # (and including) those two elements.
315
+ def self.expand(ele1, ele2, excl=false)
316
+ ary = []
317
+ offset = excl ? -1 : 0
318
+
319
+ if ele1 and ele2
320
+ # let's quickly take care of siblings
321
+ if ele1.parent == ele2.parent
322
+ ary = ele1.parent.children[ele1.node_position..(ele2.node_position+offset)]
323
+ else
324
+ # find common parent
325
+ p, ele1_p = ele1, [ele1]
326
+ ele1_p.unshift p while p.respond_to?(:parent) and p = p.parent
327
+ p, ele2_p = ele2, [ele2]
328
+ ele2_p.unshift p while p.respond_to?(:parent) and p = p.parent
329
+ common_parent = ele1_p.zip(ele2_p).select { |p1, p2| p1 == p2 }.flatten.last
330
+
331
+ child = nil
332
+ if ele1 == common_parent
333
+ child = ele2
334
+ elsif ele2 == common_parent
335
+ child = ele1
336
+ end
337
+
338
+ if child
339
+ ary = common_parent.children[0..(child.node_position+offset)]
340
+ end
341
+ end
342
+ end
343
+
344
+ return Elements[*ary]
345
+ end
346
+
347
+ def filter(expr)
348
+ nodes, = Elements.filter(self, expr)
349
+ nodes
350
+ end
351
+
352
+ def not(expr)
353
+ if expr.is_a? Traverse
354
+ nodes = self - [expr]
355
+ else
356
+ nodes, = Elements.filter(self, expr, false)
357
+ end
358
+ nodes
359
+ end
360
+
361
+ private
362
+ def copy_node(node, l)
363
+ l.instance_variables.each do |iv|
364
+ node.instance_variable_set(iv, l.instance_variable_get(iv))
365
+ end
366
+ end
367
+
368
+ end
369
+
370
+ module Traverse
371
+ def self.filter(tok, &blk)
372
+ define_method("filter[#{tok.is_a?(String) ? tok : tok.inspect}]", &blk)
373
+ end
374
+
375
+ filter '' do |name,i|
376
+ name == '*' || (self.respond_to?(:name) && self.name.downcase == name.downcase)
377
+ end
378
+
379
+ filter '#' do |id,i|
380
+ self.elem? and get_attribute('id').to_s == id
381
+ end
382
+
383
+ filter '.' do |name,i|
384
+ self.elem? and classes.include? name
385
+ end
386
+
387
+ filter :lt do |num,i|
388
+ self.position < num.to_i
389
+ end
390
+
391
+ filter :gt do |num,i|
392
+ self.position > num.to_i
393
+ end
394
+
395
+ nth = proc { |num,i| self.position == num.to_i }
396
+ nth_first = proc { |*a| self.position == 0 }
397
+ nth_last = proc { |*a| self == parent.children_of_type(self.name).last }
398
+
399
+ filter :nth, &nth
400
+ filter :eq, &nth
401
+ filter ":nth-of-type", &nth
402
+
403
+ filter :first, &nth_first
404
+ filter ":first-of-type", &nth_first
405
+
406
+ filter :last, &nth_last
407
+ filter ":last-of-type", &nth_last
408
+
409
+ filter :even do |num,i|
410
+ self.position % 2 == 0
411
+ end
412
+
413
+ filter :odd do |num,i|
414
+ self.position % 2 == 1
415
+ end
416
+
417
+ filter ':first-child' do |i|
418
+ self == parent.containers.first
419
+ end
420
+
421
+ filter ':nth-child' do |arg,i|
422
+ case arg
423
+ when 'even'; (parent.containers.index(self) + 1) % 2 == 0
424
+ when 'odd'; (parent.containers.index(self) + 1) % 2 == 1
425
+ else self == (parent.containers[arg.to_i + 1])
426
+ end
427
+ end
428
+
429
+ filter ":last-child" do |i|
430
+ self == parent.containers.last
431
+ end
432
+
433
+ filter ":nth-last-child" do |arg,i|
434
+ self == parent.containers[-1-arg.to_i]
435
+ end
436
+
437
+ filter ":nth-last-of-type" do |arg,i|
438
+ self == parent.children_of_type(self.name)[-1-arg.to_i]
439
+ end
440
+
441
+ filter ":only-of-type" do |arg,i|
442
+ parent.children_of_type(self.name).length == 1
443
+ end
444
+
445
+ filter ":only-child" do |arg,i|
446
+ parent.containers.length == 1
447
+ end
448
+
449
+ filter :parent do
450
+ containers.length > 0
451
+ end
452
+
453
+ filter :empty do
454
+ containers.length == 0
455
+ end
456
+
457
+ filter :root do
458
+ self.is_a? Hpricot::Doc
459
+ end
460
+
461
+ filter 'text' do
462
+ self.text?
463
+ end
464
+
465
+ filter 'comment' do
466
+ self.comment?
467
+ end
468
+
469
+ filter :contains do |arg, ignore|
470
+ html.include? arg
471
+ end
472
+
473
+
474
+
475
+ pred_procs =
476
+ {'text()' => proc { |ele, *_| ele.inner_text.strip },
477
+ '@' => proc { |ele, attr, *_| ele.get_attribute(attr).to_s if ele.elem? }}
478
+
479
+ oper_procs =
480
+ {'=' => proc { |a,b| a == b },
481
+ '!=' => proc { |a,b| a != b },
482
+ '~=' => proc { |a,b| a.split(/\s+/).include?(b) },
483
+ '|=' => proc { |a,b| a =~ /^#{Regexp::quote b}(-|$)/ },
484
+ '^=' => proc { |a,b| a.index(b) == 0 },
485
+ '$=' => proc { |a,b| a =~ /#{Regexp::quote b}$/ },
486
+ '*=' => proc { |a,b| idx = a.index(b) }}
487
+
488
+ pred_procs.each do |pred_n, pred_f|
489
+ oper_procs.each do |oper_n, oper_f|
490
+ filter "#{pred_n}#{oper_n}" do |*a|
491
+ qual = pred_f[self, *a]
492
+ oper_f[qual, a[-2]] if qual
493
+ end
494
+ end
495
+ end
496
+
497
+ filter 'text()' do |val,i|
498
+ !self.inner_text.strip.empty?
499
+ end
500
+
501
+ filter '@' do |attr,val,i|
502
+ self.elem? and has_attribute? attr
503
+ end
504
+
505
+ filter '[' do |val,i|
506
+ self.elem? and search(val).length > 0
507
+ end
508
+
509
+ end
510
+ end