hpricot 0.7-x86-mswin32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. data/CHANGELOG +68 -0
  2. data/COPYING +18 -0
  3. data/README +284 -0
  4. data/Rakefile +260 -0
  5. data/ext/fast_xs/FastXsService.java +1018 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +200 -0
  8. data/ext/hpricot_scan/HpricotScanService.java +1305 -0
  9. data/ext/hpricot_scan/extconf.rb +6 -0
  10. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  11. data/ext/hpricot_scan/hpricot_css.c +3502 -0
  12. data/ext/hpricot_scan/hpricot_css.rl +115 -0
  13. data/ext/hpricot_scan/hpricot_scan.c +6704 -0
  14. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  15. data/ext/hpricot_scan/hpricot_scan.java.rl +373 -0
  16. data/ext/hpricot_scan/hpricot_scan.rl +722 -0
  17. data/ext/hpricot_scan/test.rb +4 -0
  18. data/extras/mingw-rbconfig.rb +176 -0
  19. data/lib/fast_xs.so +0 -0
  20. data/lib/hpricot.rb +26 -0
  21. data/lib/hpricot/blankslate.rb +63 -0
  22. data/lib/hpricot/builder.rb +216 -0
  23. data/lib/hpricot/elements.rb +510 -0
  24. data/lib/hpricot/htmlinfo.rb +691 -0
  25. data/lib/hpricot/inspect.rb +103 -0
  26. data/lib/hpricot/modules.rb +38 -0
  27. data/lib/hpricot/parse.rb +38 -0
  28. data/lib/hpricot/tag.rb +198 -0
  29. data/lib/hpricot/tags.rb +164 -0
  30. data/lib/hpricot/traverse.rb +838 -0
  31. data/lib/hpricot/xchar.rb +94 -0
  32. data/lib/hpricot_scan.so +0 -0
  33. data/test/files/basic.xhtml +17 -0
  34. data/test/files/boingboing.html +2266 -0
  35. data/test/files/cy0.html +3653 -0
  36. data/test/files/immob.html +400 -0
  37. data/test/files/pace_application.html +1320 -0
  38. data/test/files/tenderlove.html +16 -0
  39. data/test/files/uswebgen.html +220 -0
  40. data/test/files/utf8.html +1054 -0
  41. data/test/files/week9.html +1723 -0
  42. data/test/files/why.xml +19 -0
  43. data/test/load_files.rb +7 -0
  44. data/test/nokogiri-bench.rb +64 -0
  45. data/test/test_alter.rb +77 -0
  46. data/test/test_builder.rb +37 -0
  47. data/test/test_parser.rb +409 -0
  48. data/test/test_paths.rb +25 -0
  49. data/test/test_preserved.rb +70 -0
  50. data/test/test_xml.rb +28 -0
  51. metadata +111 -0
@@ -0,0 +1,510 @@
1
+ module Hpricot
2
+ # Once you've matched a list of elements, you will often need to handle them as
3
+ # a group. Or you may want to perform the same action on each of them.
4
+ # Hpricot::Elements is an extension of Ruby's array class, with some methods
5
+ # added for altering elements contained in the array.
6
+ #
7
+ # If you need to create an element array from regular elements:
8
+ #
9
+ # Hpricot::Elements[ele1, ele2, ele3]
10
+ #
11
+ # Assuming that ele1, ele2 and ele3 contain element objects (Hpricot::Elem,
12
+ # Hpricot::Doc, etc.)
13
+ #
14
+ # == Continuing Searches
15
+ #
16
+ # Usually the Hpricot::Elements you're working on comes from a search you've
17
+ # done. Well, you can continue searching the list by using the same <tt>at</tt>
18
+ # and <tt>search</tt> methods you can use on plain elements.
19
+ #
20
+ # elements = doc.search("/div/p")
21
+ # elements = elements.search("/a[@href='http://hoodwink.d/']")
22
+ # elements = elements.at("img")
23
+ #
24
+ # == Altering Elements
25
+ #
26
+ # When you're altering elements in the list, your changes will be reflected in
27
+ # the document you started searching from.
28
+ #
29
+ # doc = Hpricot("That's my <b>spoon</b>, Tyler.")
30
+ # doc.at("b").swap("<i>fork</i>")
31
+ # doc.to_html
32
+ # #=> "That's my <i>fork</i>, Tyler."
33
+ #
34
+ # == Getting More Detailed
35
+ #
36
+ # If you can't find a method here that does what you need, you may need to
37
+ # loop through the elements and find a method in Hpricot::Container::Trav
38
+ # which can do what you need.
39
+ #
40
+ # For example, you may want to search for all the H3 header tags in a document
41
+ # and grab all the tags underneath the header, but not inside the header.
42
+ # A good method for this is <tt>next_sibling</tt>:
43
+ #
44
+ # doc.search("h3").each do |h3|
45
+ # while ele = h3.next_sibling
46
+ # ary << ele # stuff away all the elements under the h3
47
+ # end
48
+ # end
49
+ #
50
+ # Most of the useful element methods are in the mixins Hpricot::Traverse
51
+ # and Hpricot::Container::Trav.
52
+ class Elements < Array
53
+
54
+ # Searches this list for any elements (or children of these elements) matching
55
+ # the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
56
+ #
57
+ # See Hpricot::Container::Trav.search for more.
58
+ def search(*expr,&blk)
59
+ Elements[*map { |x| x.search(*expr,&blk) }.flatten.uniq]
60
+ end
61
+ alias_method :/, :search
62
+
63
+ # Searches this list for the first element (or child of these elements) matching
64
+ # the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
65
+ #
66
+ # See Hpricot::Container::Trav.at for more.
67
+ def at(expr, &blk)
68
+ search(expr, &blk).first
69
+ end
70
+ alias_method :%, :at
71
+
72
+ # Convert this group of elements into a complete HTML fragment, returned as a
73
+ # string.
74
+ def to_html
75
+ map { |x| x.output("") }.join
76
+ end
77
+ alias_method :to_s, :to_html
78
+
79
+ # Returns an HTML fragment built of the contents of each element in this list.
80
+ #
81
+ # If a HTML +string+ is supplied, this method acts like inner_html=.
82
+ def inner_html(*string)
83
+ if string.empty?
84
+ map { |x| x.inner_html }.join
85
+ else
86
+ x = self.inner_html = string.pop || x
87
+ end
88
+ end
89
+ alias_method :html, :inner_html
90
+ alias_method :innerHTML, :inner_html
91
+
92
+ # Replaces the contents of each element in this list. Supply an HTML +string+,
93
+ # which is loaded into Hpricot objects and inserted into every element in this
94
+ # list.
95
+ def inner_html=(string)
96
+ each { |x| x.inner_html = string }
97
+ end
98
+ alias_method :html=, :inner_html=
99
+ alias_method :innerHTML=, :inner_html=
100
+
101
+ # Returns an string containing the text contents of each element in this list.
102
+ # All HTML tags are removed.
103
+ def inner_text
104
+ map { |x| x.inner_text }.join
105
+ end
106
+ alias_method :text, :inner_text
107
+
108
+ # Remove all elements in this list from the document which contains them.
109
+ #
110
+ # doc = Hpricot("<html>Remove this: <b>here</b></html>")
111
+ # doc.search("b").remove
112
+ # doc.to_html
113
+ # => "<html>Remove this: </html>"
114
+ #
115
+ def remove
116
+ each { |x| x.parent.children.delete(x) }
117
+ end
118
+
119
+ # Empty the elements in this list, by removing their insides.
120
+ #
121
+ # doc = Hpricot("<p> We have <i>so much</i> to say.</p>")
122
+ # doc.search("i").empty
123
+ # doc.to_html
124
+ # => "<p> We have <i></i> to say.</p>"
125
+ #
126
+ def empty
127
+ each { |x| x.inner_html = nil }
128
+ end
129
+
130
+ # Add to the end of the contents inside each element in this list.
131
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
132
+ def append(str = nil, &blk)
133
+ each { |x| x.html(x.children + x.make(str, &blk)) }
134
+ end
135
+
136
+ # Add to the start of the contents inside each element in this list.
137
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
138
+ def prepend(str = nil, &blk)
139
+ each { |x| x.html(x.make(str, &blk) + x.children) }
140
+ end
141
+
142
+ # Add some HTML just previous to each element in this list.
143
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
144
+ def before(str = nil, &blk)
145
+ each { |x| x.parent.insert_before x.make(str, &blk), x }
146
+ end
147
+
148
+ # Just after each element in this list, add some HTML.
149
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
150
+ def after(str = nil, &blk)
151
+ each { |x| x.parent.insert_after x.make(str, &blk), x }
152
+ end
153
+
154
+ # Wraps each element in the list inside the element created by HTML +str+.
155
+ # If more than one element is found in the string, Hpricot locates the
156
+ # deepest spot inside the first element.
157
+ #
158
+ # doc.search("a[@href]").
159
+ # wrap(%{<div class="link"><div class="link_inner"></div></div>})
160
+ #
161
+ # This code wraps every link on the page inside a +div.link+ and a +div.link_inner+ nest.
162
+ def wrap(str = nil, &blk)
163
+ each do |x|
164
+ wrap = x.make(str, &blk)
165
+ nest = wrap.detect { |w| w.respond_to? :children }
166
+ unless nest
167
+ raise "No wrapping element found."
168
+ end
169
+ x.parent.replace_child(x, wrap)
170
+ nest = nest.children.first until nest.empty?
171
+ nest.html([x])
172
+ end
173
+ end
174
+
175
+ # Gets and sets attributes on all matched elements.
176
+ #
177
+ # Pass in a +key+ on its own and this method will return the string value
178
+ # assigned to that attribute for the first elements. Or +nil+ if the
179
+ # attribute isn't found.
180
+ #
181
+ # doc.search("a").attr("href")
182
+ # #=> "http://hacketyhack.net/"
183
+ #
184
+ # Or, pass in a +key+ and +value+. This will set an attribute for all
185
+ # matched elements.
186
+ #
187
+ # doc.search("p").attr("class", "basic")
188
+ #
189
+ # You may also use a Hash to set a series of attributes:
190
+ #
191
+ # (doc/"a").attr(:class => "basic", :href => "http://hackety.org/")
192
+ #
193
+ # Lastly, a block can be used to rewrite an attribute based on the element
194
+ # it belongs to. The block will pass in an element. Return from the block
195
+ # the new value of the attribute.
196
+ #
197
+ # records.attr("href") { |e| e['href'] + "#top" }
198
+ #
199
+ # This example adds a <tt>#top</tt> anchor to each link.
200
+ #
201
+ def attr key, value = nil, &blk
202
+ if value or blk
203
+ each do |el|
204
+ el.set_attribute(key, value || blk[el])
205
+ end
206
+ return self
207
+ end
208
+ if key.is_a? Hash
209
+ key.each { |k,v| self.attr(k,v) }
210
+ return self
211
+ else
212
+ return self[0].get_attribute(key)
213
+ end
214
+ end
215
+ alias_method :set, :attr
216
+
217
+ # Adds the class to all matched elements.
218
+ #
219
+ # (doc/"p").add_class("bacon")
220
+ #
221
+ # Now all paragraphs will have class="bacon".
222
+ def add_class class_name
223
+ each do |el|
224
+ next unless el.respond_to? :get_attribute
225
+ classes = el.get_attribute('class').to_s.split(" ")
226
+ el.set_attribute('class', classes.push(class_name).uniq.join(" "))
227
+ end
228
+ self
229
+ end
230
+
231
+ # Remove an attribute from each of the matched elements.
232
+ #
233
+ # (doc/"input").remove_attr("disabled")
234
+ #
235
+ def remove_attr name
236
+ each do |el|
237
+ next unless el.respond_to? :remove_attribute
238
+ el.remove_attribute(name)
239
+ end
240
+ self
241
+ end
242
+
243
+ # Removes a class from all matched elements.
244
+ #
245
+ # (doc/"span").remove_class("lightgrey")
246
+ #
247
+ # Or, to remove all classes:
248
+ #
249
+ # (doc/"span").remove_class
250
+ #
251
+ def remove_class name = nil
252
+ each do |el|
253
+ next unless el.respond_to? :get_attribute
254
+ if name
255
+ classes = el.get_attribute('class').to_s.split(" ")
256
+ el.set_attribute('class', (classes - [name]).uniq.join(" "))
257
+ else
258
+ el.remove_attribute("class")
259
+ end
260
+ end
261
+ self
262
+ end
263
+
264
+ ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^'"]*)'?"? *\]!i
265
+ BRACK_RE = %r!(\[) *([^\]]*) *\]+!i
266
+ FUNC_RE = %r!(:)?([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)]*?)['\"]? *\)!
267
+ CUST_RE = %r!(:)([a-zA-Z0-9\*_-]*)()!
268
+ CATCH_RE = %r!([:\.#]*)([a-zA-Z0-9\*_-]+)!
269
+
270
+ def self.filter(nodes, expr, truth = true)
271
+ until expr.empty?
272
+ _, *m = *expr.match(/^(?:#{ATTR_RE}|#{BRACK_RE}|#{FUNC_RE}|#{CUST_RE}|#{CATCH_RE})/)
273
+ break unless _
274
+
275
+ expr = $'
276
+ m.compact!
277
+ if m[0] == '@'
278
+ m[0] = "@#{m.slice!(2,1).join}"
279
+ end
280
+
281
+ if m[0] == '[' && m[1] =~ /^\d+$/
282
+ m = [":", "nth", m[1].to_i-1]
283
+ end
284
+
285
+ if m[0] == ":" && m[1] == "not"
286
+ nodes, = Elements.filter(nodes, m[2], false)
287
+ elsif "#{m[0]}#{m[1]}" =~ /^(:even|:odd)$/
288
+ new_nodes = []
289
+ nodes.each_with_index {|n,i| new_nodes.push(n) if (i % 2 == (m[1] == "even" ? 0 : 1)) }
290
+ nodes = new_nodes
291
+ elsif "#{m[0]}#{m[1]}" =~ /^(:first|:last)$/
292
+ nodes = [nodes.send(m[1])]
293
+ else
294
+ meth = "filter[#{m[0]}#{m[1]}]" unless m[0].empty?
295
+ if meth and Traverse.method_defined? meth
296
+ args = m[2..-1]
297
+ else
298
+ meth = "filter[#{m[0]}]"
299
+ if Traverse.method_defined? meth
300
+ args = m[1..-1]
301
+ end
302
+ end
303
+ args << -1
304
+ nodes = Elements[*nodes.find_all do |x|
305
+ args[-1] += 1
306
+ x.send(meth, *args) ? truth : !truth
307
+ end]
308
+ end
309
+ end
310
+ [nodes, expr]
311
+ end
312
+
313
+ # Given two elements, attempt to gather an Elements array of everything between
314
+ # (and including) those two elements.
315
+ def self.expand(ele1, ele2, excl=false)
316
+ ary = []
317
+ offset = excl ? -1 : 0
318
+
319
+ if ele1 and ele2
320
+ # let's quickly take care of siblings
321
+ if ele1.parent == ele2.parent
322
+ ary = ele1.parent.children[ele1.node_position..(ele2.node_position+offset)]
323
+ else
324
+ # find common parent
325
+ p, ele1_p = ele1, [ele1]
326
+ ele1_p.unshift p while p.respond_to?(:parent) and p = p.parent
327
+ p, ele2_p = ele2, [ele2]
328
+ ele2_p.unshift p while p.respond_to?(:parent) and p = p.parent
329
+ common_parent = ele1_p.zip(ele2_p).select { |p1, p2| p1 == p2 }.flatten.last
330
+
331
+ child = nil
332
+ if ele1 == common_parent
333
+ child = ele2
334
+ elsif ele2 == common_parent
335
+ child = ele1
336
+ end
337
+
338
+ if child
339
+ ary = common_parent.children[0..(child.node_position+offset)]
340
+ end
341
+ end
342
+ end
343
+
344
+ return Elements[*ary]
345
+ end
346
+
347
+ def filter(expr)
348
+ nodes, = Elements.filter(self, expr)
349
+ nodes
350
+ end
351
+
352
+ def not(expr)
353
+ if expr.is_a? Traverse
354
+ nodes = self - [expr]
355
+ else
356
+ nodes, = Elements.filter(self, expr, false)
357
+ end
358
+ nodes
359
+ end
360
+
361
+ private
362
+ def copy_node(node, l)
363
+ l.instance_variables.each do |iv|
364
+ node.instance_variable_set(iv, l.instance_variable_get(iv))
365
+ end
366
+ end
367
+
368
+ end
369
+
370
+ module Traverse
371
+ def self.filter(tok, &blk)
372
+ define_method("filter[#{tok.is_a?(String) ? tok : tok.inspect}]", &blk)
373
+ end
374
+
375
+ filter '' do |name,i|
376
+ name == '*' || (self.respond_to?(:name) && self.name.downcase == name.downcase)
377
+ end
378
+
379
+ filter '#' do |id,i|
380
+ self.elem? and get_attribute('id').to_s == id
381
+ end
382
+
383
+ filter '.' do |name,i|
384
+ self.elem? and classes.include? name
385
+ end
386
+
387
+ filter :lt do |num,i|
388
+ self.position < num.to_i
389
+ end
390
+
391
+ filter :gt do |num,i|
392
+ self.position > num.to_i
393
+ end
394
+
395
+ nth = proc { |num,i| self.position == num.to_i }
396
+ nth_first = proc { |*a| self.position == 0 }
397
+ nth_last = proc { |*a| self == parent.children_of_type(self.name).last }
398
+
399
+ filter :nth, &nth
400
+ filter :eq, &nth
401
+ filter ":nth-of-type", &nth
402
+
403
+ filter :first, &nth_first
404
+ filter ":first-of-type", &nth_first
405
+
406
+ filter :last, &nth_last
407
+ filter ":last-of-type", &nth_last
408
+
409
+ filter :even do |num,i|
410
+ self.position % 2 == 0
411
+ end
412
+
413
+ filter :odd do |num,i|
414
+ self.position % 2 == 1
415
+ end
416
+
417
+ filter ':first-child' do |i|
418
+ self == parent.containers.first
419
+ end
420
+
421
+ filter ':nth-child' do |arg,i|
422
+ case arg
423
+ when 'even'; (parent.containers.index(self) + 1) % 2 == 0
424
+ when 'odd'; (parent.containers.index(self) + 1) % 2 == 1
425
+ else self == (parent.containers[arg.to_i - 1])
426
+ end
427
+ end
428
+
429
+ filter ":last-child" do |i|
430
+ self == parent.containers.last
431
+ end
432
+
433
+ filter ":nth-last-child" do |arg,i|
434
+ self == parent.containers[-1-arg.to_i]
435
+ end
436
+
437
+ filter ":nth-last-of-type" do |arg,i|
438
+ self == parent.children_of_type(self.name)[-1-arg.to_i]
439
+ end
440
+
441
+ filter ":only-of-type" do |arg,i|
442
+ parent.children_of_type(self.name).length == 1
443
+ end
444
+
445
+ filter ":only-child" do |arg,i|
446
+ parent.containers.length == 1
447
+ end
448
+
449
+ filter :parent do |*a|
450
+ containers.length > 0
451
+ end
452
+
453
+ filter :empty do |*a|
454
+ containers.length == 0
455
+ end
456
+
457
+ filter :root do |*a|
458
+ self.is_a? Hpricot::Doc
459
+ end
460
+
461
+ filter 'text' do |*a|
462
+ self.text?
463
+ end
464
+
465
+ filter 'comment' do |*a|
466
+ self.comment?
467
+ end
468
+
469
+ filter :contains do |arg, ignore|
470
+ html.include? arg
471
+ end
472
+
473
+
474
+
475
+ pred_procs =
476
+ {'text()' => proc { |ele, *_| ele.inner_text.strip },
477
+ '@' => proc { |ele, attr, *_| ele.get_attribute(attr).to_s if ele.elem? }}
478
+
479
+ oper_procs =
480
+ {'=' => proc { |a,b| a == b },
481
+ '!=' => proc { |a,b| a != b },
482
+ '~=' => proc { |a,b| a.split(/\s+/).include?(b) },
483
+ '|=' => proc { |a,b| a =~ /^#{Regexp::quote b}(-|$)/ },
484
+ '^=' => proc { |a,b| a.index(b) == 0 },
485
+ '$=' => proc { |a,b| a =~ /#{Regexp::quote b}$/ },
486
+ '*=' => proc { |a,b| idx = a.index(b) }}
487
+
488
+ pred_procs.each do |pred_n, pred_f|
489
+ oper_procs.each do |oper_n, oper_f|
490
+ filter "#{pred_n}#{oper_n}" do |*a|
491
+ qual = pred_f[self, *a]
492
+ oper_f[qual, a[-2]] if qual
493
+ end
494
+ end
495
+ end
496
+
497
+ filter 'text()' do |val,i|
498
+ self.children.grep(Hpricot::Text).detect { |x| x.content =~ /\S/ } if self.children
499
+ end
500
+
501
+ filter '@' do |attr,val,i|
502
+ self.elem? and has_attribute? attr
503
+ end
504
+
505
+ filter '[' do |val,i|
506
+ self.elem? and search(val).length > 0
507
+ end
508
+
509
+ end
510
+ end