assaf-scrapi 1.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,804 @@
1
+ # ScrAPI toolkit for Ruby
2
+ #
3
+ # Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
4
+ # Developed for http://co.mments.com
5
+ # Code and documention: http://labnotes.org
6
+
7
+
8
+ require "rubygems"
9
+ require "time"
10
+ require "test/unit"
11
+ require File.join(File.dirname(__FILE__), "mock_net_http")
12
+ require File.join(File.dirname(__FILE__), "../lib", "scrapi")
13
+
14
+
15
+ class ScraperTest < Test::Unit::TestCase
16
+
17
+ DIVS123 = <<-EOF
18
+ <div id="1"></div>
19
+ <div id="2"></div>
20
+ <div id="3"></div>
21
+ EOF
22
+
23
+ DIVS1_23 = <<-EOF
24
+ <div id="1">
25
+ <div id="2"></div>
26
+ <div id="3"></div>
27
+ </div>
28
+ EOF
29
+
30
+
31
+ def setup
32
+ Net::HTTP.reset_on_get
33
+ end
34
+
35
+ def teardown
36
+ Net::HTTP.reset_on_get
37
+ end
38
+
39
+
40
+ #
41
+ # Tests selector methods.
42
+ #
43
+
44
+ def test_define_selectors
45
+ scraper = new_scraper(DIVS123) do
46
+ selector :test, "div"
47
+ end
48
+ assert_equal 3, scraper.test(scraper.document).size
49
+ 3.times do |i|
50
+ assert_equal String(i + 1), scraper.test(scraper.document)[i].attributes["id"]
51
+ end
52
+ end
53
+
54
+
55
+ def test_selector_blocks
56
+ scraper = new_scraper(DIVS123) do
57
+ selector :test, "div" do |elements|
58
+ return elements[0..-2]
59
+ elements[0..-2]
60
+ end
61
+ end
62
+ assert_equal 2, scraper.test(scraper.document).size
63
+ end
64
+
65
+
66
+ def test_array_selectors
67
+ scraper = new_scraper(DIVS123) do
68
+ selector :test, "#?", "2"
69
+ end
70
+ assert_equal 1, scraper.test(scraper.document).size
71
+ assert_equal "2", scraper.test(scraper.document)[0].attributes["id"]
72
+ end
73
+
74
+
75
+ def test_object_selectors
76
+ scraper = new_scraper(DIVS123) do
77
+ selector :test, HTML::Selector.new("div")
78
+ end
79
+ assert_equal 3, scraper.test(scraper.document).size
80
+ end
81
+
82
+
83
+ def test_selector_returns_array
84
+ scraper = new_scraper(DIVS123) do
85
+ selector :test0, "#4"
86
+ selector :test1, "#1"
87
+ selector :test3, "div"
88
+ end
89
+ assert_equal 0, scraper.test0(scraper.document).size # No elements (empty)
90
+ assert_equal 1, scraper.test1(scraper.document).size # One element (array)
91
+ assert_equal 3, scraper.test3(scraper.document).size # Array of elements
92
+ end
93
+
94
+
95
+ def test_select_in_document_order
96
+ scraper = new_scraper(DIVS123) do
97
+ selector :test, "#2,#1"
98
+ end
99
+ assert_equal 2, scraper.test(scraper.document).size
100
+ assert_equal "1", scraper.test(scraper.document)[0].attributes["id"]
101
+ assert_equal "2", scraper.test(scraper.document)[1].attributes["id"]
102
+ end
103
+
104
+
105
+ def test_selecting_first_element
106
+ scraper = new_scraper(DIVS123) do
107
+ selector :test, "div"
108
+ end
109
+ assert_equal 3, scraper.test(scraper.document).size
110
+ assert scraper.first_test(scraper.document)
111
+ assert_equal "1", scraper.first_test(scraper.document).attributes["id"]
112
+
113
+ scraper = new_scraper(DIVS123) do
114
+ selector :test, "div" do |element|
115
+ element[0].attributes["id"]
116
+ end
117
+ end
118
+ assert scraper.first_test(scraper.document)
119
+ assert_equal "1", scraper.first_test(scraper.document)
120
+ end
121
+
122
+
123
+ #
124
+ # Tests process methods.
125
+ #
126
+
127
+ def test_processing_rule
128
+ scraper = new_scraper(DIVS123) do
129
+ process "div" do |element|
130
+ @count = (@count || 0) + 1
131
+ end
132
+ attr :count
133
+ end
134
+ scraper.scrape
135
+ assert_equal 3, scraper.count
136
+ end
137
+
138
+
139
+ def test_processing_rule_with_array
140
+ scraper = new_scraper(DIVS123) do
141
+ process "#?", "1" do |element|
142
+ @count = (@count || 0) + 1
143
+ end
144
+ attr :count
145
+ end
146
+ scraper.scrape
147
+ assert_equal 1, scraper.count
148
+ end
149
+
150
+
151
+ def test_processing_rule_with_selector
152
+ scraper = new_scraper(DIVS123) do
153
+ process HTML::Selector.new("div") do |element|
154
+ @count = (@count || 0) + 1
155
+ end
156
+ attr :count
157
+ end
158
+ scraper.scrape
159
+ assert_equal 3, scraper.count
160
+ end
161
+
162
+
163
+ def test_extracting_in_code
164
+ scraper = new_scraper(DIVS123) do
165
+ process "div" do |element|
166
+ @concat = (@concat || "") << element.attributes["id"]
167
+ end
168
+ attr :concat
169
+ end
170
+ scraper.scrape
171
+ assert_equal "123", scraper.concat
172
+ end
173
+
174
+
175
+ def test_processing_in_document_order
176
+ scraper = new_scraper(DIVS123) do
177
+ process "#2,#1" do |element|
178
+ @concat = (@concat || "") << element.attributes["id"]
179
+ end
180
+ attr :concat
181
+ end
182
+ scraper.scrape
183
+ assert_equal "12", scraper.concat
184
+ end
185
+
186
+
187
+ def test_process_once_if_skipped
188
+ scraper = new_scraper(DIVS123) do
189
+ def prepare(document)
190
+ @found = []
191
+ end
192
+ process("#1") { |element| @found[0] = true }
193
+ process("#1") { |element| @found[1] = true ; skip element }
194
+ process("#1") { |element| @found[2] = true }
195
+ process("#2", :skip=>true){ |element| @found[3] = true }
196
+ process("#2") { |element| @found[4] = true }
197
+ attr_reader :found
198
+ end
199
+ scraper.scrape
200
+ assert_equal [true, true, nil, true], scraper.found
201
+ end
202
+
203
+
204
+ def test_skip_children
205
+ scraper = new_scraper(DIVS1_23) do
206
+ process "div" do |element|
207
+ @concat = (@concat || "") << (element.attributes["id"] || "")
208
+ skip id2(element)
209
+ end
210
+ selector :id2, "#2"
211
+ attr :concat
212
+ end
213
+ scraper.scrape
214
+ assert_equal "13", scraper.concat
215
+ end
216
+
217
+
218
+ def test_skip_descendants
219
+ # Root, child of root, grandchild of root.
220
+ scraper = new_scraper(DIVS1_23) do
221
+ process "div" do |element|
222
+ @concat = (@concat || "") << (element.attributes["id"] || "")
223
+ end
224
+ attr :concat
225
+ end
226
+ scraper.scrape
227
+ assert_equal "123", scraper.concat
228
+
229
+ # Stop at root.
230
+ scraper = new_scraper(DIVS1_23) do
231
+ process "div" do |element|
232
+ @concat = (@concat || "") << (element.attributes["id"] || "")
233
+ skip
234
+ end
235
+ attr :concat
236
+ end
237
+ scraper.scrape
238
+ assert_equal "1", scraper.concat
239
+
240
+ scraper.scrape
241
+ # Child of root, and child of root's child
242
+ scraper = new_scraper(DIVS1_23) do
243
+ process "div>div" do |element|
244
+ @concat = (@concat || "") << (element.attributes["id"] || "")
245
+ end
246
+ attr :concat
247
+ end
248
+ scraper.scrape
249
+ assert_equal "23", scraper.concat
250
+
251
+ # Stop at child of root.
252
+ scraper = new_scraper(DIVS1_23) do
253
+ process "div>div" do |element|
254
+ @concat = (@concat || "") << (element.attributes["id"] || "")
255
+ skip element.next_element
256
+ end
257
+ attr :concat
258
+ end
259
+ scraper.scrape
260
+ assert_equal "2", scraper.concat
261
+
262
+ # Child of root, the child of child of root.
263
+ scraper = new_scraper(DIVS1_23) do
264
+ process "div div" do |element|
265
+ @concat = (@concat || "") << (element.attributes["id"] || "")
266
+ end
267
+ attr :concat
268
+ end
269
+ scraper.scrape
270
+ assert_equal "23", scraper.concat
271
+
272
+ # Child of root.
273
+ scraper = new_scraper(DIVS1_23) do
274
+ process "div div" do |element|
275
+ @concat = (@concat || "") << (element.attributes["id"] || "")
276
+ skip element.next_element
277
+ end
278
+ attr :concat
279
+ end
280
+ scraper.scrape
281
+ assert_equal "2", scraper.concat
282
+ end
283
+
284
+
285
+ def test_skip_from_extractor
286
+ html = %Q{<div id="1">this</div>"}
287
+ scraper = new_scraper(html) do
288
+ process "#1", :this1=>:text
289
+ process "#1", :this2=>:text
290
+ end
291
+ scraper.scrape
292
+ assert_equal "this", scraper.this1
293
+ assert_equal "this", scraper.this2
294
+
295
+ scraper = new_scraper(html) do
296
+ process "#1", :this1=>:text, :skip=>false
297
+ process "#1", :this2=>:text
298
+ end
299
+ scraper.scrape
300
+ assert_equal "this", scraper.this1
301
+ assert_equal "this", scraper.this2
302
+
303
+ scraper = new_scraper(html) do
304
+ process "#1", :this1=>:text, :skip=>true do
305
+ false
306
+ end
307
+ process "#1", :this2=>:text
308
+ end
309
+ scraper.scrape
310
+ assert_equal "this", scraper.this1
311
+ assert_equal nil, scraper.this2
312
+ end
313
+
314
+
315
+ def test_stop
316
+ scraper = new_scraper(DIVS123) do
317
+ process "div" do |element|
318
+ @concat = (@concat || "") << (element.attributes["id"] || "")
319
+ stop
320
+ end
321
+ attr :concat
322
+ end
323
+ scraper.scrape
324
+ assert_equal "1", scraper.concat
325
+ end
326
+
327
+
328
+ def test_process_first
329
+ scraper = new_scraper(DIVS123) do
330
+ process "div" do |element|
331
+ @all = (@all || 0) + 1
332
+ end
333
+ process_first "div" do |element|
334
+ @first = (@first || 0) + 1
335
+ end
336
+ attr_accessor :all, :first
337
+ end
338
+ scraper.scrape
339
+ assert_equal 3, scraper.all
340
+ assert_equal 1, scraper.first
341
+ end
342
+
343
+
344
+ def test_accessors
345
+ time = Time.new.rfc2822
346
+ Net::HTTP.on_get do |address, path, headers|
347
+ if path == "/redirect"
348
+ response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
349
+ response["Last-Modified"] = time
350
+ response["ETag"] = "etag"
351
+ [response, <<-EOF
352
+ <html>
353
+ <head>
354
+ <meta http-equiv="content-type" value="text/html; charset=other-encoding">
355
+ </head>
356
+ <body>
357
+ <div id="x"/>
358
+ </body>
359
+ </html>
360
+ EOF
361
+ ]
362
+ else
363
+ response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 300, "Moved")
364
+ response["Location"] = "http://localhost/redirect"
365
+ [response, ""]
366
+ end
367
+ end
368
+ scraper = new_scraper(URI.parse("http://localhost/source"))
369
+ scraper.scrape
370
+ assert_equal "http://localhost/source", scraper.page_info.original_url.to_s
371
+ assert_equal "http://localhost/redirect", scraper.page_info.url.to_s
372
+ assert_equal time, scraper.page_info.last_modified
373
+ assert_equal "etag", scraper.page_info.etag
374
+ assert_equal "other-encoding", scraper.page_info.encoding
375
+ end
376
+
377
+
378
+ def test_scraping_end_to_end
379
+ Net::HTTP.on_get do |address, path, headers|
380
+ [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), <<-EOF
381
+ <html>
382
+ <body>
383
+ <div id="1"/>
384
+ <div id="2"/>
385
+ </body>
386
+ </html>
387
+ EOF
388
+ ]
389
+ end
390
+ scraper = new_scraper(URI.parse("http://localhost/")) do
391
+ process "div" do |element|
392
+ @concat = (@concat || "") << (element.attributes["id"] || "")
393
+ end
394
+ attr :concat
395
+ end
396
+ scraper.scrape
397
+ assert_equal "12", scraper.concat
398
+ end
399
+
400
+
401
+ #
402
+ # Tests extractor methods.
403
+ #
404
+
405
+ def test_extractors
406
+ html = %Q{<div id="1"></div>}
407
+ scraper = new_scraper(html) do
408
+ process "div", extractor(:div_id=>"@id")
409
+ attr :div_id
410
+ end
411
+ scraper.scrape
412
+ assert_equal "1", scraper.div_id
413
+ scraper = new_scraper(html) do
414
+ process "div", :div_id=>"@id"
415
+ attr :div_id
416
+ end
417
+ scraper.scrape
418
+ assert_equal "1", scraper.div_id
419
+ end
420
+
421
+
422
+ def test_text_and_element_extractors
423
+ html = %Q{<div>some text</div>}
424
+ # Extract the node itself.
425
+ scraper = new_scraper(html) do
426
+ process "div", extractor(:value=>:element)
427
+ attr :value
428
+ end
429
+ scraper.scrape
430
+ assert_equal "div", scraper.value.name
431
+ # Extract the text value of the node.
432
+ scraper = new_scraper(html) do
433
+ process "div", extractor(:value=>:text)
434
+ attr :value
435
+ end
436
+ scraper.scrape
437
+ assert_equal "some text", scraper.value
438
+ end
439
+
440
+
441
+ def test_extractors_objects
442
+ html = <<-EOF
443
+ <h1 class="header"></h1>
444
+ <h2 class="header"></h2>
445
+ EOF
446
+ # Extract both elements based on class, return the second one.
447
+ scraper = new_scraper(html) do
448
+ process ".header", extractor(:header=>:element)
449
+ attr :header
450
+ end
451
+ scraper.scrape
452
+ assert_equal "h2", scraper.header.name
453
+ # Extracting a specific element skips the second match.
454
+ html = <<-EOF
455
+ <h1 class="header"></h1>
456
+ <h2 class="header"></h2>
457
+ EOF
458
+ scraper = new_scraper(html) do
459
+ process ".header", extractor(:header=>"h1")
460
+ attr :header
461
+ end
462
+ scraper.scrape
463
+ assert_equal "h1", scraper.header.name
464
+ end
465
+
466
+
467
+ def test_attribute_extractors
468
+ # Extracting the attribute skips the second match.
469
+ html = <<-EOF
470
+ <abbr title="foo">bar</div>
471
+ <abbr>foo</abbr>
472
+ EOF
473
+ scraper = new_scraper(html) do
474
+ process "abbr", extractor(:title=>"@title")
475
+ attr :title
476
+ end
477
+ scraper.scrape
478
+ assert_equal "foo", scraper.title
479
+ # Extracting a specific element skips the second match.
480
+ html = <<-EOF
481
+ <h1 class="header" id="1"></h1>
482
+ <h2 class="header" id="2"></h2>
483
+ EOF
484
+ scraper = new_scraper(html) do
485
+ process ".header", extractor(:header=>"h1@id")
486
+ attr :header
487
+ end
488
+ scraper.scrape
489
+ assert_equal "1", scraper.header
490
+ end
491
+
492
+
493
+ def test_class_extractors
494
+ headers = Class.new(Scraper::Base)
495
+ headers.instance_eval do
496
+ root_element nil
497
+ process "h1,h2", :h1=>"h1", :h2=>"h2"
498
+ attr :h1
499
+ attr :h2
500
+ end
501
+ html = <<-EOF
502
+ <div>
503
+ <h1>first</h1>
504
+ <h2>second</h2>
505
+ </div>
506
+ EOF
507
+ scraper = new_scraper(html) do
508
+ process "div", extractor(:headers=>headers)
509
+ attr :headers
510
+ end
511
+ scraper.scrape
512
+ assert scraper.headers
513
+ assert_equal "h1", scraper.headers.h1.name
514
+ assert_equal "h2", scraper.headers.h2.name
515
+ end
516
+
517
+
518
+ def test_array_extractors
519
+ html = <<-EOF
520
+ <div>
521
+ <h1>first</h1>
522
+ <h1>second</h1>
523
+ </div>
524
+ EOF
525
+ scraper = new_scraper(html) do
526
+ process "h1", extractor("headers[]"=>:text)
527
+ attr :headers
528
+ end
529
+ scraper.scrape
530
+ assert scraper.headers.is_a?(Array)
531
+ assert_equal 2, scraper.headers.size
532
+ assert_equal "first", scraper.headers[0]
533
+ assert_equal "second", scraper.headers[1]
534
+ end
535
+
536
+
537
+ def test_hash_extractors
538
+ html = <<-EOF
539
+ <div>
540
+ <h1 id="1" class="header">first</h1>
541
+ </div>
542
+ EOF
543
+ scraper = new_scraper(html) do
544
+ process "h1", extractor("header"=>{:id=>"@id", :class=>"@class", :text=>:text})
545
+ attr :header
546
+ end
547
+ scraper.scrape
548
+ assert scraper.header.is_a?(Hash)
549
+ assert_equal 3, scraper.header.size
550
+ assert_equal "1", scraper.header[:id]
551
+ assert_equal "header", scraper.header[:class]
552
+ assert_equal "first", scraper.header[:text]
553
+ end
554
+
555
+
556
+ def test_multi_value_extractors
557
+ html = <<-EOF
558
+ <div>
559
+ <h1 id="1" class="header">first</h1>
560
+ </div>
561
+ EOF
562
+ scraper = new_scraper(html) do
563
+ process "h1", [:text, :kls]=>Scraper.define {
564
+ process "*", :text=>:text, :kls=>"@class"
565
+ }
566
+ end
567
+ result = scraper.scrape
568
+ assert "first", result.text
569
+ assert "header", result.kls
570
+ end
571
+
572
+
573
+ def test_conditional_extractors
574
+ # Look for id attribute (second header only),
575
+ # if not found look for class attribute (first
576
+ # two headers), otherwise just get text (third
577
+ # header).
578
+ html = <<-EOF
579
+ <div>
580
+ <h1 class="foo">first</h1>
581
+ <h1 class="foo" id="bar">second</h1>
582
+ <h1>third</h1>
583
+ </div>
584
+ EOF
585
+ scraper = new_scraper(html) do
586
+ process "h1", extractor("headers[]"=>["@id", "@class", :text])
587
+ attr :headers
588
+ end
589
+ scraper.scrape
590
+ assert scraper.headers.is_a?(Array)
591
+ assert_equal 3, scraper.headers.size
592
+ assert_equal "foo", scraper.headers[0]
593
+ assert_equal "bar", scraper.headers[1]
594
+ assert_equal "third", scraper.headers[2]
595
+ end
596
+
597
+
598
+ DIVS_ST_ND = <<-EOF
599
+ <div id="1">first</div>
600
+ <div id="2">second</div>
601
+ EOF
602
+
603
+ def test_accessors_from_extractor
604
+ scraper = new_scraper(DIVS_ST_ND) do
605
+ process_first "div", :div_id=>"@id", :div_text=>:text
606
+ result :div_id
607
+ end
608
+ value = scraper.scrape
609
+ assert_equal "1", value
610
+
611
+ scraper = new_scraper(DIVS_ST_ND) do
612
+ process_first "div", :div_id=>"@id", :div_text=>:text
613
+ result :div_id, :div_text
614
+ end
615
+ value = scraper.scrape
616
+ assert_equal "1", value.div_id
617
+ assert_equal "first", value.div_text
618
+
619
+ scraper = new_scraper(DIVS_ST_ND) do
620
+ process_first "div", :div_id=>"@id", :div_text=>:text
621
+ end
622
+ value = scraper.scrape
623
+ assert_equal "1", value.div_id
624
+ assert_equal "first", value.div_text
625
+
626
+ scraper = new_scraper(DIVS_ST_ND) do
627
+ attr_accessor :div_class
628
+ process_first "div", :div_id=>"@id", :div_text=>:text
629
+ result :div_id, :div_class
630
+ end
631
+ value = scraper.scrape
632
+ assert_equal "1", value.div_id
633
+ assert_raise(NoMethodError) { value.div_text }
634
+
635
+ scraper = new_scraper(DIVS_ST_ND) do
636
+ process "div", "div_ids[]"=>"@id"
637
+ result :div_ids
638
+ end
639
+ value = scraper.scrape
640
+ assert_equal "1", value[0]
641
+ assert_equal "2", value[1]
642
+ end
643
+
644
+
645
+ def test_array_accessors
646
+ scraper = new_scraper(DIVS_ST_ND) do
647
+ array :div_id, :div_text
648
+ process "div", :div_id=>"@id", :div_text=>:text
649
+ result :div_id, :div_text
650
+ end
651
+ value = scraper.scrape
652
+ assert_equal 2, value.div_id.size
653
+ assert_equal 2, value.div_text.size
654
+ assert_equal "1", value.div_id[0]
655
+ assert_equal "2", value.div_id[1]
656
+ assert_equal "first", value.div_text[0]
657
+ assert_equal "second", value.div_text[1]
658
+ end
659
+
660
+
661
+ #
662
+ # Root element tests.
663
+ #
664
+
665
+ HTML_EMPTY = <<-EOF
666
+ <html>
667
+ <head>
668
+ </head>
669
+ <body>
670
+ </body>
671
+ </html>
672
+ EOF
673
+
674
+ def test_scrape_body_by_default
675
+ scraper = Class.new(Scraper::Base).new(HTML_EMPTY)
676
+ scraper.class.instance_eval do
677
+ process "head" do |element| @head = element end
678
+ attr :head
679
+ process "body" do |element| @body = element end
680
+ attr :body
681
+ end
682
+ scraper.scrape
683
+ assert scraper.head
684
+ assert scraper.body
685
+ end
686
+
687
+
688
+ def test_changing_root_element
689
+ only_header = new_scraper(HTML_EMPTY) do
690
+ root_element "head"
691
+ process "head" do |element| @head = element end
692
+ attr :head
693
+ process "body" do |element| @body = element end
694
+ attr :body
695
+ end
696
+ only_body = Class.new(only_header.class).new(HTML_EMPTY)
697
+ only_body.class.root_element "body"
698
+ both_parts = Class.new(only_body.class).new(HTML_EMPTY)
699
+ both_parts.class.root_element nil
700
+ # We set this scraper to begin with the head element,
701
+ # so we can see the head element, but not the body.
702
+ only_header.scrape
703
+ assert only_header.head
704
+ assert only_header.body.nil?
705
+ # Now switch to a scraper that processes the body element,
706
+ # skipping the header.
707
+ only_body.scrape
708
+ assert only_body.head.nil?
709
+ assert only_body.body
710
+ # Now switch to a scraper that doesn't specify a root element,
711
+ # and it will process both header and body.
712
+ both_parts.scrape
713
+ assert both_parts.head
714
+ assert both_parts.body
715
+ end
716
+
717
+
718
+ # Test prepare/result.
719
+
720
+ def test_prepare_and_result
721
+ # Extracting the attribute skips the second match.
722
+ scraper = new_scraper(DIVS123) do
723
+ process("div") { |element| @count +=1 }
724
+ define_method(:prepare) { @count = 1 }
725
+ define_method(:result) { @count }
726
+ end
727
+ result = scraper.scrape
728
+ assert_equal 4, result
729
+ end
730
+
731
+
732
+ def test_changing_document_from_prepare
733
+ # Extracting the attribute skips the second match.
734
+ scraper = new_scraper(DIVS123) do
735
+ selector :divs, "div"
736
+ define_method :prepare do |document|
737
+ @document = divs(document)[1]
738
+ end
739
+ array :ids
740
+ process "div", :ids=>"@id"
741
+ result :ids
742
+ end
743
+ result = scraper.scrape
744
+ assert_equal 1, result.size
745
+ assert_equal "2", result[0]
746
+ end
747
+
748
+
749
+ def test_anonymous_scrapers
750
+ scraper = Scraper.define do
751
+ array :ids
752
+ process "div", :ids=>"@id"
753
+ result :ids
754
+ end
755
+ result = scraper.scrape(DIVS123)
756
+ assert_equal "1", result[0]
757
+ assert_equal "2", result[1]
758
+ assert_equal "3", result[2]
759
+ end
760
+
761
+
762
+ def test_named_rules
763
+ scraper = Scraper.define do
764
+ array :ids1, :ids2
765
+ process :main, "div", :ids1=>"@id"
766
+ process :main, "div", :ids2=>"@id"
767
+ result :ids1, :ids2
768
+ end
769
+ result = scraper.scrape(DIVS123)
770
+ assert_equal nil, result.ids1
771
+ assert_equal 3, result.ids2.size
772
+ assert_equal "1", result.ids2[0]
773
+ assert_equal "2", result.ids2[1]
774
+ assert_equal "3", result.ids2[2]
775
+ end
776
+
777
+
778
+ protected
779
+
780
+ def new_scraper(what, &block)
781
+ cls = Class.new(Scraper::Base)
782
+ cls.root_element nil
783
+ cls.parser :html_parser
784
+ cls.class_eval &block if block
785
+ cls.new(what)
786
+ end
787
+
788
+ end
789
+
790
+
791
+ # Repeats the same set of tests, but using Tidy instead of HTMLParser.
792
+ class ScraperUsingTidyTest < ScraperTest
793
+
794
+ protected
795
+
796
+ def new_scraper(what, &block)
797
+ cls = Class.new(Scraper::Base)
798
+ cls.root_element nil
799
+ cls.parser :tidy
800
+ cls.class_eval &block if block
801
+ cls.new(what)
802
+ end
803
+
804
+ end