assaf-scrapi 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,804 @@
1
+ # ScrAPI toolkit for Ruby
2
+ #
3
+ # Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
4
+ # Developed for http://co.mments.com
5
+ # Code and documention: http://labnotes.org
6
+
7
+
8
+ require "rubygems"
9
+ require "time"
10
+ require "test/unit"
11
+ require File.join(File.dirname(__FILE__), "mock_net_http")
12
+ require File.join(File.dirname(__FILE__), "../lib", "scrapi")
13
+
14
+
15
+ class ScraperTest < Test::Unit::TestCase
16
+
17
+ DIVS123 = <<-EOF
18
+ <div id="1"></div>
19
+ <div id="2"></div>
20
+ <div id="3"></div>
21
+ EOF
22
+
23
+ DIVS1_23 = <<-EOF
24
+ <div id="1">
25
+ <div id="2"></div>
26
+ <div id="3"></div>
27
+ </div>
28
+ EOF
29
+
30
+
31
+ def setup
32
+ Net::HTTP.reset_on_get
33
+ end
34
+
35
+ def teardown
36
+ Net::HTTP.reset_on_get
37
+ end
38
+
39
+
40
+ #
41
+ # Tests selector methods.
42
+ #
43
+
44
+ def test_define_selectors
45
+ scraper = new_scraper(DIVS123) do
46
+ selector :test, "div"
47
+ end
48
+ assert_equal 3, scraper.test(scraper.document).size
49
+ 3.times do |i|
50
+ assert_equal String(i + 1), scraper.test(scraper.document)[i].attributes["id"]
51
+ end
52
+ end
53
+
54
+
55
+ def test_selector_blocks
56
+ scraper = new_scraper(DIVS123) do
57
+ selector :test, "div" do |elements|
58
+ return elements[0..-2]
59
+ elements[0..-2]
60
+ end
61
+ end
62
+ assert_equal 2, scraper.test(scraper.document).size
63
+ end
64
+
65
+
66
+ def test_array_selectors
67
+ scraper = new_scraper(DIVS123) do
68
+ selector :test, "#?", "2"
69
+ end
70
+ assert_equal 1, scraper.test(scraper.document).size
71
+ assert_equal "2", scraper.test(scraper.document)[0].attributes["id"]
72
+ end
73
+
74
+
75
+ def test_object_selectors
76
+ scraper = new_scraper(DIVS123) do
77
+ selector :test, HTML::Selector.new("div")
78
+ end
79
+ assert_equal 3, scraper.test(scraper.document).size
80
+ end
81
+
82
+
83
+ def test_selector_returns_array
84
+ scraper = new_scraper(DIVS123) do
85
+ selector :test0, "#4"
86
+ selector :test1, "#1"
87
+ selector :test3, "div"
88
+ end
89
+ assert_equal 0, scraper.test0(scraper.document).size # No elements (empty)
90
+ assert_equal 1, scraper.test1(scraper.document).size # One element (array)
91
+ assert_equal 3, scraper.test3(scraper.document).size # Array of elements
92
+ end
93
+
94
+
95
+ def test_select_in_document_order
96
+ scraper = new_scraper(DIVS123) do
97
+ selector :test, "#2,#1"
98
+ end
99
+ assert_equal 2, scraper.test(scraper.document).size
100
+ assert_equal "1", scraper.test(scraper.document)[0].attributes["id"]
101
+ assert_equal "2", scraper.test(scraper.document)[1].attributes["id"]
102
+ end
103
+
104
+
105
+ def test_selecting_first_element
106
+ scraper = new_scraper(DIVS123) do
107
+ selector :test, "div"
108
+ end
109
+ assert_equal 3, scraper.test(scraper.document).size
110
+ assert scraper.first_test(scraper.document)
111
+ assert_equal "1", scraper.first_test(scraper.document).attributes["id"]
112
+
113
+ scraper = new_scraper(DIVS123) do
114
+ selector :test, "div" do |element|
115
+ element[0].attributes["id"]
116
+ end
117
+ end
118
+ assert scraper.first_test(scraper.document)
119
+ assert_equal "1", scraper.first_test(scraper.document)
120
+ end
121
+
122
+
123
+ #
124
+ # Tests process methods.
125
+ #
126
+
127
+ def test_processing_rule
128
+ scraper = new_scraper(DIVS123) do
129
+ process "div" do |element|
130
+ @count = (@count || 0) + 1
131
+ end
132
+ attr :count
133
+ end
134
+ scraper.scrape
135
+ assert_equal 3, scraper.count
136
+ end
137
+
138
+
139
+ def test_processing_rule_with_array
140
+ scraper = new_scraper(DIVS123) do
141
+ process "#?", "1" do |element|
142
+ @count = (@count || 0) + 1
143
+ end
144
+ attr :count
145
+ end
146
+ scraper.scrape
147
+ assert_equal 1, scraper.count
148
+ end
149
+
150
+
151
+ def test_processing_rule_with_selector
152
+ scraper = new_scraper(DIVS123) do
153
+ process HTML::Selector.new("div") do |element|
154
+ @count = (@count || 0) + 1
155
+ end
156
+ attr :count
157
+ end
158
+ scraper.scrape
159
+ assert_equal 3, scraper.count
160
+ end
161
+
162
+
163
+ def test_extracting_in_code
164
+ scraper = new_scraper(DIVS123) do
165
+ process "div" do |element|
166
+ @concat = (@concat || "") << element.attributes["id"]
167
+ end
168
+ attr :concat
169
+ end
170
+ scraper.scrape
171
+ assert_equal "123", scraper.concat
172
+ end
173
+
174
+
175
+ def test_processing_in_document_order
176
+ scraper = new_scraper(DIVS123) do
177
+ process "#2,#1" do |element|
178
+ @concat = (@concat || "") << element.attributes["id"]
179
+ end
180
+ attr :concat
181
+ end
182
+ scraper.scrape
183
+ assert_equal "12", scraper.concat
184
+ end
185
+
186
+
187
+ def test_process_once_if_skipped
188
+ scraper = new_scraper(DIVS123) do
189
+ def prepare(document)
190
+ @found = []
191
+ end
192
+ process("#1") { |element| @found[0] = true }
193
+ process("#1") { |element| @found[1] = true ; skip element }
194
+ process("#1") { |element| @found[2] = true }
195
+ process("#2", :skip=>true){ |element| @found[3] = true }
196
+ process("#2") { |element| @found[4] = true }
197
+ attr_reader :found
198
+ end
199
+ scraper.scrape
200
+ assert_equal [true, true, nil, true], scraper.found
201
+ end
202
+
203
+
204
+ def test_skip_children
205
+ scraper = new_scraper(DIVS1_23) do
206
+ process "div" do |element|
207
+ @concat = (@concat || "") << (element.attributes["id"] || "")
208
+ skip id2(element)
209
+ end
210
+ selector :id2, "#2"
211
+ attr :concat
212
+ end
213
+ scraper.scrape
214
+ assert_equal "13", scraper.concat
215
+ end
216
+
217
+
218
+ def test_skip_descendants
219
+ # Root, child of root, grandchild of root.
220
+ scraper = new_scraper(DIVS1_23) do
221
+ process "div" do |element|
222
+ @concat = (@concat || "") << (element.attributes["id"] || "")
223
+ end
224
+ attr :concat
225
+ end
226
+ scraper.scrape
227
+ assert_equal "123", scraper.concat
228
+
229
+ # Stop at root.
230
+ scraper = new_scraper(DIVS1_23) do
231
+ process "div" do |element|
232
+ @concat = (@concat || "") << (element.attributes["id"] || "")
233
+ skip
234
+ end
235
+ attr :concat
236
+ end
237
+ scraper.scrape
238
+ assert_equal "1", scraper.concat
239
+
240
+ scraper.scrape
241
+ # Child of root, and child of root's child
242
+ scraper = new_scraper(DIVS1_23) do
243
+ process "div>div" do |element|
244
+ @concat = (@concat || "") << (element.attributes["id"] || "")
245
+ end
246
+ attr :concat
247
+ end
248
+ scraper.scrape
249
+ assert_equal "23", scraper.concat
250
+
251
+ # Stop at child of root.
252
+ scraper = new_scraper(DIVS1_23) do
253
+ process "div>div" do |element|
254
+ @concat = (@concat || "") << (element.attributes["id"] || "")
255
+ skip element.next_element
256
+ end
257
+ attr :concat
258
+ end
259
+ scraper.scrape
260
+ assert_equal "2", scraper.concat
261
+
262
+ # Child of root, the child of child of root.
263
+ scraper = new_scraper(DIVS1_23) do
264
+ process "div div" do |element|
265
+ @concat = (@concat || "") << (element.attributes["id"] || "")
266
+ end
267
+ attr :concat
268
+ end
269
+ scraper.scrape
270
+ assert_equal "23", scraper.concat
271
+
272
+ # Child of root.
273
+ scraper = new_scraper(DIVS1_23) do
274
+ process "div div" do |element|
275
+ @concat = (@concat || "") << (element.attributes["id"] || "")
276
+ skip element.next_element
277
+ end
278
+ attr :concat
279
+ end
280
+ scraper.scrape
281
+ assert_equal "2", scraper.concat
282
+ end
283
+
284
+
285
+ def test_skip_from_extractor
286
+ html = %Q{<div id="1">this</div>"}
287
+ scraper = new_scraper(html) do
288
+ process "#1", :this1=>:text
289
+ process "#1", :this2=>:text
290
+ end
291
+ scraper.scrape
292
+ assert_equal "this", scraper.this1
293
+ assert_equal "this", scraper.this2
294
+
295
+ scraper = new_scraper(html) do
296
+ process "#1", :this1=>:text, :skip=>false
297
+ process "#1", :this2=>:text
298
+ end
299
+ scraper.scrape
300
+ assert_equal "this", scraper.this1
301
+ assert_equal "this", scraper.this2
302
+
303
+ scraper = new_scraper(html) do
304
+ process "#1", :this1=>:text, :skip=>true do
305
+ false
306
+ end
307
+ process "#1", :this2=>:text
308
+ end
309
+ scraper.scrape
310
+ assert_equal "this", scraper.this1
311
+ assert_equal nil, scraper.this2
312
+ end
313
+
314
+
315
+ def test_stop
316
+ scraper = new_scraper(DIVS123) do
317
+ process "div" do |element|
318
+ @concat = (@concat || "") << (element.attributes["id"] || "")
319
+ stop
320
+ end
321
+ attr :concat
322
+ end
323
+ scraper.scrape
324
+ assert_equal "1", scraper.concat
325
+ end
326
+
327
+
328
+ def test_process_first
329
+ scraper = new_scraper(DIVS123) do
330
+ process "div" do |element|
331
+ @all = (@all || 0) + 1
332
+ end
333
+ process_first "div" do |element|
334
+ @first = (@first || 0) + 1
335
+ end
336
+ attr_accessor :all, :first
337
+ end
338
+ scraper.scrape
339
+ assert_equal 3, scraper.all
340
+ assert_equal 1, scraper.first
341
+ end
342
+
343
+
344
+ def test_accessors
345
+ time = Time.new.rfc2822
346
+ Net::HTTP.on_get do |address, path, headers|
347
+ if path == "/redirect"
348
+ response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
349
+ response["Last-Modified"] = time
350
+ response["ETag"] = "etag"
351
+ [response, <<-EOF
352
+ <html>
353
+ <head>
354
+ <meta http-equiv="content-type" value="text/html; charset=other-encoding">
355
+ </head>
356
+ <body>
357
+ <div id="x"/>
358
+ </body>
359
+ </html>
360
+ EOF
361
+ ]
362
+ else
363
+ response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 300, "Moved")
364
+ response["Location"] = "http://localhost/redirect"
365
+ [response, ""]
366
+ end
367
+ end
368
+ scraper = new_scraper(URI.parse("http://localhost/source"))
369
+ scraper.scrape
370
+ assert_equal "http://localhost/source", scraper.page_info.original_url.to_s
371
+ assert_equal "http://localhost/redirect", scraper.page_info.url.to_s
372
+ assert_equal time, scraper.page_info.last_modified
373
+ assert_equal "etag", scraper.page_info.etag
374
+ assert_equal "other-encoding", scraper.page_info.encoding
375
+ end
376
+
377
+
378
+ def test_scraping_end_to_end
379
+ Net::HTTP.on_get do |address, path, headers|
380
+ [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), <<-EOF
381
+ <html>
382
+ <body>
383
+ <div id="1"/>
384
+ <div id="2"/>
385
+ </body>
386
+ </html>
387
+ EOF
388
+ ]
389
+ end
390
+ scraper = new_scraper(URI.parse("http://localhost/")) do
391
+ process "div" do |element|
392
+ @concat = (@concat || "") << (element.attributes["id"] || "")
393
+ end
394
+ attr :concat
395
+ end
396
+ scraper.scrape
397
+ assert_equal "12", scraper.concat
398
+ end
399
+
400
+
401
+ #
402
+ # Tests extractor methods.
403
+ #
404
+
405
+ def test_extractors
406
+ html = %Q{<div id="1"></div>}
407
+ scraper = new_scraper(html) do
408
+ process "div", extractor(:div_id=>"@id")
409
+ attr :div_id
410
+ end
411
+ scraper.scrape
412
+ assert_equal "1", scraper.div_id
413
+ scraper = new_scraper(html) do
414
+ process "div", :div_id=>"@id"
415
+ attr :div_id
416
+ end
417
+ scraper.scrape
418
+ assert_equal "1", scraper.div_id
419
+ end
420
+
421
+
422
+ def test_text_and_element_extractors
423
+ html = %Q{<div>some text</div>}
424
+ # Extract the node itself.
425
+ scraper = new_scraper(html) do
426
+ process "div", extractor(:value=>:element)
427
+ attr :value
428
+ end
429
+ scraper.scrape
430
+ assert_equal "div", scraper.value.name
431
+ # Extract the text value of the node.
432
+ scraper = new_scraper(html) do
433
+ process "div", extractor(:value=>:text)
434
+ attr :value
435
+ end
436
+ scraper.scrape
437
+ assert_equal "some text", scraper.value
438
+ end
439
+
440
+
441
+ def test_extractors_objects
442
+ html = <<-EOF
443
+ <h1 class="header"></h1>
444
+ <h2 class="header"></h2>
445
+ EOF
446
+ # Extract both elements based on class, return the second one.
447
+ scraper = new_scraper(html) do
448
+ process ".header", extractor(:header=>:element)
449
+ attr :header
450
+ end
451
+ scraper.scrape
452
+ assert_equal "h2", scraper.header.name
453
+ # Extracting a specific element skips the second match.
454
+ html = <<-EOF
455
+ <h1 class="header"></h1>
456
+ <h2 class="header"></h2>
457
+ EOF
458
+ scraper = new_scraper(html) do
459
+ process ".header", extractor(:header=>"h1")
460
+ attr :header
461
+ end
462
+ scraper.scrape
463
+ assert_equal "h1", scraper.header.name
464
+ end
465
+
466
+
467
+ def test_attribute_extractors
468
+ # Extracting the attribute skips the second match.
469
+ html = <<-EOF
470
+ <abbr title="foo">bar</div>
471
+ <abbr>foo</abbr>
472
+ EOF
473
+ scraper = new_scraper(html) do
474
+ process "abbr", extractor(:title=>"@title")
475
+ attr :title
476
+ end
477
+ scraper.scrape
478
+ assert_equal "foo", scraper.title
479
+ # Extracting a specific element skips the second match.
480
+ html = <<-EOF
481
+ <h1 class="header" id="1"></h1>
482
+ <h2 class="header" id="2"></h2>
483
+ EOF
484
+ scraper = new_scraper(html) do
485
+ process ".header", extractor(:header=>"h1@id")
486
+ attr :header
487
+ end
488
+ scraper.scrape
489
+ assert_equal "1", scraper.header
490
+ end
491
+
492
+
493
+ def test_class_extractors
494
+ headers = Class.new(Scraper::Base)
495
+ headers.instance_eval do
496
+ root_element nil
497
+ process "h1,h2", :h1=>"h1", :h2=>"h2"
498
+ attr :h1
499
+ attr :h2
500
+ end
501
+ html = <<-EOF
502
+ <div>
503
+ <h1>first</h1>
504
+ <h2>second</h2>
505
+ </div>
506
+ EOF
507
+ scraper = new_scraper(html) do
508
+ process "div", extractor(:headers=>headers)
509
+ attr :headers
510
+ end
511
+ scraper.scrape
512
+ assert scraper.headers
513
+ assert_equal "h1", scraper.headers.h1.name
514
+ assert_equal "h2", scraper.headers.h2.name
515
+ end
516
+
517
+
518
+ def test_array_extractors
519
+ html = <<-EOF
520
+ <div>
521
+ <h1>first</h1>
522
+ <h1>second</h1>
523
+ </div>
524
+ EOF
525
+ scraper = new_scraper(html) do
526
+ process "h1", extractor("headers[]"=>:text)
527
+ attr :headers
528
+ end
529
+ scraper.scrape
530
+ assert scraper.headers.is_a?(Array)
531
+ assert_equal 2, scraper.headers.size
532
+ assert_equal "first", scraper.headers[0]
533
+ assert_equal "second", scraper.headers[1]
534
+ end
535
+
536
+
537
+ def test_hash_extractors
538
+ html = <<-EOF
539
+ <div>
540
+ <h1 id="1" class="header">first</h1>
541
+ </div>
542
+ EOF
543
+ scraper = new_scraper(html) do
544
+ process "h1", extractor("header"=>{:id=>"@id", :class=>"@class", :text=>:text})
545
+ attr :header
546
+ end
547
+ scraper.scrape
548
+ assert scraper.header.is_a?(Hash)
549
+ assert_equal 3, scraper.header.size
550
+ assert_equal "1", scraper.header[:id]
551
+ assert_equal "header", scraper.header[:class]
552
+ assert_equal "first", scraper.header[:text]
553
+ end
554
+
555
+
556
+ def test_multi_value_extractors
557
+ html = <<-EOF
558
+ <div>
559
+ <h1 id="1" class="header">first</h1>
560
+ </div>
561
+ EOF
562
+ scraper = new_scraper(html) do
563
+ process "h1", [:text, :kls]=>Scraper.define {
564
+ process "*", :text=>:text, :kls=>"@class"
565
+ }
566
+ end
567
+ result = scraper.scrape
568
+ assert "first", result.text
569
+ assert "header", result.kls
570
+ end
571
+
572
+
573
+ def test_conditional_extractors
574
+ # Look for id attribute (second header only),
575
+ # if not found look for class attribute (first
576
+ # two headers), otherwise just get text (third
577
+ # header).
578
+ html = <<-EOF
579
+ <div>
580
+ <h1 class="foo">first</h1>
581
+ <h1 class="foo" id="bar">second</h1>
582
+ <h1>third</h1>
583
+ </div>
584
+ EOF
585
+ scraper = new_scraper(html) do
586
+ process "h1", extractor("headers[]"=>["@id", "@class", :text])
587
+ attr :headers
588
+ end
589
+ scraper.scrape
590
+ assert scraper.headers.is_a?(Array)
591
+ assert_equal 3, scraper.headers.size
592
+ assert_equal "foo", scraper.headers[0]
593
+ assert_equal "bar", scraper.headers[1]
594
+ assert_equal "third", scraper.headers[2]
595
+ end
596
+
597
+
598
+ DIVS_ST_ND = <<-EOF
599
+ <div id="1">first</div>
600
+ <div id="2">second</div>
601
+ EOF
602
+
603
+ def test_accessors_from_extractor
604
+ scraper = new_scraper(DIVS_ST_ND) do
605
+ process_first "div", :div_id=>"@id", :div_text=>:text
606
+ result :div_id
607
+ end
608
+ value = scraper.scrape
609
+ assert_equal "1", value
610
+
611
+ scraper = new_scraper(DIVS_ST_ND) do
612
+ process_first "div", :div_id=>"@id", :div_text=>:text
613
+ result :div_id, :div_text
614
+ end
615
+ value = scraper.scrape
616
+ assert_equal "1", value.div_id
617
+ assert_equal "first", value.div_text
618
+
619
+ scraper = new_scraper(DIVS_ST_ND) do
620
+ process_first "div", :div_id=>"@id", :div_text=>:text
621
+ end
622
+ value = scraper.scrape
623
+ assert_equal "1", value.div_id
624
+ assert_equal "first", value.div_text
625
+
626
+ scraper = new_scraper(DIVS_ST_ND) do
627
+ attr_accessor :div_class
628
+ process_first "div", :div_id=>"@id", :div_text=>:text
629
+ result :div_id, :div_class
630
+ end
631
+ value = scraper.scrape
632
+ assert_equal "1", value.div_id
633
+ assert_raise(NoMethodError) { value.div_text }
634
+
635
+ scraper = new_scraper(DIVS_ST_ND) do
636
+ process "div", "div_ids[]"=>"@id"
637
+ result :div_ids
638
+ end
639
+ value = scraper.scrape
640
+ assert_equal "1", value[0]
641
+ assert_equal "2", value[1]
642
+ end
643
+
644
+
645
+ def test_array_accessors
646
+ scraper = new_scraper(DIVS_ST_ND) do
647
+ array :div_id, :div_text
648
+ process "div", :div_id=>"@id", :div_text=>:text
649
+ result :div_id, :div_text
650
+ end
651
+ value = scraper.scrape
652
+ assert_equal 2, value.div_id.size
653
+ assert_equal 2, value.div_text.size
654
+ assert_equal "1", value.div_id[0]
655
+ assert_equal "2", value.div_id[1]
656
+ assert_equal "first", value.div_text[0]
657
+ assert_equal "second", value.div_text[1]
658
+ end
659
+
660
+
661
+ #
662
+ # Root element tests.
663
+ #
664
+
665
+ HTML_EMPTY = <<-EOF
666
+ <html>
667
+ <head>
668
+ </head>
669
+ <body>
670
+ </body>
671
+ </html>
672
+ EOF
673
+
674
+ def test_scrape_body_by_default
675
+ scraper = Class.new(Scraper::Base).new(HTML_EMPTY)
676
+ scraper.class.instance_eval do
677
+ process "head" do |element| @head = element end
678
+ attr :head
679
+ process "body" do |element| @body = element end
680
+ attr :body
681
+ end
682
+ scraper.scrape
683
+ assert scraper.head
684
+ assert scraper.body
685
+ end
686
+
687
+
688
+ def test_changing_root_element
689
+ only_header = new_scraper(HTML_EMPTY) do
690
+ root_element "head"
691
+ process "head" do |element| @head = element end
692
+ attr :head
693
+ process "body" do |element| @body = element end
694
+ attr :body
695
+ end
696
+ only_body = Class.new(only_header.class).new(HTML_EMPTY)
697
+ only_body.class.root_element "body"
698
+ both_parts = Class.new(only_body.class).new(HTML_EMPTY)
699
+ both_parts.class.root_element nil
700
+ # We set this scraper to begin with the head element,
701
+ # so we can see the head element, but not the body.
702
+ only_header.scrape
703
+ assert only_header.head
704
+ assert only_header.body.nil?
705
+ # Now switch to a scraper that processes the body element,
706
+ # skipping the header.
707
+ only_body.scrape
708
+ assert only_body.head.nil?
709
+ assert only_body.body
710
+ # Now switch to a scraper that doesn't specify a root element,
711
+ # and it will process both header and body.
712
+ both_parts.scrape
713
+ assert both_parts.head
714
+ assert both_parts.body
715
+ end
716
+
717
+
718
+ # Test prepare/result.
719
+
720
+ def test_prepare_and_result
721
+ # Extracting the attribute skips the second match.
722
+ scraper = new_scraper(DIVS123) do
723
+ process("div") { |element| @count +=1 }
724
+ define_method(:prepare) { @count = 1 }
725
+ define_method(:result) { @count }
726
+ end
727
+ result = scraper.scrape
728
+ assert_equal 4, result
729
+ end
730
+
731
+
732
+ def test_changing_document_from_prepare
733
+ # Extracting the attribute skips the second match.
734
+ scraper = new_scraper(DIVS123) do
735
+ selector :divs, "div"
736
+ define_method :prepare do |document|
737
+ @document = divs(document)[1]
738
+ end
739
+ array :ids
740
+ process "div", :ids=>"@id"
741
+ result :ids
742
+ end
743
+ result = scraper.scrape
744
+ assert_equal 1, result.size
745
+ assert_equal "2", result[0]
746
+ end
747
+
748
+
749
+ def test_anonymous_scrapers
750
+ scraper = Scraper.define do
751
+ array :ids
752
+ process "div", :ids=>"@id"
753
+ result :ids
754
+ end
755
+ result = scraper.scrape(DIVS123)
756
+ assert_equal "1", result[0]
757
+ assert_equal "2", result[1]
758
+ assert_equal "3", result[2]
759
+ end
760
+
761
+
762
+ def test_named_rules
763
+ scraper = Scraper.define do
764
+ array :ids1, :ids2
765
+ process :main, "div", :ids1=>"@id"
766
+ process :main, "div", :ids2=>"@id"
767
+ result :ids1, :ids2
768
+ end
769
+ result = scraper.scrape(DIVS123)
770
+ assert_equal nil, result.ids1
771
+ assert_equal 3, result.ids2.size
772
+ assert_equal "1", result.ids2[0]
773
+ assert_equal "2", result.ids2[1]
774
+ assert_equal "3", result.ids2[2]
775
+ end
776
+
777
+
778
+ protected
779
+
780
+ def new_scraper(what, &block)
781
+ cls = Class.new(Scraper::Base)
782
+ cls.root_element nil
783
+ cls.parser :html_parser
784
+ cls.class_eval &block if block
785
+ cls.new(what)
786
+ end
787
+
788
+ end
789
+
790
+
791
+ # Repeats the same set of tests, but using Tidy instead of HTMLParser.
792
+ class ScraperUsingTidyTest < ScraperTest
793
+
794
+ protected
795
+
796
+ def new_scraper(what, &block)
797
+ cls = Class.new(Scraper::Base)
798
+ cls.root_element nil
799
+ cls.parser :tidy
800
+ cls.class_eval &block if block
801
+ cls.new(what)
802
+ end
803
+
804
+ end