scrapi 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,798 @@
1
+ # ScrAPI toolkit for Ruby
2
+ #
3
+ # Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
4
+ # Developed for http://co.mments.com
5
+ # Code and documention: http://labnotes.org
6
+
7
+
8
+ require "rubygems"
9
+ require "time"
10
+ require "test/unit"
11
+ require File.join(File.dirname(__FILE__), "mock_net_http")
12
+ require File.join(File.dirname(__FILE__), "../lib", "scrapi")
13
+
14
+
15
+ class ScraperTest < Test::Unit::TestCase
16
+
17
+ def setup
18
+ Net::HTTP.reset_on_get
19
+ end
20
+
21
+ def teardown
22
+ Net::HTTP.reset_on_get
23
+ end
24
+
25
+
26
+ #
27
+ # Tests selector methods.
28
+ #
29
+
30
+ def test_define_selectors
31
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
32
+ scraper = new_scraper(html) do
33
+ selector :test, "div"
34
+ end
35
+ assert_equal 3, scraper.test(scraper.document).size
36
+ 3.times do |i|
37
+ assert_equal String(i + 1), scraper.test(scraper.document)[i].attributes["id"]
38
+ end
39
+ end
40
+
41
+
42
+ def test_selector_blocks
43
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
44
+ scraper = new_scraper(html) do
45
+ selector :test, "div" do |elements|
46
+ return elements[0..-2]
47
+ elements[0..-2]
48
+ end
49
+ end
50
+ assert_equal 2, scraper.test(scraper.document).size
51
+ end
52
+
53
+
54
+ def test_array_selectors
55
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
56
+ scraper = new_scraper(html) do
57
+ selector :test, "#?", "2"
58
+ end
59
+ assert_equal 1, scraper.test(scraper.document).size
60
+ assert_equal "2", scraper.test(scraper.document)[0].attributes["id"]
61
+ end
62
+
63
+
64
+ def test_object_selectors
65
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
66
+ scraper = new_scraper(html) do
67
+ selector :test, HTML::Selector.new("div")
68
+ end
69
+ assert_equal 3, scraper.test(scraper.document).size
70
+ end
71
+
72
+
73
+ def test_selector_returns_array
74
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
75
+ scraper = new_scraper(html) do
76
+ selector :test0, "#4"
77
+ selector :test1, "#1"
78
+ selector :test3, "div"
79
+ end
80
+ assert_equal 0, scraper.test0(scraper.document).size # No elements (empty)
81
+ assert_equal 1, scraper.test1(scraper.document).size # One element (array)
82
+ assert_equal 3, scraper.test3(scraper.document).size # Array of elements
83
+ end
84
+
85
+
86
+ def test_select_in_document_order
87
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
88
+ scraper = new_scraper(html) do
89
+ selector :test, "#2,#1"
90
+ end
91
+ assert_equal 2, scraper.test(scraper.document).size
92
+ assert_equal "1", scraper.test(scraper.document)[0].attributes["id"]
93
+ assert_equal "2", scraper.test(scraper.document)[1].attributes["id"]
94
+ end
95
+
96
+
97
+ def test_selecting_first_element
98
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
99
+ scraper = new_scraper(html) do
100
+ selector :test, "div"
101
+ end
102
+ assert_equal 3, scraper.test(scraper.document).size
103
+ assert scraper.first_test(scraper.document)
104
+ assert_equal "1", scraper.first_test(scraper.document).attributes["id"]
105
+
106
+ scraper = new_scraper(html) do
107
+ selector :test, "div" do |element|
108
+ element[0].attributes["id"]
109
+ end
110
+ end
111
+ assert scraper.first_test(scraper.document)
112
+ assert_equal "1", scraper.first_test(scraper.document)
113
+ end
114
+
115
+
116
+ #
117
+ # Tests process methods.
118
+ #
119
+
120
+ def test_processing_rule
121
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
122
+ scraper = new_scraper(html) do
123
+ process "div" do |element|
124
+ @count = (@count || 0) + 1
125
+ end
126
+ attr :count
127
+ end
128
+ scraper.scrape
129
+ assert_equal 3, scraper.count
130
+ end
131
+
132
+
133
+ def test_processing_rule_with_array
134
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
135
+ scraper = new_scraper(html) do
136
+ process "#?", "1" do |element|
137
+ @count = (@count || 0) + 1
138
+ end
139
+ attr :count
140
+ end
141
+ scraper.scrape
142
+ assert_equal 1, scraper.count
143
+ end
144
+
145
+
146
+ def test_processing_rule_with_selector
147
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
148
+ scraper = new_scraper(html) do
149
+ process HTML::Selector.new("div") do |element|
150
+ @count = (@count || 0) + 1
151
+ end
152
+ attr :count
153
+ end
154
+ scraper.scrape
155
+ assert_equal 3, scraper.count
156
+ end
157
+
158
+
159
+ def test_extracting_in_code
160
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
161
+ scraper = new_scraper(html) do
162
+ process "div" do |element|
163
+ @concat = (@concat || "") << element.attributes["id"]
164
+ end
165
+ attr :concat
166
+ end
167
+ scraper.scrape
168
+ assert_equal "123", scraper.concat
169
+ end
170
+
171
+
172
+ def test_processing_in_document_order
173
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
174
+ scraper = new_scraper(html) do
175
+ process "#2,#1" do |element|
176
+ @concat = (@concat || "") << element.attributes["id"]
177
+ end
178
+ attr :concat
179
+ end
180
+ scraper.scrape
181
+ assert_equal "12", scraper.concat
182
+ end
183
+
184
+
185
+ def test_skip_if_extractor_returns_true
186
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
187
+ scraper = new_scraper(html) do
188
+ process "#1" do |element|
189
+ @first = true
190
+ false
191
+ end
192
+ process "#1" do |element|
193
+ @second = true
194
+ end
195
+ attr :first
196
+ attr :second
197
+ end
198
+ scraper.scrape
199
+ assert_equal true, scraper.first
200
+ assert_equal true, scraper.second
201
+ scraper = new_scraper(html) do
202
+ process "#1" do |element|
203
+ @first = true
204
+ true
205
+ end
206
+ process "#1" do |element|
207
+ @second = true
208
+ end
209
+ attr :first
210
+ attr :second
211
+ end
212
+ scraper.scrape
213
+ assert_equal true, scraper.first
214
+ assert_equal nil, scraper.second
215
+ end
216
+
217
+
218
+ def test_process_once_if_skipped
219
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
220
+ scraper = new_scraper(html) do
221
+ process "#1" do |element|
222
+ @first = true
223
+ skip element
224
+ false
225
+ end
226
+ process "#1" do |element|
227
+ @second = true
228
+ end
229
+ attr :first
230
+ attr :second
231
+ end
232
+ scraper.scrape
233
+ assert_equal true, scraper.first
234
+ assert_equal nil, scraper.second
235
+ end
236
+
237
+
238
+ def test_skip_children
239
+ html = %Q{<div><div id="1"></div><div id="2"></div><div id="3"></div></div>}
240
+ scraper = new_scraper(html) do
241
+ process "div" do |element|
242
+ @concat = (@concat || "") << (element.attributes["id"] || "")
243
+ if to_skip = id2(element)
244
+ skip to_skip
245
+ end
246
+ false
247
+ end
248
+ selector :id2, "#2"
249
+ attr :concat
250
+ end
251
+ scraper.scrape
252
+ assert_equal "13", scraper.concat
253
+ end
254
+
255
+
256
+ def test_skip_descendants
257
+ html = %Q{<div id="1"><div id="2"><div id="3"></div></div</div>}
258
+ scraper = new_scraper(html) do
259
+ process "div" do |element|
260
+ @concat = (@concat || "") << (element.attributes["id"] || "")
261
+ false
262
+ end
263
+ attr :concat
264
+ end
265
+ scraper.scrape
266
+ # Root, child of root, grandchild of root.
267
+ assert_equal "123", scraper.concat
268
+ scraper = new_scraper(html) do
269
+ process "div" do |element|
270
+ @concat = (@concat || "") << (element.attributes["id"] || "")
271
+ true
272
+ end
273
+ attr :concat
274
+ end
275
+ scraper.scrape
276
+ # Stop at root.
277
+ assert_equal "1", scraper.concat
278
+
279
+ scraper = new_scraper(html) do
280
+ process "div>div" do |element|
281
+ @concat = (@concat || "") << (element.attributes["id"] || "")
282
+ false
283
+ end
284
+ attr :concat
285
+ end
286
+ scraper.scrape
287
+ # Child of root, and child of root's child
288
+ assert_equal "23", scraper.concat
289
+ scraper = new_scraper(html) do
290
+ process "div>div" do |element|
291
+ @concat = (@concat || "") << (element.attributes["id"] || "")
292
+ true
293
+ end
294
+ attr :concat
295
+ end
296
+ scraper.scrape
297
+ # Stop at child of root.
298
+ assert_equal "2", scraper.concat
299
+
300
+ scraper = new_scraper(html) do
301
+ process "div div" do |element|
302
+ @concat = (@concat || "") << (element.attributes["id"] || "")
303
+ false
304
+ end
305
+ attr :concat
306
+ end
307
+ scraper.scrape
308
+ # Child of root, the child of child of root.
309
+ assert_equal "23", scraper.concat
310
+ scraper = new_scraper(html) do
311
+ process "div div" do |element|
312
+ @concat = (@concat || "") << (element.attributes["id"] || "")
313
+ true
314
+ end
315
+ attr :concat
316
+ end
317
+ scraper.scrape
318
+ # Child of root.
319
+ assert_equal "2", scraper.concat
320
+ end
321
+
322
+
323
+ def test_skip_from_extractor
324
+ html = %Q{<div id="1">this</div>"}
325
+ scraper = new_scraper(html) do
326
+ process "#1", :this1=>:text
327
+ process "#1", :this2=>:text
328
+ end
329
+ scraper.scrape
330
+ assert_equal "this", scraper.this1
331
+ assert_equal nil, scraper.this2
332
+
333
+ scraper = new_scraper(html) do
334
+ process "#1", :this1=>:text, :skip=>false
335
+ process "#1", :this2=>:text
336
+ end
337
+ scraper.scrape
338
+ #assert_equal "this", scraper.this1
339
+ #assert_equal "this", scraper.this2
340
+
341
+ scraper = new_scraper(html) do
342
+ process "#1", :this1=>:text, :skip=>true do
343
+ false
344
+ end
345
+ process "#1", :this2=>:text
346
+ end
347
+ scraper.scrape
348
+ assert_equal "this", scraper.this1
349
+ assert_equal nil, scraper.this2
350
+ end
351
+
352
+
353
+ def test_stop
354
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
355
+ scraper = new_scraper(html) do
356
+ process "div" do |element|
357
+ @concat = (@concat || "") << (element.attributes["id"] || "")
358
+ stop
359
+ end
360
+ attr :concat
361
+ end
362
+ scraper.scrape
363
+ assert_equal "1", scraper.concat
364
+ end
365
+
366
+
367
+ def test_process_first
368
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
369
+ scraper = new_scraper(html) do
370
+ process "div" do |element|
371
+ @all = (@all || 0) + 1
372
+ false
373
+ end
374
+ process_first "div" do |element|
375
+ @first = (@first || 0) + 1
376
+ false
377
+ end
378
+ attr :all
379
+ attr :first
380
+ end
381
+ scraper.scrape
382
+ assert_equal 3, scraper.all
383
+ assert_equal 1, scraper.first
384
+ end
385
+
386
+
387
+ def test_accessors
388
+ time = Time.new.rfc2822
389
+ Net::HTTP.on_get do |address, path, headers|
390
+ if path == "/redirect"
391
+ response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
392
+ response["Last-Modified"] = time
393
+ response["ETag"] = "etag"
394
+ [response, %Q{
395
+ <html>
396
+ <head>
397
+ <meta http-equiv="content-type" value="text/html; charset=other-encoding">
398
+ </head>
399
+ <body><div id="x"/></body>
400
+ </html>
401
+ }]
402
+ else
403
+ response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 300, "Moved")
404
+ response["Location"] = "http://localhost/redirect"
405
+ [response, ""]
406
+ end
407
+ end
408
+ scraper = new_scraper(URI.parse("http://localhost/source"))
409
+ scraper.scrape
410
+ assert_equal "http://localhost/source", scraper.page_info.original_url.to_s
411
+ assert_equal "http://localhost/redirect", scraper.page_info.url.to_s
412
+ assert_equal time, scraper.page_info.last_modified
413
+ assert_equal "etag", scraper.page_info.etag
414
+ assert_equal "other-encoding", scraper.page_info.encoding
415
+ end
416
+
417
+
418
+ def test_scraping_end_to_end
419
+ Net::HTTP.on_get do |address, path, headers|
420
+ [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), %Q{
421
+ <html>
422
+ <body><div id="1"/><div id="2"/></body>
423
+ </html>
424
+ }]
425
+ end
426
+ scraper = new_scraper(URI.parse("http://localhost/")) do
427
+ process "div" do |element|
428
+ @concat = (@concat || "") << (element.attributes["id"] || "")
429
+ end
430
+ attr :concat
431
+ end
432
+ scraper.scrape
433
+ assert_equal "12", scraper.concat
434
+ end
435
+
436
+
437
+ #
438
+ # Tests extractor methods.
439
+ #
440
+
441
+ def test_extractors
442
+ html = %Q{<div id="1"></div>}
443
+ scraper = new_scraper(html) do
444
+ process "div", extractor(:div_id=>"@id")
445
+ attr :div_id
446
+ end
447
+ scraper.scrape
448
+ assert_equal "1", scraper.div_id
449
+ scraper = new_scraper(html) do
450
+ process "div", :div_id=>"@id"
451
+ attr :div_id
452
+ end
453
+ scraper.scrape
454
+ assert_equal "1", scraper.div_id
455
+ end
456
+
457
+
458
+ def test_text_and_element_extractors
459
+ html = %Q{<div>some text</div>}
460
+ # Extract the node itself.
461
+ scraper = new_scraper(html) do
462
+ process "div", extractor(:value=>:element)
463
+ attr :value
464
+ end
465
+ scraper.scrape
466
+ assert_equal "div", scraper.value.name
467
+ # Extract the text value of the node.
468
+ scraper = new_scraper(html) do
469
+ process "div", extractor(:value=>:text)
470
+ attr :value
471
+ end
472
+ scraper.scrape
473
+ assert_equal "some text", scraper.value
474
+ end
475
+
476
+
477
+ def test_extractors_objects
478
+ html = %Q{<h1 class="header"></h1><h2 class="header"></h2>}
479
+ # Extract both elements based on class, return the second one.
480
+ scraper = new_scraper(html) do
481
+ process ".header", extractor(:header=>:element)
482
+ attr :header
483
+ end
484
+ scraper.scrape
485
+ assert_equal "h2", scraper.header.name
486
+ # Extracting a specific element skips the second match.
487
+ html = %Q{<h1 class="header"></h1><h2 class="header"></h2>}
488
+ scraper = new_scraper(html) do
489
+ process ".header", extractor(:header=>"h1")
490
+ attr :header
491
+ end
492
+ scraper.scrape
493
+ assert_equal "h1", scraper.header.name
494
+ end
495
+
496
+
497
+ def test_attribute_extractors
498
+ # Extracting the attribute skips the second match.
499
+ html = %Q{<abbr title="foo">bar</div><abbr>foo</abbr>}
500
+ scraper = new_scraper(html) do
501
+ process "abbr", extractor(:title=>"@title")
502
+ attr :title
503
+ end
504
+ scraper.scrape
505
+ assert_equal "foo", scraper.title
506
+ # Extracting a specific element skips the second match.
507
+ html = %Q{<h1 class="header" id="1"></h1><h2 class="header" id="2"></h2>}
508
+ scraper = new_scraper(html) do
509
+ process ".header", extractor(:header=>"h1@id")
510
+ attr :header
511
+ end
512
+ scraper.scrape
513
+ assert_equal "1", scraper.header
514
+ end
515
+
516
+
517
+ def test_class_extractors
518
+ headers = Class.new(Scraper::Base)
519
+ headers.instance_eval do
520
+ root_element nil
521
+ process "h1,h2", :h1=>"h1", :h2=>"h2"
522
+ attr :h1
523
+ attr :h2
524
+ end
525
+ html = %Q{<div><h1>first</h1><h2>second</h2></div>}
526
+ scraper = new_scraper(html) do
527
+ process "div", extractor(:headers=>headers)
528
+ attr :headers
529
+ end
530
+ scraper.scrape
531
+ assert scraper.headers
532
+ assert_equal "h1", scraper.headers.h1.name
533
+ assert_equal "h2", scraper.headers.h2.name
534
+ end
535
+
536
+
537
+ def test_array_extractors
538
+ html = %Q{<div><h1>first</h1><h1>second</h1></div>}
539
+ scraper = new_scraper(html) do
540
+ process "h1", extractor("headers[]"=>:text)
541
+ attr :headers
542
+ end
543
+ scraper.scrape
544
+ assert scraper.headers.is_a?(Array)
545
+ assert_equal 2, scraper.headers.size
546
+ assert_equal "first", scraper.headers[0]
547
+ assert_equal "second", scraper.headers[1]
548
+ end
549
+
550
+
551
+ def test_hash_extractors
552
+ html = %Q{<div><h1 id="1" class="header">first</h1></div>}
553
+ scraper = new_scraper(html) do
554
+ process "h1", extractor("header"=>{:id=>"@id", :class=>"@class", :text=>:text})
555
+ attr :header
556
+ end
557
+ scraper.scrape
558
+ assert scraper.header.is_a?(Hash)
559
+ assert_equal 3, scraper.header.size
560
+ assert_equal "1", scraper.header[:id]
561
+ assert_equal "header", scraper.header[:class]
562
+ assert_equal "first", scraper.header[:text]
563
+ end
564
+
565
+
566
+ def test_multi_value_extractors
567
+ html = %Q{<div><h1 id="1" class="header">first</h1></div>}
568
+ scraper = new_scraper(html) do
569
+ process "h1", [:text, :kls]=>Scraper.define {
570
+ process "*", :text=>:text, :kls=>"@class"
571
+ }
572
+ end
573
+ result = scraper.scrape
574
+ assert "first", result.text
575
+ assert "header", result.kls
576
+ end
577
+
578
+
579
+ def test_conditional_extractors
580
+ # Look for id attribute (second header only),
581
+ # if not found look for class attribute (first
582
+ # two headers), otherwise just get text (third
583
+ # header).
584
+ html = %Q{<div><h1 class="foo">first</h1><h1 class="foo" id="bar">second</h1><h1>third</h1></div>}
585
+ scraper = new_scraper(html) do
586
+ process "h1", extractor("headers[]"=>["@id", "@class", :text])
587
+ attr :headers
588
+ end
589
+ scraper.scrape
590
+ assert scraper.headers.is_a?(Array)
591
+ assert_equal 3, scraper.headers.size
592
+ assert_equal "foo", scraper.headers[0]
593
+ assert_equal "bar", scraper.headers[1]
594
+ assert_equal "third", scraper.headers[2]
595
+ end
596
+
597
+
598
+ def test_accessors_from_extractor
599
+ html = %Q{<div id="1">first</div><div id="2">second</div>}
600
+ scraper = new_scraper(html) do
601
+ process_first "div", :div_id=>"@id", :div_text=>:text
602
+ result :div_id
603
+ end
604
+ value = scraper.scrape
605
+ assert_equal "1", value
606
+
607
+ scraper = new_scraper(html) do
608
+ process_first "div", :div_id=>"@id", :div_text=>:text
609
+ result :div_id, :div_text
610
+ end
611
+ value = scraper.scrape
612
+ assert_equal "1", value.div_id
613
+ assert_equal "first", value.div_text
614
+
615
+ scraper = new_scraper(html) do
616
+ process_first "div", :div_id=>"@id", :div_text=>:text
617
+ end
618
+ value = scraper.scrape
619
+ assert_equal "1", value.div_id
620
+ assert_equal "first", value.div_text
621
+
622
+ scraper = new_scraper(html) do
623
+ attr_accessor :div_class
624
+ process_first "div", :div_id=>"@id", :div_text=>:text
625
+ result :div_id, :div_class
626
+ end
627
+ value = scraper.scrape
628
+ assert_equal "1", value.div_id
629
+ assert_raise(NoMethodError) { value.div_text }
630
+
631
+ scraper = new_scraper(html) do
632
+ process "div", "div_ids[]"=>"@id"
633
+ result :div_ids
634
+ end
635
+ value = scraper.scrape
636
+ assert_equal "1", value[0]
637
+ assert_equal "2", value[1]
638
+ end
639
+
640
+
641
+ def test_array_accessors
642
+ html = %Q{<div id="1">first</div><div id="2">second</div>}
643
+ scraper = new_scraper(html) do
644
+ array :div_id, :div_text
645
+ process "div", :div_id=>"@id", :div_text=>:text
646
+ result :div_id, :div_text
647
+ end
648
+ value = scraper.scrape
649
+ assert_equal 2, value.div_id.size
650
+ assert_equal 2, value.div_text.size
651
+ assert_equal "1", value.div_id[0]
652
+ assert_equal "2", value.div_id[1]
653
+ assert_equal "first", value.div_text[0]
654
+ assert_equal "second", value.div_text[1]
655
+ end
656
+
657
+
658
+ #
659
+ # Root element tests.
660
+ #
661
+
662
+ def test_scrape_body_by_default
663
+ html = %Q{<html><head></head><body></body></html>}
664
+ scraper = Class.new(Scraper::Base).new(html)
665
+ scraper.class.instance_eval do
666
+ process "head" do |element| @head = element end
667
+ attr :head
668
+ process "body" do |element| @body = element end
669
+ attr :body
670
+ end
671
+ scraper.scrape
672
+ assert scraper.head
673
+ assert scraper.body
674
+ end
675
+
676
+
677
+ def test_changing_root_element
678
+ html = %Q{<html><head></head><body></body></html>}
679
+ only_header = new_scraper(html) do
680
+ root_element "head"
681
+ process "head" do |element| @head = element end
682
+ attr :head
683
+ process "body" do |element| @body = element end
684
+ attr :body
685
+ end
686
+ only_body = Class.new(only_header.class).new(html)
687
+ only_body.class.root_element "body"
688
+ both_parts = Class.new(only_body.class).new(html)
689
+ both_parts.class.root_element nil
690
+ # We set this scraper to begin with the head element,
691
+ # so we can see the head element, but not the body.
692
+ only_header.scrape
693
+ assert only_header.head
694
+ assert only_header.body.nil?
695
+ # Now switch to a scraper that processes the body element,
696
+ # skipping the header.
697
+ only_body.scrape
698
+ assert only_body.head.nil?
699
+ assert only_body.body
700
+ # Now switch to a scraper that doesn't specify a root element,
701
+ # and it will process both header and body.
702
+ both_parts.scrape
703
+ assert both_parts.head
704
+ assert both_parts.body
705
+ end
706
+
707
+
708
+ # Test prepare/result.
709
+
710
+ def test_prepare_and_result
711
+ # Extracting the attribute skips the second match.
712
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
713
+ scraper = new_scraper(html) do
714
+ process("div") { |element| @count +=1 }
715
+ define_method(:prepare) { @count = 1 }
716
+ define_method(:result) { @count }
717
+ end
718
+ result = scraper.scrape
719
+ assert_equal 4, result
720
+ end
721
+
722
+
723
+ def test_changing_document_from_prepare
724
+ # Extracting the attribute skips the second match.
725
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
726
+ scraper = new_scraper(html) do
727
+ selector :divs, "div"
728
+ define_method :prepare do |document|
729
+ @document = divs(document)[1]
730
+ end
731
+ array :ids
732
+ process "div", :ids=>"@id"
733
+ result :ids
734
+ end
735
+ result = scraper.scrape
736
+ assert_equal 1, result.size
737
+ assert_equal "2", result[0]
738
+ end
739
+
740
+
741
+ def test_anonymous_scrapers
742
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
743
+ scraper = Scraper.define do
744
+ array :ids
745
+ process "div", :ids=>"@id"
746
+ result :ids
747
+ end
748
+ result = scraper.scrape(html)
749
+ assert_equal "1", result[0]
750
+ assert_equal "2", result[1]
751
+ assert_equal "3", result[2]
752
+ end
753
+
754
+
755
+ def test_named_rules
756
+ html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
757
+ scraper = Scraper.define do
758
+ array :ids1, :ids2
759
+ process :main, "div", :ids1=>"@id"
760
+ process :main, "div", :ids2=>"@id"
761
+ result :ids1, :ids2
762
+ end
763
+ result = scraper.scrape(html)
764
+ assert_equal nil, result.ids1
765
+ assert_equal 3, result.ids2.size
766
+ assert_equal "1", result.ids2[0]
767
+ assert_equal "2", result.ids2[1]
768
+ assert_equal "3", result.ids2[2]
769
+ end
770
+
771
+
772
+ protected
773
+
774
+ def new_scraper(what, &block)
775
+ cls = Class.new(Scraper::Base)
776
+ cls.root_element nil
777
+ cls.parser :html_parser
778
+ cls.instance_eval &block if block
779
+ cls.new(what)
780
+ end
781
+
782
+ end
783
+
784
+
785
+ # Repeats the same set of tests, but using Tidy instead of HTMLParser.
786
+ class ScraperUsingTidyTest < ScraperTest
787
+
788
+ protected
789
+
790
+ def new_scraper(what, &block)
791
+ cls = Class.new(Scraper::Base)
792
+ cls.root_element nil
793
+ cls.parser :tidy
794
+ cls.instance_eval &block if block
795
+ cls.new(what)
796
+ end
797
+
798
+ end