scrapi 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +22 -0
- data/MIT-LICENSE +20 -0
- data/README +88 -0
- data/Rakefile +67 -0
- data/lib/html/document.rb +64 -0
- data/lib/html/htmlparser.rb +407 -0
- data/lib/html/node.rb +534 -0
- data/lib/html/node_ext.rb +86 -0
- data/lib/html/selector.rb +825 -0
- data/lib/html/tokenizer.rb +105 -0
- data/lib/html/version.rb +11 -0
- data/lib/scraper/base.rb +970 -0
- data/lib/scraper/reader.rb +239 -0
- data/lib/scrapi.rb +8 -0
- data/lib/tidy/libtidy.dll +0 -0
- data/lib/tidy/libtidy.so +0 -0
- data/test/mock_net_http.rb +54 -0
- data/test/node_ext_test.rb +24 -0
- data/test/reader_test.rb +299 -0
- data/test/scraper_test.rb +798 -0
- data/test/selector_test.rb +637 -0
- metadata +81 -0
@@ -0,0 +1,798 @@
|
|
1
|
+
# ScrAPI toolkit for Ruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
|
4
|
+
# Developed for http://co.mments.com
|
5
|
+
# Code and documention: http://labnotes.org
|
6
|
+
|
7
|
+
|
8
|
+
require "rubygems"
|
9
|
+
require "time"
|
10
|
+
require "test/unit"
|
11
|
+
require File.join(File.dirname(__FILE__), "mock_net_http")
|
12
|
+
require File.join(File.dirname(__FILE__), "../lib", "scrapi")
|
13
|
+
|
14
|
+
|
15
|
+
class ScraperTest < Test::Unit::TestCase
|
16
|
+
|
17
|
+
def setup
|
18
|
+
Net::HTTP.reset_on_get
|
19
|
+
end
|
20
|
+
|
21
|
+
def teardown
|
22
|
+
Net::HTTP.reset_on_get
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
#
|
27
|
+
# Tests selector methods.
|
28
|
+
#
|
29
|
+
|
30
|
+
def test_define_selectors
|
31
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
32
|
+
scraper = new_scraper(html) do
|
33
|
+
selector :test, "div"
|
34
|
+
end
|
35
|
+
assert_equal 3, scraper.test(scraper.document).size
|
36
|
+
3.times do |i|
|
37
|
+
assert_equal String(i + 1), scraper.test(scraper.document)[i].attributes["id"]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
def test_selector_blocks
|
43
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
44
|
+
scraper = new_scraper(html) do
|
45
|
+
selector :test, "div" do |elements|
|
46
|
+
return elements[0..-2]
|
47
|
+
elements[0..-2]
|
48
|
+
end
|
49
|
+
end
|
50
|
+
assert_equal 2, scraper.test(scraper.document).size
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
def test_array_selectors
|
55
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
56
|
+
scraper = new_scraper(html) do
|
57
|
+
selector :test, "#?", "2"
|
58
|
+
end
|
59
|
+
assert_equal 1, scraper.test(scraper.document).size
|
60
|
+
assert_equal "2", scraper.test(scraper.document)[0].attributes["id"]
|
61
|
+
end
|
62
|
+
|
63
|
+
|
64
|
+
def test_object_selectors
|
65
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
66
|
+
scraper = new_scraper(html) do
|
67
|
+
selector :test, HTML::Selector.new("div")
|
68
|
+
end
|
69
|
+
assert_equal 3, scraper.test(scraper.document).size
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
def test_selector_returns_array
|
74
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
75
|
+
scraper = new_scraper(html) do
|
76
|
+
selector :test0, "#4"
|
77
|
+
selector :test1, "#1"
|
78
|
+
selector :test3, "div"
|
79
|
+
end
|
80
|
+
assert_equal 0, scraper.test0(scraper.document).size # No elements (empty)
|
81
|
+
assert_equal 1, scraper.test1(scraper.document).size # One element (array)
|
82
|
+
assert_equal 3, scraper.test3(scraper.document).size # Array of elements
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
def test_select_in_document_order
|
87
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
88
|
+
scraper = new_scraper(html) do
|
89
|
+
selector :test, "#2,#1"
|
90
|
+
end
|
91
|
+
assert_equal 2, scraper.test(scraper.document).size
|
92
|
+
assert_equal "1", scraper.test(scraper.document)[0].attributes["id"]
|
93
|
+
assert_equal "2", scraper.test(scraper.document)[1].attributes["id"]
|
94
|
+
end
|
95
|
+
|
96
|
+
|
97
|
+
def test_selecting_first_element
|
98
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
99
|
+
scraper = new_scraper(html) do
|
100
|
+
selector :test, "div"
|
101
|
+
end
|
102
|
+
assert_equal 3, scraper.test(scraper.document).size
|
103
|
+
assert scraper.first_test(scraper.document)
|
104
|
+
assert_equal "1", scraper.first_test(scraper.document).attributes["id"]
|
105
|
+
|
106
|
+
scraper = new_scraper(html) do
|
107
|
+
selector :test, "div" do |element|
|
108
|
+
element[0].attributes["id"]
|
109
|
+
end
|
110
|
+
end
|
111
|
+
assert scraper.first_test(scraper.document)
|
112
|
+
assert_equal "1", scraper.first_test(scraper.document)
|
113
|
+
end
|
114
|
+
|
115
|
+
|
116
|
+
#
|
117
|
+
# Tests process methods.
|
118
|
+
#
|
119
|
+
|
120
|
+
def test_processing_rule
|
121
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
122
|
+
scraper = new_scraper(html) do
|
123
|
+
process "div" do |element|
|
124
|
+
@count = (@count || 0) + 1
|
125
|
+
end
|
126
|
+
attr :count
|
127
|
+
end
|
128
|
+
scraper.scrape
|
129
|
+
assert_equal 3, scraper.count
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
def test_processing_rule_with_array
|
134
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
135
|
+
scraper = new_scraper(html) do
|
136
|
+
process "#?", "1" do |element|
|
137
|
+
@count = (@count || 0) + 1
|
138
|
+
end
|
139
|
+
attr :count
|
140
|
+
end
|
141
|
+
scraper.scrape
|
142
|
+
assert_equal 1, scraper.count
|
143
|
+
end
|
144
|
+
|
145
|
+
|
146
|
+
def test_processing_rule_with_selector
|
147
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
148
|
+
scraper = new_scraper(html) do
|
149
|
+
process HTML::Selector.new("div") do |element|
|
150
|
+
@count = (@count || 0) + 1
|
151
|
+
end
|
152
|
+
attr :count
|
153
|
+
end
|
154
|
+
scraper.scrape
|
155
|
+
assert_equal 3, scraper.count
|
156
|
+
end
|
157
|
+
|
158
|
+
|
159
|
+
def test_extracting_in_code
|
160
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
161
|
+
scraper = new_scraper(html) do
|
162
|
+
process "div" do |element|
|
163
|
+
@concat = (@concat || "") << element.attributes["id"]
|
164
|
+
end
|
165
|
+
attr :concat
|
166
|
+
end
|
167
|
+
scraper.scrape
|
168
|
+
assert_equal "123", scraper.concat
|
169
|
+
end
|
170
|
+
|
171
|
+
|
172
|
+
def test_processing_in_document_order
|
173
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
174
|
+
scraper = new_scraper(html) do
|
175
|
+
process "#2,#1" do |element|
|
176
|
+
@concat = (@concat || "") << element.attributes["id"]
|
177
|
+
end
|
178
|
+
attr :concat
|
179
|
+
end
|
180
|
+
scraper.scrape
|
181
|
+
assert_equal "12", scraper.concat
|
182
|
+
end
|
183
|
+
|
184
|
+
|
185
|
+
def test_skip_if_extractor_returns_true
|
186
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
187
|
+
scraper = new_scraper(html) do
|
188
|
+
process "#1" do |element|
|
189
|
+
@first = true
|
190
|
+
false
|
191
|
+
end
|
192
|
+
process "#1" do |element|
|
193
|
+
@second = true
|
194
|
+
end
|
195
|
+
attr :first
|
196
|
+
attr :second
|
197
|
+
end
|
198
|
+
scraper.scrape
|
199
|
+
assert_equal true, scraper.first
|
200
|
+
assert_equal true, scraper.second
|
201
|
+
scraper = new_scraper(html) do
|
202
|
+
process "#1" do |element|
|
203
|
+
@first = true
|
204
|
+
true
|
205
|
+
end
|
206
|
+
process "#1" do |element|
|
207
|
+
@second = true
|
208
|
+
end
|
209
|
+
attr :first
|
210
|
+
attr :second
|
211
|
+
end
|
212
|
+
scraper.scrape
|
213
|
+
assert_equal true, scraper.first
|
214
|
+
assert_equal nil, scraper.second
|
215
|
+
end
|
216
|
+
|
217
|
+
|
218
|
+
def test_process_once_if_skipped
|
219
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
220
|
+
scraper = new_scraper(html) do
|
221
|
+
process "#1" do |element|
|
222
|
+
@first = true
|
223
|
+
skip element
|
224
|
+
false
|
225
|
+
end
|
226
|
+
process "#1" do |element|
|
227
|
+
@second = true
|
228
|
+
end
|
229
|
+
attr :first
|
230
|
+
attr :second
|
231
|
+
end
|
232
|
+
scraper.scrape
|
233
|
+
assert_equal true, scraper.first
|
234
|
+
assert_equal nil, scraper.second
|
235
|
+
end
|
236
|
+
|
237
|
+
|
238
|
+
def test_skip_children
|
239
|
+
html = %Q{<div><div id="1"></div><div id="2"></div><div id="3"></div></div>}
|
240
|
+
scraper = new_scraper(html) do
|
241
|
+
process "div" do |element|
|
242
|
+
@concat = (@concat || "") << (element.attributes["id"] || "")
|
243
|
+
if to_skip = id2(element)
|
244
|
+
skip to_skip
|
245
|
+
end
|
246
|
+
false
|
247
|
+
end
|
248
|
+
selector :id2, "#2"
|
249
|
+
attr :concat
|
250
|
+
end
|
251
|
+
scraper.scrape
|
252
|
+
assert_equal "13", scraper.concat
|
253
|
+
end
|
254
|
+
|
255
|
+
|
256
|
+
def test_skip_descendants
|
257
|
+
html = %Q{<div id="1"><div id="2"><div id="3"></div></div</div>}
|
258
|
+
scraper = new_scraper(html) do
|
259
|
+
process "div" do |element|
|
260
|
+
@concat = (@concat || "") << (element.attributes["id"] || "")
|
261
|
+
false
|
262
|
+
end
|
263
|
+
attr :concat
|
264
|
+
end
|
265
|
+
scraper.scrape
|
266
|
+
# Root, child of root, grandchild of root.
|
267
|
+
assert_equal "123", scraper.concat
|
268
|
+
scraper = new_scraper(html) do
|
269
|
+
process "div" do |element|
|
270
|
+
@concat = (@concat || "") << (element.attributes["id"] || "")
|
271
|
+
true
|
272
|
+
end
|
273
|
+
attr :concat
|
274
|
+
end
|
275
|
+
scraper.scrape
|
276
|
+
# Stop at root.
|
277
|
+
assert_equal "1", scraper.concat
|
278
|
+
|
279
|
+
scraper = new_scraper(html) do
|
280
|
+
process "div>div" do |element|
|
281
|
+
@concat = (@concat || "") << (element.attributes["id"] || "")
|
282
|
+
false
|
283
|
+
end
|
284
|
+
attr :concat
|
285
|
+
end
|
286
|
+
scraper.scrape
|
287
|
+
# Child of root, and child of root's child
|
288
|
+
assert_equal "23", scraper.concat
|
289
|
+
scraper = new_scraper(html) do
|
290
|
+
process "div>div" do |element|
|
291
|
+
@concat = (@concat || "") << (element.attributes["id"] || "")
|
292
|
+
true
|
293
|
+
end
|
294
|
+
attr :concat
|
295
|
+
end
|
296
|
+
scraper.scrape
|
297
|
+
# Stop at child of root.
|
298
|
+
assert_equal "2", scraper.concat
|
299
|
+
|
300
|
+
scraper = new_scraper(html) do
|
301
|
+
process "div div" do |element|
|
302
|
+
@concat = (@concat || "") << (element.attributes["id"] || "")
|
303
|
+
false
|
304
|
+
end
|
305
|
+
attr :concat
|
306
|
+
end
|
307
|
+
scraper.scrape
|
308
|
+
# Child of root, the child of child of root.
|
309
|
+
assert_equal "23", scraper.concat
|
310
|
+
scraper = new_scraper(html) do
|
311
|
+
process "div div" do |element|
|
312
|
+
@concat = (@concat || "") << (element.attributes["id"] || "")
|
313
|
+
true
|
314
|
+
end
|
315
|
+
attr :concat
|
316
|
+
end
|
317
|
+
scraper.scrape
|
318
|
+
# Child of root.
|
319
|
+
assert_equal "2", scraper.concat
|
320
|
+
end
|
321
|
+
|
322
|
+
|
323
|
+
def test_skip_from_extractor
|
324
|
+
html = %Q{<div id="1">this</div>"}
|
325
|
+
scraper = new_scraper(html) do
|
326
|
+
process "#1", :this1=>:text
|
327
|
+
process "#1", :this2=>:text
|
328
|
+
end
|
329
|
+
scraper.scrape
|
330
|
+
assert_equal "this", scraper.this1
|
331
|
+
assert_equal nil, scraper.this2
|
332
|
+
|
333
|
+
scraper = new_scraper(html) do
|
334
|
+
process "#1", :this1=>:text, :skip=>false
|
335
|
+
process "#1", :this2=>:text
|
336
|
+
end
|
337
|
+
scraper.scrape
|
338
|
+
#assert_equal "this", scraper.this1
|
339
|
+
#assert_equal "this", scraper.this2
|
340
|
+
|
341
|
+
scraper = new_scraper(html) do
|
342
|
+
process "#1", :this1=>:text, :skip=>true do
|
343
|
+
false
|
344
|
+
end
|
345
|
+
process "#1", :this2=>:text
|
346
|
+
end
|
347
|
+
scraper.scrape
|
348
|
+
assert_equal "this", scraper.this1
|
349
|
+
assert_equal nil, scraper.this2
|
350
|
+
end
|
351
|
+
|
352
|
+
|
353
|
+
def test_stop
|
354
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
355
|
+
scraper = new_scraper(html) do
|
356
|
+
process "div" do |element|
|
357
|
+
@concat = (@concat || "") << (element.attributes["id"] || "")
|
358
|
+
stop
|
359
|
+
end
|
360
|
+
attr :concat
|
361
|
+
end
|
362
|
+
scraper.scrape
|
363
|
+
assert_equal "1", scraper.concat
|
364
|
+
end
|
365
|
+
|
366
|
+
|
367
|
+
def test_process_first
|
368
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
369
|
+
scraper = new_scraper(html) do
|
370
|
+
process "div" do |element|
|
371
|
+
@all = (@all || 0) + 1
|
372
|
+
false
|
373
|
+
end
|
374
|
+
process_first "div" do |element|
|
375
|
+
@first = (@first || 0) + 1
|
376
|
+
false
|
377
|
+
end
|
378
|
+
attr :all
|
379
|
+
attr :first
|
380
|
+
end
|
381
|
+
scraper.scrape
|
382
|
+
assert_equal 3, scraper.all
|
383
|
+
assert_equal 1, scraper.first
|
384
|
+
end
|
385
|
+
|
386
|
+
|
387
|
+
def test_accessors
|
388
|
+
time = Time.new.rfc2822
|
389
|
+
Net::HTTP.on_get do |address, path, headers|
|
390
|
+
if path == "/redirect"
|
391
|
+
response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
|
392
|
+
response["Last-Modified"] = time
|
393
|
+
response["ETag"] = "etag"
|
394
|
+
[response, %Q{
|
395
|
+
<html>
|
396
|
+
<head>
|
397
|
+
<meta http-equiv="content-type" value="text/html; charset=other-encoding">
|
398
|
+
</head>
|
399
|
+
<body><div id="x"/></body>
|
400
|
+
</html>
|
401
|
+
}]
|
402
|
+
else
|
403
|
+
response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 300, "Moved")
|
404
|
+
response["Location"] = "http://localhost/redirect"
|
405
|
+
[response, ""]
|
406
|
+
end
|
407
|
+
end
|
408
|
+
scraper = new_scraper(URI.parse("http://localhost/source"))
|
409
|
+
scraper.scrape
|
410
|
+
assert_equal "http://localhost/source", scraper.page_info.original_url.to_s
|
411
|
+
assert_equal "http://localhost/redirect", scraper.page_info.url.to_s
|
412
|
+
assert_equal time, scraper.page_info.last_modified
|
413
|
+
assert_equal "etag", scraper.page_info.etag
|
414
|
+
assert_equal "other-encoding", scraper.page_info.encoding
|
415
|
+
end
|
416
|
+
|
417
|
+
|
418
|
+
def test_scraping_end_to_end
|
419
|
+
Net::HTTP.on_get do |address, path, headers|
|
420
|
+
[Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), %Q{
|
421
|
+
<html>
|
422
|
+
<body><div id="1"/><div id="2"/></body>
|
423
|
+
</html>
|
424
|
+
}]
|
425
|
+
end
|
426
|
+
scraper = new_scraper(URI.parse("http://localhost/")) do
|
427
|
+
process "div" do |element|
|
428
|
+
@concat = (@concat || "") << (element.attributes["id"] || "")
|
429
|
+
end
|
430
|
+
attr :concat
|
431
|
+
end
|
432
|
+
scraper.scrape
|
433
|
+
assert_equal "12", scraper.concat
|
434
|
+
end
|
435
|
+
|
436
|
+
|
437
|
+
#
|
438
|
+
# Tests extractor methods.
|
439
|
+
#
|
440
|
+
|
441
|
+
def test_extractors
|
442
|
+
html = %Q{<div id="1"></div>}
|
443
|
+
scraper = new_scraper(html) do
|
444
|
+
process "div", extractor(:div_id=>"@id")
|
445
|
+
attr :div_id
|
446
|
+
end
|
447
|
+
scraper.scrape
|
448
|
+
assert_equal "1", scraper.div_id
|
449
|
+
scraper = new_scraper(html) do
|
450
|
+
process "div", :div_id=>"@id"
|
451
|
+
attr :div_id
|
452
|
+
end
|
453
|
+
scraper.scrape
|
454
|
+
assert_equal "1", scraper.div_id
|
455
|
+
end
|
456
|
+
|
457
|
+
|
458
|
+
def test_text_and_element_extractors
|
459
|
+
html = %Q{<div>some text</div>}
|
460
|
+
# Extract the node itself.
|
461
|
+
scraper = new_scraper(html) do
|
462
|
+
process "div", extractor(:value=>:element)
|
463
|
+
attr :value
|
464
|
+
end
|
465
|
+
scraper.scrape
|
466
|
+
assert_equal "div", scraper.value.name
|
467
|
+
# Extract the text value of the node.
|
468
|
+
scraper = new_scraper(html) do
|
469
|
+
process "div", extractor(:value=>:text)
|
470
|
+
attr :value
|
471
|
+
end
|
472
|
+
scraper.scrape
|
473
|
+
assert_equal "some text", scraper.value
|
474
|
+
end
|
475
|
+
|
476
|
+
|
477
|
+
def test_extractors_objects
|
478
|
+
html = %Q{<h1 class="header"></h1><h2 class="header"></h2>}
|
479
|
+
# Extract both elements based on class, return the second one.
|
480
|
+
scraper = new_scraper(html) do
|
481
|
+
process ".header", extractor(:header=>:element)
|
482
|
+
attr :header
|
483
|
+
end
|
484
|
+
scraper.scrape
|
485
|
+
assert_equal "h2", scraper.header.name
|
486
|
+
# Extracting a specific element skips the second match.
|
487
|
+
html = %Q{<h1 class="header"></h1><h2 class="header"></h2>}
|
488
|
+
scraper = new_scraper(html) do
|
489
|
+
process ".header", extractor(:header=>"h1")
|
490
|
+
attr :header
|
491
|
+
end
|
492
|
+
scraper.scrape
|
493
|
+
assert_equal "h1", scraper.header.name
|
494
|
+
end
|
495
|
+
|
496
|
+
|
497
|
+
def test_attribute_extractors
|
498
|
+
# Extracting the attribute skips the second match.
|
499
|
+
html = %Q{<abbr title="foo">bar</div><abbr>foo</abbr>}
|
500
|
+
scraper = new_scraper(html) do
|
501
|
+
process "abbr", extractor(:title=>"@title")
|
502
|
+
attr :title
|
503
|
+
end
|
504
|
+
scraper.scrape
|
505
|
+
assert_equal "foo", scraper.title
|
506
|
+
# Extracting a specific element skips the second match.
|
507
|
+
html = %Q{<h1 class="header" id="1"></h1><h2 class="header" id="2"></h2>}
|
508
|
+
scraper = new_scraper(html) do
|
509
|
+
process ".header", extractor(:header=>"h1@id")
|
510
|
+
attr :header
|
511
|
+
end
|
512
|
+
scraper.scrape
|
513
|
+
assert_equal "1", scraper.header
|
514
|
+
end
|
515
|
+
|
516
|
+
|
517
|
+
def test_class_extractors
|
518
|
+
headers = Class.new(Scraper::Base)
|
519
|
+
headers.instance_eval do
|
520
|
+
root_element nil
|
521
|
+
process "h1,h2", :h1=>"h1", :h2=>"h2"
|
522
|
+
attr :h1
|
523
|
+
attr :h2
|
524
|
+
end
|
525
|
+
html = %Q{<div><h1>first</h1><h2>second</h2></div>}
|
526
|
+
scraper = new_scraper(html) do
|
527
|
+
process "div", extractor(:headers=>headers)
|
528
|
+
attr :headers
|
529
|
+
end
|
530
|
+
scraper.scrape
|
531
|
+
assert scraper.headers
|
532
|
+
assert_equal "h1", scraper.headers.h1.name
|
533
|
+
assert_equal "h2", scraper.headers.h2.name
|
534
|
+
end
|
535
|
+
|
536
|
+
|
537
|
+
def test_array_extractors
|
538
|
+
html = %Q{<div><h1>first</h1><h1>second</h1></div>}
|
539
|
+
scraper = new_scraper(html) do
|
540
|
+
process "h1", extractor("headers[]"=>:text)
|
541
|
+
attr :headers
|
542
|
+
end
|
543
|
+
scraper.scrape
|
544
|
+
assert scraper.headers.is_a?(Array)
|
545
|
+
assert_equal 2, scraper.headers.size
|
546
|
+
assert_equal "first", scraper.headers[0]
|
547
|
+
assert_equal "second", scraper.headers[1]
|
548
|
+
end
|
549
|
+
|
550
|
+
|
551
|
+
def test_hash_extractors
|
552
|
+
html = %Q{<div><h1 id="1" class="header">first</h1></div>}
|
553
|
+
scraper = new_scraper(html) do
|
554
|
+
process "h1", extractor("header"=>{:id=>"@id", :class=>"@class", :text=>:text})
|
555
|
+
attr :header
|
556
|
+
end
|
557
|
+
scraper.scrape
|
558
|
+
assert scraper.header.is_a?(Hash)
|
559
|
+
assert_equal 3, scraper.header.size
|
560
|
+
assert_equal "1", scraper.header[:id]
|
561
|
+
assert_equal "header", scraper.header[:class]
|
562
|
+
assert_equal "first", scraper.header[:text]
|
563
|
+
end
|
564
|
+
|
565
|
+
|
566
|
+
def test_multi_value_extractors
|
567
|
+
html = %Q{<div><h1 id="1" class="header">first</h1></div>}
|
568
|
+
scraper = new_scraper(html) do
|
569
|
+
process "h1", [:text, :kls]=>Scraper.define {
|
570
|
+
process "*", :text=>:text, :kls=>"@class"
|
571
|
+
}
|
572
|
+
end
|
573
|
+
result = scraper.scrape
|
574
|
+
assert "first", result.text
|
575
|
+
assert "header", result.kls
|
576
|
+
end
|
577
|
+
|
578
|
+
|
579
|
+
def test_conditional_extractors
|
580
|
+
# Look for id attribute (second header only),
|
581
|
+
# if not found look for class attribute (first
|
582
|
+
# two headers), otherwise just get text (third
|
583
|
+
# header).
|
584
|
+
html = %Q{<div><h1 class="foo">first</h1><h1 class="foo" id="bar">second</h1><h1>third</h1></div>}
|
585
|
+
scraper = new_scraper(html) do
|
586
|
+
process "h1", extractor("headers[]"=>["@id", "@class", :text])
|
587
|
+
attr :headers
|
588
|
+
end
|
589
|
+
scraper.scrape
|
590
|
+
assert scraper.headers.is_a?(Array)
|
591
|
+
assert_equal 3, scraper.headers.size
|
592
|
+
assert_equal "foo", scraper.headers[0]
|
593
|
+
assert_equal "bar", scraper.headers[1]
|
594
|
+
assert_equal "third", scraper.headers[2]
|
595
|
+
end
|
596
|
+
|
597
|
+
|
598
|
+
def test_accessors_from_extractor
|
599
|
+
html = %Q{<div id="1">first</div><div id="2">second</div>}
|
600
|
+
scraper = new_scraper(html) do
|
601
|
+
process_first "div", :div_id=>"@id", :div_text=>:text
|
602
|
+
result :div_id
|
603
|
+
end
|
604
|
+
value = scraper.scrape
|
605
|
+
assert_equal "1", value
|
606
|
+
|
607
|
+
scraper = new_scraper(html) do
|
608
|
+
process_first "div", :div_id=>"@id", :div_text=>:text
|
609
|
+
result :div_id, :div_text
|
610
|
+
end
|
611
|
+
value = scraper.scrape
|
612
|
+
assert_equal "1", value.div_id
|
613
|
+
assert_equal "first", value.div_text
|
614
|
+
|
615
|
+
scraper = new_scraper(html) do
|
616
|
+
process_first "div", :div_id=>"@id", :div_text=>:text
|
617
|
+
end
|
618
|
+
value = scraper.scrape
|
619
|
+
assert_equal "1", value.div_id
|
620
|
+
assert_equal "first", value.div_text
|
621
|
+
|
622
|
+
scraper = new_scraper(html) do
|
623
|
+
attr_accessor :div_class
|
624
|
+
process_first "div", :div_id=>"@id", :div_text=>:text
|
625
|
+
result :div_id, :div_class
|
626
|
+
end
|
627
|
+
value = scraper.scrape
|
628
|
+
assert_equal "1", value.div_id
|
629
|
+
assert_raise(NoMethodError) { value.div_text }
|
630
|
+
|
631
|
+
scraper = new_scraper(html) do
|
632
|
+
process "div", "div_ids[]"=>"@id"
|
633
|
+
result :div_ids
|
634
|
+
end
|
635
|
+
value = scraper.scrape
|
636
|
+
assert_equal "1", value[0]
|
637
|
+
assert_equal "2", value[1]
|
638
|
+
end
|
639
|
+
|
640
|
+
|
641
|
+
def test_array_accessors
|
642
|
+
html = %Q{<div id="1">first</div><div id="2">second</div>}
|
643
|
+
scraper = new_scraper(html) do
|
644
|
+
array :div_id, :div_text
|
645
|
+
process "div", :div_id=>"@id", :div_text=>:text
|
646
|
+
result :div_id, :div_text
|
647
|
+
end
|
648
|
+
value = scraper.scrape
|
649
|
+
assert_equal 2, value.div_id.size
|
650
|
+
assert_equal 2, value.div_text.size
|
651
|
+
assert_equal "1", value.div_id[0]
|
652
|
+
assert_equal "2", value.div_id[1]
|
653
|
+
assert_equal "first", value.div_text[0]
|
654
|
+
assert_equal "second", value.div_text[1]
|
655
|
+
end
|
656
|
+
|
657
|
+
|
658
|
+
#
|
659
|
+
# Root element tests.
|
660
|
+
#
|
661
|
+
|
662
|
+
def test_scrape_body_by_default
|
663
|
+
html = %Q{<html><head></head><body></body></html>}
|
664
|
+
scraper = Class.new(Scraper::Base).new(html)
|
665
|
+
scraper.class.instance_eval do
|
666
|
+
process "head" do |element| @head = element end
|
667
|
+
attr :head
|
668
|
+
process "body" do |element| @body = element end
|
669
|
+
attr :body
|
670
|
+
end
|
671
|
+
scraper.scrape
|
672
|
+
assert scraper.head
|
673
|
+
assert scraper.body
|
674
|
+
end
|
675
|
+
|
676
|
+
|
677
|
+
def test_changing_root_element
|
678
|
+
html = %Q{<html><head></head><body></body></html>}
|
679
|
+
only_header = new_scraper(html) do
|
680
|
+
root_element "head"
|
681
|
+
process "head" do |element| @head = element end
|
682
|
+
attr :head
|
683
|
+
process "body" do |element| @body = element end
|
684
|
+
attr :body
|
685
|
+
end
|
686
|
+
only_body = Class.new(only_header.class).new(html)
|
687
|
+
only_body.class.root_element "body"
|
688
|
+
both_parts = Class.new(only_body.class).new(html)
|
689
|
+
both_parts.class.root_element nil
|
690
|
+
# We set this scraper to begin with the head element,
|
691
|
+
# so we can see the head element, but not the body.
|
692
|
+
only_header.scrape
|
693
|
+
assert only_header.head
|
694
|
+
assert only_header.body.nil?
|
695
|
+
# Now switch to a scraper that processes the body element,
|
696
|
+
# skipping the header.
|
697
|
+
only_body.scrape
|
698
|
+
assert only_body.head.nil?
|
699
|
+
assert only_body.body
|
700
|
+
# Now switch to a scraper that doesn't specify a root element,
|
701
|
+
# and it will process both header and body.
|
702
|
+
both_parts.scrape
|
703
|
+
assert both_parts.head
|
704
|
+
assert both_parts.body
|
705
|
+
end
|
706
|
+
|
707
|
+
|
708
|
+
# Test prepare/result.
|
709
|
+
|
710
|
+
def test_prepare_and_result
|
711
|
+
# Extracting the attribute skips the second match.
|
712
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
713
|
+
scraper = new_scraper(html) do
|
714
|
+
process("div") { |element| @count +=1 }
|
715
|
+
define_method(:prepare) { @count = 1 }
|
716
|
+
define_method(:result) { @count }
|
717
|
+
end
|
718
|
+
result = scraper.scrape
|
719
|
+
assert_equal 4, result
|
720
|
+
end
|
721
|
+
|
722
|
+
|
723
|
+
def test_changing_document_from_prepare
|
724
|
+
# Extracting the attribute skips the second match.
|
725
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
726
|
+
scraper = new_scraper(html) do
|
727
|
+
selector :divs, "div"
|
728
|
+
define_method :prepare do |document|
|
729
|
+
@document = divs(document)[1]
|
730
|
+
end
|
731
|
+
array :ids
|
732
|
+
process "div", :ids=>"@id"
|
733
|
+
result :ids
|
734
|
+
end
|
735
|
+
result = scraper.scrape
|
736
|
+
assert_equal 1, result.size
|
737
|
+
assert_equal "2", result[0]
|
738
|
+
end
|
739
|
+
|
740
|
+
|
741
|
+
def test_anonymous_scrapers
|
742
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
743
|
+
scraper = Scraper.define do
|
744
|
+
array :ids
|
745
|
+
process "div", :ids=>"@id"
|
746
|
+
result :ids
|
747
|
+
end
|
748
|
+
result = scraper.scrape(html)
|
749
|
+
assert_equal "1", result[0]
|
750
|
+
assert_equal "2", result[1]
|
751
|
+
assert_equal "3", result[2]
|
752
|
+
end
|
753
|
+
|
754
|
+
|
755
|
+
def test_named_rules
|
756
|
+
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
757
|
+
scraper = Scraper.define do
|
758
|
+
array :ids1, :ids2
|
759
|
+
process :main, "div", :ids1=>"@id"
|
760
|
+
process :main, "div", :ids2=>"@id"
|
761
|
+
result :ids1, :ids2
|
762
|
+
end
|
763
|
+
result = scraper.scrape(html)
|
764
|
+
assert_equal nil, result.ids1
|
765
|
+
assert_equal 3, result.ids2.size
|
766
|
+
assert_equal "1", result.ids2[0]
|
767
|
+
assert_equal "2", result.ids2[1]
|
768
|
+
assert_equal "3", result.ids2[2]
|
769
|
+
end
|
770
|
+
|
771
|
+
|
772
|
+
protected
|
773
|
+
|
774
|
+
def new_scraper(what, &block)
|
775
|
+
cls = Class.new(Scraper::Base)
|
776
|
+
cls.root_element nil
|
777
|
+
cls.parser :html_parser
|
778
|
+
cls.instance_eval &block if block
|
779
|
+
cls.new(what)
|
780
|
+
end
|
781
|
+
|
782
|
+
end
|
783
|
+
|
784
|
+
|
785
|
+
# Repeats the same set of tests, but using Tidy instead of HTMLParser.
|
786
|
+
class ScraperUsingTidyTest < ScraperTest
|
787
|
+
|
788
|
+
protected
|
789
|
+
|
790
|
+
def new_scraper(what, &block)
|
791
|
+
cls = Class.new(Scraper::Base)
|
792
|
+
cls.root_element nil
|
793
|
+
cls.parser :tidy
|
794
|
+
cls.instance_eval &block if block
|
795
|
+
cls.new(what)
|
796
|
+
end
|
797
|
+
|
798
|
+
end
|