spidr 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.editorconfig +11 -0
- data/.github/workflows/ruby.yml +26 -0
- data/.gitignore +4 -5
- data/ChangeLog.md +17 -0
- data/Gemfile +8 -5
- data/LICENSE.txt +1 -1
- data/README.md +137 -78
- data/Rakefile +1 -0
- data/gemspec.yml +8 -1
- data/lib/spidr/agent/actions.rb +1 -1
- data/lib/spidr/agent/events.rb +1 -1
- data/lib/spidr/agent/filters.rb +55 -56
- data/lib/spidr/agent/sanitizers.rb +6 -9
- data/lib/spidr/agent.rb +230 -120
- data/lib/spidr/auth_store.rb +10 -6
- data/lib/spidr/page/content_types.rb +51 -0
- data/lib/spidr/page/html.rb +17 -19
- data/lib/spidr/page/status_codes.rb +12 -10
- data/lib/spidr/proxy.rb +6 -14
- data/lib/spidr/rules.rb +5 -8
- data/lib/spidr/session_cache.rb +23 -21
- data/lib/spidr/settings/proxy.rb +19 -5
- data/lib/spidr/spidr.rb +16 -6
- data/lib/spidr/version.rb +1 -1
- data/spec/agent_spec.rb +357 -10
- data/spec/example_page.rb +2 -0
- data/spec/page/content_types_spec.rb +22 -0
- data/spec/page/html_spec.rb +255 -51
- data/spec/page/status_codes_spec.rb +4 -4
- data/spec/proxy_spec.rb +2 -2
- data/spec/settings/proxy_examples.rb +31 -11
- data/spec/spec_helper.rb +3 -0
- metadata +19 -19
- data/.travis.yml +0 -14
data/spec/page/html_spec.rb
CHANGED
@@ -297,7 +297,7 @@ describe Page do
|
|
297
297
|
context "when the page contains iframes" do
|
298
298
|
let(:iframe1) { '/iframe1' }
|
299
299
|
let(:iframe2) { '/iframe2' }
|
300
|
-
let(:body) { %{<html><body><iframe src="#{iframe1}"
|
300
|
+
let(:body) { %{<html><body><iframe src="#{iframe1}"></iframe><iframe src="#{iframe2}"></iframe></body></html>} }
|
301
301
|
|
302
302
|
it "should yield each iframe/@src value" do
|
303
303
|
expect { |b|
|
@@ -332,32 +332,100 @@ describe Page do
|
|
332
332
|
end
|
333
333
|
|
334
334
|
describe "#links" do
|
335
|
-
context "when the page contains
|
335
|
+
context "when the page contains an 'a' link" do
|
336
336
|
let(:link) { '/link' }
|
337
|
+
let(:body) do
|
338
|
+
<<-HTML
|
339
|
+
<html>
|
340
|
+
<body>
|
341
|
+
<a href="#{link}">link</a>
|
342
|
+
</body>
|
343
|
+
</html>
|
344
|
+
HTML
|
345
|
+
end
|
346
|
+
|
347
|
+
it "should return an Array of links" do
|
348
|
+
expect(subject.links).to be == [
|
349
|
+
link
|
350
|
+
]
|
351
|
+
end
|
352
|
+
end
|
353
|
+
|
354
|
+
context "when the page contains a 'frame'" do
|
337
355
|
let(:frame) { '/frame' }
|
356
|
+
let(:body) do
|
357
|
+
<<-HTML
|
358
|
+
<html>
|
359
|
+
<frameset>
|
360
|
+
<frame src="#{frame}"></frame>
|
361
|
+
</frameset>
|
362
|
+
</html>
|
363
|
+
HTML
|
364
|
+
end
|
365
|
+
|
366
|
+
it "should return an Array of links" do
|
367
|
+
expect(subject.links).to be == [
|
368
|
+
frame
|
369
|
+
]
|
370
|
+
end
|
371
|
+
end
|
372
|
+
|
373
|
+
context "when the page contains a 'iframe'" do
|
338
374
|
let(:iframe) { '/iframe' }
|
375
|
+
let(:body) do
|
376
|
+
<<-HTML
|
377
|
+
<html>
|
378
|
+
<body>
|
379
|
+
<iframe src="#{iframe}"></iframe>
|
380
|
+
</body>
|
381
|
+
</html>
|
382
|
+
HTML
|
383
|
+
end
|
384
|
+
|
385
|
+
it "should return an Array of links" do
|
386
|
+
expect(subject.links).to be == [
|
387
|
+
iframe
|
388
|
+
]
|
389
|
+
end
|
390
|
+
end
|
391
|
+
|
392
|
+
context "when the page contains a 'link' element" do
|
339
393
|
let(:stylesheet) { '/stylesheet.css' }
|
394
|
+
let(:body) do
|
395
|
+
<<-HTML
|
396
|
+
<html>
|
397
|
+
<head>
|
398
|
+
<link type="stylesheet" href="#{stylesheet}" />
|
399
|
+
</head>
|
400
|
+
<body>
|
401
|
+
</body>
|
402
|
+
</html>
|
403
|
+
HTML
|
404
|
+
end
|
405
|
+
|
406
|
+
it "should return an Array of links" do
|
407
|
+
expect(subject.links).to be == [
|
408
|
+
stylesheet
|
409
|
+
]
|
410
|
+
end
|
411
|
+
end
|
412
|
+
|
413
|
+
context "when the page contains a 'script' element" do
|
340
414
|
let(:javascript) { '/script.js' }
|
341
415
|
let(:body) do
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
%{</body>} +
|
352
|
-
%{</html>}
|
416
|
+
<<-HTML
|
417
|
+
<html>
|
418
|
+
<head>
|
419
|
+
<script src="#{javascript}" />
|
420
|
+
</head>
|
421
|
+
<body>
|
422
|
+
</body>
|
423
|
+
</html>
|
424
|
+
HTML
|
353
425
|
end
|
354
426
|
|
355
427
|
it "should return an Array of links" do
|
356
428
|
expect(subject.links).to be == [
|
357
|
-
link,
|
358
|
-
frame,
|
359
|
-
iframe,
|
360
|
-
stylesheet,
|
361
429
|
javascript
|
362
430
|
]
|
363
431
|
end
|
@@ -369,32 +437,100 @@ describe Page do
|
|
369
437
|
end
|
370
438
|
|
371
439
|
describe "#each_url" do
|
372
|
-
context "when the page contains
|
440
|
+
context "when the page contains an 'a' link" do
|
373
441
|
let(:link) { '/link' }
|
442
|
+
let(:body) do
|
443
|
+
<<-HTML
|
444
|
+
<html>
|
445
|
+
<body>
|
446
|
+
<a href="#{link}">link</a>
|
447
|
+
</body>
|
448
|
+
</html>
|
449
|
+
HTML
|
450
|
+
end
|
451
|
+
|
452
|
+
it "should yield successive absolute URIs" do
|
453
|
+
expect { |b| subject.each_url(&b) }.to yield_successive_args(
|
454
|
+
URI("http://#{host}#{link}")
|
455
|
+
)
|
456
|
+
end
|
457
|
+
end
|
458
|
+
|
459
|
+
context "when the page contains a 'frame'" do
|
374
460
|
let(:frame) { '/frame' }
|
461
|
+
let(:body) do
|
462
|
+
<<-HTML
|
463
|
+
<html>
|
464
|
+
<frameset>
|
465
|
+
<frame src="#{frame}"></frame>
|
466
|
+
</frameset>
|
467
|
+
</html>
|
468
|
+
HTML
|
469
|
+
end
|
470
|
+
|
471
|
+
it "should yield successive absolute URIs" do
|
472
|
+
expect { |b| subject.each_url(&b) }.to yield_successive_args(
|
473
|
+
URI("http://#{host}#{frame}")
|
474
|
+
)
|
475
|
+
end
|
476
|
+
end
|
477
|
+
|
478
|
+
context "when the page contains a 'iframe'" do
|
375
479
|
let(:iframe) { '/iframe' }
|
480
|
+
let(:body) do
|
481
|
+
<<-HTML
|
482
|
+
<html>
|
483
|
+
<body>
|
484
|
+
<iframe src="#{iframe}"></iframe>
|
485
|
+
</body>
|
486
|
+
</html>
|
487
|
+
HTML
|
488
|
+
end
|
489
|
+
|
490
|
+
it "should yield successive absolute URIs" do
|
491
|
+
expect { |b| subject.each_url(&b) }.to yield_successive_args(
|
492
|
+
URI("http://#{host}#{iframe}")
|
493
|
+
)
|
494
|
+
end
|
495
|
+
end
|
496
|
+
|
497
|
+
context "when the page contains a 'link' element" do
|
376
498
|
let(:stylesheet) { '/stylesheet.css' }
|
377
|
-
let(:javascript) { '/script.js' }
|
378
499
|
let(:body) do
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
500
|
+
<<-HTML
|
501
|
+
<html>
|
502
|
+
<head>
|
503
|
+
<link type="stylesheet" href="#{stylesheet}" />
|
504
|
+
</head>
|
505
|
+
<body>
|
506
|
+
</body>
|
507
|
+
</html>
|
508
|
+
HTML
|
509
|
+
end
|
510
|
+
|
511
|
+
it "should yield successive absolute URIs" do
|
512
|
+
expect { |b| subject.each_url(&b) }.to yield_successive_args(
|
513
|
+
URI("http://#{host}#{stylesheet}")
|
514
|
+
)
|
390
515
|
end
|
516
|
+
end
|
391
517
|
|
392
|
-
|
518
|
+
context "when the page contains a 'script' element" do
|
519
|
+
let(:javascript) { '/script.js' }
|
520
|
+
let(:body) do
|
521
|
+
<<-HTML
|
522
|
+
<html>
|
523
|
+
<head>
|
524
|
+
<script src="#{javascript}" />
|
525
|
+
</head>
|
526
|
+
<body>
|
527
|
+
</body>
|
528
|
+
</html>
|
529
|
+
HTML
|
530
|
+
end
|
531
|
+
|
532
|
+
it "should yield successive absolute URIs" do
|
393
533
|
expect { |b| subject.each_url(&b) }.to yield_successive_args(
|
394
|
-
URI("http://#{host}#{link}"),
|
395
|
-
URI("http://#{host}#{frame}"),
|
396
|
-
URI("http://#{host}#{iframe}"),
|
397
|
-
URI("http://#{host}#{stylesheet}"),
|
398
534
|
URI("http://#{host}#{javascript}")
|
399
535
|
)
|
400
536
|
end
|
@@ -410,32 +546,100 @@ describe Page do
|
|
410
546
|
end
|
411
547
|
|
412
548
|
describe "#urls" do
|
413
|
-
context "when the page contains
|
549
|
+
context "when the page contains an 'a' link" do
|
414
550
|
let(:link) { '/link' }
|
551
|
+
let(:body) do
|
552
|
+
<<-HTML
|
553
|
+
<html>
|
554
|
+
<body>
|
555
|
+
<a href="#{link}">link</a>
|
556
|
+
</body>
|
557
|
+
</html>
|
558
|
+
HTML
|
559
|
+
end
|
560
|
+
|
561
|
+
it "should return an Array of absolute URIs" do
|
562
|
+
expect(subject.urls).to be == [
|
563
|
+
URI("http://#{host}#{link}")
|
564
|
+
]
|
565
|
+
end
|
566
|
+
end
|
567
|
+
|
568
|
+
context "when the page contains a 'frame'" do
|
415
569
|
let(:frame) { '/frame' }
|
570
|
+
let(:body) do
|
571
|
+
<<-HTML
|
572
|
+
<html>
|
573
|
+
<frameset>
|
574
|
+
<frame src="#{frame}"></frame>
|
575
|
+
</frameset>
|
576
|
+
</html>
|
577
|
+
HTML
|
578
|
+
end
|
579
|
+
|
580
|
+
it "should return an Array of absolute URIs" do
|
581
|
+
expect(subject.urls).to be == [
|
582
|
+
URI("http://#{host}#{frame}")
|
583
|
+
]
|
584
|
+
end
|
585
|
+
end
|
586
|
+
|
587
|
+
context "when the page contains a 'iframe'" do
|
416
588
|
let(:iframe) { '/iframe' }
|
589
|
+
let(:body) do
|
590
|
+
<<-HTML
|
591
|
+
<html>
|
592
|
+
<body>
|
593
|
+
<iframe src="#{iframe}"></iframe>
|
594
|
+
</body>
|
595
|
+
</html>
|
596
|
+
HTML
|
597
|
+
end
|
598
|
+
|
599
|
+
it "should return an Array of absolute URIs" do
|
600
|
+
expect(subject.urls).to be == [
|
601
|
+
URI("http://#{host}#{iframe}")
|
602
|
+
]
|
603
|
+
end
|
604
|
+
end
|
605
|
+
|
606
|
+
context "when the page contains a 'link' element" do
|
417
607
|
let(:stylesheet) { '/stylesheet.css' }
|
608
|
+
let(:body) do
|
609
|
+
<<-HTML
|
610
|
+
<html>
|
611
|
+
<head>
|
612
|
+
<link type="stylesheet" href="#{stylesheet}" />
|
613
|
+
</head>
|
614
|
+
<body>
|
615
|
+
</body>
|
616
|
+
</html>
|
617
|
+
HTML
|
618
|
+
end
|
619
|
+
|
620
|
+
it "should return an Array of absolute URIs" do
|
621
|
+
expect(subject.urls).to be == [
|
622
|
+
URI("http://#{host}#{stylesheet}")
|
623
|
+
]
|
624
|
+
end
|
625
|
+
end
|
626
|
+
|
627
|
+
context "when the page contains a 'script' element" do
|
418
628
|
let(:javascript) { '/script.js' }
|
419
629
|
let(:body) do
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
%{</body>} +
|
430
|
-
%{</html>}
|
630
|
+
<<-HTML
|
631
|
+
<html>
|
632
|
+
<head>
|
633
|
+
<script src="#{javascript}" />
|
634
|
+
</head>
|
635
|
+
<body>
|
636
|
+
</body>
|
637
|
+
</html>
|
638
|
+
HTML
|
431
639
|
end
|
432
640
|
|
433
641
|
it "should return an Array of absolute URIs" do
|
434
642
|
expect(subject.urls).to be == [
|
435
|
-
URI("http://#{host}#{link}"),
|
436
|
-
URI("http://#{host}#{frame}"),
|
437
|
-
URI("http://#{host}#{iframe}"),
|
438
|
-
URI("http://#{host}#{stylesheet}"),
|
439
643
|
URI("http://#{host}#{javascript}")
|
440
644
|
]
|
441
645
|
end
|
@@ -26,10 +26,6 @@ describe Page do
|
|
26
26
|
include_examples "status code method", :is_ok?, {200 => true, 500 => false}
|
27
27
|
end
|
28
28
|
|
29
|
-
describe "#timedout?" do
|
30
|
-
include_examples "status code method", :timedout?, {308 => true, 200 => false}
|
31
|
-
end
|
32
|
-
|
33
29
|
describe "#bad_request?" do
|
34
30
|
include_examples "status code method", :bad_request?, {400 => true, 200 => false}
|
35
31
|
end
|
@@ -46,6 +42,10 @@ describe Page do
|
|
46
42
|
include_examples "status code method", :is_missing?, {404 => true, 200 => false}
|
47
43
|
end
|
48
44
|
|
45
|
+
describe "#is_timedout?" do
|
46
|
+
include_examples "status code method", :is_timedout?, {408 => true, 200 => false}
|
47
|
+
end
|
48
|
+
|
49
49
|
describe "#had_internal_server_error?" do
|
50
50
|
include_examples "status code method", :had_internal_server_error?, {500 => true, 200 => false}
|
51
51
|
end
|
data/spec/proxy_spec.rb
CHANGED
@@ -26,13 +26,13 @@ describe Spidr::Proxy do
|
|
26
26
|
it { expect(subject.enabled?).to be true }
|
27
27
|
end
|
28
28
|
|
29
|
-
context "when
|
29
|
+
context "when host is not set" do
|
30
30
|
it { expect(subject.enabled?).to be false }
|
31
31
|
end
|
32
32
|
end
|
33
33
|
|
34
34
|
describe "#disabled?" do
|
35
|
-
context "when
|
35
|
+
context "when host is not set" do
|
36
36
|
it { expect(subject.disabled?).to be true }
|
37
37
|
end
|
38
38
|
|
@@ -16,7 +16,7 @@ shared_examples "includes Spidr::Settings::Proxy" do
|
|
16
16
|
end
|
17
17
|
|
18
18
|
it "should retain the default value" do
|
19
|
-
expect(subject.proxy.object_id).to be
|
19
|
+
expect(subject.proxy.object_id).to be(subject.proxy.object_id)
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
@@ -26,7 +26,7 @@ shared_examples "includes Spidr::Settings::Proxy" do
|
|
26
26
|
end
|
27
27
|
|
28
28
|
it "should return the set @proxy" do
|
29
|
-
expect(subject.proxy).to be
|
29
|
+
expect(subject.proxy).to be(proxy)
|
30
30
|
end
|
31
31
|
end
|
32
32
|
end
|
@@ -35,12 +35,10 @@ shared_examples "includes Spidr::Settings::Proxy" do
|
|
35
35
|
context "when given a Proxy object" do
|
36
36
|
let(:proxy) { Proxy.new(host: proxy_host, port: proxy_port) }
|
37
37
|
|
38
|
-
before
|
39
|
-
subject.proxy = proxy
|
40
|
-
end
|
38
|
+
before { subject.proxy = proxy }
|
41
39
|
|
42
40
|
it "should save it" do
|
43
|
-
expect(subject.proxy).to be
|
41
|
+
expect(subject.proxy).to be(proxy)
|
44
42
|
end
|
45
43
|
end
|
46
44
|
|
@@ -51,15 +49,37 @@ shared_examples "includes Spidr::Settings::Proxy" do
|
|
51
49
|
|
52
50
|
it "should create a new Proxy object" do
|
53
51
|
expect(subject.proxy).to be_kind_of(Proxy)
|
54
|
-
expect(subject.proxy[:host]).to be
|
55
|
-
expect(subject.proxy[:port]).to be
|
52
|
+
expect(subject.proxy[:host]).to be(proxy_host)
|
53
|
+
expect(subject.proxy[:port]).to be(proxy_port)
|
56
54
|
end
|
57
55
|
end
|
58
56
|
|
59
|
-
context "when given
|
60
|
-
|
61
|
-
|
57
|
+
context "when given a URI::HTTP" do
|
58
|
+
let(:uri) { URI::HTTP.build(host: proxy_host, port: proxy_port) }
|
59
|
+
|
60
|
+
before { subject.proxy = uri }
|
61
|
+
|
62
|
+
it "should create a new Proxy object based on the URI" do
|
63
|
+
expect(subject.proxy).to be_kind_of(Proxy)
|
64
|
+
expect(subject.proxy[:host]).to eq(proxy_host)
|
65
|
+
expect(subject.proxy[:port]).to eq(proxy_port)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
context "when given a String" do
|
70
|
+
let(:url) { "http://#{proxy_host}:#{proxy_port}" }
|
71
|
+
|
72
|
+
before { subject.proxy = url }
|
73
|
+
|
74
|
+
it "should parse the String as a URI and create a new Proxy object" do
|
75
|
+
expect(subject.proxy).to be_kind_of(Proxy)
|
76
|
+
expect(subject.proxy[:host]).to eq(proxy_host)
|
77
|
+
expect(subject.proxy[:port]).to eq(proxy_port)
|
62
78
|
end
|
79
|
+
end
|
80
|
+
|
81
|
+
context "when given nil" do
|
82
|
+
before { subject.proxy = nil }
|
63
83
|
|
64
84
|
it "should leave an empty proxy" do
|
65
85
|
expect(subject.proxy).to be_kind_of(Proxy)
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,43 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-01-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.3'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.3'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - ~>
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '2.0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - ~>
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '2.0'
|
41
41
|
description: Spidr is a versatile Ruby web spidering library that can spider a site,
|
42
42
|
multiple domains, certain links or infinitely. Spidr is designed to be fast and
|
43
43
|
easy to use.
|
@@ -49,10 +49,11 @@ extra_rdoc_files:
|
|
49
49
|
- LICENSE.txt
|
50
50
|
- README.md
|
51
51
|
files:
|
52
|
-
- .
|
53
|
-
- .
|
54
|
-
- .
|
55
|
-
- .
|
52
|
+
- ".editorconfig"
|
53
|
+
- ".github/workflows/ruby.yml"
|
54
|
+
- ".gitignore"
|
55
|
+
- ".rspec"
|
56
|
+
- ".yardopts"
|
56
57
|
- ChangeLog.md
|
57
58
|
- Gemfile
|
58
59
|
- LICENSE.txt
|
@@ -112,24 +113,23 @@ homepage: https://github.com/postmodern/spidr#readme
|
|
112
113
|
licenses:
|
113
114
|
- MIT
|
114
115
|
metadata: {}
|
115
|
-
post_install_message:
|
116
|
+
post_install_message:
|
116
117
|
rdoc_options: []
|
117
118
|
require_paths:
|
118
119
|
- lib
|
119
120
|
required_ruby_version: !ruby/object:Gem::Requirement
|
120
121
|
requirements:
|
121
|
-
- -
|
122
|
+
- - ">="
|
122
123
|
- !ruby/object:Gem::Version
|
123
124
|
version: 2.0.0
|
124
125
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
125
126
|
requirements:
|
126
|
-
- -
|
127
|
+
- - ">="
|
127
128
|
- !ruby/object:Gem::Version
|
128
129
|
version: '0'
|
129
130
|
requirements: []
|
130
|
-
|
131
|
-
|
132
|
-
signing_key:
|
131
|
+
rubygems_version: 3.3.26
|
132
|
+
signing_key:
|
133
133
|
specification_version: 4
|
134
134
|
summary: A versatile Ruby web spidering library
|
135
135
|
test_files: []
|