spidr 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -297,7 +297,7 @@ describe Page do
297
297
  context "when the page contains iframes" do
298
298
  let(:iframe1) { '/iframe1' }
299
299
  let(:iframe2) { '/iframe2' }
300
- let(:body) { %{<html><body><iframe src="#{iframe1}" /><iframe src="#{iframe2}" /></body></html>} }
300
+ let(:body) { %{<html><body><iframe src="#{iframe1}"></iframe><iframe src="#{iframe2}"></iframe></body></html>} }
301
301
 
302
302
  it "should yield each iframe/@src value" do
303
303
  expect { |b|
@@ -332,32 +332,100 @@ describe Page do
332
332
  end
333
333
 
334
334
  describe "#links" do
335
- context "when the page contains links" do
335
+ context "when the page contains an 'a' link" do
336
336
  let(:link) { '/link' }
337
+ let(:body) do
338
+ <<-HTML
339
+ <html>
340
+ <body>
341
+ <a href="#{link}">link</a>
342
+ </body>
343
+ </html>
344
+ HTML
345
+ end
346
+
347
+ it "should return an Array of links" do
348
+ expect(subject.links).to be == [
349
+ link
350
+ ]
351
+ end
352
+ end
353
+
354
+ context "when the page contains a 'frame'" do
337
355
  let(:frame) { '/frame' }
356
+ let(:body) do
357
+ <<-HTML
358
+ <html>
359
+ <frameset>
360
+ <frame src="#{frame}"></frame>
361
+ </frameset>
362
+ </html>
363
+ HTML
364
+ end
365
+
366
+ it "should return an Array of links" do
367
+ expect(subject.links).to be == [
368
+ frame
369
+ ]
370
+ end
371
+ end
372
+
373
+ context "when the page contains a 'iframe'" do
338
374
  let(:iframe) { '/iframe' }
375
+ let(:body) do
376
+ <<-HTML
377
+ <html>
378
+ <body>
379
+ <iframe src="#{iframe}"></iframe>
380
+ </body>
381
+ </html>
382
+ HTML
383
+ end
384
+
385
+ it "should return an Array of links" do
386
+ expect(subject.links).to be == [
387
+ iframe
388
+ ]
389
+ end
390
+ end
391
+
392
+ context "when the page contains a 'link' element" do
339
393
  let(:stylesheet) { '/stylesheet.css' }
394
+ let(:body) do
395
+ <<-HTML
396
+ <html>
397
+ <head>
398
+ <link type="stylesheet" href="#{stylesheet}" />
399
+ </head>
400
+ <body>
401
+ </body>
402
+ </html>
403
+ HTML
404
+ end
405
+
406
+ it "should return an Array of links" do
407
+ expect(subject.links).to be == [
408
+ stylesheet
409
+ ]
410
+ end
411
+ end
412
+
413
+ context "when the page contains a 'script' element" do
340
414
  let(:javascript) { '/script.js' }
341
415
  let(:body) do
342
- %{<html>} +
343
- %{<head>} +
344
- %{<link rel="stylesheet" type="text/css" href="#{stylesheet}" />} +
345
- %{<script type="text/javascript" src="#{javascript}"></script>} +
346
- %{</head>} +
347
- %{<body>} +
348
- %{<a href="#{link}">link</a>} +
349
- %{<frameset><frame src="#{frame}" /></frameset>} +
350
- %{<iframe src="#{iframe}" />} +
351
- %{</body>} +
352
- %{</html>}
416
+ <<-HTML
417
+ <html>
418
+ <head>
419
+ <script src="#{javascript}" />
420
+ </head>
421
+ <body>
422
+ </body>
423
+ </html>
424
+ HTML
353
425
  end
354
426
 
355
427
  it "should return an Array of links" do
356
428
  expect(subject.links).to be == [
357
- link,
358
- frame,
359
- iframe,
360
- stylesheet,
361
429
  javascript
362
430
  ]
363
431
  end
@@ -369,32 +437,100 @@ describe Page do
369
437
  end
370
438
 
371
439
  describe "#each_url" do
372
- context "when the page contains links" do
440
+ context "when the page contains an 'a' link" do
373
441
  let(:link) { '/link' }
442
+ let(:body) do
443
+ <<-HTML
444
+ <html>
445
+ <body>
446
+ <a href="#{link}">link</a>
447
+ </body>
448
+ </html>
449
+ HTML
450
+ end
451
+
452
+ it "should yield successive absolute URIs" do
453
+ expect { |b| subject.each_url(&b) }.to yield_successive_args(
454
+ URI("http://#{host}#{link}")
455
+ )
456
+ end
457
+ end
458
+
459
+ context "when the page contains a 'frame'" do
374
460
  let(:frame) { '/frame' }
461
+ let(:body) do
462
+ <<-HTML
463
+ <html>
464
+ <frameset>
465
+ <frame src="#{frame}"></frame>
466
+ </frameset>
467
+ </html>
468
+ HTML
469
+ end
470
+
471
+ it "should yield successive absolute URIs" do
472
+ expect { |b| subject.each_url(&b) }.to yield_successive_args(
473
+ URI("http://#{host}#{frame}")
474
+ )
475
+ end
476
+ end
477
+
478
+ context "when the page contains a 'iframe'" do
375
479
  let(:iframe) { '/iframe' }
480
+ let(:body) do
481
+ <<-HTML
482
+ <html>
483
+ <body>
484
+ <iframe src="#{iframe}"></iframe>
485
+ </body>
486
+ </html>
487
+ HTML
488
+ end
489
+
490
+ it "should yield successive absolute URIs" do
491
+ expect { |b| subject.each_url(&b) }.to yield_successive_args(
492
+ URI("http://#{host}#{iframe}")
493
+ )
494
+ end
495
+ end
496
+
497
+ context "when the page contains a 'link' element" do
376
498
  let(:stylesheet) { '/stylesheet.css' }
377
- let(:javascript) { '/script.js' }
378
499
  let(:body) do
379
- %{<html>} +
380
- %{<head>} +
381
- %{<link rel="stylesheet" type="text/css" href="#{stylesheet}" />} +
382
- %{<script type="text/javascript" src="#{javascript}"></script>} +
383
- %{</head>} +
384
- %{<body>} +
385
- %{<a href="#{link}">link</a>} +
386
- %{<frameset><frame src="#{frame}" /></frameset>} +
387
- %{<iframe src="#{iframe}" />} +
388
- %{</body>} +
389
- %{</html>}
500
+ <<-HTML
501
+ <html>
502
+ <head>
503
+ <link type="stylesheet" href="#{stylesheet}" />
504
+ </head>
505
+ <body>
506
+ </body>
507
+ </html>
508
+ HTML
509
+ end
510
+
511
+ it "should yield successive absolute URIs" do
512
+ expect { |b| subject.each_url(&b) }.to yield_successive_args(
513
+ URI("http://#{host}#{stylesheet}")
514
+ )
390
515
  end
516
+ end
391
517
 
392
- it "should return an Array of absolute URIs" do
518
+ context "when the page contains a 'script' element" do
519
+ let(:javascript) { '/script.js' }
520
+ let(:body) do
521
+ <<-HTML
522
+ <html>
523
+ <head>
524
+ <script src="#{javascript}" />
525
+ </head>
526
+ <body>
527
+ </body>
528
+ </html>
529
+ HTML
530
+ end
531
+
532
+ it "should yield successive absolute URIs" do
393
533
  expect { |b| subject.each_url(&b) }.to yield_successive_args(
394
- URI("http://#{host}#{link}"),
395
- URI("http://#{host}#{frame}"),
396
- URI("http://#{host}#{iframe}"),
397
- URI("http://#{host}#{stylesheet}"),
398
534
  URI("http://#{host}#{javascript}")
399
535
  )
400
536
  end
@@ -410,32 +546,100 @@ describe Page do
410
546
  end
411
547
 
412
548
  describe "#urls" do
413
- context "when the page contains links" do
549
+ context "when the page contains an 'a' link" do
414
550
  let(:link) { '/link' }
551
+ let(:body) do
552
+ <<-HTML
553
+ <html>
554
+ <body>
555
+ <a href="#{link}">link</a>
556
+ </body>
557
+ </html>
558
+ HTML
559
+ end
560
+
561
+ it "should return an Array of absolute URIs" do
562
+ expect(subject.urls).to be == [
563
+ URI("http://#{host}#{link}")
564
+ ]
565
+ end
566
+ end
567
+
568
+ context "when the page contains a 'frame'" do
415
569
  let(:frame) { '/frame' }
570
+ let(:body) do
571
+ <<-HTML
572
+ <html>
573
+ <frameset>
574
+ <frame src="#{frame}"></frame>
575
+ </frameset>
576
+ </html>
577
+ HTML
578
+ end
579
+
580
+ it "should return an Array of absolute URIs" do
581
+ expect(subject.urls).to be == [
582
+ URI("http://#{host}#{frame}")
583
+ ]
584
+ end
585
+ end
586
+
587
+ context "when the page contains a 'iframe'" do
416
588
  let(:iframe) { '/iframe' }
589
+ let(:body) do
590
+ <<-HTML
591
+ <html>
592
+ <body>
593
+ <iframe src="#{iframe}"></iframe>
594
+ </body>
595
+ </html>
596
+ HTML
597
+ end
598
+
599
+ it "should return an Array of absolute URIs" do
600
+ expect(subject.urls).to be == [
601
+ URI("http://#{host}#{iframe}")
602
+ ]
603
+ end
604
+ end
605
+
606
+ context "when the page contains a 'link' element" do
417
607
  let(:stylesheet) { '/stylesheet.css' }
608
+ let(:body) do
609
+ <<-HTML
610
+ <html>
611
+ <head>
612
+ <link type="stylesheet" href="#{stylesheet}" />
613
+ </head>
614
+ <body>
615
+ </body>
616
+ </html>
617
+ HTML
618
+ end
619
+
620
+ it "should return an Array of absolute URIs" do
621
+ expect(subject.urls).to be == [
622
+ URI("http://#{host}#{stylesheet}")
623
+ ]
624
+ end
625
+ end
626
+
627
+ context "when the page contains a 'script' element" do
418
628
  let(:javascript) { '/script.js' }
419
629
  let(:body) do
420
- %{<html>} +
421
- %{<head>} +
422
- %{<link rel="stylesheet" type="text/css" href="#{stylesheet}" />} +
423
- %{<script type="text/javascript" src="#{javascript}"></script>} +
424
- %{</head>} +
425
- %{<body>} +
426
- %{<a href="#{link}">link</a>} +
427
- %{<frameset><frame src="#{frame}" /></frameset>} +
428
- %{<iframe src="#{iframe}" />} +
429
- %{</body>} +
430
- %{</html>}
630
+ <<-HTML
631
+ <html>
632
+ <head>
633
+ <script src="#{javascript}" />
634
+ </head>
635
+ <body>
636
+ </body>
637
+ </html>
638
+ HTML
431
639
  end
432
640
 
433
641
  it "should return an Array of absolute URIs" do
434
642
  expect(subject.urls).to be == [
435
- URI("http://#{host}#{link}"),
436
- URI("http://#{host}#{frame}"),
437
- URI("http://#{host}#{iframe}"),
438
- URI("http://#{host}#{stylesheet}"),
439
643
  URI("http://#{host}#{javascript}")
440
644
  ]
441
645
  end
@@ -26,10 +26,6 @@ describe Page do
26
26
  include_examples "status code method", :is_ok?, {200 => true, 500 => false}
27
27
  end
28
28
 
29
- describe "#timedout?" do
30
- include_examples "status code method", :timedout?, {308 => true, 200 => false}
31
- end
32
-
33
29
  describe "#bad_request?" do
34
30
  include_examples "status code method", :bad_request?, {400 => true, 200 => false}
35
31
  end
@@ -46,6 +42,10 @@ describe Page do
46
42
  include_examples "status code method", :is_missing?, {404 => true, 200 => false}
47
43
  end
48
44
 
45
+ describe "#is_timedout?" do
46
+ include_examples "status code method", :is_timedout?, {408 => true, 200 => false}
47
+ end
48
+
49
49
  describe "#had_internal_server_error?" do
50
50
  include_examples "status code method", :had_internal_server_error?, {500 => true, 200 => false}
51
51
  end
data/spec/proxy_spec.rb CHANGED
@@ -26,13 +26,13 @@ describe Spidr::Proxy do
26
26
  it { expect(subject.enabled?).to be true }
27
27
  end
28
28
 
29
- context "when hist is not set" do
29
+ context "when host is not set" do
30
30
  it { expect(subject.enabled?).to be false }
31
31
  end
32
32
  end
33
33
 
34
34
  describe "#disabled?" do
35
- context "when hist is not set" do
35
+ context "when host is not set" do
36
36
  it { expect(subject.disabled?).to be true }
37
37
  end
38
38
 
@@ -16,7 +16,7 @@ shared_examples "includes Spidr::Settings::Proxy" do
16
16
  end
17
17
 
18
18
  it "should retain the default value" do
19
- expect(subject.proxy.object_id).to be subject.proxy.object_id
19
+ expect(subject.proxy.object_id).to be(subject.proxy.object_id)
20
20
  end
21
21
  end
22
22
 
@@ -26,7 +26,7 @@ shared_examples "includes Spidr::Settings::Proxy" do
26
26
  end
27
27
 
28
28
  it "should return the set @proxy" do
29
- expect(subject.proxy).to be proxy
29
+ expect(subject.proxy).to be(proxy)
30
30
  end
31
31
  end
32
32
  end
@@ -35,12 +35,10 @@ shared_examples "includes Spidr::Settings::Proxy" do
35
35
  context "when given a Proxy object" do
36
36
  let(:proxy) { Proxy.new(host: proxy_host, port: proxy_port) }
37
37
 
38
- before do
39
- subject.proxy = proxy
40
- end
38
+ before { subject.proxy = proxy }
41
39
 
42
40
  it "should save it" do
43
- expect(subject.proxy).to be proxy
41
+ expect(subject.proxy).to be(proxy)
44
42
  end
45
43
  end
46
44
 
@@ -51,15 +49,37 @@ shared_examples "includes Spidr::Settings::Proxy" do
51
49
 
52
50
  it "should create a new Proxy object" do
53
51
  expect(subject.proxy).to be_kind_of(Proxy)
54
- expect(subject.proxy[:host]).to be proxy_host
55
- expect(subject.proxy[:port]).to be proxy_port
52
+ expect(subject.proxy[:host]).to be(proxy_host)
53
+ expect(subject.proxy[:port]).to be(proxy_port)
56
54
  end
57
55
  end
58
56
 
59
- context "when given nil" do
60
- before do
61
- subject.proxy = nil
57
+ context "when given a URI::HTTP" do
58
+ let(:uri) { URI::HTTP.build(host: proxy_host, port: proxy_port) }
59
+
60
+ before { subject.proxy = uri }
61
+
62
+ it "should create a new Proxy object based on the URI" do
63
+ expect(subject.proxy).to be_kind_of(Proxy)
64
+ expect(subject.proxy[:host]).to eq(proxy_host)
65
+ expect(subject.proxy[:port]).to eq(proxy_port)
66
+ end
67
+ end
68
+
69
+ context "when given a String" do
70
+ let(:url) { "http://#{proxy_host}:#{proxy_port}" }
71
+
72
+ before { subject.proxy = url }
73
+
74
+ it "should parse the String as a URI and create a new Proxy object" do
75
+ expect(subject.proxy).to be_kind_of(Proxy)
76
+ expect(subject.proxy[:host]).to eq(proxy_host)
77
+ expect(subject.proxy[:port]).to eq(proxy_port)
62
78
  end
79
+ end
80
+
81
+ context "when given nil" do
82
+ before { subject.proxy = nil }
63
83
 
64
84
  it "should leave an empty proxy" do
65
85
  expect(subject.proxy).to be_kind_of(Proxy)
data/spec/spec_helper.rb CHANGED
@@ -1,4 +1,7 @@
1
1
  require 'rspec'
2
+ require 'simplecov'
2
3
  require 'spidr/version'
3
4
 
4
5
  include Spidr
6
+
7
+ SimpleCov.start
metadata CHANGED
@@ -1,43 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-08-05 00:00:00.000000000 Z
11
+ date: 2023-01-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.3'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.3'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ~>
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '1.0'
33
+ version: '2.0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ~>
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '1.0'
40
+ version: '2.0'
41
41
  description: Spidr is a versatile Ruby web spidering library that can spider a site,
42
42
  multiple domains, certain links or infinitely. Spidr is designed to be fast and
43
43
  easy to use.
@@ -49,10 +49,11 @@ extra_rdoc_files:
49
49
  - LICENSE.txt
50
50
  - README.md
51
51
  files:
52
- - .gitignore
53
- - .rspec
54
- - .travis.yml
55
- - .yardopts
52
+ - ".editorconfig"
53
+ - ".github/workflows/ruby.yml"
54
+ - ".gitignore"
55
+ - ".rspec"
56
+ - ".yardopts"
56
57
  - ChangeLog.md
57
58
  - Gemfile
58
59
  - LICENSE.txt
@@ -112,24 +113,23 @@ homepage: https://github.com/postmodern/spidr#readme
112
113
  licenses:
113
114
  - MIT
114
115
  metadata: {}
115
- post_install_message:
116
+ post_install_message:
116
117
  rdoc_options: []
117
118
  require_paths:
118
119
  - lib
119
120
  required_ruby_version: !ruby/object:Gem::Requirement
120
121
  requirements:
121
- - - '>='
122
+ - - ">="
122
123
  - !ruby/object:Gem::Version
123
124
  version: 2.0.0
124
125
  required_rubygems_version: !ruby/object:Gem::Requirement
125
126
  requirements:
126
- - - '>='
127
+ - - ">="
127
128
  - !ruby/object:Gem::Version
128
129
  version: '0'
129
130
  requirements: []
130
- rubyforge_project:
131
- rubygems_version: 2.0.14.1
132
- signing_key:
131
+ rubygems_version: 3.3.26
132
+ signing_key:
133
133
  specification_version: 4
134
134
  summary: A versatile Ruby web spidering library
135
135
  test_files: []
data/.travis.yml DELETED
@@ -1,14 +0,0 @@
1
- ---
2
- language: ruby
3
- rvm:
4
- - 2.0.0
5
- - 2.1.9
6
- - 2.2.4
7
- - 2.3.1
8
- - jruby
9
- - rbx
10
- matrix:
11
- allow_failures:
12
- - rvm: jruby
13
- - rvm: rbx
14
- script: rake spec