spidr 0.6.1 → 0.7.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.editorconfig +11 -0
  3. data/.github/workflows/ruby.yml +26 -0
  4. data/.gitignore +4 -5
  5. data/ChangeLog.md +19 -1
  6. data/Gemfile +7 -4
  7. data/LICENSE.txt +1 -1
  8. data/README.md +136 -79
  9. data/Rakefile +1 -0
  10. data/gemspec.yml +7 -0
  11. data/lib/spidr/agent/actions.rb +3 -1
  12. data/lib/spidr/agent/events.rb +3 -1
  13. data/lib/spidr/agent/filters.rb +57 -56
  14. data/lib/spidr/agent/robots.rb +2 -0
  15. data/lib/spidr/agent/sanitizers.rb +7 -8
  16. data/lib/spidr/agent.rb +232 -108
  17. data/lib/spidr/auth_credential.rb +2 -0
  18. data/lib/spidr/auth_store.rb +9 -7
  19. data/lib/spidr/cookie_jar.rb +7 -5
  20. data/lib/spidr/extensions/uri.rb +3 -1
  21. data/lib/spidr/extensions.rb +3 -1
  22. data/lib/spidr/page/content_types.rb +53 -0
  23. data/lib/spidr/page/cookies.rb +2 -0
  24. data/lib/spidr/page/html.rb +21 -20
  25. data/lib/spidr/page/status_codes.rb +15 -11
  26. data/lib/spidr/page.rb +3 -1
  27. data/lib/spidr/proxy.rb +8 -14
  28. data/lib/spidr/rules.rb +7 -8
  29. data/lib/spidr/session_cache.rb +26 -22
  30. data/lib/spidr/settings/proxy.rb +22 -6
  31. data/lib/spidr/settings/timeouts.rb +2 -0
  32. data/lib/spidr/settings/user_agent.rb +2 -0
  33. data/lib/spidr/settings.rb +5 -3
  34. data/lib/spidr/spidr.rb +22 -11
  35. data/lib/spidr/version.rb +3 -1
  36. data/lib/spidr.rb +5 -3
  37. data/spec/agent_spec.rb +356 -7
  38. data/spec/example_page.rb +2 -0
  39. data/spec/page/content_types_spec.rb +22 -0
  40. data/spec/page/html_spec.rb +255 -51
  41. data/spec/page/status_codes_spec.rb +4 -4
  42. data/spec/proxy_spec.rb +2 -2
  43. data/spec/settings/proxy_examples.rb +31 -11
  44. data/spec/spec_helper.rb +3 -0
  45. data/spidr.gemspec +1 -4
  46. metadata +8 -7
  47. data/.travis.yml +0 -16
@@ -297,7 +297,7 @@ describe Page do
297
297
  context "when the page contains iframes" do
298
298
  let(:iframe1) { '/iframe1' }
299
299
  let(:iframe2) { '/iframe2' }
300
- let(:body) { %{<html><body><iframe src="#{iframe1}" /><iframe src="#{iframe2}" /></body></html>} }
300
+ let(:body) { %{<html><body><iframe src="#{iframe1}"></iframe><iframe src="#{iframe2}"></iframe></body></html>} }
301
301
 
302
302
  it "should yield each iframe/@src value" do
303
303
  expect { |b|
@@ -332,32 +332,100 @@ describe Page do
332
332
  end
333
333
 
334
334
  describe "#links" do
335
- context "when the page contains links" do
335
+ context "when the page contains an 'a' link" do
336
336
  let(:link) { '/link' }
337
+ let(:body) do
338
+ <<-HTML
339
+ <html>
340
+ <body>
341
+ <a href="#{link}">link</a>
342
+ </body>
343
+ </html>
344
+ HTML
345
+ end
346
+
347
+ it "should return an Array of links" do
348
+ expect(subject.links).to be == [
349
+ link
350
+ ]
351
+ end
352
+ end
353
+
354
+ context "when the page contains a 'frame'" do
337
355
  let(:frame) { '/frame' }
356
+ let(:body) do
357
+ <<-HTML
358
+ <html>
359
+ <frameset>
360
+ <frame src="#{frame}"></frame>
361
+ </frameset>
362
+ </html>
363
+ HTML
364
+ end
365
+
366
+ it "should return an Array of links" do
367
+ expect(subject.links).to be == [
368
+ frame
369
+ ]
370
+ end
371
+ end
372
+
373
+ context "when the page contains a 'iframe'" do
338
374
  let(:iframe) { '/iframe' }
375
+ let(:body) do
376
+ <<-HTML
377
+ <html>
378
+ <body>
379
+ <iframe src="#{iframe}"></iframe>
380
+ </body>
381
+ </html>
382
+ HTML
383
+ end
384
+
385
+ it "should return an Array of links" do
386
+ expect(subject.links).to be == [
387
+ iframe
388
+ ]
389
+ end
390
+ end
391
+
392
+ context "when the page contains a 'link' element" do
339
393
  let(:stylesheet) { '/stylesheet.css' }
394
+ let(:body) do
395
+ <<-HTML
396
+ <html>
397
+ <head>
398
+ <link type="stylesheet" href="#{stylesheet}" />
399
+ </head>
400
+ <body>
401
+ </body>
402
+ </html>
403
+ HTML
404
+ end
405
+
406
+ it "should return an Array of links" do
407
+ expect(subject.links).to be == [
408
+ stylesheet
409
+ ]
410
+ end
411
+ end
412
+
413
+ context "when the page contains a 'script' element" do
340
414
  let(:javascript) { '/script.js' }
341
415
  let(:body) do
342
- %{<html>} +
343
- %{<head>} +
344
- %{<link rel="stylesheet" type="text/css" href="#{stylesheet}" />} +
345
- %{<script type="text/javascript" src="#{javascript}"></script>} +
346
- %{</head>} +
347
- %{<body>} +
348
- %{<a href="#{link}">link</a>} +
349
- %{<frameset><frame src="#{frame}" /></frameset>} +
350
- %{<iframe src="#{iframe}" />} +
351
- %{</body>} +
352
- %{</html>}
416
+ <<-HTML
417
+ <html>
418
+ <head>
419
+ <script src="#{javascript}" />
420
+ </head>
421
+ <body>
422
+ </body>
423
+ </html>
424
+ HTML
353
425
  end
354
426
 
355
427
  it "should return an Array of links" do
356
428
  expect(subject.links).to be == [
357
- link,
358
- frame,
359
- iframe,
360
- stylesheet,
361
429
  javascript
362
430
  ]
363
431
  end
@@ -369,32 +437,100 @@ describe Page do
369
437
  end
370
438
 
371
439
  describe "#each_url" do
372
- context "when the page contains links" do
440
+ context "when the page contains an 'a' link" do
373
441
  let(:link) { '/link' }
442
+ let(:body) do
443
+ <<-HTML
444
+ <html>
445
+ <body>
446
+ <a href="#{link}">link</a>
447
+ </body>
448
+ </html>
449
+ HTML
450
+ end
451
+
452
+ it "should yield successive absolute URIs" do
453
+ expect { |b| subject.each_url(&b) }.to yield_successive_args(
454
+ URI("http://#{host}#{link}")
455
+ )
456
+ end
457
+ end
458
+
459
+ context "when the page contains a 'frame'" do
374
460
  let(:frame) { '/frame' }
461
+ let(:body) do
462
+ <<-HTML
463
+ <html>
464
+ <frameset>
465
+ <frame src="#{frame}"></frame>
466
+ </frameset>
467
+ </html>
468
+ HTML
469
+ end
470
+
471
+ it "should yield successive absolute URIs" do
472
+ expect { |b| subject.each_url(&b) }.to yield_successive_args(
473
+ URI("http://#{host}#{frame}")
474
+ )
475
+ end
476
+ end
477
+
478
+ context "when the page contains a 'iframe'" do
375
479
  let(:iframe) { '/iframe' }
480
+ let(:body) do
481
+ <<-HTML
482
+ <html>
483
+ <body>
484
+ <iframe src="#{iframe}"></iframe>
485
+ </body>
486
+ </html>
487
+ HTML
488
+ end
489
+
490
+ it "should yield successive absolute URIs" do
491
+ expect { |b| subject.each_url(&b) }.to yield_successive_args(
492
+ URI("http://#{host}#{iframe}")
493
+ )
494
+ end
495
+ end
496
+
497
+ context "when the page contains a 'link' element" do
376
498
  let(:stylesheet) { '/stylesheet.css' }
377
- let(:javascript) { '/script.js' }
378
499
  let(:body) do
379
- %{<html>} +
380
- %{<head>} +
381
- %{<link rel="stylesheet" type="text/css" href="#{stylesheet}" />} +
382
- %{<script type="text/javascript" src="#{javascript}"></script>} +
383
- %{</head>} +
384
- %{<body>} +
385
- %{<a href="#{link}">link</a>} +
386
- %{<frameset><frame src="#{frame}" /></frameset>} +
387
- %{<iframe src="#{iframe}" />} +
388
- %{</body>} +
389
- %{</html>}
500
+ <<-HTML
501
+ <html>
502
+ <head>
503
+ <link type="stylesheet" href="#{stylesheet}" />
504
+ </head>
505
+ <body>
506
+ </body>
507
+ </html>
508
+ HTML
509
+ end
510
+
511
+ it "should yield successive absolute URIs" do
512
+ expect { |b| subject.each_url(&b) }.to yield_successive_args(
513
+ URI("http://#{host}#{stylesheet}")
514
+ )
390
515
  end
516
+ end
391
517
 
392
- it "should return an Array of absolute URIs" do
518
+ context "when the page contains a 'script' element" do
519
+ let(:javascript) { '/script.js' }
520
+ let(:body) do
521
+ <<-HTML
522
+ <html>
523
+ <head>
524
+ <script src="#{javascript}" />
525
+ </head>
526
+ <body>
527
+ </body>
528
+ </html>
529
+ HTML
530
+ end
531
+
532
+ it "should yield successive absolute URIs" do
393
533
  expect { |b| subject.each_url(&b) }.to yield_successive_args(
394
- URI("http://#{host}#{link}"),
395
- URI("http://#{host}#{frame}"),
396
- URI("http://#{host}#{iframe}"),
397
- URI("http://#{host}#{stylesheet}"),
398
534
  URI("http://#{host}#{javascript}")
399
535
  )
400
536
  end
@@ -410,32 +546,100 @@ describe Page do
410
546
  end
411
547
 
412
548
  describe "#urls" do
413
- context "when the page contains links" do
549
+ context "when the page contains an 'a' link" do
414
550
  let(:link) { '/link' }
551
+ let(:body) do
552
+ <<-HTML
553
+ <html>
554
+ <body>
555
+ <a href="#{link}">link</a>
556
+ </body>
557
+ </html>
558
+ HTML
559
+ end
560
+
561
+ it "should return an Array of absolute URIs" do
562
+ expect(subject.urls).to be == [
563
+ URI("http://#{host}#{link}")
564
+ ]
565
+ end
566
+ end
567
+
568
+ context "when the page contains a 'frame'" do
415
569
  let(:frame) { '/frame' }
570
+ let(:body) do
571
+ <<-HTML
572
+ <html>
573
+ <frameset>
574
+ <frame src="#{frame}"></frame>
575
+ </frameset>
576
+ </html>
577
+ HTML
578
+ end
579
+
580
+ it "should return an Array of absolute URIs" do
581
+ expect(subject.urls).to be == [
582
+ URI("http://#{host}#{frame}")
583
+ ]
584
+ end
585
+ end
586
+
587
+ context "when the page contains a 'iframe'" do
416
588
  let(:iframe) { '/iframe' }
589
+ let(:body) do
590
+ <<-HTML
591
+ <html>
592
+ <body>
593
+ <iframe src="#{iframe}"></iframe>
594
+ </body>
595
+ </html>
596
+ HTML
597
+ end
598
+
599
+ it "should return an Array of absolute URIs" do
600
+ expect(subject.urls).to be == [
601
+ URI("http://#{host}#{iframe}")
602
+ ]
603
+ end
604
+ end
605
+
606
+ context "when the page contains a 'link' element" do
417
607
  let(:stylesheet) { '/stylesheet.css' }
608
+ let(:body) do
609
+ <<-HTML
610
+ <html>
611
+ <head>
612
+ <link type="stylesheet" href="#{stylesheet}" />
613
+ </head>
614
+ <body>
615
+ </body>
616
+ </html>
617
+ HTML
618
+ end
619
+
620
+ it "should return an Array of absolute URIs" do
621
+ expect(subject.urls).to be == [
622
+ URI("http://#{host}#{stylesheet}")
623
+ ]
624
+ end
625
+ end
626
+
627
+ context "when the page contains a 'script' element" do
418
628
  let(:javascript) { '/script.js' }
419
629
  let(:body) do
420
- %{<html>} +
421
- %{<head>} +
422
- %{<link rel="stylesheet" type="text/css" href="#{stylesheet}" />} +
423
- %{<script type="text/javascript" src="#{javascript}"></script>} +
424
- %{</head>} +
425
- %{<body>} +
426
- %{<a href="#{link}">link</a>} +
427
- %{<frameset><frame src="#{frame}" /></frameset>} +
428
- %{<iframe src="#{iframe}" />} +
429
- %{</body>} +
430
- %{</html>}
630
+ <<-HTML
631
+ <html>
632
+ <head>
633
+ <script src="#{javascript}" />
634
+ </head>
635
+ <body>
636
+ </body>
637
+ </html>
638
+ HTML
431
639
  end
432
640
 
433
641
  it "should return an Array of absolute URIs" do
434
642
  expect(subject.urls).to be == [
435
- URI("http://#{host}#{link}"),
436
- URI("http://#{host}#{frame}"),
437
- URI("http://#{host}#{iframe}"),
438
- URI("http://#{host}#{stylesheet}"),
439
643
  URI("http://#{host}#{javascript}")
440
644
  ]
441
645
  end
@@ -26,10 +26,6 @@ describe Page do
26
26
  include_examples "status code method", :is_ok?, {200 => true, 500 => false}
27
27
  end
28
28
 
29
- describe "#timedout?" do
30
- include_examples "status code method", :timedout?, {308 => true, 200 => false}
31
- end
32
-
33
29
  describe "#bad_request?" do
34
30
  include_examples "status code method", :bad_request?, {400 => true, 200 => false}
35
31
  end
@@ -46,6 +42,10 @@ describe Page do
46
42
  include_examples "status code method", :is_missing?, {404 => true, 200 => false}
47
43
  end
48
44
 
45
+ describe "#is_timedout?" do
46
+ include_examples "status code method", :is_timedout?, {408 => true, 200 => false}
47
+ end
48
+
49
49
  describe "#had_internal_server_error?" do
50
50
  include_examples "status code method", :had_internal_server_error?, {500 => true, 200 => false}
51
51
  end
data/spec/proxy_spec.rb CHANGED
@@ -26,13 +26,13 @@ describe Spidr::Proxy do
26
26
  it { expect(subject.enabled?).to be true }
27
27
  end
28
28
 
29
- context "when hist is not set" do
29
+ context "when host is not set" do
30
30
  it { expect(subject.enabled?).to be false }
31
31
  end
32
32
  end
33
33
 
34
34
  describe "#disabled?" do
35
- context "when hist is not set" do
35
+ context "when host is not set" do
36
36
  it { expect(subject.disabled?).to be true }
37
37
  end
38
38
 
@@ -16,7 +16,7 @@ shared_examples "includes Spidr::Settings::Proxy" do
16
16
  end
17
17
 
18
18
  it "should retain the default value" do
19
- expect(subject.proxy.object_id).to be subject.proxy.object_id
19
+ expect(subject.proxy.object_id).to be(subject.proxy.object_id)
20
20
  end
21
21
  end
22
22
 
@@ -26,7 +26,7 @@ shared_examples "includes Spidr::Settings::Proxy" do
26
26
  end
27
27
 
28
28
  it "should return the set @proxy" do
29
- expect(subject.proxy).to be proxy
29
+ expect(subject.proxy).to be(proxy)
30
30
  end
31
31
  end
32
32
  end
@@ -35,12 +35,10 @@ shared_examples "includes Spidr::Settings::Proxy" do
35
35
  context "when given a Proxy object" do
36
36
  let(:proxy) { Proxy.new(host: proxy_host, port: proxy_port) }
37
37
 
38
- before do
39
- subject.proxy = proxy
40
- end
38
+ before { subject.proxy = proxy }
41
39
 
42
40
  it "should save it" do
43
- expect(subject.proxy).to be proxy
41
+ expect(subject.proxy).to be(proxy)
44
42
  end
45
43
  end
46
44
 
@@ -51,15 +49,37 @@ shared_examples "includes Spidr::Settings::Proxy" do
51
49
 
52
50
  it "should create a new Proxy object" do
53
51
  expect(subject.proxy).to be_kind_of(Proxy)
54
- expect(subject.proxy[:host]).to be proxy_host
55
- expect(subject.proxy[:port]).to be proxy_port
52
+ expect(subject.proxy[:host]).to be(proxy_host)
53
+ expect(subject.proxy[:port]).to be(proxy_port)
56
54
  end
57
55
  end
58
56
 
59
- context "when given nil" do
60
- before do
61
- subject.proxy = nil
57
+ context "when given a URI::HTTP" do
58
+ let(:uri) { URI::HTTP.build(host: proxy_host, port: proxy_port) }
59
+
60
+ before { subject.proxy = uri }
61
+
62
+ it "should create a new Proxy object based on the URI" do
63
+ expect(subject.proxy).to be_kind_of(Proxy)
64
+ expect(subject.proxy[:host]).to eq(proxy_host)
65
+ expect(subject.proxy[:port]).to eq(proxy_port)
66
+ end
67
+ end
68
+
69
+ context "when given a String" do
70
+ let(:url) { "http://#{proxy_host}:#{proxy_port}" }
71
+
72
+ before { subject.proxy = url }
73
+
74
+ it "should parse the String as a URI and create a new Proxy object" do
75
+ expect(subject.proxy).to be_kind_of(Proxy)
76
+ expect(subject.proxy[:host]).to eq(proxy_host)
77
+ expect(subject.proxy[:port]).to eq(proxy_port)
62
78
  end
79
+ end
80
+
81
+ context "when given nil" do
82
+ before { subject.proxy = nil }
63
83
 
64
84
  it "should leave an empty proxy" do
65
85
  expect(subject.proxy).to be_kind_of(Proxy)
data/spec/spec_helper.rb CHANGED
@@ -1,4 +1,7 @@
1
1
  require 'rspec'
2
+ require 'simplecov'
2
3
  require 'spidr/version'
3
4
 
4
5
  include Spidr
6
+
7
+ SimpleCov.start
data/spidr.gemspec CHANGED
@@ -7,10 +7,7 @@ Gem::Specification.new do |gem|
7
7
 
8
8
  gem.name = gemspec.fetch('name')
9
9
  gem.version = gemspec.fetch('version') do
10
- lib_dir = File.join(File.dirname(__FILE__),'lib')
11
- $LOAD_PATH << lib_dir unless $LOAD_PATH.include?(lib_dir)
12
-
13
- require 'spidr/version'
10
+ require_relative 'lib/spidr/version'
14
11
  Spidr::VERSION
15
12
  end
16
13
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.1
4
+ version: 0.7.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-10-25 00:00:00.000000000 Z
11
+ date: 2024-01-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -49,9 +49,10 @@ extra_rdoc_files:
49
49
  - LICENSE.txt
50
50
  - README.md
51
51
  files:
52
+ - ".editorconfig"
53
+ - ".github/workflows/ruby.yml"
52
54
  - ".gitignore"
53
55
  - ".rspec"
54
- - ".travis.yml"
55
56
  - ".yardopts"
56
57
  - ChangeLog.md
57
58
  - Gemfile
@@ -112,7 +113,7 @@ homepage: https://github.com/postmodern/spidr#readme
112
113
  licenses:
113
114
  - MIT
114
115
  metadata: {}
115
- post_install_message:
116
+ post_install_message:
116
117
  rdoc_options: []
117
118
  require_paths:
118
119
  - lib
@@ -127,8 +128,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
127
128
  - !ruby/object:Gem::Version
128
129
  version: '0'
129
130
  requirements: []
130
- rubygems_version: 3.0.3
131
- signing_key:
131
+ rubygems_version: 3.4.10
132
+ signing_key:
132
133
  specification_version: 4
133
134
  summary: A versatile Ruby web spidering library
134
135
  test_files: []
data/.travis.yml DELETED
@@ -1,16 +0,0 @@
1
- ---
2
- before_install:
3
- - gem update --system
4
- - gem install bundler -v "~> 2.0"
5
- language: ruby
6
- sudo: false
7
- cache:
8
- - bundler
9
- rvm:
10
- - 2.5
11
- - 2.6
12
- - jruby
13
- matrix:
14
- allow_failures:
15
- - rvm: jruby
16
- script: bundle exec rake spec