RubyGems - spidr - Versions diffs - 0.5.0 → 0.6.0 - Mend

spidr 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

checksums.yaml +4 -4
data/.travis.yml +14 -0
data/ChangeLog.md +20 -2
data/Gemfile +2 -2
data/README.md +4 -2
data/Rakefile +1 -0
data/gemspec.yml +1 -1
data/lib/spidr/agent.rb +145 -85
data/lib/spidr/agent/filters.rb +1 -9
data/lib/spidr/agent/robots.rb +36 -0
data/lib/spidr/page.rb +76 -28
data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
data/lib/spidr/page/cookies.rb +60 -0
data/lib/spidr/page/{links.rb → html.rb} +47 -23
data/lib/spidr/page/status_codes.rb +112 -0
data/lib/spidr/proxy.rb +56 -0
data/lib/spidr/session_cache.rb +60 -24
data/lib/spidr/settings.rb +3 -0
data/lib/spidr/settings/proxy.rb +61 -0
data/lib/spidr/settings/timeouts.rb +33 -0
data/lib/spidr/settings/user_agent.rb +14 -0
data/lib/spidr/spidr.rb +15 -79
data/lib/spidr/version.rb +1 -1
data/spec/agent/actions_spec.rb +158 -32
data/spec/agent/filters_spec.rb +46 -29
data/spec/agent/sanitizers_spec.rb +25 -31
data/spec/agent_spec.rb +772 -50
data/spec/example_app.rb +27 -0
data/spec/example_page.rb +33 -0
data/spec/page/content_types_spec.rb +150 -0
data/spec/page/cookies_spec.rb +58 -0
data/spec/page/html_spec.rb +524 -0
data/spec/page/status_codes_spec.rb +87 -0
data/spec/page_spec.rb +114 -78
data/spec/proxy_spec.rb +45 -0
data/spec/session_cache.rb +103 -2
data/spec/settings/proxy_examples.rb +82 -0
data/spec/settings/timeouts_examples.rb +93 -0
data/spec/settings/user_agent_examples.rb +25 -0
data/spec/spidr_spec.rb +6 -29
data/spidr.gemspec +38 -109
metadata +35 -31
data/lib/spidr/page/body.rb +0 -98
data/spec/helpers/history.rb +0 -34
data/spec/helpers/page.rb +0 -8
data/spec/helpers/wsoc.rb +0 -83
data/spec/page_examples.rb +0 -21

data/spec/page/status_codes_spec.rb ADDED

@@ -0,0 +1,87 @@
+require 'spec_helper'
+require 'example_page'
+require 'spidr/page'
+describe Page do
+  include_context "example Page"
+  describe "#code" do
+    it "should return the Integer version of the response status code" do
+      expect(subject.code).to be code
+    end
+  end
+  shared_examples "status code method" do |method,status_codes|
+    status_codes.each do |code,expected|
+      context "when status code is #{code}" do
+        let(:code) { code }
+        it { expect(subject.send(method)).to be expected }
+      end
+    end
+  end
+  describe "#is_ok?" do
+    include_examples "status code method", :is_ok?, {200 => true, 500 => false}
+  end
+  describe "#timedout?" do
+    include_examples "status code method", :timedout?, {308 => true, 200 => false}
+  end
+  describe "#bad_request?" do
+    include_examples "status code method", :bad_request?, {400 => true, 200 => false}
+  end
+  describe "#is_unauthorized?" do
+    include_examples "status code method", :is_unauthorized?, {401 => true, 200 => false}
+  end
+  describe "#is_forbidden?" do
+    include_examples "status code method", :is_forbidden?, {403 => true, 200 => false}
+  end
+  describe "#is_missing?" do
+    include_examples "status code method", :is_missing?, {404 => true, 200 => false}
+  end
+  describe "#had_internal_server_error?" do
+    include_examples "status code method", :had_internal_server_error?, {500 => true, 200 => false}
+  end
+  describe "#is_redirect?" do
+    include_examples "status code method", :is_redirect?, {
+      300 => true,
+      301 => true,
+      302 => true,
+      303 => true,
+      304 => false,
+      305 => false,
+      306 => false,
+      307 => true
+    }
+    context "when code is 200" do
+      context "and there is a meta refresh redirect" do
+        let(:body) do
+          %{<html><head><meta http-equiv="refresh" content="0; url=/other" /></head><body>redirecting...</body></html>}
+        end
+        it { expect(subject.is_redirect?).to be true }
+      end
+      context "and there is no meta refresh redirect" do
+        let(:body) { %{<html><body>foo</body></html>} }
+        it { expect(subject.is_redirect?).to be false }
+      end
+    end
+    context "when that status code is not 30x or 200" do
+      let(:code) { 404 }
+      it { expect(subject.is_redirect?).to be false }
+    end
+  end
+end

data/spec/page_spec.rb CHANGED

@@ -1,128 +1,164 @@
-require 'spidr/page'
 require 'spec_helper'
-require 'page_examples'
-require 'helpers/page'
+require 'example_page'
+require 'spidr/page'
 describe Page do
-  describe "html" do
-    before(:all) do
-      @page = get_page('http://spidr.rubyforge.org/course/start.html')
-    end
+  include_context "example Page"
-    it_should_behave_like "Page"
+  describe "#initialize" do
+    let(:headers) { {'X-Foo' => 'bar'} }
-    it "should be OK" do
-      expect(@page).to be_ok
+    it "should set #url" do
+      expect(subject.url).to be url
     end
-    it "should have a content-type" do
-      expect(@page.content_type).to include('text/html')
+    it "should set #headers" do
+      expect(subject.headers).to be == {
+        'content-type' => [content_type],
+        'x-foo'        => ['bar']
+      }
     end
+  end
-    it "should be a html page" do
-      expect(@page).to be_html
-    end
+  describe "method_missing" do
+    let(:headers) { {'X-Foo' => 'bar'} }
-    it "should have provide a document" do
-      expect(@page.doc.class).to eq(Nokogiri::HTML::Document)
+    it "should provide transparent access to headers" do
+      expect(subject.x_foo).to be == 'bar'
     end
-    it "should allow searching the document" do
-      expect(@page.doc.search('//p').length).to eq(2)
-      expect(@page.doc.at('//p[2]').inner_text).to eq('Ready! Set! Go!')
+    context "when the requested header does not exist" do
+      it do
+        expect { subject.x_bar }.to raise_error(NoMethodError)
+      end
     end
-    it "should have a title" do
-      expect(@page.title).to eq('Spidr :: Web-Spider Obstacle Course :: Start')
+    context "when method arguments are also given" do
+      it do
+        expect { subject.x_foo(1) }.to raise_error(NoMethodError)
+      end
     end
-    it "should have links" do
-      expect(@page.links).not_to be_empty
+    context "when a block is also given" do
+      it do
+        expect { subject.x_foo { } }.to raise_error(NoMethodError)
+      end
     end
   end
-  describe "txt" do
-    before(:all) do
-      @page = get_page('https://www.ruby-lang.org/en/about/license.txt')
-    end
+  describe "#body" do
+    context "when there is a body" do
+      let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
-    it_should_behave_like "Page"
-    it "should be OK" do
-      expect(@page).to be_ok
+      it "should return the body text" do
+        expect(subject.body).to be body
+      end
     end
-    it "should have a content-type" do
-      expect(@page.content_type).to include('text/plain')
+    context "when there is no body" do
+      it "should return an empty String" do
+        expect(subject.body).to be == ''
+      end
     end
+  end
-    it "should be a txt page" do
-      expect(@page).to be_txt
-    end
+  describe "#doc" do
+    context "when the Content-Type is text/html" do
+      let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
-    it "should not have provide a document" do
-      expect(@page.doc).to be_nil
+      it "should parse the body as HTML" do
+        expect(subject.doc).to be_kind_of(Nokogiri::HTML::Document)
+        expect(subject.doc.at('//p').inner_text).to be == 'hello'
+      end
     end
-    it "should not allow searching the document" do
-      expect(@page.search('//p')).to be_empty
-      expect(@page.at('//p')).to be_nil
-    end
+    context "when the document is application/rss+xml" do
+      let(:content_type) { 'application/rss+xml' }
+      let(:body) do
+        %{<?xml version="1.0" encoding="UTF-8" ?><rss version="2.0"></rss>}
+      end
-    it "should not have links" do
-      expect(@page.links).to be_empty
+      it "should parse the body as XML" do
+        expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
+      end
     end
-    it "should not have a title" do
-      expect(@page.title).to be_nil
-    end
-  end
+    context "when the document is application/atom+xml" do
+      let(:content_type) { 'application/atom+xml' }
+      let(:body) do
+        %{<?xml version="1.0" encoding="UTF-8" ?><feed xmlns="http://www.w3.org/2005/Atom"></feed>}
+      end
-  describe "redirects" do
-    before(:all) do
-      @page = get_page('http://spidr.rubyforge.org/course/start.html')
+      it "should parse the body as XML" do
+        expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
+      end
     end
-    before do
-      allow(@page).to receive(:body).and_return('<meta HTTP-EQUIV="REFRESH" content="0; url=http://spidr.rubyforge.org/redirected">')
+    context "when the document is text/xml" do
+      let(:content_type) { 'text/xml' }
+      let(:body) do
+        %{<?xml version="1.0" encoding="UTF-8" ?><foo />}
+      end
+      it "should parse the body as XML" do
+        expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
+      end
     end
-    it "should provide access to page-level redirects" do
-      expect(@page.redirects_to).to eq(['http://spidr.rubyforge.org/redirected'])
-    end
+    context "when the document is text/xsl" do
+      let(:content_type) { 'text/xsl' }
+      let(:body) do
+        %{<?xml version="1.0" encoding="UTF-8" ?><xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"></xsl:stylesheet>}
+      end
-    it "should include meta refresh redirects in the list of links" do
-      expect(@page.links).to include('http://spidr.rubyforge.org/redirected')
+      it "should parse the body as XML" do
+        expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
+      end
     end
-  end
-  describe "cookies" do
-    before(:all) do
-      @page = get_page('http://twitter.com/login')
+    context "when there is no body" do
+      it "should return an empty String" do
+        expect(subject.doc).to be nil
+      end
     end
+  end
-    it "should provide access to the raw Cookie" do
-      cookie = @page.cookie
+  describe "#search" do
+    context "when there is a document" do
+      let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
-      expect(cookie).not_to be_nil
-      expect(cookie).not_to be_empty
+      it "should search the document" do
+        expect(subject.search('//p').inner_text).to be == 'hello'
+      end
     end
-    it "should provide access to the Cookies" do
-      cookies = @page.cookies
-      expect(cookies).not_to be_empty
+    context "when there is no document" do
+      it "should return an empty Array" do
+        expect(subject.search('//p')).to be == []
+      end
     end
+  end
+  describe "#at" do
+    context "when there is a document" do
+      let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
-    it "should provide access to the key->value pairs within the Cookie" do
-      params = @page.cookie_params
-      expect(params).not_to be_empty
+      it "should search the document for the first matching node" do
+        expect(subject.at('//p').inner_text).to be == 'hello'
+      end
+    end
-      params.each do |key,value|
-        expect(key).not_to be_empty
+    context "when there is no document" do
+      it "should return nil" do
+        expect(subject.at('//p')).to be nil
       end
     end
   end
+  describe "#to_s" do
+    it "should return the body" do
+      expect(subject.to_s).to be body
+    end
+  end
 end

data/spec/proxy_spec.rb ADDED

@@ -0,0 +1,45 @@
+require 'spec_helper'
+require 'spidr/proxy'
+describe Spidr::Proxy do
+  let(:proxy_host) { 'proxy.example.com' }
+  let(:proxy_port) { 9999 }
+  let(:proxy_user) { 'bob' }
+  let(:proxy_password) { 'secret' }
+  describe "DEFAULT_PORT" do
+    subject { described_class::DEFAULT_PORT }
+    it { expect(subject).to be 8080 }
+  end
+  describe "#initialize" do
+    it "should default port to 8080" do
+      expect(subject.port).to be 8080
+    end
+  end
+  describe "#enabled?" do
+    context "when host is set" do
+      subject { described_class.new(host: proxy_host) }
+      it { expect(subject.enabled?).to be true }
+    end
+    context "when hist is not set" do
+      it { expect(subject.enabled?).to be false }
+    end
+  end
+  describe "#disabled?" do
+    context "when hist is not set" do
+      it { expect(subject.disabled?).to be true }
+    end
+    context "when host is set" do
+      subject { described_class.new(host: proxy_host) }
+      it { expect(subject.disabled?).to be false }
+    end
+  end
+end

data/spec/session_cache.rb CHANGED

@@ -1,9 +1,110 @@
 require 'spidr/session_cache'
 require 'spec_helper'
+require 'settings/proxy_examples'
+require 'settings/timeouts_examples'
 describe SessionCache do
-  describe "empty" do
+  describe "#initialize" do
+    let(:proxy_host) { 'proxy.example.com' }
+    let(:proxy_port) { 9999 }
+    let(:open_timeout)       { 1 }
+    let(:ssl_timeout)        { 2 }
+    let(:read_timeout)       { 3 }
+    let(:continue_timeout)   { 4 }
+    let(:keep_alive_timeout) { 5 }
+    subject do
+      described_class.new(
+        proxy: {host: proxy_host, port: proxy_port},
+        open_timeout:       open_timeout,
+        ssl_timeout:        ssl_timeout,
+        read_timeout:       read_timeout,
+        continue_timeout:   continue_timeout,
+        keep_alive_timeout: keep_alive_timeout,
+      )
+    end
+    it "should set proxy" do
+      expect(subject.proxy[:host]).to be == proxy_host
+      expect(subject.proxy[:port]).to be == proxy_port
+    end
+    it "should set open_timeout" do
+      expect(subject.open_timeout).to be open_timeout
+    end
+    it "should set ssl_timeout" do
+      expect(subject.ssl_timeout).to be ssl_timeout
+    end
+    it "should set read_timeout" do
+      expect(subject.read_timeout).to be read_timeout
+    end
+    it "should set continue_timeout" do
+      expect(subject.continue_timeout).to be continue_timeout
+    end
+    it "should set keep_alive_timeout" do
+      expect(subject.keep_alive_timeout).to be keep_alive_timeout
+    end
+    context "with no arguments" do
+      before(:all) do
+        Spidr.proxy = {host: 'proxy.example.com', port: 9999}
+        Spidr.open_timeout       = 1
+        Spidr.ssl_timeout        = 2
+        Spidr.read_timeout       = 3
+        Spidr.continue_timeout   = 4
+        Spidr.keep_alive_timeout = 5
+      end
+      subject { described_class.new }
+      it "should use the global proxy settings" do
+        expect(subject.proxy).to be Spidr.proxy
+      end
+      it "should use the global open_timeout" do
+        expect(subject.open_timeout).to be == Spidr.open_timeout
+      end
+      it "should use the global ssl_timeout" do
+        expect(subject.ssl_timeout).to be == Spidr.ssl_timeout
+      end
+      it "should use the global read_timeout" do
+        expect(subject.read_timeout).to be == Spidr.read_timeout
+      end
+      it "should use the global continue_timeout" do
+        expect(subject.continue_timeout).to be == Spidr.continue_timeout
+      end
+      it "should use the global keep_alive_timeout" do
+        expect(subject.keep_alive_timeout).to be == Spidr.keep_alive_timeout
+      end
+      before(:all) do
+        Spidr.proxy = nil
+        Spidr.open_timeout       = nil
+        Spidr.ssl_timeout        = nil
+        Spidr.read_timeout       = nil
+        Spidr.continue_timeout   = nil
+        Spidr.keep_alive_timeout = nil
+      end
+    end
+  end
+  it_should_behave_like "includes Spidr::Settings::Proxy"
+  it_should_behave_like "includes Spidr::Settings::Timeouts"
+  context "when empty" do
     before(:all) do
       @sessions = SessionCache.new
     end
@@ -21,7 +122,7 @@ describe SessionCache do
     end
   end
-  describe "not-empty" do
+  context "when not-empty" do
     before(:all) do
       @url = URI('http://example.com/')