RubyGems - grell - Versions diffs - 1.3.0 - Mend

grell 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +7 -0
data/.rspec +2 -0
data/CHANGELOG.md +32 -0
data/Gemfile +3 -0
data/LICENSE.txt +22 -0
data/README.md +78 -0
data/Rakefile +2 -0
data/grell.gemspec +30 -0
data/lib/grell.rb +10 -0
data/lib/grell/capybara_driver.rb +34 -0
data/lib/grell/crawler.rb +44 -0
data/lib/grell/grell_logger.rb +10 -0
data/lib/grell/page.rb +231 -0
data/lib/grell/page_collection.rb +46 -0
data/lib/grell/rawpage.rb +37 -0
data/lib/grell/reader.rb +15 -0
data/lib/grell/version.rb +3 -0
data/spec/lib/crawler_spec.rb +108 -0
data/spec/lib/page_collection_spec.rb +149 -0
data/spec/lib/page_spec.rb +284 -0
data/spec/lib/reader_spec.rb +43 -0
data/spec/spec_helper.rb +64 -0
metadata +196 -0

data/lib/grell/page_collection.rb ADDED

@@ -0,0 +1,46 @@
+module Grell
+  class PageCollection
+    attr_reader :collection
+    def initialize
+      @collection = []
+    end
+    def create_page(url, parent_id)
+      page_id = next_id
+      page = Page.new(url, page_id, parent_id)
+      add(page)
+      page
+    end
+    def visited_pages
+      @collection.select {|page| page.visited?}
+    end
+    def discovered_pages
+      @collection - visited_pages
+    end
+    def next_page
+      discovered_pages.sort_by{|page| page.parent_id}.first
+    end
+    private
+    def next_id
+      @collection.size
+    end
+    def add(page)
+      # Although finding unique pages based on URL will add pages with different query parameters,
+      # in some cases we do link to different pages depending on the query parameters like when using proxies
+      new_url = @collection.none? do |collection_page|
+        collection_page.url.downcase == page.url.downcase
+      end
+      if new_url
+        @collection.push page
+      end
+    end
+  end
+end

data/lib/grell/rawpage.rb ADDED

@@ -0,0 +1,37 @@
+module Grell
+  # This class depends heavily on Capybara but contains no logic.
+  class RawPage
+    include Capybara::DSL
+    def navigate(url)
+      visit(url)
+    end
+    def headers
+      page.response_headers
+    end
+    def status
+      page.status_code
+    end
+    def body
+      page.body
+    end
+    def all_anchors
+      # Some elements may not be "a" elements but still provide a link. This usually is done for Javascript
+      # to convert other elements which are not links to be able to be clicked naturally.
+      all('[href]', visible: false).to_a + all('[data-href]', visible: false).to_a
+    end
+    def host
+      page.current_host
+    end
+    def has_selector?(selector)
+      page.has_selector?(selector)
+    end
+  end
+end

data/lib/grell/reader.rb ADDED

@@ -0,0 +1,15 @@
+module Grell
+  class Reader
+    def self.wait_for(action, max_waiting, sleeping_time)
+      time_start = Time.now
+      action.call()
+      return if yield
+      while (Time.now < time_start + max_waiting)
+        action.call()
+        break if yield
+        sleep(sleeping_time)
+      end
+    end
+  end
+end

data/lib/grell/version.rb ADDED

@@ -0,0 +1,3 @@
+module Grell
+  VERSION = "1.3.0"
+end

data/spec/lib/crawler_spec.rb ADDED

@@ -0,0 +1,108 @@
+RSpec.describe Grell::Crawler do
+  let(:page_id) { rand(10).floor + 10}
+  let(:parent_page_id) {rand(10).floor}
+  let(:page) {Grell::Page.new(url, page_id, parent_page_id)}
+  let(:host) {"http://www.example.com"}
+  let(:url) {"http://www.example.com/test"}
+  let(:crawler) { Grell::Crawler.new(external_driver: true)}
+  let(:body) {'body'}
+  before do
+    proxy.stub(url).and_return(body: body, code: 200)
+  end
+  describe 'initialize' do
+    it 'can provide your own logger' do
+      Grell::Crawler.new(external_driver: true, logger: 33)
+      expect(Grell.logger).to eq(33)
+    end
+    it 'provides a stdout logger if nothing provided' do
+      crawler
+      expect(Grell.logger).to be_instance_of(Logger)
+    end
+  end
+  context '#crawl' do
+    it 'yields the result if a block is given' do
+      result = []
+      block = Proc.new {|n| result.push(n) }
+      crawler.crawl(page, block)
+      expect(result.size).to eq(1)
+      expect(result.first.url).to eq(url)
+      expect(result.first.visited?).to eq(true)
+    end
+    it 'logs interesting information' do
+      crawler
+      expect(Grell.logger).to receive(:info).with(/Visiting #{url}, visited_links: 0, discovered 0/)
+      crawler.crawl(page, nil)
+    end
+  end
+  context '#start_crawling' do
+    let(:body) do
+      <<-EOS
+      <html><head></head><body>
+      <a href="/musmis.html">trusmis</a>
+      Hello world!
+      </body></html>
+      EOS
+    end
+    let(:url_visited) {"http://www.example.com/musmis.html"}
+    before do
+      proxy.stub(url_visited).and_return(body: 'body', code: 200)
+    end
+    it 'calls the block we used to start_crawling' do
+      result = []
+      block = Proc.new {|n| result.push(n) }
+      crawler.start_crawling(url, &block)
+      expect(result.size).to eq(2)
+      expect(result[0].url).to eq(url)
+      expect(result[1].url).to eq(url_visited)
+    end
+  end
+  context 'the url has no links' do
+    let(:body) do
+      "<html><head></head><body>
+      Hello world!
+      </body></html>"
+    end
+    before do
+      crawler.start_crawling(url)
+    end
+    it 'visits all the pages' do
+      expect(crawler.collection.visited_pages.size).to eq(1)
+    end
+    it 'has no more pages to discover' do
+      expect(crawler.collection.discovered_pages.size).to eq(0)
+    end
+  end
+  context 'the url has several links' do
+    let(:body) do
+      "<html><head></head><body>
+      <a href=\"/trusmis.html\">trusmis</a>
+      <a href=\"/help.html\">help</a>
+      Hello world!
+      </body></html>"
+    end
+    before do
+      proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200)
+      proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
+    end
+    it 'visits all the pages' do
+      crawler.start_crawling(url)
+      expect(crawler.collection.visited_pages.size).to eq(3)
+    end
+    it 'has no more pages to discover' do
+      crawler.start_crawling(url)
+      expect(crawler.collection.discovered_pages.size).to eq(0)
+    end
+  end
+end

data/spec/lib/page_collection_spec.rb ADDED

@@ -0,0 +1,149 @@
+RSpec.describe Grell::PageCollection do
+  let(:collection) {Grell::PageCollection.new}
+  let(:url) {'http://www.github.com/SomeUser/dragonlance?search=false'}
+  let(:url2) {'http://www.github.com/OtherUser/forgotten?search=false'}
+  context 'empty collection' do
+    it 'has no visited pages' do
+      expect(collection.visited_pages).to be_empty
+    end
+    it 'has no discovered pages' do
+      expect(collection.discovered_pages).to be_empty
+    end
+    it 'next page is nil' do
+      expect(collection.next_page).to be_nil
+    end
+  end
+  context 'one unvisited page' do
+    let(:page) {collection.create_page(url, 0)}
+    before do
+      allow(page).to receive(:visited?).and_return(false)
+    end
+    it 'has no visited pages' do
+      expect(collection.visited_pages).to be_empty
+    end
+    it 'has one discovered page' do
+      expect(collection.discovered_pages).to eq([page])
+    end
+    it 'next page is the unvisited page' do
+      expect(collection.next_page).to eq(page)
+    end
+  end
+  context 'one visited page' do
+    let(:page) {collection.create_page(url, 0)}
+    before do
+      allow(page).to receive(:visited?).and_return(true)
+    end
+    it 'has one visited page' do
+      expect(collection.visited_pages).to eq([page])
+    end
+    it 'has no discovered pages' do
+      expect(collection.discovered_pages).to be_empty
+    end
+    it 'next page is nil' do
+      expect(collection.next_page).to be_nil
+    end
+  end
+  context 'one visited and one unvisited page with the same url' do
+    let(:page) {collection.create_page(url, 0)}
+    let(:unvisited)  {collection.create_page(url.upcase, 0)}
+    before do
+      allow(page).to receive(:visited?).and_return(true)
+      allow(unvisited).to receive(:visited?).and_return(false)
+    end
+    it 'first page has id 0' do
+      expect(page.id).to eq(0)
+    end
+    it 'second page has id 1' do
+      expect(unvisited.id).to eq(1)
+    end
+    it 'has one visited page' do
+      expect(collection.visited_pages).to eq([page])
+    end
+    it 'has no discovered pages' do
+      expect(collection.discovered_pages).to be_empty
+    end
+    it 'next page is nil' do
+      expect(collection.next_page).to be_nil
+    end
+  end
+  context 'one visited and one unvisited page with different URLs' do
+    let(:page) {collection.create_page(url, 0)}
+    let(:unvisited)  {collection.create_page(url2, 0)}
+    before do
+      allow(page).to receive(:visited?).and_return(true)
+      allow(unvisited).to receive(:visited?).and_return(false)
+    end
+    it 'has one visited page' do
+      expect(collection.visited_pages).to eq([page])
+    end
+    it 'has one discovered page' do
+      expect(collection.discovered_pages).to eq([unvisited])
+    end
+    it 'next page is the unvisited page' do
+      expect(collection.next_page).to eq(unvisited)
+    end
+  end
+  context 'one visited and one unvisited page with different URLs only different by the query' do
+    let(:page) {collection.create_page(url, 0)}
+    let(:url3) {'http://www.github.com/SomeUser/dragonlance?search=true'}
+    let(:unvisited)  {collection.create_page(url3, 0)}
+    before do
+      allow(page).to receive(:visited?).and_return(true)
+      allow(unvisited).to receive(:visited?).and_return(false)
+    end
+    it 'has one visited page' do
+      expect(collection.visited_pages).to eq([page])
+    end
+    it 'has one discovered page' do
+      expect(collection.discovered_pages).to eq([unvisited])
+    end
+    it 'next page is the unvisited page' do
+      expect(collection.next_page).to eq(unvisited)
+    end
+  end
+  context 'several unvisited pages' do
+    let(:page) {collection.create_page(url, 2)}
+    let(:page2) {collection.create_page(url2, 0)}
+    before do
+      allow(page).to receive(:visited?).and_return(true)
+      allow(page2).to receive(:visited?).and_return(false)
+    end
+    it "returns the page which has an earlier parent" do
+      expect(collection.next_page).to eq(page2)
+    end
+  end
+end

data/spec/lib/page_spec.rb ADDED

@@ -0,0 +1,284 @@
+RSpec.describe Grell::Page do
+  let(:page_id) { rand(10).floor + 10}
+  let(:parent_page_id) {rand(10).floor}
+  let(:page) {Grell::Page.new(url, page_id, parent_page_id)}
+  let(:host) {"http://www.example.com"}
+  let(:url) {"http://www.example.com/test"}
+  let(:returned_headers)  { { 'Other-Header' => 'yes', 'Content-Type' => 'text/html' }}
+  let(:now) {Time.now}
+  before do
+    allow(Time).to receive(:now).and_return(now)
+  end
+  it "gives access to the url" do
+    expect(page.url).to eq(url)
+  end
+  it "gives access to the page id" do
+    expect(page.id).to eq(page_id)
+  end
+  it "gives access to the parent page id" do
+    expect(page.parent_id).to eq(parent_page_id)
+  end
+  it 'newly created page does not have status yet' do
+    expect(page.status).to eq(nil)
+  end
+  shared_examples_for 'a grell page' do
+    it 'returns the correct status' do
+      expect(page.status).to eq(status)
+    end
+    it 'has the correct body' do
+      expect(page.body).to eq(body)
+    end
+    it 'has correct headers' do
+      expect(page.headers).to include(expected_headers)
+    end
+    it 'has the correct links' do
+      expect(page.links.sort).to eq(links.sort)
+    end
+    it '#visited? returns the correct value' do
+      expect(page.visited?).to eq(visited)
+    end
+    it 'has correct timestamp' do
+      expect(page.timestamp).to eq(now)
+    end
+  end
+  shared_examples_for 'an errored grell page' do
+    it 'returns empty status 404 page after navigating' do
+      expect(page.status).to eq(404)
+      expect(page.links).to eq([])
+      expect(page.headers).to eq(headers)
+      expect(page.body).to eq('')
+      expect(page.has_selector?('html')).to eq(false)
+      expect(page).to be_visited
+      expect(page.timestamp).to eq(now)
+      #expect_any_instance_of(Logger).to receive(:warn) #.with(/The page with the URL #{url} was not available"/)
+    end
+  end
+  [Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::BrowserError, URI::InvalidURIError,
+   Capybara::Poltergeist::TimeoutError, Capybara::Poltergeist::StatusFailError ].each do |error_type|
+    context "#{error_type}" do
+      let(:headers) do
+        {
+          grellStatus: 'Error',
+          errorClass: "#{error_type}",
+          errorMessage: error_message
+        }
+      end
+      let(:error_message) {'Trusmis broke it again'}
+      let(:now) {Time.now}
+      before do
+        allow_any_instance_of(Grell::RawPage).to receive(:navigate).and_raise(error_type, 'error')
+        allow_any_instance_of(error_type).to receive(:message).and_return(error_message)
+        page.navigate
+      end
+      it_behaves_like 'an errored grell page'
+    end
+  end
+  context 'we have not yet navigated to the page' do
+    let(:visited) {false}
+    let(:status) {nil}
+    let(:body) {''}
+    let(:links) {[]}
+    let(:expected_headers) {{}}
+    let(:now) {nil}
+    before do
+      proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
+    end
+    it_behaves_like 'a grell page'
+  end
+  context 'navigating to the URL we get a 404' do
+    let(:visited) {true}
+    let(:status) { 404}
+    let(:body) {'<html><head></head><body>nothing cool</body></html>'}
+    let(:links) {[]}
+    let(:expected_headers) {returned_headers}
+    before do
+      proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
+      page.navigate
+    end
+    it_behaves_like 'a grell page'
+  end
+  context 'navigating to the URL we get page with no links' do
+    let(:visited) {true}
+    let(:status) { 200}
+    let(:body) {'<html><head></head><body>nothing cool</body></html>'}
+    let(:links) {[]}
+    let(:expected_headers) {returned_headers}
+    before do
+      proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
+      page.navigate
+    end
+    it_behaves_like 'a grell page'
+  end
+  context 'navigating to the URL we get page with links using a elements' do
+    let(:visited) {true}
+    let(:status) { 200}
+    let(:body) do
+      "<html><head></head><body>
+      Hello world!
+      <a href=\"/trusmis.html\">trusmis</a>
+      <a href=\"/help.html\">help</a>
+      <a href=\"http://www.outsidewebsite.com/help.html\">help</a>
+      </body></html>"
+    end
+    let(:links) {["http://www.example.com/trusmis.html", "http://www.example.com/help.html"]}
+    let(:expected_headers) {returned_headers}
+    before do
+      proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
+      page.navigate
+    end
+    it_behaves_like 'a grell page'
+    it 'do not return links to external websites' do
+      expect(page.links).to_not include('http://www.outsidewebsite.com/help.html')
+    end
+  end
+  context 'navigating to the URL we get page with links with absolute links' do
+    let(:visited) {true}
+    let(:status) { 200}
+    let(:body) do
+      "<html><head></head><body>
+      Hello world!
+      <a href=\"/trusmis.html\">trusmis</a>
+      <a href=\"http://www.example.com/help.html\">help</a>
+      <a href=\"http://www.outsidewebsite.com/help.html\">help</a>
+      </body></html>"
+    end
+    let(:links) {["http://www.example.com/trusmis.html", "http://www.example.com/help.html"]}
+    let(:expected_headers) {returned_headers}
+    before do
+      proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
+      page.navigate
+    end
+    it_behaves_like 'a grell page'
+    it 'do not return links to external websites' do
+      expect(page.links).to_not include('http://www.outsidewebsite.com/help.html')
+    end
+  end
+  context 'navigating to the URL we get page with links using a mix of elements' do
+    let(:visited) {true}
+    let(:status) { 200}
+    let(:body) do
+      "<html><head></head><body>
+      Hello world!
+      <a href=\"/trusmis.html\">trusmis</a>
+      <table>
+      <tbody>
+      <tr href=\"/help_me.html\"><td>help</td></tr>
+      <tr data-href=\"/help.html\"><td>help</td></tr>
+      </tbody>
+      </table>
+      <div data-href=\"http://www.example.com/more_help.html\">help</div>
+      <div data-href=\"http://www.outsidewebsite.com/help.html\">help</div>
+      </body></html>"
+    end
+    let(:links) do
+      ["http://www.example.com/trusmis.html", "http://www.example.com/help.html",
+       'http://www.example.com/more_help.html', 'http://www.example.com/help_me.html'
+      ]
+    end
+    let(:expected_headers) {returned_headers}
+    before do
+      proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
+      page.navigate
+    end
+    it_behaves_like 'a grell page'
+    it 'do not return links to external websites' do
+      expect(page.links).to_not include('http://www.outsidewebsite.com/help.html')
+    end
+  end
+ context 'navigating to the URL we get page with links inside the header section of the code' do
+    let(:visited) {true}
+    let(:status) { 200}
+    let(:css) {'/application.css'}
+    let(:favicon) {'/favicon.ico'}
+    let(:body) do
+      "<html><head>
+      <title>mimi</title>
+      <link href=\"#{css}\" rel=\"stylesheet\">
+      <link href=\"#{favicon}\" rel=\"shortcut icon\" type=\"image/vnd.microsoft.icon\">
+      </head>
+      <body>
+      Hello world!
+      <a href=\"/trusmis.html\">trusmis</a>
+      </body></html>"
+    end
+    let(:links) do
+      ["http://www.example.com/trusmis.html"]
+    end
+    let(:expected_headers) {returned_headers}
+    before do
+      proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
+      #We need to stub this or Phantomjs will get stuck trying to retrieve the resources
+      proxy.stub(host + css).and_return(body: '', code: status)
+      proxy.stub(host + favicon).and_return(body: '', code: status)
+      page.navigate
+    end
+    it_behaves_like 'a grell page'
+    it 'do not return links to resources in the header' do
+      expect(page.links).to_not include('http://www.example.com/application.css')
+    end
+  end
+  context 'status is never set' do #this may happen when there is nothing comming from the site
+    before do
+      stub_const('Grell::Page::WAIT_TIME', 0)
+      allow_any_instance_of(Grell::RawPage).to receive(:status).and_return(nil)
+      allow_any_instance_of(Grell::RawPage).to receive(:headers).and_return({})
+      allow_any_instance_of(Grell::RawPage).to receive(:body).and_return('')
+      proxy.stub(url).and_return(body: body, code: nil, headers: {})
+      page.navigate
+    end
+    let(:visited) {true}
+    let(:status) { nil}
+    let(:body) {''}
+    let(:links) {[]}
+    let(:expected_headers) {{}}
+    it_behaves_like 'a grell page'
+  end
+end