RubyGems - crabbs - Versions diffs - 0.0.2 → 0.1.0 - Mend

crabbs 0.0.2 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 9fc5a904a6d962205d1a075c95ef79a98ccba4e7
-  data.tar.gz: e30f07bf9305c2e4a6aad06de1c335d2f2976d0d
+  metadata.gz: c21a29135af8fd53a59e659fd1ccbff6f1642f6d
+  data.tar.gz: 9c144e1bc4381a328f01b22b36d7936e348713bf
 SHA512:
-  metadata.gz: 82c05e3734be530a5f8e9059fdd69959598f76ad66cbd3ce2cb9af7120e3d24cc9839eb79a7168417273404e7dd20a74a1cad01d05563f5ce4743f59e6efaaba
-  data.tar.gz: db18939b31a5cb899cdc8ca5872ba355e21ef3f597106c9996debc44697fa08ab8c7d1834bbbdff0c3373939c034334e5897a514ab0a45a2b9c91c8410d584fa
+  metadata.gz: 6573282576cb66e87d654a4d24f8361c6b53afe6361fb7656e26fa3df941577519f952a65a6f3ff7c6583229a1382b75e026c77bec14355ca51280b9592d4dcc
+  data.tar.gz: 3a8e8eb68e217614c406ce547e6a204463636d2457615f8432235c2f86fbba0c32bc2b5765f7a97dd71bcda71f9ab6d5d2101eb758ac6c965f9233ce5a58deeb

data/lib/crabbs.rb CHANGED Viewed

@@ -8,9 +8,9 @@ module Crabbs
   class << self
     attr_reader :crawler
-    def start(url)
-      @crawler = Crabbs::Crawler.new
-      @crawler.crawl url
+    def start(options)
+      @crawler = Crabbs::Crawler.new options
+      @crawler.crawl options[:url]
       @crawler.site_map
     end
   end

data/lib/crabbs/cli.rb CHANGED Viewed

@@ -7,8 +7,8 @@ module Crabbs
     def start
       begin
         opts = parse_options
-        result = Crabbs.start(opts[:url])
-        STDOUT.puts result.to_json
+        result = Crabbs.start({ url: opts[:url], verbose: opts[:verbose] })
+        STDOUT.puts "\nResult:\n#{result.to_json}"
       rescue Slop::MissingOptionError => e
         STDOUT.puts e.message
       end
@@ -21,6 +21,7 @@ module Crabbs
         banner 'Usage: crabbs [options]'
         on 'u', 'url=', 'URL to start crawling', required: true
+        on 'v', 'verbose', 'Shows URLs being crawled', default: false
       end
     end
   end

data/lib/crabbs/crawler.rb CHANGED Viewed

@@ -4,18 +4,19 @@ module Crabbs
   class Crawler
     attr_reader :visited, :site_map
-    def initialize()
+    def initialize(options={})
       @visited = []
       @site_map = {}
+      @options = options
     end
     def crawl(uri_string)
-      recurse uri_string, @site_map
+      recursively_crawl uri_string, @site_map
     end
     private
-    def recurse(uri_string, hash)
+    def recursively_crawl(uri_string, hash)
       hash[uri_string] = Hash.new
       return if (@visited.include? uri_string)
@@ -24,19 +25,19 @@ module Crabbs
       @visited << uri_string
       links.each do |link|
-        recurse(link, hash[uri_string])
+        recursively_crawl(link, hash[uri_string])
       end
     end
     def extract_links(uri_string)
-      begin
-        uri = URI.parse(uri_string)
-        page = Crabbs::Page.new(Net::HTTP.get(uri), uri.to_s)
-        page.links
-      rescue URI::InvalidURIError
-        []
-      end
+      uri = URI.parse(uri_string)
+      STDOUT.puts "Visiting: #{uri_string}" if @options[:verbose]
+      STDOUT.putc '.' unless @options[:verbose]
+      page = Crabbs::Page.new(Net::HTTP.get(uri), uri.to_s)
+      page.links
+    rescue URI::InvalidURIError
+      []
     end
   end
 end

data/lib/crabbs/link.rb ADDED Viewed

@@ -0,0 +1,40 @@
+module Crabbs
+  class Link
+    def initialize(href)
+      @href = href
+      @uri = URI.parse(href)
+    end
+    def same_host_as?(url)
+      @uri.host == URI.parse(url).host or @uri.host.nil?
+    end
+    def has_valid_fragment?
+      @uri.fragment.nil? or not @uri.fragment.empty?
+    end
+    def has_html_extension?
+      link = @href
+      if not @uri.host.nil?
+        link = @href.sub(@uri.host, '')
+      end
+      extension = File.extname(link)
+      extension.empty? or extension == '.html'
+    end
+    def join(url)
+      new_uri = @uri
+      if @uri.host.nil?
+        new_uri = URI.parse url
+        new_uri = URI.join(new_uri.to_s, @uri.path) unless @uri.path.nil?
+        new_uri = URI.join(new_uri.to_s, "?#{@uri.query}") unless @uri.query.nil?
+        new_uri = URI.join(new_uri.to_s, "##{@uri.fragment}") unless @uri.fragment.nil?
+      end
+      new_uri.to_s
+    end
+  end
+end

data/lib/crabbs/page.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 require 'nokogiri'
+require 'crabbs/link'
 module Crabbs
   class Page
@@ -8,38 +9,14 @@ module Crabbs
     end
     def links
-      uri_list = @html.css('a[href]').map { |a| URI.parse(a['href']) }
-      valid_uris = uri_list
-        .select { |uri| uri.host == URI.parse(@url).host or uri.host.nil? }
-        .select { |uri| uri.fragment.nil? or not uri.fragment.empty? }
-      links = create_full_uri_links(valid_uris)
-      links = links.select do |link|
-        host = URI.parse(link).host
-        extension = File.extname(link.sub(host, ''))
-        extension.empty? or extension == '.html'
-      end
-      links.uniq
-    end
-    private
-    def create_full_uri_links(uri_list)
-      uri_list.map do |uri|
-        new_uri = uri
-        if uri.host.nil?
-          new_uri = URI.parse @url
-          new_uri = URI.join(new_uri.to_s, uri.path) unless uri.path.nil?
-          new_uri = URI.join(new_uri.to_s, "?#{uri.query}") unless uri.query.nil?
-          new_uri = URI.join(new_uri.to_s, "##{uri.fragment}") unless uri.fragment.nil?
-        end
-        new_uri.to_s
-      end
+      links = @html.css('a[href]').map { |a| Crabbs::Link.new a['href'] }
+      links
+        .select { |link| link.same_host_as? @url }
+        .select(&:has_valid_fragment?)
+        .select(&:has_html_extension?)
+        .map { |link| link.join @url }
+        .uniq
     end
   end
 end

data/lib/crabbs/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Crabbs
-  VERSION = "0.0.2"
+  VERSION = "0.1.0"
 end

data/spec/crabbs/cli_spec.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 require 'crabbs/cli'
+require 'webmock/rspec'
 describe Crabbs::CLI do
   subject { Crabbs::CLI.new }
@@ -25,8 +26,25 @@ describe Crabbs::CLI do
         subject.start
-        expect(Crabbs).to have_received(:start).with('https://example.com')
-        expect(STDOUT).to have_received(:puts).with('result'.to_json)
+        expect(Crabbs).to have_received(:start).with({ url: 'https://example.com', verbose: false })
+        expect(STDOUT).to have_received(:puts).with("\nResult:\n#{'result'.to_json}")
+      end
+    end
+    context 'integration with crabbs' do
+      before do
+         stub_request(:get, "http://example.com/").to_return(:body => "")
+      end
+      it 'does not break contract' do
+        ARGV.replace ['--url=http://example.com']
+        allow(STDOUT).to receive(:puts)
+        subject.start
+        output = "\nResult:\n#{{ 'http://example.com' => {} }.to_json}"
+        expect(STDOUT).to have_received(:puts).with(output)
       end
     end
   end

data/spec/crabbs/crabbs_spec.rb CHANGED Viewed

@@ -11,7 +11,7 @@ describe Crabbs do
     end
     it 'starts crawling' do
-      subject.start('http://example.com').should == { 'http://example.com' => {} }
+      subject.start({ url: 'http://example.com' }).should == { 'http://example.com' => {} }
     end
   end
 end

data/spec/crabbs/crawler_spec.rb CHANGED Viewed

@@ -4,7 +4,48 @@ require 'crabbs/crawler'
 describe Crabbs::Crawler do
   describe '#crawl' do
-    subject { Crabbs::Crawler.new }
+    let(:options) { Hash.new }
+    subject { Crabbs::Crawler.new options }
+    context 'when verbose' do
+      let(:options) { { verbose: true } }
+      before do
+        @uri_string = 'http://example.com/'
+        stub_request(:get, @uri_string).to_return(body: %Q{<a href="/path"></a><a href="/local"></a><a href="http://fb.com/"></a>})
+        stub_request(:get, "http://example.com/path").to_return(body: "")
+        stub_request(:get, "http://example.com/local").to_return(body: "")
+      end
+      it 'logs the url been visited' do
+        allow(STDOUT).to receive(:puts)
+        subject.crawl @uri_string
+        expect(STDOUT).to have_received(:puts).with('Visiting: http://example.com/')
+        expect(STDOUT).to have_received(:puts).with('Visiting: http://example.com/path')
+        expect(STDOUT).to have_received(:puts).with('Visiting: http://example.com/local')
+      end
+    end
+    context 'when not verbose' do
+      let(:options) { { verbose: false } }
+      before do
+        @uri_string = 'http://example.com/'
+        stub_request(:get, @uri_string).to_return(body: %Q{<a href="/path"></a><a href="/local"></a><a href="http://fb.com/"></a>})
+        stub_request(:get, "http://example.com/path").to_return(body: "")
+        stub_request(:get, "http://example.com/local").to_return(body: "")
+      end
+      it 'logs the url been visited' do
+        allow(STDOUT).to receive(:putc)
+        subject.crawl @uri_string
+        expect(STDOUT).to have_received(:putc).with('.').exactly(3).times
+      end
+    end
     context 'an invalid URI' do
       it 'stores single entry site map' do

data/spec/crabbs/link_spec.rb ADDED Viewed

@@ -0,0 +1,151 @@
+require 'crabbs/link'
+describe Crabbs::Link do
+  subject { Crabbs::Link.new href }
+  describe '#same_host_as?' do
+    context 'when it is the same host' do
+      let(:host) { 'http://example.com/path' }
+      let(:href) { 'http://example.com/path' }
+      it 'is the same host' do
+        subject.same_host_as?(host).should be_true
+      end
+    end
+    context 'when href have no host' do
+      let(:host) { 'http://example.com/path' }
+      let(:href) { '/path' }
+      it 'assumes its the same host' do
+        subject.same_host_as?(host).should be_true
+      end
+    end
+    context 'when host is a subdomain' do
+      let(:host) { 'http://example.com/path' }
+      let(:href) { 'http://subdomain.example.com/path' }
+      it 'is not the same host' do
+        subject.same_host_as?(host).should be_false
+      end
+    end
+    context 'when host is completely different' do
+      let(:host) { 'http://example.com/path' }
+      let(:href) { 'http://facebook.com/path' }
+      it 'is not the same host' do
+        subject.same_host_as?(host).should be_false
+      end
+    end
+  end
+  describe '#has_valid_fragment?' do
+    context 'when the fragment is empty' do
+      let(:href) { 'http://example.com/path#' }
+      it 'is not valid' do
+        subject.has_valid_fragment?.should be_false
+      end
+    end
+    context 'when there is no fragment' do
+      let(:href) { 'http://example.com/path' }
+      it 'is valid' do
+        subject.has_valid_fragment?.should be_true
+      end
+    end
+    context 'when there is a fragment' do
+      let(:href) { 'http://example.com/path#fragment' }
+      it 'is valid' do
+        subject.has_valid_fragment?.should be_true
+      end
+    end
+  end
+  describe '#has_html_extension?' do
+    context 'when the extension is not html' do
+      let(:href) { 'http://example.com/path.zip' }
+      it 'has no html extension' do
+        subject.has_html_extension?.should be_false
+      end
+    end
+    context 'when the extension html' do
+      let(:href) { 'http://example.com/path.html' }
+      it 'has html extension' do
+        subject.has_html_extension?.should be_true
+      end
+    end
+    context 'when there is no extension' do
+      let(:href) { 'http://example.com/path' }
+      it 'assumes to be a html link' do
+        subject.has_html_extension?.should be_true
+      end
+    end
+  end
+  describe '#join' do
+    context 'when the href is a root path' do
+      let(:href) { '/path' }
+      let(:url) { 'http://example.com/' }
+      it 'joins with the url' do
+        subject.join(url).should == 'http://example.com/path'
+      end
+    end
+    context 'when the href is a direct path' do
+      let(:href) { 'path' }
+      let(:url) { 'http://example.com/test/' }
+      it 'appends to the url previous path' do
+        subject.join(url).should == 'http://example.com/test/path'
+      end
+    end
+    context 'when the href is a fragment (hash)' do
+      let(:href) { '#fragment' }
+      let(:url) { 'http://example.com/test/' }
+      it 'appends to the url previous path' do
+        subject.join(url).should == 'http://example.com/test/#fragment'
+      end
+    end
+    context 'when the href is a query parameter' do
+      let(:href) { '?parameter' }
+      let(:url) { 'http://example.com/test/' }
+      it 'appends to the url previous path' do
+        subject.join(url).should == 'http://example.com/test/?parameter'
+      end
+    end
+    context 'when the href is both (fragment + query)' do
+      let(:href) { '#fragment?parameter' }
+      let(:url) { 'http://example.com/test/' }
+      it 'appends to the url previous path' do
+        subject.join(url).should == 'http://example.com/test/#fragment?parameter'
+      end
+    end
+    context 'when the href is a full uri' do
+      let(:href) { 'http://example.com/test/#fragment?parameter' }
+      let(:url) { 'http://example.com/test/' }
+      it 'leaves it untouched' do
+        subject.join(url).should == 'http://example.com/test/#fragment?parameter'
+      end
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: crabbs
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.1.0
 platform: ruby
 authors:
 - Bruno Trecenti
@@ -157,11 +157,13 @@ files:
 - lib/crabbs.rb
 - lib/crabbs/cli.rb
 - lib/crabbs/crawler.rb
+- lib/crabbs/link.rb
 - lib/crabbs/page.rb
 - lib/crabbs/version.rb
 - spec/crabbs/cli_spec.rb
 - spec/crabbs/crabbs_spec.rb
 - spec/crabbs/crawler_spec.rb
+- spec/crabbs/link_spec.rb
 - spec/crabbs/page_spec.rb
 homepage: http://github.com/Trecenti/crabbs
 licenses:
@@ -191,4 +193,5 @@ test_files:
 - spec/crabbs/cli_spec.rb
 - spec/crabbs/crabbs_spec.rb
 - spec/crabbs/crawler_spec.rb
+- spec/crabbs/link_spec.rb
 - spec/crabbs/page_spec.rb