RubyGems - simple_crawler - Versions diffs - 0.0.1 - Mend

simple_crawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: ebddf965a3f0021ce3187684afa08db022665878
+  data.tar.gz: afc8d73392ac3918cd11b1a7fd3e1b61340299e9
+SHA512:
+  metadata.gz: 76da6a650493ea030c63371c529759d8e01a87c7f72131250b5a99dff418320dac908eb5e70a36d50629315f232a0b357a4a48f98d5453ad333a15454028e343
+  data.tar.gz: 060c78ef1d9452efaa042daf3b90208f5d92980de68b136667c468edd493ed022ac063334b87f25c3bb7c437ca95cdaf92bca3dbfde139ae2aae6e152d74ae23

data/.gitignore ADDED

@@ -0,0 +1,20 @@
+*.gem
+*.rbc
+.bundle
+.config
+.DS_Store
+.idea/
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/Gemfile ADDED

@@ -0,0 +1,8 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in crawler.gemspec
+gemspec
+gem 'awesome_print', '~>1.2.0'
+gem 'em-synchrony', '~>1.0.3'
+gem 'em-http-request', '~>1.1.2'
+gem 'nokogiri', '~>1.6.1'

data/LICENSE.txt ADDED

@@ -0,0 +1,22 @@
+Copyright (c) 2014 Anupom Syam
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,49 @@
+# Crawler
+Crawler is a simple web crawler written in Ruby. Given a URL it crawls the domain and recursively finds all links
+associated with it. It also keeps track of all static contents related to each of these links.
+It uses eventmachine and fiber (through em-synchrony) to issue concurrent non-blocking requests.
+Crawler stores the site map using a variation of Adjacency list data structure. It can also
+pretty-print the map once a URL is crawled.
+## Installation
+Add this line to your application's Gemfile:
+    gem 'crawler'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install crawler
+## Usage
+### Using Crawler as library
+```ruby
+crawler = Crawler.new('http://google.com')
+# Start crawling the URL
+crawler.crawl
+# Generated site map object
+map = crawler.map
+# Pretty print the site map
+crawler.print
+```
+### Using Crawler as binary
+```sh
+# Crawl domain and print the sitemap
+crawler http://google.com
+```
+## Contributing
+1. Fork it ( http://github.com/anupom/crawler/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED

	@@ -0,0 +1 @@
1	+ require 'bundler/gem_tasks'

data/bin/crawler ADDED

@@ -0,0 +1,12 @@
+#!/usr/bin/env ruby
+require 'crawler'
+if ARGV.empty?
+  puts 'Please provide a url to crawl'
+  exit 1
+end
+crawler = Crawler.new(ARGV[0])
+crawler.crawl
+crawler.print

data/crawler.gemspec ADDED

@@ -0,0 +1,24 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+Gem::Specification.new do |spec|
+  spec.name          = 'simple_crawler'
+  spec.version       = '0.0.1'
+  spec.authors       = ['anupom']
+  spec.email         = ['anupom.syam@gmail.com']
+  spec.summary       = %q{Create sitemap from a given url}
+  spec.description   = %q{Simple web crawler to crawl a domain and generate sitemap}
+  spec.homepage      = 'https://github.com/anupom/crawler'
+  spec.license       = 'MIT'
+  spec.files         = `git ls-files`.split($/)
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ['lib']
+  spec.add_development_dependency 'bundler', '~> 1.5'
+  spec.add_development_dependency 'rake', '~> 10.1'
+  spec.add_development_dependency 'simplecov', '~> 0.8'
+  spec.add_development_dependency 'webmock', '~> 1.17'
+end

data/lib/crawler.rb ADDED

@@ -0,0 +1,148 @@
+require 'awesome_print'
+require 'em-synchrony'
+require 'em-synchrony/em-http'
+require 'em-synchrony/fiber_iterator'
+require 'nokogiri'
+require 'set'
+class Crawler
+  CONCURRENCY = 5
+  HTTP_OK = 200
+  MAX_REDIRECTS = 3
+  MAX_RETRIES = 3
+  VALID_SCHEMES = %w(http https)
+  Node = Struct.new(:neighbors, :statics)
+  attr_reader :map
+  def initialize(root_url)
+    @map = {}
+    @urls_to_crawl = [root_url]
+    @root_hostname = URI.parse(root_url).hostname
+    @retries = Hash.new { |h, k| h[k] = 0 }
+  end
+  def crawl
+    if @urls_to_crawl.empty?
+      EventMachine.stop
+      return
+    end
+    EM.synchrony do
+      # Iterate over a copy while we change the main array
+      urls = @urls_to_crawl.dup
+      @urls_to_crawl = crawl_urls(urls)
+      crawl
+    end
+  end
+  def print
+    ap @map, { index: false }
+  end
+  protected
+  def crawl_urls(urls)
+    next_urls = []
+    EM::Synchrony::FiberIterator.new(urls, CONCURRENCY).each do |url|
+      next if @map.key?(url)
+      http = http_request(url)
+      next if http.nil?
+      page = Nokogiri::HTML(http.response)
+      neighbors = get_neighbors(page, url)
+      next_urls += neighbors
+      statics = get_statics(page)
+      @map[url] = Node.new(neighbors, statics)
+    end
+    next_urls
+  end
+  def http_request(url)
+    http = EventMachine::HttpRequest.new(url)
+      .get redirects: MAX_REDIRECTS
+    if http.response_header.status != HTTP_OK
+      queue_for_retry(url)
+      return nil
+    end
+    http
+  rescue Addressable::URI::InvalidURIError
+    nil
+  end
+  def queue_for_retry(url)
+    return if @retries[url] == MAX_RETRIES
+    @retries[url] += 1
+    @urls_to_crawl.push(url)
+  end
+  def get_neighbors(page, parent_url)
+    neighbors = Set.new
+    links = page.css('a')
+    links.each do |link|
+      href = link['href']
+      uri = uri_from_href(href)
+      next unless valid_uri?(uri)
+      uri = URI.join(parent_url, uri) if relative_uri?(uri)
+      # Page fragments are ignored for site map
+      uri.fragment = nil
+      next if uri.to_s == parent_url
+      neighbors.add(uri.to_s)
+    end
+    neighbors.to_a
+  end
+  def get_statics(page)
+    statics = Set.new
+    scripts = page.css('script')
+    scripts.each do |script|
+      statics.add(script['src']) unless script['src'].nil?
+    end
+    stylesheets = page.css('link[rel="stylesheet"]')
+    stylesheets.each do |stylesheet|
+      statics.add(stylesheet['href']) unless stylesheet['href'].nil?
+    end
+    images = page.css('img')
+    images.each do |image|
+      statics.add(image['src']) unless image['src'].nil?
+    end
+    statics.to_a
+  end
+  def uri_from_href(href)
+    URI.parse(href)
+  rescue URI::InvalidURIError
+    nil
+  end
+  def valid_uri?(uri)
+    return false if uri.nil?
+    return false unless uri.scheme.nil? || VALID_SCHEMES.include?(uri.scheme)
+    return false unless uri.hostname.nil? || uri.hostname == @root_hostname
+    true
+  end
+  def relative_uri?(uri)
+    uri.scheme.nil?
+  end
+end

data/test/crawler_test.rb ADDED

@@ -0,0 +1,248 @@
+require 'crawler'
+require 'stringio'
+require 'test/unit'
+require 'webmock/test_unit'
+class CrawlerTest < Test::Unit::TestCase
+  # Called before every test method runs. Can be used
+  # to set up fixture information.
+  def setup
+    # Do nothing
+  end
+  # Called after every test method runs. Can be used to tear
+  # down fixture information.
+  def teardown
+    # Do nothing
+  end
+  def test_with_nil_url
+    assert_raise URI::InvalidURIError do
+      crawler = Crawler.new(nil)
+      crawler.crawl
+    end
+  end
+  def test_with_invalid_url
+    crawler = Crawler.new('https:/xyz.invalidurl.com/')
+    crawler.crawl
+    map = crawler.map
+    assert_equal({}, map)
+  end
+  def test_with_valid_url
+    url1 = 'http://www.example.com'
+    url2 = 'http://www.example.com/test.html'
+    stub_response_link(url1, url2)
+    stub_response_empty(url2)
+    crawler = Crawler.new(url1)
+    crawler.crawl
+    map = crawler.map
+    assert(map.key?(url1))
+    assert_equal(map[url1].neighbors, [url2])
+  end
+  def test_with_relative_links
+    url1 = 'http://www.example.com/test/test.html'
+    url2 = 'http://www.example.com/test/test2.html'
+    url3 = 'http://www.example.com/test3.html'
+    stub_response_link(url1, 'test2.html')
+    stub_response_link(url2, '/test3.html')
+    stub_response_empty(url3)
+    crawler = Crawler.new(url1)
+    crawler.crawl
+    map = crawler.map
+    assert(map.key?(url1))
+    assert_equal(map[url1].neighbors, [url2])
+    assert(map.key?(url2))
+    assert_equal(map[url2].neighbors, [url3])
+    assert(map.key?(url3))
+    assert_equal(map[url3].neighbors, [])
+  end
+  def test_with_unavailable_links
+    url1 = 'http://www.example.com'
+    url2 = 'http://www.example.com/test.html'
+    url3 = 'http://www.example.com/unavailable'
+    stub_response_link(url1, url2)
+    stub_response_link(url2, url3)
+    stub_response_not_found(url3)
+    crawler = Crawler.new(url1)
+    crawler.crawl
+    map = crawler.map
+    assert(!map.key?(url3))
+  end
+  def test_with_external_links
+    url1 = 'http://www.example.com/test/test.html'
+    url2 = 'http://www.google.com/test/test2.html'
+    stub_response_link(url1, url2)
+    crawler = Crawler.new(url1)
+    crawler.crawl
+    map = crawler.map
+    assert(map.key?(url1))
+    assert_equal(map[url1].neighbors, [])
+  end
+  def test_with_invalid_links
+    url1 = 'http://www.example.com/test/test.html'
+    url2 = ':// test/test2.html'
+    stub_response_link(url1, url2)
+    crawler = Crawler.new(url1)
+    crawler.crawl
+    map = crawler.map
+    assert(map.key?(url1))
+    assert_equal(map[url1].neighbors, [])
+  end
+  def test_with_multiple_links
+    url1 = 'http://www.example.com/test/test.html'
+    url2 = 'http://www.example.com/test/test2.html'
+    url3 = 'http://www.example.com/test3.html'
+    stub_response_two_links(url1, url2, url3)
+    stub_response_empty(url2)
+    stub_response_empty(url3)
+    crawler = Crawler.new(url1)
+    crawler.crawl
+    map = crawler.map
+    assert(map.key?(url1))
+    assert_equal(map[url1].neighbors, [url2, url3])
+    assert(map.key?(url2))
+    assert_equal(map[url2].neighbors, [])
+    assert(map.key?(url3))
+    assert_equal(map[url3].neighbors, [])
+  end
+  def test_with_same_links
+    url1 = 'http://www.example.com/test/test.html'
+    url2 = 'http://www.example.com/test/test2.html'
+    url3 = url1
+    stub_response_two_links(url1, url2, url3)
+    stub_response_empty(url2)
+    crawler = Crawler.new(url1)
+    crawler.crawl
+    map = crawler.map
+    assert(map.key?(url1))
+    assert_equal(map[url1].neighbors, [url2])
+    assert(map.key?(url2))
+    assert_equal(map[url2].neighbors, [])
+  end
+  def test_with_same_links_with_different_fragments
+    url1 = 'http://www.example.com/test/test.html'
+    url2 = 'http://www.example.com/test/test2.html'
+    url3 = url1 + '#fragment'
+    stub_response_two_links(url1, url2, url3)
+    stub_response_empty(url2)
+    crawler = Crawler.new(url1)
+    crawler.crawl
+    map = crawler.map
+    assert(map.key?(url1))
+    assert_equal(map[url1].neighbors, [url2])
+    assert(map.key?(url2))
+    assert_equal(map[url2].neighbors, [])
+  end
+  def test_with_non_http_link
+    url1 = 'http://www.example.com/test/test.html'
+    url2 = 'http://www.example.com/test/test2.html'
+    url3 = 'mailto:mail@example.com'
+    stub_response_two_links(url1, url2, url3)
+    stub_response_empty(url2)
+    crawler = Crawler.new(url1)
+    crawler.crawl
+    map = crawler.map
+    assert(map.key?(url1))
+    assert_equal(map[url1].neighbors, [url2])
+    assert(map.key?(url2))
+    assert_equal(map[url2].neighbors, [])
+  end
+  def test_with_statics
+    url1 = 'http://www.example.com'
+    stub_request(:get, url1).to_return(
+      body: '<link rel="stylesheet" href="test.css" />'\
+        '<script src="test.js"></script>'\
+        '<img src="test.png" />'
+    )
+    crawler = Crawler.new(url1)
+    crawler.crawl
+    map = crawler.map
+    assert(map.key?(url1))
+    assert_equal(map[url1].statics, %w(test.js test.css test.png))
+  end
+  def test_print
+    url1 = 'http://www.example.com'
+    url2 = 'http://www.example.com/test.html'
+    stub_response_link(url1, url2)
+    stub_response_empty(url2)
+    crawler = Crawler.new(url1)
+    crawler.crawl
+    printed = capture_stdout do
+      crawler.print
+    end
+    assert_match(/#{url1}/, printed)
+    assert_match(/#{url2}/, printed)
+  end
+  private
+  def stub_response_link(url, link)
+    stub_request(:get, url).to_return(
+      body: %(<body><a href="#{link}">test</a></body>")
+    )
+  end
+  def stub_response_two_links(url, link1, link2)
+    stub_request(:get, url).to_return(
+      body: %(<body><a href="#{link1}">t1</a><a href="#{link2}">t2</a></body>)
+    )
+  end
+  def stub_response_empty(url)
+    stub_request(:get, url).to_return(
+      body: '<body>stub body</body>'
+    )
+  end
+  def stub_response_not_found(url)
+    stub_request(:get, url).to_return(
+      status: 404
+    )
+  end
+  def capture_stdout(&blk)
+    old = $stdout
+    $stdout = fake = StringIO.new
+    blk.call
+    fake.string
+  ensure
+    $stdout = old
+  end
+end

metadata ADDED

@@ -0,0 +1,111 @@
+--- !ruby/object:Gem::Specification
+name: simple_crawler
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- anupom
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-02-18 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.5'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.5'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.1'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.1'
+- !ruby/object:Gem::Dependency
+  name: simplecov
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.8'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.8'
+- !ruby/object:Gem::Dependency
+  name: webmock
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.17'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.17'
+description: Simple web crawler to crawl a domain and generate sitemap
+email:
+- anupom.syam@gmail.com
+executables:
+- crawler
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- bin/crawler
+- crawler.gemspec
+- lib/crawler.rb
+- test/crawler_test.rb
+homepage: https://github.com/anupom/crawler
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.2.2
+signing_key:
+specification_version: 4
+summary: Create sitemap from a given url
+test_files:
+- test/crawler_test.rb