RubyGems - mech_warrior - Versions diffs - 0.0.1 - Mend

mech_warrior 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +7 -0
data/.ruby-version +1 -0
data/Gemfile +2 -0
data/Gemfile.lock +61 -0
data/README.md +58 -0
data/Rakefile +6 -0
data/bin/spider +12 -0
data/lib/mech_warrior.rb +60 -0
data/lib/mech_warrior/crawler.rb +75 -0
data/lib/mech_warrior/mech_cell.rb +22 -0
data/lib/mech_warrior/version.rb +3 -0
data/mech_warrior.gemspec +32 -0
data/spec/fakeweb_helper.rb +65 -0
data/spec/mech_warrior_spec.rb +89 -0
data/spec/spec_helper.rb +6 -0
metadata +147 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 63e3f44169fe19d92e5c237ce58460e35522bc1d
+  data.tar.gz: 95bc44122c6b15c317c512866a1f1c00a769260f
+SHA512:
+  metadata.gz: eb826d2289cbed61494ef15285ee2ae0e2fad84cc40a6434895f0eedd3430706fe6539e912af94faf98a49307b288146c19be23b8febc7dce15d91d5bba7e473
+  data.tar.gz: 849c53b351db05f04375bf73e10e6c76f284b62d0664cab8697926792377f1468119c5bd0fb7afbbed14c930b4464109c1342758a8341f4586bb4e0f59a3c40f

data/.ruby-version ADDED

	@@ -0,0 +1 @@
1	+ 2.1.1

data/Gemfile ADDED

	@@ -0,0 +1,2 @@
1	+ source 'https://rubygems.org'
2	+ gemspec

data/Gemfile.lock ADDED

@@ -0,0 +1,61 @@
+PATH
+  remote: .
+  specs:
+    mech_warrior (0.0.1)
+      celluloid (~> 0)
+      mechanize (~> 2.7)
+      xml-sitemap (~> 1.3)
+GEM
+  remote: https://rubygems.org/
+  specs:
+    builder (3.2.2)
+    celluloid (0.15.2)
+      timers (~> 1.1.0)
+    diff-lcs (1.2.5)
+    domain_name (0.5.18)
+      unf (>= 0.0.5, < 1.0.0)
+    fakeweb (1.3.0)
+    http-cookie (1.0.2)
+      domain_name (~> 0.5)
+    mechanize (2.7.3)
+      domain_name (~> 0.5, >= 0.5.1)
+      http-cookie (~> 1.0)
+      mime-types (~> 2.0)
+      net-http-digest_auth (~> 1.1, >= 1.1.1)
+      net-http-persistent (~> 2.5, >= 2.5.2)
+      nokogiri (~> 1.4)
+      ntlm-http (~> 0.1, >= 0.1.1)
+      webrobots (>= 0.0.9, < 0.2)
+    mime-types (2.2)
+    mini_portile (0.5.3)
+    net-http-digest_auth (1.4)
+    net-http-persistent (2.9.4)
+    nokogiri (1.6.1)
+      mini_portile (~> 0.5.0)
+    ntlm-http (0.1.1)
+    rake (0.9.6)
+    rspec (2.14.1)
+      rspec-core (~> 2.14.0)
+      rspec-expectations (~> 2.14.0)
+      rspec-mocks (~> 2.14.0)
+    rspec-core (2.14.8)
+    rspec-expectations (2.14.5)
+      diff-lcs (>= 1.1.3, < 2.0)
+    rspec-mocks (2.14.6)
+    timers (1.1.0)
+    unf (0.1.4)
+      unf_ext
+    unf_ext (0.0.6)
+    webrobots (0.1.1)
+    xml-sitemap (1.3.3)
+      builder (>= 2.0)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  fakeweb (~> 1.3)
+  mech_warrior!
+  rake (~> 0)
+  rspec (~> 2.14)

data/README.md ADDED

@@ -0,0 +1,58 @@
+MechWarrior
+=========
+MechWarrior is a Mechanize and Celluloid powered site crawler that generates a
+JSON file of all pages, links on pages, and assets those pages rely upon
+as well as optionally generating an XML sitemap compliant with sitemaps 0.9
+protocol.
+Version
+----
+0.0.1
+Tech
+-----------
+MechWarrior relies on several excellent RubyGems
+* [Mechanize] - a ruby library that makes automated web interaction easy.
+* [Celluloid] - an Actor-based concurrent object framework for Ruby
+* [XML-Sitemap] - provides easy XML sitemap generation for Ruby/Rails/Merb/Sinatra applications
+Installation
+--------------
+```sh
+gem install mech_warrior-0.0.1.gem
+```
+Crawling a site
+---------------
+```sh
+bin/spider
+```
+and enter a host name, followed by any additional options you wish to pass in
+to override default options in `lib/mech_warrior.rb`
+Todo
+----
+Some of the functionality, including XML Sitemaps, is untested.
+Support for multiple hosts in a single spider is currently incomplete,
+despite the 'allowed_hosts' array, unless all but default host have
+only absolute links to follow.
+License
+----
+MIT
+[mechanize]:https://github.com/sparklemotion/mechanize
+[celluloid]:http://celluloid.io/
+[xml-sitemap]:https://github.com/sosedoff/xml-sitemap

data/Rakefile ADDED

@@ -0,0 +1,6 @@
+#!/usr/bin/env rake
+task :default => 'spec'
+task :spec do
+  sh "rspec spec/mech_warrior_spec.rb"
+end

data/bin/spider ADDED

@@ -0,0 +1,12 @@
+#!/usr/bin/env ruby
+require_relative "../lib/mech_warrior"
+puts "Host:"
+host = gets.chomp
+puts
+puts "Other options:"
+opts = gets.chomp
+opts_hash = eval("{" + opts + "}")
+puts opts_hash.to_json
+MechWarrior.crawl(opts_hash.merge(default_host: host))

data/lib/mech_warrior.rb ADDED

@@ -0,0 +1,60 @@
+require 'mechanize'
+require 'xml-sitemap'
+require 'logger'
+require 'celluloid/autostart'
+require_relative 'mech_warrior/mech_cell'
+require_relative 'mech_warrior/crawler'
+module MechWarrior
+  SITEMAP_MAX_LINKS = 50000
+  DEFAULTS = {
+    allowed_domains:    [],
+    default_protocol:   'http://',
+    default_host:       'www.example.com',
+    # this is less 'default_host' at the moment than 'only', though links to other domains will work as long
+    # as all links on other domains' pages are absolute.  To support multiple domains while supporting
+    # relative links, some new state would have to be introduced to track 'current_host'
+    max_depth_divisor:  256, # this results in max depth of 4096 on my machine, seems deep enough
+    pool_size:          20,
+    logger_class:       Logger,
+    log_file_name:      "mech_warrior_errors.txt"
+  }
+  def self.crawl(opts={})
+    crawl_results = Crawler.new(opts)
+    crawl_results.agent_pool.future.terminate
+    unless opts[:skip_asset_json]
+      File.open("#{crawl_results.default_host}_crawl_#{Time.now.gmtime}", 'w') do |file|
+        file.write(JSON.pretty_generate(crawl_results.pages))
+      end
+    end
+    if sitemap_opts = opts[:generate_sitemap]
+      generate_sitemap(crawl_results.default_host,
+                      crawl_results.pages,
+                      sitemap_opts.respond_to?(:keys) ? sitemap_opts : {}
+                      )
+    end
+    crawl_results
+  end
+  #generate_sitemap is untested and NOT production ready, but is functional
+  #and probably a better output format if asset/link data is not needed
+  def self.generate_sitemap(default_host, pages, opts, sitemap_file_num=1)
+    page_keys = pages.keys
+    current_page_keys = page_keys.slice(0...SITEMAP_MAX_LINKS)
+    site_map = XmlSitemap::Map.new(default_host) do |map|
+      current_page_keys.each do |page|
+        map.add URI(page).path, opts if URI(page).path.length > 0
+      end
+    end
+    site_map.render_to("./site_map_#{default_host}_#{sitemap_file_num}")
+    if page_keys.count > SITEMAP_MAX_LINKS
+      generate_sitemap(default_host, page_keys.slice(SITEMAP_MAX_LINKS..-1), opts, sitemap_file_num + 1)
+    end
+  end
+end

data/lib/mech_warrior/crawler.rb ADDED

@@ -0,0 +1,75 @@
+module MechWarrior
+  class Crawler
+    attr_reader :agent_pool, :pages, :opts, :default_host, :default_protocol, :logger, :output_file
+    def initialize(override_opts={})
+      @opts  = DEFAULTS.merge(override_opts)
+      @default_host     = opts[:default_host]
+      @default_protocol = opts[:default_protocol]
+      opts[:allowed_domains] << default_host
+      @output_file = opts[:output_file] || File.open(opts[:log_file_name], 'a')
+      @logger = opts[:logger_class].new(output_file)
+      @agent_pool = MechCell.pool(size: opts[:pool_size], args: [logger])
+      @pages = {}
+      start_url = opts[:start_url] || "#{default_protocol}#{default_host}/"
+      pages[normalize_url(start_url)] = {}
+      index_url(start_url) unless opts[:no_index]
+      self
+    ensure
+      output_file.close if output_file.respond_to?(:close)
+    end
+    def index_url(href)
+      schemed_url                 = normalize_url(href)
+      future                      = page_future(schemed_url)
+      process_page(future, schemed_url)
+    end
+    private
+    def process_page(page_future, url, depth=0)
+      return if depth > RubyVM::DEFAULT_PARAMS[:thread_vm_stack_size]/opts[:max_depth_divisor]
+      page = page_future.value
+      if page && page.respond_to?(:links)
+        pages[url]          = {}
+        pages[url][:links]  = page.respond_to?(:links) ? page.links.map(&:href) : []
+        pages[url][:assets] = {
+          images:       page.image_urls,
+          scripts:      page.search('script'),
+          asset_links:  page.search('link'), #css, icons
+          iframes:      page.iframes
+        }
+        urls = links_to_follow(page).map {|link| normalize_url(link.href)}
+        futures = urls.map {|url| page_future(url)}
+        pairs = futures.zip(urls)
+        pairs.each {|future, url| process_page(future, url, depth +1)}
+      end
+    rescue URI::InvalidURIError => e
+      logger << "InvalidURIError processing links on page at URL: #{url} -- #{e}\n"
+    end
+    def page_future(url)
+      agent_pool.future.get(url)
+    end
+    def get_page(url)
+      agent_pool.get(url)
+    end
+    def normalize_url(href)
+      URI(href).scheme ? href : "#{default_protocol}#{default_host}#{href}"
+    end
+    def follow_link?(link) #follow only pages not indexed and relative links or whitelisted link hosts
+      if link.href && URI(link.href)
+        pages[normalize_url(link.href)].nil? && (link.uri.host.nil? || opts[:allowed_domains].include?(link.uri.host))
+      end
+    rescue URI::InvalidURIError => e
+      logger << "InvalidURIError on link with href: #{link.href} -- #{e}\n"
+    end
+    def links_to_follow(page)
+      page.links.select { |link| follow_link?(link) }
+    end
+  end
+end

data/lib/mech_warrior/mech_cell.rb ADDED

@@ -0,0 +1,22 @@
+module MechWarrior
+  class MechCell
+    include Celluloid
+    attr_reader :agent, :logger
+    MECH_ERRORS = [
+          SocketError,
+          Mechanize::ResponseCodeError,
+          Mechanize::ResponseReadError,
+          Mechanize::UnsupportedSchemeError
+        ]
+    def initialize(logger)
+      @agent = Mechanize.new
+      @logger = logger
+    end
+    def get(url)
+      agent.get(url)
+    rescue *MECH_ERRORS => e
+      logger << "Caught Exception getting URL: #{url} -- #{e}\n"
+    end
+  end
+end

data/lib/mech_warrior/version.rb ADDED

@@ -0,0 +1,3 @@
+module MechWarrior
+  VERSION = "0.0.1"
+end

data/mech_warrior.gemspec ADDED

@@ -0,0 +1,32 @@
+lib = File.expand_path('../lib/', __FILE__)
+$:.unshift lib unless $:.include?(lib)
+require "mech_warrior/version"
+Gem::Specification.new do |s|
+  s.name        = "mech_warrior"
+  s.version     = MechWarrior::VERSION
+  s.platform    = Gem::Platform::RUBY
+  s.authors     = ["Brian Glusman"]
+  s.email       = ["brian@glusman.me"]
+  s.summary     = "Crawler and asset list/sitemap generator"
+  s.licenses    = ["MIT", "BSD"]
+  s.extensions = ["Rakefile"]
+  s.description = <<-DESC
+      Spider a web host with many mechanize agents concurrently, and generate an asset JSON
+      and/or an XML sitemap of the result
+  DESC
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {spec}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  s.add_runtime_dependency "mechanize", '~> 2.7'
+  s.add_runtime_dependency "xml-sitemap", '~> 1.3'
+  s.add_runtime_dependency "celluloid", '~> 0'
+  s.add_development_dependency "rake", '~> 0'
+  s.add_development_dependency "rspec", '~> 2.14'
+  s.add_development_dependency "fakeweb", '~> 1.3'
+end

data/spec/fakeweb_helper.rb ADDED

@@ -0,0 +1,65 @@
+FakeWeb.allow_net_connect = false
+module MechWarrior
+  require_relative '../lib/mech_warrior'
+  DEFAULT_HOST = DEFAULTS[:default_host]
+  SPEC_DOMAIN = "http://#{DEFAULT_HOST}/"
+  class FakePage
+    attr_accessor :links
+    attr_accessor :hrefs
+    attr_accessor :body
+    def initialize(name = '', options = {})
+      @name = name
+      @links = [options[:links]].flatten if options.has_key?(:links)
+      @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
+      @redirect = options[:redirect] if options.has_key?(:redirect)
+      @base = options[:base] if options.has_key?(:base)
+      @content_type = options[:content_type] || "text/html"
+      @body = options[:body]
+      create_body unless @body
+      add_to_fakeweb
+    end
+    def url
+      SPEC_DOMAIN + @name
+    end
+    private
+    def create_body
+      if @base
+        @body = "<html><head><base href=\"#{@base}\"></head><body>"
+      else
+        @body = "<html><body>"
+      end
+      @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
+      @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
+      @body += "</body></html>"
+    end
+    def add_to_fakeweb
+      options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
+      if @redirect
+        options[:status] = [301, "Permanently Moved"]
+        # only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
+        redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
+        options[:location] = redirect_url
+        # register the page this one redirects to
+        FakeWeb.register_uri(:get, redirect_url, {:body => '',
+                                                  :content_type => @content_type,
+                                                  :status => [200, "OK"]})
+      end
+      FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
+    end
+  end
+end
+#default root
+MechWarrior::FakePage.new

data/spec/mech_warrior_spec.rb ADDED

@@ -0,0 +1,89 @@
+require 'spec_helper'
+module MechWarrior
+  describe Crawler do
+    before(:each) do
+      FakeWeb.clean_registry
+    end
+    describe "crawl" do
+      context "crawl all the html pages in a domain by following <a> href's" do
+        let(:pages) do
+          pages = []
+          pages << FakePage.new('0', links: ['1', '2'])
+          pages << FakePage.new('1', links: ['3'])
+          pages << FakePage.new('2')
+          pages << FakePage.new('3')
+          pages
+        end
+        subject { Crawler.new(default_host: MechWarrior::DEFAULTS[:default_host],
+                              start_url:    pages[0].url,
+                              logger_class: String,
+                              output_file:  "")
+                }
+        it {should have(4).pages }
+        its(:logger) {should be_empty }
+      end
+      context "should not follow links that leave the original domain" do
+        let(:pages) do
+          pages = []
+          pages << FakePage.new('0', links: ['1'], :hrefs => 'http://www.other.com/')
+          pages << FakePage.new('1')
+          pages
+        end
+        subject { Crawler.new(default_host: MechWarrior::DEFAULTS[:default_host],
+                              start_url:    pages[0].url,
+                              logger_class: String,
+                              output_file:  "")
+                }
+        it { should have(2).pages }
+        its("pages.keys") { should_not include('http://www.other.com/') }
+        its(:logger) {should be_empty }
+      end
+      context "should not index non-html links" do
+        let(:pages) do
+          pages = []
+          pages << FakePage.new('0', links: ['1', '2'])
+          pages << FakePage.new('1', content_type: 'application/pdf')
+          pages << FakePage.new('2', content_type: 'text/csv')
+          pages
+        end
+        subject { Crawler.new(default_host: MechWarrior::DEFAULTS[:default_host],
+                              start_url:    pages[0].url,
+                              logger_class: String,
+                              output_file:  "")
+                }
+        it { should have(1).pages }
+        its(:logger) {should be_empty }
+      end
+      context "should ignore invalid URLs" do
+        let(:pages) do
+          pages = []
+          pages << FakePage.new('0', links: ['1', '2'])
+          pages << FakePage.new('1', links: ['not a valid url'])
+          pages << FakePage.new('2')
+          pages << FakePage.new('not_a_valid_url')
+          pages
+        end
+        subject { Crawler.new(default_host: MechWarrior::DEFAULTS[:default_host],
+                              start_url:    pages[0].url,
+                              logger_class: String,
+                              output_file:  "")
+                }
+        it { should have(3).pages }
+        its(:logger) {should_not be_empty }
+      end
+    end
+  end
+end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,6 @@
+require_relative '../lib/mech_warrior'
+require 'fakeweb'
+require File.dirname(__FILE__) + '/fakeweb_helper'
+SPEC_DOMAIN = 'http://www.example.com/'

metadata ADDED

@@ -0,0 +1,147 @@
+--- !ruby/object:Gem::Specification
+name: mech_warrior
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Brian Glusman
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-05-17 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.7'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.7'
+- !ruby/object:Gem::Dependency
+  name: xml-sitemap
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.3'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.3'
+- !ruby/object:Gem::Dependency
+  name: celluloid
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.14'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.14'
+- !ruby/object:Gem::Dependency
+  name: fakeweb
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.3'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.3'
+description: |2
+        Spider a web host with many mechanize agents concurrently, and generate an asset JSON
+        and/or an XML sitemap of the result
+email:
+- brian@glusman.me
+executables:
+- spider
+extensions:
+- Rakefile
+extra_rdoc_files: []
+files:
+- ".ruby-version"
+- Gemfile
+- Gemfile.lock
+- README.md
+- Rakefile
+- bin/spider
+- lib/mech_warrior.rb
+- lib/mech_warrior/crawler.rb
+- lib/mech_warrior/mech_cell.rb
+- lib/mech_warrior/version.rb
+- mech_warrior.gemspec
+- spec/fakeweb_helper.rb
+- spec/mech_warrior_spec.rb
+- spec/spec_helper.rb
+homepage:
+licenses:
+- MIT
+- BSD
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.2.2
+signing_key:
+specification_version: 4
+summary: Crawler and asset list/sitemap generator
+test_files: []