RubyGems - speed_spider - Versions diffs - 0.0.1 - Mend

speed_spider 0.0.1

Files changed (14) hide show

checksums.yaml +7 -0
data/.gitignore +17 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +65 -0
data/Rakefile +1 -0
data/bin/speed_spider +6 -0
data/lib/speed_spider/anemone_hack.rb +19 -0
data/lib/speed_spider/cli.rb +123 -0
data/lib/speed_spider/crawler.rb +82 -0
data/lib/speed_spider/version.rb +3 -0
data/lib/speed_spider.rb +17 -0
data/speed_spider.gemspec +24 -0
metadata +100 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: abdb6ebdea8dbe7f00e7c9e487641a45d0d47b49
+  data.tar.gz: 3b06cf74adb37f274516b16215ae71627a36ec69
+SHA512:
+  metadata.gz: fb6e3f517125ab47b511abaf4a1e09d3ecfeab330557c7fe60f165f3350de3434c0d57f3534c1134cefb97a340869afc2abafe8ec5a687e061340bbeccf36cc4
+  data.tar.gz: 5bc260f898cf66898073fd547696b237a6ad89f45fd2650e5518983bcefa440bc8e4d5cf31da1ad4110ac3c0614e96aecf05389b19dde4f5e2e0c902bae48613

data/.gitignore ADDED Viewed

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in speed_spider.gemspec
+gemspec

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2013 Ryan Wang
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,65 @@
+# SpeedSpider
+A simple and speedy web spider for pages downloading.
+SpeedSpider is based on ruby spider framework [Anemone][1], it's easy to use and very fast since it uses threads for page fetching.
+## What kind of files will be downloaded
+### links in html pages
+* link,        xpath: `//a[@href]`
+* stylesheet,  xpath: `//link[@src]`
+* javascript,  xpath: `//script[@src]`
+* iframe file, xpath: `//iframe[@src]`
+* image file,  xpath: `//img[@src]`
+### urls in stylesheet files
+* urls with parttern url\((.*)\)
+## Installation
+install it with rubygem:
+    gem install 'speed_spider'
+## Usage
+    Usage: speed_spider [options] start_url
+    options:
+        -S, --slient                     slient output
+        -D, --dir String                 directory for download files to save to. "download" by default
+        -b, --base_url String            any url not starts with base_url will not be saved
+        -t, --threads Integer            threads to run for fetching pages, 4 by default
+        -u, --user_agent String          words for request header USER_AGENT
+        -d, --delay Integer              delay between requests
+        -o, --obey_robots_text           obey robots exclustion protocol
+        -l, --depth_limit                limit the depth of the crawl
+        -r, --redirect_limit Integer     number of times HTTP redirects will be followed
+        -a, --accept_cookies             accept cookies from the server and send them back?
+        -s, --skip_query_strings         skip any link with a query string? e.g. http://foo.com/?u=user
+        -H, --proxy_host String          proxy server hostname
+        -P, --proxy_port Integer         proxy server port number
+        -T, --read_timeout Integer       HTTP read timeout in seconds
+        -V, --version                    Show version
+## Example
+    speed_spider http://twitter.github.io/bootstrap/
+It will download all files within the same domain  as `twitter.github.io`, and save to `download/twitter.github.io/`.
+    speed_spider -b http://ruby-doc.org/core-2.0/ http://ruby-doc.org/core-2.0/
+It will only download urls start with http://ruby-doc.org/core-2.0/, notice `assets` files like image, css, js, font will not obey `base_url` rule.
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request
+[1]: http://anemone.rubyforge.org/

data/Rakefile ADDED Viewed

	@@ -0,0 +1 @@
1	+ require "bundler/gem_tasks"

data/bin/speed_spider ADDED Viewed

@@ -0,0 +1,6 @@
+#!/usr/bin/env ruby
+$: << File.expand_path('../../lib', __FILE__)
+require 'speed_spider'
+SpeedSpider.crawl

data/lib/speed_spider/anemone_hack.rb ADDED Viewed

@@ -0,0 +1,19 @@
+module Anemone
+  class Core
+    def assets?(link)
+      %w(js css jpg jpeg png bmp gif svg ttf woff eot).any? do |e|
+        /#{e}/i =~ File.extname(link.path).split('.').pop
+      end
+    end
+    #
+    # Returns +true+ if *link* should not be visited because
+    # its URL matches a skip_link pattern.
+    #
+    def skip_link_with_hack?(link)
+      skip_link_without_hack?(link) or !assets?(link) and !link.to_s.start_with? @opts[:base_url]
+    end
+    alias_method :skip_link_without_hack?, :skip_link?
+    alias_method :skip_link?, :skip_link_with_hack?
+  end
+end

data/lib/speed_spider/cli.rb ADDED Viewed

@@ -0,0 +1,123 @@
+require 'speed_spider/crawler'
+require 'optparse'
+require 'ostruct'
+module SpeedSpider
+  class Cli
+    attr_reader :options, :option_parser
+    def initialize
+      @options = {
+        # only url start with base_url will save to local
+        :base_url => '',
+        # directory for downloaded files to save to
+        :dir => 'download',
+        # run 4 Tentacle threads to fetch pages
+        :threads => 4,
+        # verbose output
+        :verbose => true,
+        # don't throw away the page response body after scanning it for links
+        :discard_page_bodies => false,
+        # identify self as WebCrawler/VERSION
+        :user_agent => "SpeedSpider/#{SpeedSpider::VERSION}",
+        # no delay between requests
+        :delay => 0,
+        # don't obey the robots exclusion protocol
+        :obey_robots_txt => false,
+        # by default, don't limit the depth of the crawl
+        :depth_limit => false,
+        # number of times HTTP redirects will be followed
+        :redirect_limit => 5,
+        # storage engine defaults to Hash in +process_options+ if none specified
+        :storage => nil,
+        # Hash of cookie name => value to send with HTTP requests
+        :cookies => nil,
+        # accept cookies from the server and send them back?
+        :accept_cookies => false,
+        # skip any link with a query string? e.g. http://foo.com/?u=user
+        :skip_query_strings => false,
+        # proxy server hostname
+        :proxy_host => nil,
+        # proxy server port number
+        :proxy_port => false,
+        # HTTP read timeout in seconds
+        :read_timeout => nil
+      }
+    end
+    def parse!
+      @option_parser = OptionParser.new do |opts|
+        opts.banner = "Usage: speed_spider [options] start_url"
+        opts.separator ""
+        opts.separator "options:"
+        opts.on('-S', '--slient', 'slient output') do
+          @options[:verbose] = false
+        end
+        opts.on('-D', '--dir String', 'directory for download files to save to. "download" by default') do |value|
+          options[:dir]  = value
+        end
+        opts.on('-b', '--base_url String', 'any url not starts with base_url will not be saved') do |value|
+          value += '/' unless value.end_with? '/'
+          options[:base_url]  = value
+        end
+        opts.on('-t', '--threads Integer', Integer, 'threads to run for fetching pages, 4 by default') do |value|
+          @options[:threads] = value
+        end
+        opts.on('-u', '--user_agent String', 'words for request header USER_AGENT') do |value|
+          @options[:user_agent] = value
+        end
+        opts.on('-d', '--delay Integer', Integer, 'delay between requests in seconds') do |value|
+          @options[:delay] = value
+        end
+        opts.on('-o', '--obey_robots_text', 'obey robots exclustion protocol') do
+          @options[:obey_robots_txt] = true
+        end
+        opts.on('-l', '--depth_limit', 'limit the depth of the crawl') do
+          @options[:delay] = true
+        end
+        opts.on('-r', '--redirect_limit Integer', Integer, 'number of times HTTP redirects will be followed') do |value|
+          @options[:redirect_limit] = value
+        end
+        opts.on('-a', '--accept_cookies', 'accept cookies from the server and send them back?') do
+          @options[:accept_cookies] = true
+        end
+        opts.on('-s', '--skip_query_strings', 'skip any link with a query string? e.g. http://foo.com/?u=user') do
+          @options[:skip_query_strings] = true
+        end
+        opts.on('-H', '--proxy_host String', 'proxy server hostname') do |value|
+          @options[:proxy_host] = value
+        end
+        opts.on('-P', '--proxy_port Integer', Integer, 'proxy server port number') do |value|
+          @options[:proxy_port] = value
+        end
+        opts.on('-T', '--read_timeout Integer', Integer, 'HTTP read timeout in seconds') do |value|
+          @options[:read_timeout] = value
+        end
+        # print the version.
+        opts.on_tail("-V", "--version", "Show version") do
+          puts SpeedSpider::VERSION
+          exit
+        end
+      end
+      @option_parser.parse!
+      self
+    end
+  end
+end

data/lib/speed_spider/crawler.rb ADDED Viewed

@@ -0,0 +1,82 @@
+require 'anemone'
+require 'speed_spider/anemone_hack'
+require 'fileutils'
+require 'uri'
+module SpeedSpider
+  class Crawler
+    def initialize(start_url, options)
+      @start_url = start_url
+      @base_url = options[:base_url]
+      @options = options
+    end
+    # return urls from css file contents
+    def get_urls_from_css data, pos = 0
+      if m = data.match(/url\((.*?)\)/i, pos)
+        [ m[1] ] + get_urls_from_css(data, m.end(1) + 1)
+      else
+        []
+      end
+    end
+    def focus_crawl
+      lambda { |page|
+        links = []
+        if page.doc
+          # include javascripts and img files as target links
+          page.doc.search('//script[@src]', '//img[@src]', '//iframe[@src]').each do |s|
+            u = s['src']
+            next if u.nil? or u.empty?
+            abs = page.to_absolute u rescue next
+            links << abs if page.in_domain? abs
+          end
+          # include css files as target links
+          page.doc.search('//link[@href]').each do |s|
+            u = s['href']
+            next if u.nil? or u.empty?
+            abs = page.to_absolute u rescue next
+            links << abs if page.in_domain? abs
+          end
+        elsif page.url.to_s.end_with? '.css'
+          get_urls_from_css(page.body).each do |s|
+            u = s.gsub('"', '').gsub("'", '')
+            next if u.nil? or u.empty?
+            abs = page.to_absolute u rescue next
+            links << abs if page.in_domain? abs
+          end
+        end
+        page.links + links.uniq
+      }
+    end
+    def after_crawl
+      lambda { |pages|
+        pages.each do |url, page|
+          path = page.url.path
+          path += 'index.html' if path.end_with? '/' or path.empty?
+          path = "#{@options[:dir]}/#{page.url.host}#{path}"
+          dir = File.dirname path
+          FileUtils.mkdir_p dir unless dir.empty?
+          File.open path, 'w' do |f|
+            f.write page.body
+          end
+          puts "save file #{path}" if @options[:verbose]
+        end
+      }
+    end
+    def crawl
+      Anemone.crawl @start_url, @options do |spider|
+        spider.focus_crawl &focus_crawl
+        spider.after_crawl &after_crawl
+      end
+    end
+  end
+end

data/lib/speed_spider/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module SpeedSpider
+  VERSION = "0.0.1"
+end

data/lib/speed_spider.rb ADDED Viewed

@@ -0,0 +1,17 @@
+require "speed_spider/version"
+require 'speed_spider/cli'
+require 'speed_spider/crawler'
+require 'debugger'
+module SpeedSpider
+  def self.crawl
+    cli = Cli.new.parse!
+    start_url = ARGV[0]
+    (puts cli.option_parser.help; exit 1) if start_url.nil?
+    crawler = Crawler.new start_url, cli.options
+    crawler.crawl
+  end
+end

data/speed_spider.gemspec ADDED Viewed

@@ -0,0 +1,24 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'speed_spider/version'
+Gem::Specification.new do |spec|
+  spec.name          = "speed_spider"
+  spec.version       = SpeedSpider::VERSION
+  spec.authors       = ["Ryan Wang"]
+  spec.email         = ["wongyouth@gmail.com"]
+  spec.description   = %q{A simple web spider tool for crawling pages to local based on a url}
+  spec.summary       = %q{A simple web spider tool for download pages from a base url including css js html and iframe source files}
+  spec.homepage      = ""
+  spec.license       = "MIT"
+  spec.files         = `git ls-files`.split($/)
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_dependency "anemone", "~> 0.7.2"
+  spec.add_development_dependency "bundler", "~> 1.3"
+  spec.add_development_dependency "rake"
+end

metadata ADDED Viewed

@@ -0,0 +1,100 @@
+--- !ruby/object:Gem::Specification
+name: speed_spider
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Ryan Wang
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-06-01 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: anemone
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.7.2
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.7.2
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: A simple web spider tool for crawling pages to local based on a url
+email:
+- wongyouth@gmail.com
+executables:
+- speed_spider
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- bin/speed_spider
+- lib/speed_spider.rb
+- lib/speed_spider/anemone_hack.rb
+- lib/speed_spider/cli.rb
+- lib/speed_spider/crawler.rb
+- lib/speed_spider/version.rb
+- speed_spider.gemspec
+homepage: ''
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.0.3
+signing_key:
+specification_version: 4
+summary: A simple web spider tool for download pages from a base url including css
+  js html and iframe source files
+test_files: []