RubyGems - grabber - Versions diffs - 0.0.0 - Mend

grabber 0.0.0

Files changed (6) hide show

data/bin/grab ADDED

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require 'grabber'
+unless ARGV[0] && ARGV[1]
+  puts "Usage: grab web_site_address download_directory"
+  exit
+end
+unless Dir.exists?(ARGV[1])
+  puts "Directory doesn't exist: #{ARGV[1]}"
+  exit
+end
+Grabber::Site.new(ARGV[0], ARGV[1]).crawl

data/lib/grabber.rb ADDED

@@ -0,0 +1,7 @@
+require 'rubygems'
+require 'nokogiri'
+require 'open-uri'
+require 'grabber/util'
+require 'grabber/page'
+require 'grabber/site'

data/lib/grabber/page.rb ADDED

@@ -0,0 +1,66 @@
+module Grabber
+  class Page
+    include Util
+    attr_reader :links
+    def initialize(url)
+      @url = url
+      @assets = []
+      @links = []
+    end
+    def crawl
+      puts "Grabbing: #{uri.to_s}"
+      content.search('img').each do |asset|
+        @assets << asset['src']
+      end
+      content.search('a').each do |asset|
+        location = asset['href']
+        next if location.nil? || location == '' || location[/^#/]
+        @links << location # if on same domain
+      end
+      @links.compact!
+      @links.uniq! if @links
+    end
+    def uri
+      URI.parse(@url)
+    end
+    def content
+      Nokogiri::HTML(uri.read)
+    end
+    def basename
+      if uri.path.nil? || uri.path == ''
+        "index.html"
+      else
+        uri.path.split('/').last + ".html"
+      end
+    end
+    def download(directory)
+      local_path = File.expand_path(File.join(directory, basename))
+      File.open(local_path, "wb") do |file|
+        file.write open(uri).read
+      end
+    end
+    def download_assets(directory)
+      @assets.each do |asset|
+        local_path = File.expand_path(File.join(directory, File.basename(asset)))
+        File.open(local_path, "wb") do |file|
+          begin
+            file.write open(format_url(asset)).read
+          rescue OpenURI::HTTPError => e
+            puts "Failed download for #{format_url(asset)}: #{e.message}"
+          end
+        end
+      end
+    end
+  end
+end

data/lib/grabber/site.rb ADDED

@@ -0,0 +1,33 @@
+module Grabber
+  class Site
+    include Util
+    def initialize(url, path)
+      @url = with_url_protocol(url)
+      @download_path = path
+    end
+    def crawl
+      index = 0
+      page_urls = [format_url(@url)]
+      while (url = page_urls[index])
+        page = process_page(url)
+        other_urls = page.links.map { |link| format_url(link) }.select do |link|
+          URI.parse(link).host == uri.host
+        end
+        page_urls = page_urls | other_urls.compact
+        index += 1
+      end
+    end
+    def process_page(url)
+      page = Page.new(url)
+      page.crawl
+      page.download(@download_path)
+      page.download_assets(@download_path)
+      page
+    end
+  end
+end

data/lib/grabber/util.rb ADDED

@@ -0,0 +1,32 @@
+module Grabber
+  module Util
+    def format_url(url)
+      url = strip_non_url_parts(url)
+      if URI.parse(url).relative?
+        url = URI.join("#{uri.scheme}://#{uri.host}", url).to_s
+      end
+      url.chop! while url.end_with?('/')
+      with_url_protocol(url)
+    end
+    def with_url_protocol(path)
+      path =~ /^http/ ? path : 'http://' + path
+    end
+    def strip_non_url_parts(link)
+      if (index = (link =~ /#/))
+        link.slice!(index..link.size)
+      end
+      if (index = (link =~ /\?/))
+        link.slice!(index..link.size)
+      end
+      link
+    end
+    def uri
+      URI.parse(@url)
+    end
+  end
+end

metadata ADDED

@@ -0,0 +1,68 @@
+--- !ruby/object:Gem::Specification
+name: grabber
+version: !ruby/object:Gem::Version
+  prerelease:
+  version: 0.0.0
+platform: ruby
+authors:
+- Greg Lazarev
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-11-21 00:00:00 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :runtime
+  version_requirements: *id001
+description: Crawls the site and downloads assets to a specified directory
+email: russianbandit@gmail.com
+executables:
+- grab
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/grabber.rb
+- lib/grabber/site.rb
+- lib/grabber/page.rb
+- lib/grabber/util.rb
+- bin/grab
+homepage: ""
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.5
+signing_key:
+specification_version: 3
+summary: Web site crawler and grabber
+test_files: []