grabber 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ require 'grabber'
3
+
4
+ unless ARGV[0] && ARGV[1]
5
+ puts "Usage: grab web_site_address download_directory"
6
+ exit
7
+ end
8
+
9
+ unless Dir.exists?(ARGV[1])
10
+ puts "Directory doesn't exist: #{ARGV[1]}"
11
+ exit
12
+ end
13
+
14
+ Grabber::Site.new(ARGV[0], ARGV[1]).crawl
@@ -0,0 +1,7 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+
5
+ require 'grabber/util'
6
+ require 'grabber/page'
7
+ require 'grabber/site'
@@ -0,0 +1,66 @@
1
+ module Grabber
2
+ class Page
3
+ include Util
4
+ attr_reader :links
5
+
6
+ def initialize(url)
7
+ @url = url
8
+ @assets = []
9
+ @links = []
10
+ end
11
+
12
+ def crawl
13
+ puts "Grabbing: #{uri.to_s}"
14
+
15
+ content.search('img').each do |asset|
16
+ @assets << asset['src']
17
+ end
18
+
19
+ content.search('a').each do |asset|
20
+ location = asset['href']
21
+ next if location.nil? || location == '' || location[/^#/]
22
+
23
+ @links << location # if on same domain
24
+ end
25
+
26
+ @links.compact!
27
+ @links.uniq! if @links
28
+ end
29
+
30
+ def uri
31
+ URI.parse(@url)
32
+ end
33
+
34
+ def content
35
+ Nokogiri::HTML(uri.read)
36
+ end
37
+
38
+ def basename
39
+ if uri.path.nil? || uri.path == ''
40
+ "index.html"
41
+ else
42
+ uri.path.split('/').last + ".html"
43
+ end
44
+ end
45
+
46
+ def download(directory)
47
+ local_path = File.expand_path(File.join(directory, basename))
48
+ File.open(local_path, "wb") do |file|
49
+ file.write open(uri).read
50
+ end
51
+ end
52
+
53
+ def download_assets(directory)
54
+ @assets.each do |asset|
55
+ local_path = File.expand_path(File.join(directory, File.basename(asset)))
56
+ File.open(local_path, "wb") do |file|
57
+ begin
58
+ file.write open(format_url(asset)).read
59
+ rescue OpenURI::HTTPError => e
60
+ puts "Failed download for #{format_url(asset)}: #{e.message}"
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,33 @@
1
+ module Grabber
2
+ class Site
3
+ include Util
4
+
5
+ def initialize(url, path)
6
+ @url = with_url_protocol(url)
7
+ @download_path = path
8
+ end
9
+
10
+ def crawl
11
+ index = 0
12
+ page_urls = [format_url(@url)]
13
+
14
+ while (url = page_urls[index])
15
+ page = process_page(url)
16
+ other_urls = page.links.map { |link| format_url(link) }.select do |link|
17
+ URI.parse(link).host == uri.host
18
+ end
19
+ page_urls = page_urls | other_urls.compact
20
+
21
+ index += 1
22
+ end
23
+ end
24
+
25
+ def process_page(url)
26
+ page = Page.new(url)
27
+ page.crawl
28
+ page.download(@download_path)
29
+ page.download_assets(@download_path)
30
+ page
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,32 @@
1
+ module Grabber
2
+ module Util
3
+
4
+ def format_url(url)
5
+ url = strip_non_url_parts(url)
6
+ if URI.parse(url).relative?
7
+ url = URI.join("#{uri.scheme}://#{uri.host}", url).to_s
8
+ end
9
+
10
+ url.chop! while url.end_with?('/')
11
+ with_url_protocol(url)
12
+ end
13
+
14
+ def with_url_protocol(path)
15
+ path =~ /^http/ ? path : 'http://' + path
16
+ end
17
+
18
+ def strip_non_url_parts(link)
19
+ if (index = (link =~ /#/))
20
+ link.slice!(index..link.size)
21
+ end
22
+ if (index = (link =~ /\?/))
23
+ link.slice!(index..link.size)
24
+ end
25
+ link
26
+ end
27
+
28
+ def uri
29
+ URI.parse(@url)
30
+ end
31
+ end
32
+ end
metadata ADDED
@@ -0,0 +1,68 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: grabber
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.0
6
+ platform: ruby
7
+ authors:
8
+ - Greg Lazarev
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-11-21 00:00:00 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: nokogiri
17
+ prerelease: false
18
+ requirement: &id001 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ type: :runtime
25
+ version_requirements: *id001
26
+ description: Crawls the site and downloads assets to a specified directory
27
+ email: russianbandit@gmail.com
28
+ executables:
29
+ - grab
30
+ extensions: []
31
+
32
+ extra_rdoc_files: []
33
+
34
+ files:
35
+ - lib/grabber.rb
36
+ - lib/grabber/site.rb
37
+ - lib/grabber/page.rb
38
+ - lib/grabber/util.rb
39
+ - bin/grab
40
+ homepage: ""
41
+ licenses: []
42
+
43
+ post_install_message:
44
+ rdoc_options: []
45
+
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: "0"
54
+ required_rubygems_version: !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: "0"
60
+ requirements: []
61
+
62
+ rubyforge_project:
63
+ rubygems_version: 1.8.5
64
+ signing_key:
65
+ specification_version: 3
66
+ summary: Web site crawler and grabber
67
+ test_files: []
68
+