grabber 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ require 'grabber'
3
+
4
+ unless ARGV[0] && ARGV[1]
5
+ puts "Usage: grab web_site_address download_directory"
6
+ exit
7
+ end
8
+
9
+ unless Dir.exists?(ARGV[1])
10
+ puts "Directory doesn't exist: #{ARGV[1]}"
11
+ exit
12
+ end
13
+
14
+ Grabber::Site.new(ARGV[0], ARGV[1]).crawl
@@ -0,0 +1,7 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+
5
+ require 'grabber/util'
6
+ require 'grabber/page'
7
+ require 'grabber/site'
@@ -0,0 +1,66 @@
1
+ module Grabber
2
+ class Page
3
+ include Util
4
+ attr_reader :links
5
+
6
+ def initialize(url)
7
+ @url = url
8
+ @assets = []
9
+ @links = []
10
+ end
11
+
12
+ def crawl
13
+ puts "Grabbing: #{uri.to_s}"
14
+
15
+ content.search('img').each do |asset|
16
+ @assets << asset['src']
17
+ end
18
+
19
+ content.search('a').each do |asset|
20
+ location = asset['href']
21
+ next if location.nil? || location == '' || location[/^#/]
22
+
23
+ @links << location # if on same domain
24
+ end
25
+
26
+ @links.compact!
27
+ @links.uniq! if @links
28
+ end
29
+
30
+ def uri
31
+ URI.parse(@url)
32
+ end
33
+
34
+ def content
35
+ Nokogiri::HTML(uri.read)
36
+ end
37
+
38
+ def basename
39
+ if uri.path.nil? || uri.path == ''
40
+ "index.html"
41
+ else
42
+ uri.path.split('/').last + ".html"
43
+ end
44
+ end
45
+
46
+ def download(directory)
47
+ local_path = File.expand_path(File.join(directory, basename))
48
+ File.open(local_path, "wb") do |file|
49
+ file.write open(uri).read
50
+ end
51
+ end
52
+
53
+ def download_assets(directory)
54
+ @assets.each do |asset|
55
+ local_path = File.expand_path(File.join(directory, File.basename(asset)))
56
+ File.open(local_path, "wb") do |file|
57
+ begin
58
+ file.write open(format_url(asset)).read
59
+ rescue OpenURI::HTTPError => e
60
+ puts "Failed download for #{format_url(asset)}: #{e.message}"
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,33 @@
1
+ module Grabber
2
+ class Site
3
+ include Util
4
+
5
+ def initialize(url, path)
6
+ @url = with_url_protocol(url)
7
+ @download_path = path
8
+ end
9
+
10
+ def crawl
11
+ index = 0
12
+ page_urls = [format_url(@url)]
13
+
14
+ while (url = page_urls[index])
15
+ page = process_page(url)
16
+ other_urls = page.links.map { |link| format_url(link) }.select do |link|
17
+ URI.parse(link).host == uri.host
18
+ end
19
+ page_urls = page_urls | other_urls.compact
20
+
21
+ index += 1
22
+ end
23
+ end
24
+
25
+ def process_page(url)
26
+ page = Page.new(url)
27
+ page.crawl
28
+ page.download(@download_path)
29
+ page.download_assets(@download_path)
30
+ page
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,32 @@
1
+ module Grabber
2
+ module Util
3
+
4
+ def format_url(url)
5
+ url = strip_non_url_parts(url)
6
+ if URI.parse(url).relative?
7
+ url = URI.join("#{uri.scheme}://#{uri.host}", url).to_s
8
+ end
9
+
10
+ url.chop! while url.end_with?('/')
11
+ with_url_protocol(url)
12
+ end
13
+
14
+ def with_url_protocol(path)
15
+ path =~ /^http/ ? path : 'http://' + path
16
+ end
17
+
18
+ def strip_non_url_parts(link)
19
+ if (index = (link =~ /#/))
20
+ link.slice!(index..link.size)
21
+ end
22
+ if (index = (link =~ /\?/))
23
+ link.slice!(index..link.size)
24
+ end
25
+ link
26
+ end
27
+
28
+ def uri
29
+ URI.parse(@url)
30
+ end
31
+ end
32
+ end
metadata ADDED
@@ -0,0 +1,68 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: grabber
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.0
6
+ platform: ruby
7
+ authors:
8
+ - Greg Lazarev
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-11-21 00:00:00 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: nokogiri
17
+ prerelease: false
18
+ requirement: &id001 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ type: :runtime
25
+ version_requirements: *id001
26
+ description: Crawls the site and downloads assets to a specified directory
27
+ email: russianbandit@gmail.com
28
+ executables:
29
+ - grab
30
+ extensions: []
31
+
32
+ extra_rdoc_files: []
33
+
34
+ files:
35
+ - lib/grabber.rb
36
+ - lib/grabber/site.rb
37
+ - lib/grabber/page.rb
38
+ - lib/grabber/util.rb
39
+ - bin/grab
40
+ homepage: ""
41
+ licenses: []
42
+
43
+ post_install_message:
44
+ rdoc_options: []
45
+
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: "0"
54
+ required_rubygems_version: !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: "0"
60
+ requirements: []
61
+
62
+ rubyforge_project:
63
+ rubygems_version: 1.8.5
64
+ signing_key:
65
+ specification_version: 3
66
+ summary: Web site crawler and grabber
67
+ test_files: []
68
+