grabber 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/grab +14 -0
- data/lib/grabber.rb +7 -0
- data/lib/grabber/page.rb +66 -0
- data/lib/grabber/site.rb +33 -0
- data/lib/grabber/util.rb +32 -0
- metadata +68 -0
data/bin/grab
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'grabber'
|
3
|
+
|
4
|
+
unless ARGV[0] && ARGV[1]
|
5
|
+
puts "Usage: grab web_site_address download_directory"
|
6
|
+
exit
|
7
|
+
end
|
8
|
+
|
9
|
+
unless Dir.exists?(ARGV[1])
|
10
|
+
puts "Directory doesn't exist: #{ARGV[1]}"
|
11
|
+
exit
|
12
|
+
end
|
13
|
+
|
14
|
+
Grabber::Site.new(ARGV[0], ARGV[1]).crawl
|
data/lib/grabber.rb
ADDED
data/lib/grabber/page.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
module Grabber
|
2
|
+
class Page
|
3
|
+
include Util
|
4
|
+
attr_reader :links
|
5
|
+
|
6
|
+
def initialize(url)
|
7
|
+
@url = url
|
8
|
+
@assets = []
|
9
|
+
@links = []
|
10
|
+
end
|
11
|
+
|
12
|
+
def crawl
|
13
|
+
puts "Grabbing: #{uri.to_s}"
|
14
|
+
|
15
|
+
content.search('img').each do |asset|
|
16
|
+
@assets << asset['src']
|
17
|
+
end
|
18
|
+
|
19
|
+
content.search('a').each do |asset|
|
20
|
+
location = asset['href']
|
21
|
+
next if location.nil? || location == '' || location[/^#/]
|
22
|
+
|
23
|
+
@links << location # if on same domain
|
24
|
+
end
|
25
|
+
|
26
|
+
@links.compact!
|
27
|
+
@links.uniq! if @links
|
28
|
+
end
|
29
|
+
|
30
|
+
def uri
|
31
|
+
URI.parse(@url)
|
32
|
+
end
|
33
|
+
|
34
|
+
def content
|
35
|
+
Nokogiri::HTML(uri.read)
|
36
|
+
end
|
37
|
+
|
38
|
+
def basename
|
39
|
+
if uri.path.nil? || uri.path == ''
|
40
|
+
"index.html"
|
41
|
+
else
|
42
|
+
uri.path.split('/').last + ".html"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def download(directory)
|
47
|
+
local_path = File.expand_path(File.join(directory, basename))
|
48
|
+
File.open(local_path, "wb") do |file|
|
49
|
+
file.write open(uri).read
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def download_assets(directory)
|
54
|
+
@assets.each do |asset|
|
55
|
+
local_path = File.expand_path(File.join(directory, File.basename(asset)))
|
56
|
+
File.open(local_path, "wb") do |file|
|
57
|
+
begin
|
58
|
+
file.write open(format_url(asset)).read
|
59
|
+
rescue OpenURI::HTTPError => e
|
60
|
+
puts "Failed download for #{format_url(asset)}: #{e.message}"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
data/lib/grabber/site.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
module Grabber
|
2
|
+
class Site
|
3
|
+
include Util
|
4
|
+
|
5
|
+
def initialize(url, path)
|
6
|
+
@url = with_url_protocol(url)
|
7
|
+
@download_path = path
|
8
|
+
end
|
9
|
+
|
10
|
+
def crawl
|
11
|
+
index = 0
|
12
|
+
page_urls = [format_url(@url)]
|
13
|
+
|
14
|
+
while (url = page_urls[index])
|
15
|
+
page = process_page(url)
|
16
|
+
other_urls = page.links.map { |link| format_url(link) }.select do |link|
|
17
|
+
URI.parse(link).host == uri.host
|
18
|
+
end
|
19
|
+
page_urls = page_urls | other_urls.compact
|
20
|
+
|
21
|
+
index += 1
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def process_page(url)
|
26
|
+
page = Page.new(url)
|
27
|
+
page.crawl
|
28
|
+
page.download(@download_path)
|
29
|
+
page.download_assets(@download_path)
|
30
|
+
page
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/grabber/util.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
module Grabber
|
2
|
+
module Util
|
3
|
+
|
4
|
+
def format_url(url)
|
5
|
+
url = strip_non_url_parts(url)
|
6
|
+
if URI.parse(url).relative?
|
7
|
+
url = URI.join("#{uri.scheme}://#{uri.host}", url).to_s
|
8
|
+
end
|
9
|
+
|
10
|
+
url.chop! while url.end_with?('/')
|
11
|
+
with_url_protocol(url)
|
12
|
+
end
|
13
|
+
|
14
|
+
def with_url_protocol(path)
|
15
|
+
path =~ /^http/ ? path : 'http://' + path
|
16
|
+
end
|
17
|
+
|
18
|
+
def strip_non_url_parts(link)
|
19
|
+
if (index = (link =~ /#/))
|
20
|
+
link.slice!(index..link.size)
|
21
|
+
end
|
22
|
+
if (index = (link =~ /\?/))
|
23
|
+
link.slice!(index..link.size)
|
24
|
+
end
|
25
|
+
link
|
26
|
+
end
|
27
|
+
|
28
|
+
def uri
|
29
|
+
URI.parse(@url)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
metadata
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: grabber
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.0
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Greg Lazarev
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-11-21 00:00:00 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: nokogiri
|
17
|
+
prerelease: false
|
18
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
19
|
+
none: false
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
type: :runtime
|
25
|
+
version_requirements: *id001
|
26
|
+
description: Crawls the site and downloads assets to a specified directory
|
27
|
+
email: russianbandit@gmail.com
|
28
|
+
executables:
|
29
|
+
- grab
|
30
|
+
extensions: []
|
31
|
+
|
32
|
+
extra_rdoc_files: []
|
33
|
+
|
34
|
+
files:
|
35
|
+
- lib/grabber.rb
|
36
|
+
- lib/grabber/site.rb
|
37
|
+
- lib/grabber/page.rb
|
38
|
+
- lib/grabber/util.rb
|
39
|
+
- bin/grab
|
40
|
+
homepage: ""
|
41
|
+
licenses: []
|
42
|
+
|
43
|
+
post_install_message:
|
44
|
+
rdoc_options: []
|
45
|
+
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: "0"
|
54
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: "0"
|
60
|
+
requirements: []
|
61
|
+
|
62
|
+
rubyforge_project:
|
63
|
+
rubygems_version: 1.8.5
|
64
|
+
signing_key:
|
65
|
+
specification_version: 3
|
66
|
+
summary: Web site crawler and grabber
|
67
|
+
test_files: []
|
68
|
+
|