downer 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/Rakefile +1 -0
- data/downer.gemspec +18 -2
- data/lib/downer.rb +4 -5
- data/lib/downer/application.rb +17 -1
- data/lib/downer/download_item.rb +15 -10
- data/lib/downer/download_manager.rb +53 -18
- data/lib/downer/download_strategy.rb +34 -0
- data/lib/downer/download_worker.rb +11 -1
- data/lib/downer/generic_strategy.rb +31 -0
- data/lib/downer/options.rb +13 -3
- data/lib/downer/strategies/flat_file_strategy.rb +19 -0
- data/lib/downer/strategies/website_strategy.rb +75 -0
- data/spec/downer/application_spec.rb +18 -9
- data/spec/downer/download_manager_spec.rb +27 -9
- data/spec/downer/download_strategy_spec.rb +19 -0
- data/spec/downer/download_worker_spec.rb +11 -3
- data/spec/downer/options_spec.rb +10 -0
- data/spec/downer/strategies/flat_file_stragtegy_spec.rb +26 -0
- data/spec/downer/strategies/website_strategy_spec.rb +58 -0
- data/spec/fixtures/basic_page.html +18 -0
- data/version.yml +3 -3
- metadata +31 -4
data/.gitignore
CHANGED
data/Rakefile
CHANGED
@@ -11,6 +11,7 @@ begin
|
|
11
11
|
gem.homepage = "http://github.com/nate63179/downer"
|
12
12
|
gem.authors = ["Nate Miller"]
|
13
13
|
gem.add_development_dependency "rspec", ">= 1.2.9"
|
14
|
+
gem.add_dependency 'nokogiri'
|
14
15
|
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
16
|
end
|
16
17
|
Jeweler::GemcutterTasks.new
|
data/downer.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{downer}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.3.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Nate Miller"]
|
12
|
-
s.date = %q{2010-
|
12
|
+
s.date = %q{2010-08-08}
|
13
13
|
s.default_executable = %q{downer}
|
14
14
|
s.description = %q{Downer is a tool used to download a list of urls from a website thorugh HTTP.}
|
15
15
|
s.email = %q{nate@natemiller.org}
|
@@ -34,13 +34,22 @@ Gem::Specification.new do |s|
|
|
34
34
|
"lib/downer/application.rb",
|
35
35
|
"lib/downer/download_item.rb",
|
36
36
|
"lib/downer/download_manager.rb",
|
37
|
+
"lib/downer/download_strategy.rb",
|
37
38
|
"lib/downer/download_worker.rb",
|
39
|
+
"lib/downer/generic_strategy.rb",
|
38
40
|
"lib/downer/options.rb",
|
41
|
+
"lib/downer/strategies/flat_file_strategy.rb",
|
42
|
+
"lib/downer/strategies/website_strategy.rb",
|
39
43
|
"spec/downer/application_spec.rb",
|
40
44
|
"spec/downer/download_item_spec.rb",
|
41
45
|
"spec/downer/download_manager_spec.rb",
|
46
|
+
"spec/downer/download_strategy_spec.rb",
|
42
47
|
"spec/downer/download_worker_spec.rb",
|
43
48
|
"spec/downer/generator_spec.rb",
|
49
|
+
"spec/downer/options_spec.rb",
|
50
|
+
"spec/downer/strategies/flat_file_stragtegy_spec.rb",
|
51
|
+
"spec/downer/strategies/website_strategy_spec.rb",
|
52
|
+
"spec/fixtures/basic_page.html",
|
44
53
|
"spec/fixtures/some_images.txt",
|
45
54
|
"spec/spec.opts",
|
46
55
|
"spec/spec_helper.rb",
|
@@ -55,8 +64,12 @@ Gem::Specification.new do |s|
|
|
55
64
|
"spec/downer/application_spec.rb",
|
56
65
|
"spec/downer/download_item_spec.rb",
|
57
66
|
"spec/downer/download_manager_spec.rb",
|
67
|
+
"spec/downer/download_strategy_spec.rb",
|
58
68
|
"spec/downer/download_worker_spec.rb",
|
59
69
|
"spec/downer/generator_spec.rb",
|
70
|
+
"spec/downer/options_spec.rb",
|
71
|
+
"spec/downer/strategies/flat_file_stragtegy_spec.rb",
|
72
|
+
"spec/downer/strategies/website_strategy_spec.rb",
|
60
73
|
"spec/spec_helper.rb"
|
61
74
|
]
|
62
75
|
|
@@ -66,11 +79,14 @@ Gem::Specification.new do |s|
|
|
66
79
|
|
67
80
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
68
81
|
s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
|
82
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
69
83
|
else
|
70
84
|
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
85
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
71
86
|
end
|
72
87
|
else
|
73
88
|
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
89
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
74
90
|
end
|
75
91
|
end
|
76
92
|
|
data/lib/downer.rb
CHANGED
@@ -1,8 +1,7 @@
|
|
1
|
+
require 'rubygems'
|
1
2
|
require 'net/http'
|
3
|
+
require 'nokogiri'
|
2
4
|
require 'optparse'
|
5
|
+
require 'open-uri'
|
3
6
|
|
4
|
-
|
5
|
-
require 'downer/download_manager'
|
6
|
-
require 'downer/download_worker'
|
7
|
-
require 'downer/download_item'
|
8
|
-
require 'downer/options'
|
7
|
+
Dir.glob(File.dirname(__FILE__) + '/**/*.rb').each { |file| require file }
|
data/lib/downer/application.rb
CHANGED
@@ -2,6 +2,7 @@ module Downer
|
|
2
2
|
class Application
|
3
3
|
|
4
4
|
attr_accessor :output
|
5
|
+
attr_reader :options
|
5
6
|
|
6
7
|
def initialize(output = nil)
|
7
8
|
@output = (output) ? output : $stdout
|
@@ -10,18 +11,29 @@ module Downer
|
|
10
11
|
|
11
12
|
def run!(*arguments)
|
12
13
|
@options = Downer::Options.new(arguments)
|
14
|
+
download_options = {}
|
13
15
|
|
16
|
+
|
17
|
+
|
18
|
+
# begin analysis of arguments
|
14
19
|
if @options[:invalid_argument]
|
15
20
|
@output.puts @options[:invalid_argument]
|
16
21
|
@options[:show_help] = true
|
17
22
|
end
|
18
23
|
|
24
|
+
if @options[:images_only]
|
25
|
+
print_image_only_message
|
26
|
+
download_options[:images_only] = true
|
27
|
+
end
|
28
|
+
|
29
|
+
# Immediately exit if this will never complete
|
19
30
|
return exit_with_help_banner if @options[:file_manifest].nil?
|
20
31
|
return exit_with_help_banner if @options[:target_directory].nil?
|
21
32
|
return exit_with_help_banner if @options[:show_help]
|
22
33
|
|
34
|
+
|
23
35
|
begin
|
24
|
-
manager = Downer::DownloadManager.new(@options[:file_manifest], @options[:target_directory], @output)
|
36
|
+
manager = Downer::DownloadManager.new(@options[:file_manifest], @options[:target_directory], @output, download_options)
|
25
37
|
manager.start
|
26
38
|
return 0
|
27
39
|
rescue Downer::WriteFailed
|
@@ -31,6 +43,10 @@ module Downer
|
|
31
43
|
end
|
32
44
|
|
33
45
|
private
|
46
|
+
|
47
|
+
def print_image_only_message
|
48
|
+
@output.puts "Images only filter selected...downloading PNG,JPG,GIF, and TIFF files"
|
49
|
+
end
|
34
50
|
|
35
51
|
def exit_with_help_banner
|
36
52
|
@output.puts @options.opts.banner
|
data/lib/downer/download_item.rb
CHANGED
@@ -25,18 +25,23 @@ module Downer
|
|
25
25
|
|
26
26
|
def download
|
27
27
|
@http = Net::HTTP.new(@uri.host, @uri.port)
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
28
|
+
if @uri.respond_to? :request_uri
|
29
|
+
req = Net::HTTP::Get.new(@uri.request_uri)
|
30
|
+
response = @http.request(req)
|
31
|
+
|
32
|
+
if response.code != '200'
|
33
|
+
fd = FailedDownload.new
|
34
|
+
fd.http_code = response.code
|
35
|
+
fd.url = @url
|
36
|
+
raise fd
|
37
|
+
else
|
38
|
+
@content = response.body
|
39
|
+
write_to_file
|
40
|
+
end
|
36
41
|
else
|
37
|
-
|
38
|
-
write_to_file
|
42
|
+
puts "WARNING: Ignored download for #{@uri.inspect}"
|
39
43
|
end
|
44
|
+
|
40
45
|
end
|
41
46
|
|
42
47
|
private
|
@@ -5,34 +5,69 @@ module Downer
|
|
5
5
|
class NoManifestFileGiven < StandardError; end
|
6
6
|
class NoTargetDirectoryGiven < StandardError; end
|
7
7
|
class URLSourceDoesNotExist < StandardError; end
|
8
|
+
|
8
9
|
|
9
10
|
class DownloadManager
|
10
11
|
|
11
|
-
attr_accessor :
|
12
|
-
attr_reader :urls
|
12
|
+
attr_accessor :target_directory, :output, :source_type
|
13
|
+
attr_reader :urls, :downloaded_files, :strategy
|
13
14
|
|
14
|
-
def initialize(url_source, target_directory, output)
|
15
|
-
@
|
16
|
-
|
17
|
-
@target_directory =
|
18
|
-
@
|
19
|
-
raise URLSourceDoesNotExist unless File.exists?(@file_manifest)
|
20
|
-
get_urls
|
15
|
+
def initialize(url_source, target_directory, output, options ={})
|
16
|
+
@url_source = url_source
|
17
|
+
@output = output
|
18
|
+
@target_directory = append_slash_to_path(target_directory)
|
19
|
+
@strategy = StrategyFinder::find_strategy(@url_source, options)
|
21
20
|
end
|
22
21
|
|
23
22
|
def start
|
24
|
-
|
25
|
-
|
26
|
-
@
|
27
|
-
|
28
|
-
|
23
|
+
check_directory
|
24
|
+
|
25
|
+
if @strategy && @strategy.source_valid?
|
26
|
+
urls = @strategy.get_urls
|
27
|
+
@output.puts "Starting download on #{urls.size} files"
|
28
|
+
worker = DownloadWorker.new(urls, @target_directory, @output)
|
29
|
+
@downloaded_files = worker.start
|
30
|
+
print_session_summary(worker)
|
31
|
+
else
|
32
|
+
@output.puts "Could not open url source #{@url_source}"
|
33
|
+
end
|
29
34
|
end
|
30
35
|
|
31
|
-
|
36
|
+
# Tells us what worked and what failed
|
37
|
+
def print_session_summary(worker)
|
38
|
+
@output.puts "Session complete."
|
39
|
+
@output.puts "\n\n"
|
40
|
+
@output.puts "----------------------------------"
|
41
|
+
@output.puts "SUMMARY:"
|
42
|
+
@output.puts "Successful downloads: (#{worker.successful_downloads.size})"
|
43
|
+
@output.puts "Failed downloads: (#{worker.failed_downloads.size}): "
|
44
|
+
worker.failed_downloads.each do |fail|
|
45
|
+
@output.puts " * #{fail}"
|
46
|
+
end
|
47
|
+
end
|
32
48
|
|
33
|
-
|
34
|
-
|
35
|
-
|
49
|
+
def source_type
|
50
|
+
@strategy.source_type
|
51
|
+
end
|
52
|
+
|
53
|
+
# Determine whether the direcotry specified exists. If it does not, see if its parent is writable. If not, give up
|
54
|
+
# and throw error
|
55
|
+
def check_directory
|
56
|
+
if File.writable?(@target_directory)
|
57
|
+
return true
|
58
|
+
else
|
59
|
+
#FileUtils.mkdir(@target_directory)
|
60
|
+
raise WriteFailed
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
protected
|
65
|
+
|
66
|
+
# Put a slash on the directory name if one was ommited
|
67
|
+
def append_slash_to_path(dir_name)
|
68
|
+
dir_last_char_is_slash = (dir_name[-1,1] == '/')
|
69
|
+
dir_name = dir_name + '/' unless dir_last_char_is_slash
|
70
|
+
dir_name
|
36
71
|
end
|
37
72
|
|
38
73
|
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Downer
|
2
|
+
|
3
|
+
class StrategyFinder
|
4
|
+
class << self
|
5
|
+
|
6
|
+
# Determines a strategy for extracting urls from a media type
|
7
|
+
def find_strategy(url_source, options ={})
|
8
|
+
strategy = nil
|
9
|
+
|
10
|
+
if is_local_file?(url_source)
|
11
|
+
strategy = DownloadStrategy::FlatFileStrategy.new(url_source, options)
|
12
|
+
elsif is_remote_source?(url_source)
|
13
|
+
strategy = DownloadStrategy::WebsiteStrategy.new(url_source, options)
|
14
|
+
else
|
15
|
+
raise "Could not find strategy"
|
16
|
+
end
|
17
|
+
strategy
|
18
|
+
end
|
19
|
+
|
20
|
+
# Determine whether the source is located on a local file system
|
21
|
+
def is_local_file?(url_source)
|
22
|
+
File.exist?(url_source)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Determine if this is something that lives online
|
26
|
+
def is_remote_source?(url_source)
|
27
|
+
#if url_source =~ /(ftp|https?).*$/
|
28
|
+
url_source.match(/(ftp|https?).*$/) ? true : false
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
@@ -1,11 +1,15 @@
|
|
1
1
|
module Downer
|
2
2
|
class DownloadWorker
|
3
3
|
|
4
|
-
attr_reader :items
|
4
|
+
attr_reader :items, :successful_downloads, :failed_downloads
|
5
5
|
|
6
6
|
def initialize(urls, target_directory, output)
|
7
7
|
@urls, @target_directory, @output = urls, target_directory, output
|
8
|
+
@urls.delete_if { |url| url == nil }
|
8
9
|
@items = []
|
10
|
+
@successful_downloads = []
|
11
|
+
@failed_downloads = []
|
12
|
+
|
9
13
|
@urls.each { |url| @items << DownloadItem.new(url, target_directory) }
|
10
14
|
end
|
11
15
|
|
@@ -16,6 +20,7 @@ module Downer
|
|
16
20
|
end
|
17
21
|
|
18
22
|
@items.each { |item| try_download_item(item) }
|
23
|
+
successful_downloads
|
19
24
|
end
|
20
25
|
|
21
26
|
private
|
@@ -23,9 +28,14 @@ module Downer
|
|
23
28
|
def try_download_item(item)
|
24
29
|
begin
|
25
30
|
item.download
|
31
|
+
@successful_downloads << item.url
|
26
32
|
@output.puts "Downloaded #{item.url}"
|
27
33
|
rescue Downer::FailedDownload => e
|
28
34
|
@output.puts "Could not download #{e.url}, received http code #{e.http_code}"
|
35
|
+
@failed_downloads << item.url
|
36
|
+
rescue SocketError => e
|
37
|
+
@output.puts "SocketError encountered on url #{item.url}"
|
38
|
+
@failed_downloads << item.url
|
29
39
|
end
|
30
40
|
end
|
31
41
|
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Downer
|
2
|
+
|
3
|
+
class SubclassMethodUndefined < StandardError; end
|
4
|
+
|
5
|
+
class GenericStrategy
|
6
|
+
|
7
|
+
def initialize(source, search_options = {})
|
8
|
+
@url_source = source
|
9
|
+
@search_options = search_options
|
10
|
+
end
|
11
|
+
|
12
|
+
def options
|
13
|
+
@search_options
|
14
|
+
end
|
15
|
+
|
16
|
+
def get_urls
|
17
|
+
raise SubclassMethodUndefined
|
18
|
+
end
|
19
|
+
|
20
|
+
def source_valid?
|
21
|
+
raise SubclassMethodUndefined
|
22
|
+
end
|
23
|
+
|
24
|
+
def source_type
|
25
|
+
name = self.class.name.gsub(/Downer::DownloadStrategy::/,'')
|
26
|
+
name.gsub(/Strategy/,'').downcase
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
data/lib/downer/options.rb
CHANGED
@@ -3,10 +3,20 @@ module Downer
|
|
3
3
|
attr_reader :opts, :orig_args
|
4
4
|
|
5
5
|
def initialize(args)
|
6
|
-
|
7
|
-
|
6
|
+
@opts = args.clone
|
7
|
+
self[:is_website] = false
|
8
|
+
self[:images_only] = false
|
9
|
+
|
8
10
|
@opts = OptionParser.new do |o|
|
9
|
-
o.banner = "Usage: downer URL_SOURCE DESTINATION_DIR"
|
11
|
+
o.banner = "Usage: downer -flags URL_SOURCE DESTINATION_DIR"
|
12
|
+
|
13
|
+
o.on('-w', '--web', 'Declare source as a url') do |url|
|
14
|
+
self[:is_website] = true
|
15
|
+
end
|
16
|
+
|
17
|
+
o.on('-i', '--image', 'When combined with w will download JPG,GIF,PNG formats') do
|
18
|
+
self[:images_only] = true
|
19
|
+
end
|
10
20
|
end
|
11
21
|
|
12
22
|
begin
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Downer
|
2
|
+
module DownloadStrategy
|
3
|
+
|
4
|
+
class FlatFileStrategy < GenericStrategy
|
5
|
+
|
6
|
+
def get_urls
|
7
|
+
urls = []
|
8
|
+
f = File.open(@url_source, 'r')
|
9
|
+
f.each_line { |line| urls << line.chomp}
|
10
|
+
urls
|
11
|
+
end
|
12
|
+
|
13
|
+
def source_valid?
|
14
|
+
File.exist?(@url_source)
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Downer
|
2
|
+
module DownloadStrategy
|
3
|
+
|
4
|
+
class WebsiteStrategy < GenericStrategy
|
5
|
+
|
6
|
+
# Create the downloading strategy, set any behavior flags in the options hash
|
7
|
+
def initialize(url_source, search_options = {})
|
8
|
+
super(url_source, search_options)
|
9
|
+
uri = URI.parse(url_source)
|
10
|
+
@host_prefix = uri.scheme + "://" + uri.host
|
11
|
+
end
|
12
|
+
|
13
|
+
# Retrieve urls from an HTML page. Behavior is dependent upon options passed
|
14
|
+
# to constructor
|
15
|
+
def get_urls
|
16
|
+
@noko = Nokogiri::HTML(download_page)
|
17
|
+
urls = []
|
18
|
+
|
19
|
+
if @search_options[:images_only]
|
20
|
+
urls = image_urls
|
21
|
+
else
|
22
|
+
urls = urls.concat document_links
|
23
|
+
urls = urls.concat image_urls
|
24
|
+
end
|
25
|
+
urls.uniq
|
26
|
+
end
|
27
|
+
|
28
|
+
# read an html page into memory
|
29
|
+
def download_page
|
30
|
+
@downloaded_page ||= open(@url_source)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Return all image urls from document
|
34
|
+
def image_urls
|
35
|
+
urls = []
|
36
|
+
@noko.css('img').each do |img|
|
37
|
+
urls << absolutify_link(img['src'])
|
38
|
+
end
|
39
|
+
urls
|
40
|
+
end
|
41
|
+
|
42
|
+
# Return all links stored within the document
|
43
|
+
def document_links
|
44
|
+
urls = []
|
45
|
+
@noko.css('a').each do |alink|
|
46
|
+
link = alink['href']
|
47
|
+
urls << absolutify_link(link)
|
48
|
+
end
|
49
|
+
urls
|
50
|
+
end
|
51
|
+
|
52
|
+
# Converts non absolute urls to absolute ones
|
53
|
+
def absolutify_link(link)
|
54
|
+
|
55
|
+
# Auto prepend any links which refer use releative reference like '../'
|
56
|
+
if link[0,1] == '.'
|
57
|
+
link = '/' + link
|
58
|
+
end
|
59
|
+
|
60
|
+
if link =~ /(https?|ftp).*/
|
61
|
+
url = link
|
62
|
+
elsif link[0,1] != '/'
|
63
|
+
link = "/" + link
|
64
|
+
else
|
65
|
+
url = @host_prefix + link
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def source_valid?
|
70
|
+
URI.parse(@url_source)
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -4,19 +4,28 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
|
4
4
|
module Downer
|
5
5
|
describe Downer do
|
6
6
|
describe "#run!" do
|
7
|
+
before(:each) do
|
8
|
+
@output = double('output').as_null_object
|
9
|
+
@app = Application.new(@output)
|
10
|
+
end
|
11
|
+
|
7
12
|
it "when run without arguments displays a usage hint" do
|
8
|
-
output
|
9
|
-
app
|
10
|
-
output.should_receive(:puts).with('Usage: downer URL_SOURCE DESTINATION_DIR')
|
11
|
-
app.run!
|
13
|
+
@output.should_receive(:puts).with('Usage: downer -flags URL_SOURCE DESTINATION_DIR')
|
14
|
+
@app.run!
|
12
15
|
end
|
13
16
|
|
14
|
-
it "when run with
|
15
|
-
output
|
16
|
-
app
|
17
|
-
|
18
|
-
app.run!(fixture_directory + "/some_images.txt", '/tmp')
|
17
|
+
it "when run with -i argument it will download only images" do
|
18
|
+
@output.should_receive(:puts).with("Images only filter selected...downloading PNG,JPG,GIF, and TIFF files")
|
19
|
+
@app.run!("-i")
|
20
|
+
@app.options[:images_only].should == true
|
19
21
|
end
|
22
|
+
|
23
|
+
# it "when run with a -w DATA_SOURCE argument it should start a web download" do
|
24
|
+
# host = "http://www.urbaninfluence.com"
|
25
|
+
# @output.should_receive(:puts).with("Requesting from host #{host}")
|
26
|
+
# arg_cmd = %w{-wi http://www.urbaninfluence.com /tmp}
|
27
|
+
# @app.run!(arg_cmd)
|
28
|
+
# end
|
20
29
|
end
|
21
30
|
end
|
22
31
|
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '../../spec_helper')
|
2
2
|
|
3
3
|
module Downer
|
4
|
+
|
4
5
|
describe DownloadManager do
|
5
6
|
let(:output) { double('output').as_null_object }
|
6
7
|
let(:fixture_file_path) { fixture_directory + '/some_images.txt' }
|
@@ -10,31 +11,48 @@ module Downer
|
|
10
11
|
it "should add a slash as the last character of the target directory if one is not present" do
|
11
12
|
manager.target_directory.should == 'myoutputdir/'
|
12
13
|
end
|
13
|
-
|
14
|
-
it "
|
15
|
-
|
14
|
+
|
15
|
+
it "when passed an option hash containing :images_only, only images will be downloaded for this session" do
|
16
|
+
mgr = DownloadManager.new("http://localhost/basic_page.html", '/tmp', output, {:images_only => true})
|
17
|
+
mgr.strategy.options[:images_only].should == true
|
16
18
|
end
|
17
19
|
end
|
18
20
|
|
19
21
|
describe "#start" do
|
20
22
|
|
21
|
-
it "should raise an error when the file does not exist" do
|
22
|
-
File.should_receive(:exists?).with(fixture_file_path)
|
23
|
-
lambda { manager.start }.should raise_error(Downer::URLSourceDoesNotExist)
|
24
|
-
end
|
25
23
|
it "should raise a WriteFailed exception if the target directory is not writable" do
|
26
24
|
File.should_receive(:writable?).with('myoutputdir/').and_return(false)
|
27
25
|
lambda { manager.start }.should raise_error(Downer::WriteFailed)
|
28
26
|
end
|
29
|
-
|
27
|
+
|
28
|
+
it "should create the specified directory if it does not exist and its parent directory is writable"
|
29
|
+
## do
|
30
|
+
# if File.exist?('/tmp/new-directory')
|
31
|
+
# FileUtils.rm_rf('/tmp/new-directory')
|
32
|
+
# end
|
33
|
+
# create_mgr = DownloadManager.new(fixture_file_path, '/tmp/new-directory', output)
|
34
|
+
# create_mgr.start
|
35
|
+
# File.exist?('/tmp/new-directory').should == true
|
36
|
+
# end
|
30
37
|
|
31
38
|
it "should create a download worker to begin the downloading" do
|
32
39
|
File.should_receive(:writable?).and_return(true)
|
33
|
-
worker = double('worker')
|
40
|
+
worker = double('worker').as_null_object
|
34
41
|
worker.should_receive(:start)
|
35
42
|
DownloadWorker.should_receive(:new).and_return(worker)
|
36
43
|
manager.start
|
37
44
|
end
|
45
|
+
|
46
|
+
it "should resolve to use a flat file strategy when it receives a file data source" do
|
47
|
+
dm = DownloadManager.new(fixture_file_path, 'myoutputdir', output)
|
48
|
+
dm.source_type.should == "flatfile"
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should resolve to use a web strategy when it receives a data source that looks like a url" do
|
52
|
+
dm = DownloadManager.new('http://www.msn.com', 'myoutputdir', output)
|
53
|
+
dm.source_type.should == "website"
|
54
|
+
end
|
55
|
+
|
38
56
|
end
|
39
57
|
|
40
58
|
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '../../spec_helper')
|
2
|
+
|
3
|
+
module Downer
|
4
|
+
describe StrategyFinder do
|
5
|
+
describe "Class method find_strategy" do
|
6
|
+
it "should return flat file strategy when url source is a local file" do
|
7
|
+
strategy = StrategyFinder.find_strategy(fixture_directory + '/some_images.txt')
|
8
|
+
strategy.should respond_to :get_urls
|
9
|
+
strategy.source_type.should == 'flatfile'
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should return a website strategy when url source is a web resource" do
|
13
|
+
strategy = StrategyFinder.find_strategy('http://www.example.com')
|
14
|
+
strategy.should respond_to :get_urls
|
15
|
+
strategy.source_type.should == 'website'
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -3,7 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '../../spec_helper')
|
|
3
3
|
module Downer
|
4
4
|
describe DownloadWorker do
|
5
5
|
let (:output) { double('output') }
|
6
|
-
let (:urls) {
|
6
|
+
let (:urls) { DownloadStrategy::FlatFileStrategy.new(fixture_directory + "/some_images.txt").get_urls }
|
7
7
|
|
8
8
|
describe '#start' do
|
9
9
|
it "should write a message to output when no urls exist to be downloaded" do
|
@@ -12,6 +12,12 @@ module Downer
|
|
12
12
|
worker.start
|
13
13
|
end
|
14
14
|
|
15
|
+
it "should ignore arrays with nils" do
|
16
|
+
output.should_receive(:puts).with("No URLs specified, exiting.")
|
17
|
+
worker = DownloadWorker.new([nil,nil], '/tmp', output)
|
18
|
+
worker.start
|
19
|
+
end
|
20
|
+
|
15
21
|
it "should create a download object for each url to be downloaded" do
|
16
22
|
worker = DownloadWorker.new(urls, '/tmp', output)
|
17
23
|
worker.items.size.should == urls.size
|
@@ -21,14 +27,16 @@ module Downer
|
|
21
27
|
bad_url = "http://www.urbaninfluence.com/will_never_succeed"
|
22
28
|
worker = DownloadWorker.new([bad_url], '/tmp', output)
|
23
29
|
output.should_receive(:puts).with("Could not download #{bad_url}, received http code 404")
|
24
|
-
worker.start
|
30
|
+
results = worker.start
|
31
|
+
results.size.should == 0
|
25
32
|
end
|
26
33
|
|
27
34
|
it "should write a message to output feed when a url is successfully downloaded" do
|
28
35
|
good_url = "http://www.urbaninfluence.com/sites/default/files/user_uploads/images/4th.png"
|
29
36
|
worker = DownloadWorker.new([good_url], '/tmp', output)
|
30
37
|
output.should_receive(:puts).with("Downloaded #{good_url}")
|
31
|
-
worker.start
|
38
|
+
results = worker.start
|
39
|
+
results.size.should == 1
|
32
40
|
end
|
33
41
|
end
|
34
42
|
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
module Downer
|
4
|
+
describe Options do
|
5
|
+
it "when passed '-i' the option image_only will be set to true" do
|
6
|
+
op_struct = Options.new(['-i'])
|
7
|
+
op_struct[:images_only].should == true
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '../../../spec_helper')
|
2
|
+
|
3
|
+
module Downer
|
4
|
+
module DownloadStrategy
|
5
|
+
describe FlatFileStrategy do
|
6
|
+
|
7
|
+
describe "#get_urls" do
|
8
|
+
before(:each) do
|
9
|
+
@flat_file_strategy = FlatFileStrategy.new(fixture_directory + '/some_images.txt')
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should retrieve all urls from a text file" do
|
13
|
+
@flat_file_strategy.get_urls.size.should == 4
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "#source_valid?" do
|
19
|
+
it "should return false when the local file does not exist" do
|
20
|
+
flat_file_strategy = FlatFileStrategy.new('foobar')
|
21
|
+
flat_file_strategy.source_valid?.should == false
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '../../../spec_helper')
|
2
|
+
|
3
|
+
module Downer
|
4
|
+
module DownloadStrategy
|
5
|
+
describe WebsiteStrategy do
|
6
|
+
|
7
|
+
it "should automatically prepend the host for relative urls" do
|
8
|
+
@web_strategy = WebsiteStrategy.new('http://www.example.com')
|
9
|
+
downloaded_page_mock = IO.read(fixture_directory + '/basic_page.html')
|
10
|
+
@web_strategy.should_receive(:download_page).and_return(downloaded_page_mock)
|
11
|
+
@web_strategy.get_urls.should include('http://www.example.com/clickhere.html')
|
12
|
+
end
|
13
|
+
|
14
|
+
describe "default get_urls behavior" do
|
15
|
+
before(:each) do
|
16
|
+
@web_strategy = WebsiteStrategy.new('http://www.example.com')
|
17
|
+
downloaded_page_mock = IO.read(fixture_directory + '/basic_page.html')
|
18
|
+
@web_strategy.should_receive(:download_page).and_return(downloaded_page_mock)
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should retrieve 5 urls fom basic_page.html fixture" do
|
22
|
+
@web_strategy.get_urls.size.should == 5
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
describe "get images only behavior" do
|
27
|
+
before(:each) do
|
28
|
+
@web_strategy = WebsiteStrategy.new('http://www.example.com', {:images_only => true} )
|
29
|
+
downloaded_page_mock = IO.read(fixture_directory + '/basic_page.html')
|
30
|
+
@web_strategy.should_receive(:download_page).and_return(downloaded_page_mock)
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should retrieve 3 urls from basic_page.html fixture with images only mode enabled" do
|
34
|
+
@web_strategy.get_urls.size.should == 3
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
describe "image searching" do
|
39
|
+
it "should skip duplicate urls" do
|
40
|
+
@web_strategy = WebsiteStrategy.new('http://www.example.com', :images_only => true)
|
41
|
+
downloaded_page_mock = <<-PAGE
|
42
|
+
<img src="/sites/image.png" />
|
43
|
+
<img src="/sites/image.png" />
|
44
|
+
PAGE
|
45
|
+
@web_strategy.should_receive(:download_page).and_return(downloaded_page_mock)
|
46
|
+
@web_strategy.get_urls.size.should == 1
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should retrieve 3 images from basic_page.html fixture" do
|
50
|
+
@web_strategy = WebsiteStrategy.new('http://www.example.com', :images_only => true)
|
51
|
+
downloaded_page_mock = IO.read(fixture_directory + '/basic_page.html')
|
52
|
+
@web_strategy.should_receive(:download_page).and_return(downloaded_page_mock)
|
53
|
+
@web_strategy.get_urls.size.should == 3
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<title>Hello, Page</title>
|
4
|
+
</head>
|
5
|
+
|
6
|
+
<body>
|
7
|
+
<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
|
8
|
+
|
9
|
+
<img width="186" height="65" title="Urban Influence Brand Studio" alt="Urban Influence Brand Studio" src="/sites/all/themes/urbaninfluence/images/ui-logo-orange.png">
|
10
|
+
|
11
|
+
<img style="width: 600px; height: 1083px;" src="/sites/default/files/user_uploads/images/Essentials01(1).jpg" alt="Essentials">
|
12
|
+
|
13
|
+
<IMG style="width: 600px; height: 133px; border-width: 0px; border-style: solid;" src="/sites/default/files/user_uploads/images/RatRace.tiff" alt="Workin For Our Money!">
|
14
|
+
|
15
|
+
<a href="http://www.boobs.com">Free boobs!</a>
|
16
|
+
<a href="/clickhere.html">Click me!!!</a>
|
17
|
+
</body>
|
18
|
+
</html>
|
data/version.yml
CHANGED
metadata
CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 3
|
9
|
+
- 0
|
10
|
+
version: 0.3.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Nate Miller
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-
|
18
|
+
date: 2010-08-08 00:00:00 -07:00
|
19
19
|
default_executable: downer
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -34,6 +34,20 @@ dependencies:
|
|
34
34
|
version: 1.2.9
|
35
35
|
type: :development
|
36
36
|
version_requirements: *id001
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: nokogiri
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
hash: 3
|
46
|
+
segments:
|
47
|
+
- 0
|
48
|
+
version: "0"
|
49
|
+
type: :runtime
|
50
|
+
version_requirements: *id002
|
37
51
|
description: Downer is a tool used to download a list of urls from a website thorugh HTTP.
|
38
52
|
email: nate@natemiller.org
|
39
53
|
executables:
|
@@ -59,13 +73,22 @@ files:
|
|
59
73
|
- lib/downer/application.rb
|
60
74
|
- lib/downer/download_item.rb
|
61
75
|
- lib/downer/download_manager.rb
|
76
|
+
- lib/downer/download_strategy.rb
|
62
77
|
- lib/downer/download_worker.rb
|
78
|
+
- lib/downer/generic_strategy.rb
|
63
79
|
- lib/downer/options.rb
|
80
|
+
- lib/downer/strategies/flat_file_strategy.rb
|
81
|
+
- lib/downer/strategies/website_strategy.rb
|
64
82
|
- spec/downer/application_spec.rb
|
65
83
|
- spec/downer/download_item_spec.rb
|
66
84
|
- spec/downer/download_manager_spec.rb
|
85
|
+
- spec/downer/download_strategy_spec.rb
|
67
86
|
- spec/downer/download_worker_spec.rb
|
68
87
|
- spec/downer/generator_spec.rb
|
88
|
+
- spec/downer/options_spec.rb
|
89
|
+
- spec/downer/strategies/flat_file_stragtegy_spec.rb
|
90
|
+
- spec/downer/strategies/website_strategy_spec.rb
|
91
|
+
- spec/fixtures/basic_page.html
|
69
92
|
- spec/fixtures/some_images.txt
|
70
93
|
- spec/spec.opts
|
71
94
|
- spec/spec_helper.rb
|
@@ -108,6 +131,10 @@ test_files:
|
|
108
131
|
- spec/downer/application_spec.rb
|
109
132
|
- spec/downer/download_item_spec.rb
|
110
133
|
- spec/downer/download_manager_spec.rb
|
134
|
+
- spec/downer/download_strategy_spec.rb
|
111
135
|
- spec/downer/download_worker_spec.rb
|
112
136
|
- spec/downer/generator_spec.rb
|
137
|
+
- spec/downer/options_spec.rb
|
138
|
+
- spec/downer/strategies/flat_file_stragtegy_spec.rb
|
139
|
+
- spec/downer/strategies/website_strategy_spec.rb
|
113
140
|
- spec/spec_helper.rb
|