downer 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -19,3 +19,4 @@ rdoc
19
19
  pkg
20
20
 
21
21
  ## PROJECT::SPECIFIC
22
+ *._*
data/Rakefile CHANGED
@@ -11,6 +11,7 @@ begin
11
11
  gem.homepage = "http://github.com/nate63179/downer"
12
12
  gem.authors = ["Nate Miller"]
13
13
  gem.add_development_dependency "rspec", ">= 1.2.9"
14
+ gem.add_dependency 'nokogiri'
14
15
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
16
  end
16
17
  Jeweler::GemcutterTasks.new
data/downer.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{downer}
8
- s.version = "0.2.2"
8
+ s.version = "0.3.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Nate Miller"]
12
- s.date = %q{2010-07-16}
12
+ s.date = %q{2010-08-08}
13
13
  s.default_executable = %q{downer}
14
14
  s.description = %q{Downer is a tool used to download a list of urls from a website thorugh HTTP.}
15
15
  s.email = %q{nate@natemiller.org}
@@ -34,13 +34,22 @@ Gem::Specification.new do |s|
34
34
  "lib/downer/application.rb",
35
35
  "lib/downer/download_item.rb",
36
36
  "lib/downer/download_manager.rb",
37
+ "lib/downer/download_strategy.rb",
37
38
  "lib/downer/download_worker.rb",
39
+ "lib/downer/generic_strategy.rb",
38
40
  "lib/downer/options.rb",
41
+ "lib/downer/strategies/flat_file_strategy.rb",
42
+ "lib/downer/strategies/website_strategy.rb",
39
43
  "spec/downer/application_spec.rb",
40
44
  "spec/downer/download_item_spec.rb",
41
45
  "spec/downer/download_manager_spec.rb",
46
+ "spec/downer/download_strategy_spec.rb",
42
47
  "spec/downer/download_worker_spec.rb",
43
48
  "spec/downer/generator_spec.rb",
49
+ "spec/downer/options_spec.rb",
50
+ "spec/downer/strategies/flat_file_stragtegy_spec.rb",
51
+ "spec/downer/strategies/website_strategy_spec.rb",
52
+ "spec/fixtures/basic_page.html",
44
53
  "spec/fixtures/some_images.txt",
45
54
  "spec/spec.opts",
46
55
  "spec/spec_helper.rb",
@@ -55,8 +64,12 @@ Gem::Specification.new do |s|
55
64
  "spec/downer/application_spec.rb",
56
65
  "spec/downer/download_item_spec.rb",
57
66
  "spec/downer/download_manager_spec.rb",
67
+ "spec/downer/download_strategy_spec.rb",
58
68
  "spec/downer/download_worker_spec.rb",
59
69
  "spec/downer/generator_spec.rb",
70
+ "spec/downer/options_spec.rb",
71
+ "spec/downer/strategies/flat_file_stragtegy_spec.rb",
72
+ "spec/downer/strategies/website_strategy_spec.rb",
60
73
  "spec/spec_helper.rb"
61
74
  ]
62
75
 
@@ -66,11 +79,14 @@ Gem::Specification.new do |s|
66
79
 
67
80
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
68
81
  s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
82
+ s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
69
83
  else
70
84
  s.add_dependency(%q<rspec>, [">= 1.2.9"])
85
+ s.add_dependency(%q<nokogiri>, [">= 0"])
71
86
  end
72
87
  else
73
88
  s.add_dependency(%q<rspec>, [">= 1.2.9"])
89
+ s.add_dependency(%q<nokogiri>, [">= 0"])
74
90
  end
75
91
  end
76
92
 
data/lib/downer.rb CHANGED
@@ -1,8 +1,7 @@
1
+ require 'rubygems'
1
2
  require 'net/http'
3
+ require 'nokogiri'
2
4
  require 'optparse'
5
+ require 'open-uri'
3
6
 
4
- require 'downer/application'
5
- require 'downer/download_manager'
6
- require 'downer/download_worker'
7
- require 'downer/download_item'
8
- require 'downer/options'
7
+ Dir.glob(File.dirname(__FILE__) + '/**/*.rb').each { |file| require file }
@@ -2,6 +2,7 @@ module Downer
2
2
  class Application
3
3
 
4
4
  attr_accessor :output
5
+ attr_reader :options
5
6
 
6
7
  def initialize(output = nil)
7
8
  @output = (output) ? output : $stdout
@@ -10,18 +11,29 @@ module Downer
10
11
 
11
12
  def run!(*arguments)
12
13
  @options = Downer::Options.new(arguments)
14
+ download_options = {}
13
15
 
16
+
17
+
18
+ # begin analysis of arguments
14
19
  if @options[:invalid_argument]
15
20
  @output.puts @options[:invalid_argument]
16
21
  @options[:show_help] = true
17
22
  end
18
23
 
24
+ if @options[:images_only]
25
+ print_image_only_message
26
+ download_options[:images_only] = true
27
+ end
28
+
29
+ # Immediately exit if this will never complete
19
30
  return exit_with_help_banner if @options[:file_manifest].nil?
20
31
  return exit_with_help_banner if @options[:target_directory].nil?
21
32
  return exit_with_help_banner if @options[:show_help]
22
33
 
34
+
23
35
  begin
24
- manager = Downer::DownloadManager.new(@options[:file_manifest], @options[:target_directory], @output)
36
+ manager = Downer::DownloadManager.new(@options[:file_manifest], @options[:target_directory], @output, download_options)
25
37
  manager.start
26
38
  return 0
27
39
  rescue Downer::WriteFailed
@@ -31,6 +43,10 @@ module Downer
31
43
  end
32
44
 
33
45
  private
46
+
47
+ def print_image_only_message
48
+ @output.puts "Images only filter selected...downloading PNG,JPG,GIF, and TIFF files"
49
+ end
34
50
 
35
51
  def exit_with_help_banner
36
52
  @output.puts @options.opts.banner
@@ -25,18 +25,23 @@ module Downer
25
25
 
26
26
  def download
27
27
  @http = Net::HTTP.new(@uri.host, @uri.port)
28
- req = Net::HTTP::Get.new(@uri.request_uri)
29
- response = @http.request(req)
30
-
31
- if response.code != '200'
32
- fd = FailedDownload.new
33
- fd.http_code = response.code
34
- fd.url = @url
35
- raise fd
28
+ if @uri.respond_to? :request_uri
29
+ req = Net::HTTP::Get.new(@uri.request_uri)
30
+ response = @http.request(req)
31
+
32
+ if response.code != '200'
33
+ fd = FailedDownload.new
34
+ fd.http_code = response.code
35
+ fd.url = @url
36
+ raise fd
37
+ else
38
+ @content = response.body
39
+ write_to_file
40
+ end
36
41
  else
37
- @content = response.body
38
- write_to_file
42
+ puts "WARNING: Ignored download for #{@uri.inspect}"
39
43
  end
44
+
40
45
  end
41
46
 
42
47
  private
@@ -5,34 +5,69 @@ module Downer
5
5
  class NoManifestFileGiven < StandardError; end
6
6
  class NoTargetDirectoryGiven < StandardError; end
7
7
  class URLSourceDoesNotExist < StandardError; end
8
+
8
9
 
9
10
  class DownloadManager
10
11
 
11
- attr_accessor :file_manifest, :target_directory, :output
12
- attr_reader :urls
12
+ attr_accessor :target_directory, :output, :source_type
13
+ attr_reader :urls, :downloaded_files, :strategy
13
14
 
14
- def initialize(url_source, target_directory, output)
15
- @file_manifest, @target_directory, @output = url_source, target_directory, output
16
- dir_last_char_is_slash = (@target_directory[-1,1] == '/')
17
- @target_directory = @target_directory + '/' unless dir_last_char_is_slash
18
- @urls = []
19
- raise URLSourceDoesNotExist unless File.exists?(@file_manifest)
20
- get_urls
15
+ def initialize(url_source, target_directory, output, options ={})
16
+ @url_source = url_source
17
+ @output = output
18
+ @target_directory = append_slash_to_path(target_directory)
19
+ @strategy = StrategyFinder::find_strategy(@url_source, options)
21
20
  end
22
21
 
23
22
  def start
24
- raise WriteFailed unless File.writable?(@target_directory)
25
-
26
- @output.puts "Starting download on #{@urls.size} files"
27
- worker = DownloadWorker.new(@urls, @target_directory, @output)
28
- worker.start
23
+ check_directory
24
+
25
+ if @strategy && @strategy.source_valid?
26
+ urls = @strategy.get_urls
27
+ @output.puts "Starting download on #{urls.size} files"
28
+ worker = DownloadWorker.new(urls, @target_directory, @output)
29
+ @downloaded_files = worker.start
30
+ print_session_summary(worker)
31
+ else
32
+ @output.puts "Could not open url source #{@url_source}"
33
+ end
29
34
  end
30
35
 
31
- protected
36
+ # Tells us what worked and what failed
37
+ def print_session_summary(worker)
38
+ @output.puts "Session complete."
39
+ @output.puts "\n\n"
40
+ @output.puts "----------------------------------"
41
+ @output.puts "SUMMARY:"
42
+ @output.puts "Successful downloads: (#{worker.successful_downloads.size})"
43
+ @output.puts "Failed downloads: (#{worker.failed_downloads.size}): "
44
+ worker.failed_downloads.each do |fail|
45
+ @output.puts " * #{fail}"
46
+ end
47
+ end
32
48
 
33
- def get_urls
34
- f = File.open(@file_manifest, 'r')
35
- f.each_line { |line| @urls << line.chomp }
49
+ def source_type
50
+ @strategy.source_type
51
+ end
52
+
53
+ # Determine whether the direcotry specified exists. If it does not, see if its parent is writable. If not, give up
54
+ # and throw error
55
+ def check_directory
56
+ if File.writable?(@target_directory)
57
+ return true
58
+ else
59
+ #FileUtils.mkdir(@target_directory)
60
+ raise WriteFailed
61
+ end
62
+ end
63
+
64
+ protected
65
+
66
+ # Put a slash on the directory name if one was ommited
67
+ def append_slash_to_path(dir_name)
68
+ dir_last_char_is_slash = (dir_name[-1,1] == '/')
69
+ dir_name = dir_name + '/' unless dir_last_char_is_slash
70
+ dir_name
36
71
  end
37
72
 
38
73
  end
@@ -0,0 +1,34 @@
1
+ module Downer
2
+
3
+ class StrategyFinder
4
+ class << self
5
+
6
+ # Determines a strategy for extracting urls from a media type
7
+ def find_strategy(url_source, options ={})
8
+ strategy = nil
9
+
10
+ if is_local_file?(url_source)
11
+ strategy = DownloadStrategy::FlatFileStrategy.new(url_source, options)
12
+ elsif is_remote_source?(url_source)
13
+ strategy = DownloadStrategy::WebsiteStrategy.new(url_source, options)
14
+ else
15
+ raise "Could not find strategy"
16
+ end
17
+ strategy
18
+ end
19
+
20
+ # Determine whether the source is located on a local file system
21
+ def is_local_file?(url_source)
22
+ File.exist?(url_source)
23
+ end
24
+
25
+ # Determine if this is something that lives online
26
+ def is_remote_source?(url_source)
27
+ #if url_source =~ /(ftp|https?).*$/
28
+ url_source.match(/(ftp|https?).*$/) ? true : false
29
+ end
30
+
31
+ end
32
+ end
33
+
34
+ end
@@ -1,11 +1,15 @@
1
1
  module Downer
2
2
  class DownloadWorker
3
3
 
4
- attr_reader :items
4
+ attr_reader :items, :successful_downloads, :failed_downloads
5
5
 
6
6
  def initialize(urls, target_directory, output)
7
7
  @urls, @target_directory, @output = urls, target_directory, output
8
+ @urls.delete_if { |url| url == nil }
8
9
  @items = []
10
+ @successful_downloads = []
11
+ @failed_downloads = []
12
+
9
13
  @urls.each { |url| @items << DownloadItem.new(url, target_directory) }
10
14
  end
11
15
 
@@ -16,6 +20,7 @@ module Downer
16
20
  end
17
21
 
18
22
  @items.each { |item| try_download_item(item) }
23
+ successful_downloads
19
24
  end
20
25
 
21
26
  private
@@ -23,9 +28,14 @@ module Downer
23
28
  def try_download_item(item)
24
29
  begin
25
30
  item.download
31
+ @successful_downloads << item.url
26
32
  @output.puts "Downloaded #{item.url}"
27
33
  rescue Downer::FailedDownload => e
28
34
  @output.puts "Could not download #{e.url}, received http code #{e.http_code}"
35
+ @failed_downloads << item.url
36
+ rescue SocketError => e
37
+ @output.puts "SocketError encountered on url #{item.url}"
38
+ @failed_downloads << item.url
29
39
  end
30
40
  end
31
41
  end
@@ -0,0 +1,31 @@
1
+ module Downer
2
+
3
+ class SubclassMethodUndefined < StandardError; end
4
+
5
+ class GenericStrategy
6
+
7
+ def initialize(source, search_options = {})
8
+ @url_source = source
9
+ @search_options = search_options
10
+ end
11
+
12
+ def options
13
+ @search_options
14
+ end
15
+
16
+ def get_urls
17
+ raise SubclassMethodUndefined
18
+ end
19
+
20
+ def source_valid?
21
+ raise SubclassMethodUndefined
22
+ end
23
+
24
+ def source_type
25
+ name = self.class.name.gsub(/Downer::DownloadStrategy::/,'')
26
+ name.gsub(/Strategy/,'').downcase
27
+ end
28
+ end
29
+
30
+ end
31
+
@@ -3,10 +3,20 @@ module Downer
3
3
  attr_reader :opts, :orig_args
4
4
 
5
5
  def initialize(args)
6
-
7
- @orig_args = args.clone
6
+ @opts = args.clone
7
+ self[:is_website] = false
8
+ self[:images_only] = false
9
+
8
10
  @opts = OptionParser.new do |o|
9
- o.banner = "Usage: downer URL_SOURCE DESTINATION_DIR"
11
+ o.banner = "Usage: downer -flags URL_SOURCE DESTINATION_DIR"
12
+
13
+ o.on('-w', '--web', 'Declare source as a url') do |url|
14
+ self[:is_website] = true
15
+ end
16
+
17
+ o.on('-i', '--image', 'When combined with w will download JPG,GIF,PNG formats') do
18
+ self[:images_only] = true
19
+ end
10
20
  end
11
21
 
12
22
  begin
@@ -0,0 +1,19 @@
1
+ module Downer
2
+ module DownloadStrategy
3
+
4
+ class FlatFileStrategy < GenericStrategy
5
+
6
+ def get_urls
7
+ urls = []
8
+ f = File.open(@url_source, 'r')
9
+ f.each_line { |line| urls << line.chomp}
10
+ urls
11
+ end
12
+
13
+ def source_valid?
14
+ File.exist?(@url_source)
15
+ end
16
+
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,75 @@
1
+ module Downer
2
+ module DownloadStrategy
3
+
4
+ class WebsiteStrategy < GenericStrategy
5
+
6
+ # Create the downloading strategy, set any behavior flags in the options hash
7
+ def initialize(url_source, search_options = {})
8
+ super(url_source, search_options)
9
+ uri = URI.parse(url_source)
10
+ @host_prefix = uri.scheme + "://" + uri.host
11
+ end
12
+
13
+ # Retrieve urls from an HTML page. Behavior is dependent upon options passed
14
+ # to constructor
15
+ def get_urls
16
+ @noko = Nokogiri::HTML(download_page)
17
+ urls = []
18
+
19
+ if @search_options[:images_only]
20
+ urls = image_urls
21
+ else
22
+ urls = urls.concat document_links
23
+ urls = urls.concat image_urls
24
+ end
25
+ urls.uniq
26
+ end
27
+
28
+ # read an html page into memory
29
+ def download_page
30
+ @downloaded_page ||= open(@url_source)
31
+ end
32
+
33
+ # Return all image urls from document
34
+ def image_urls
35
+ urls = []
36
+ @noko.css('img').each do |img|
37
+ urls << absolutify_link(img['src'])
38
+ end
39
+ urls
40
+ end
41
+
42
+ # Return all links stored within the document
43
+ def document_links
44
+ urls = []
45
+ @noko.css('a').each do |alink|
46
+ link = alink['href']
47
+ urls << absolutify_link(link)
48
+ end
49
+ urls
50
+ end
51
+
52
+ # Converts non absolute urls to absolute ones
53
+ def absolutify_link(link)
54
+
55
+ # Auto prepend any links which refer use releative reference like '../'
56
+ if link[0,1] == '.'
57
+ link = '/' + link
58
+ end
59
+
60
+ if link =~ /(https?|ftp).*/
61
+ url = link
62
+ elsif link[0,1] != '/'
63
+ link = "/" + link
64
+ else
65
+ url = @host_prefix + link
66
+ end
67
+ end
68
+
69
+ def source_valid?
70
+ URI.parse(@url_source)
71
+ end
72
+
73
+ end
74
+ end
75
+ end
@@ -4,19 +4,28 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
4
4
  module Downer
5
5
  describe Downer do
6
6
  describe "#run!" do
7
+ before(:each) do
8
+ @output = double('output').as_null_object
9
+ @app = Application.new(@output)
10
+ end
11
+
7
12
  it "when run without arguments displays a usage hint" do
8
- output = double('output')
9
- app = Application.new(output)
10
- output.should_receive(:puts).with('Usage: downer URL_SOURCE DESTINATION_DIR')
11
- app.run!
13
+ @output.should_receive(:puts).with('Usage: downer -flags URL_SOURCE DESTINATION_DIR')
14
+ @app.run!
12
15
  end
13
16
 
14
- it "when run with valid arguments displays the number of files to download" do
15
- output = double('output').as_null_object
16
- app = Application.new(output)
17
- output.should_receive(:puts).with("Starting download on 4 files")
18
- app.run!(fixture_directory + "/some_images.txt", '/tmp')
17
+ it "when run with -i argument it will download only images" do
18
+ @output.should_receive(:puts).with("Images only filter selected...downloading PNG,JPG,GIF, and TIFF files")
19
+ @app.run!("-i")
20
+ @app.options[:images_only].should == true
19
21
  end
22
+
23
+ # it "when run with a -w DATA_SOURCE argument it should start a web download" do
24
+ # host = "http://www.urbaninfluence.com"
25
+ # @output.should_receive(:puts).with("Requesting from host #{host}")
26
+ # arg_cmd = %w{-wi http://www.urbaninfluence.com /tmp}
27
+ # @app.run!(arg_cmd)
28
+ # end
20
29
  end
21
30
  end
22
31
  end
@@ -1,6 +1,7 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '../../spec_helper')
2
2
 
3
3
  module Downer
4
+
4
5
  describe DownloadManager do
5
6
  let(:output) { double('output').as_null_object }
6
7
  let(:fixture_file_path) { fixture_directory + '/some_images.txt' }
@@ -10,31 +11,48 @@ module Downer
10
11
  it "should add a slash as the last character of the target directory if one is not present" do
11
12
  manager.target_directory.should == 'myoutputdir/'
12
13
  end
13
-
14
- it "should retrieve all urls from the url source" do
15
- manager.urls.should_not be_empty
14
+
15
+ it "when passed an option hash containing :images_only, only images will be downloaded for this session" do
16
+ mgr = DownloadManager.new("http://localhost/basic_page.html", '/tmp', output, {:images_only => true})
17
+ mgr.strategy.options[:images_only].should == true
16
18
  end
17
19
  end
18
20
 
19
21
  describe "#start" do
20
22
 
21
- it "should raise an error when the file does not exist" do
22
- File.should_receive(:exists?).with(fixture_file_path)
23
- lambda { manager.start }.should raise_error(Downer::URLSourceDoesNotExist)
24
- end
25
23
  it "should raise a WriteFailed exception if the target directory is not writable" do
26
24
  File.should_receive(:writable?).with('myoutputdir/').and_return(false)
27
25
  lambda { manager.start }.should raise_error(Downer::WriteFailed)
28
26
  end
29
-
27
+
28
+ it "should create the specified directory if it does not exist and its parent directory is writable"
29
+ ## do
30
+ # if File.exist?('/tmp/new-directory')
31
+ # FileUtils.rm_rf('/tmp/new-directory')
32
+ # end
33
+ # create_mgr = DownloadManager.new(fixture_file_path, '/tmp/new-directory', output)
34
+ # create_mgr.start
35
+ # File.exist?('/tmp/new-directory').should == true
36
+ # end
30
37
 
31
38
  it "should create a download worker to begin the downloading" do
32
39
  File.should_receive(:writable?).and_return(true)
33
- worker = double('worker')
40
+ worker = double('worker').as_null_object
34
41
  worker.should_receive(:start)
35
42
  DownloadWorker.should_receive(:new).and_return(worker)
36
43
  manager.start
37
44
  end
45
+
46
+ it "should resolve to use a flat file strategy when it receives a file data source" do
47
+ dm = DownloadManager.new(fixture_file_path, 'myoutputdir', output)
48
+ dm.source_type.should == "flatfile"
49
+ end
50
+
51
+ it "should resolve to use a web strategy when it receives a data source that looks like a url" do
52
+ dm = DownloadManager.new('http://www.msn.com', 'myoutputdir', output)
53
+ dm.source_type.should == "website"
54
+ end
55
+
38
56
  end
39
57
 
40
58
  end
@@ -0,0 +1,19 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '../../spec_helper')
2
+
3
+ module Downer
4
+ describe StrategyFinder do
5
+ describe "Class method find_strategy" do
6
+ it "should return flat file strategy when url source is a local file" do
7
+ strategy = StrategyFinder.find_strategy(fixture_directory + '/some_images.txt')
8
+ strategy.should respond_to :get_urls
9
+ strategy.source_type.should == 'flatfile'
10
+ end
11
+
12
+ it "should return a website strategy when url source is a web resource" do
13
+ strategy = StrategyFinder.find_strategy('http://www.example.com')
14
+ strategy.should respond_to :get_urls
15
+ strategy.source_type.should == 'website'
16
+ end
17
+ end
18
+ end
19
+ end
@@ -3,7 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '../../spec_helper')
3
3
  module Downer
4
4
  describe DownloadWorker do
5
5
  let (:output) { double('output') }
6
- let (:urls) { DownloadManager.new(fixture_directory + '/some_images.txt', '/tmp', output).urls }
6
+ let (:urls) { DownloadStrategy::FlatFileStrategy.new(fixture_directory + "/some_images.txt").get_urls }
7
7
 
8
8
  describe '#start' do
9
9
  it "should write a message to output when no urls exist to be downloaded" do
@@ -12,6 +12,12 @@ module Downer
12
12
  worker.start
13
13
  end
14
14
 
15
+ it "should ignore arrays with nils" do
16
+ output.should_receive(:puts).with("No URLs specified, exiting.")
17
+ worker = DownloadWorker.new([nil,nil], '/tmp', output)
18
+ worker.start
19
+ end
20
+
15
21
  it "should create a download object for each url to be downloaded" do
16
22
  worker = DownloadWorker.new(urls, '/tmp', output)
17
23
  worker.items.size.should == urls.size
@@ -21,14 +27,16 @@ module Downer
21
27
  bad_url = "http://www.urbaninfluence.com/will_never_succeed"
22
28
  worker = DownloadWorker.new([bad_url], '/tmp', output)
23
29
  output.should_receive(:puts).with("Could not download #{bad_url}, received http code 404")
24
- worker.start
30
+ results = worker.start
31
+ results.size.should == 0
25
32
  end
26
33
 
27
34
  it "should write a message to output feed when a url is successfully downloaded" do
28
35
  good_url = "http://www.urbaninfluence.com/sites/default/files/user_uploads/images/4th.png"
29
36
  worker = DownloadWorker.new([good_url], '/tmp', output)
30
37
  output.should_receive(:puts).with("Downloaded #{good_url}")
31
- worker.start
38
+ results = worker.start
39
+ results.size.should == 1
32
40
  end
33
41
  end
34
42
  end
@@ -0,0 +1,10 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ module Downer
4
+ describe Options do
5
+ it "when passed '-i' the option image_only will be set to true" do
6
+ op_struct = Options.new(['-i'])
7
+ op_struct[:images_only].should == true
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,26 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '../../../spec_helper')
2
+
3
+ module Downer
4
+ module DownloadStrategy
5
+ describe FlatFileStrategy do
6
+
7
+ describe "#get_urls" do
8
+ before(:each) do
9
+ @flat_file_strategy = FlatFileStrategy.new(fixture_directory + '/some_images.txt')
10
+ end
11
+
12
+ it "should retrieve all urls from a text file" do
13
+ @flat_file_strategy.get_urls.size.should == 4
14
+ end
15
+
16
+ end
17
+
18
+ describe "#source_valid?" do
19
+ it "should return false when the local file does not exist" do
20
+ flat_file_strategy = FlatFileStrategy.new('foobar')
21
+ flat_file_strategy.source_valid?.should == false
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,58 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '../../../spec_helper')
2
+
3
+ module Downer
4
+ module DownloadStrategy
5
+ describe WebsiteStrategy do
6
+
7
+ it "should automatically prepend the host for relative urls" do
8
+ @web_strategy = WebsiteStrategy.new('http://www.example.com')
9
+ downloaded_page_mock = IO.read(fixture_directory + '/basic_page.html')
10
+ @web_strategy.should_receive(:download_page).and_return(downloaded_page_mock)
11
+ @web_strategy.get_urls.should include('http://www.example.com/clickhere.html')
12
+ end
13
+
14
+ describe "default get_urls behavior" do
15
+ before(:each) do
16
+ @web_strategy = WebsiteStrategy.new('http://www.example.com')
17
+ downloaded_page_mock = IO.read(fixture_directory + '/basic_page.html')
18
+ @web_strategy.should_receive(:download_page).and_return(downloaded_page_mock)
19
+ end
20
+
21
+ it "should retrieve 5 urls fom basic_page.html fixture" do
22
+ @web_strategy.get_urls.size.should == 5
23
+ end
24
+ end
25
+
26
+ describe "get images only behavior" do
27
+ before(:each) do
28
+ @web_strategy = WebsiteStrategy.new('http://www.example.com', {:images_only => true} )
29
+ downloaded_page_mock = IO.read(fixture_directory + '/basic_page.html')
30
+ @web_strategy.should_receive(:download_page).and_return(downloaded_page_mock)
31
+ end
32
+
33
+ it "should retrieve 3 urls from basic_page.html fixture with images only mode enabled" do
34
+ @web_strategy.get_urls.size.should == 3
35
+ end
36
+ end
37
+
38
+ describe "image searching" do
39
+ it "should skip duplicate urls" do
40
+ @web_strategy = WebsiteStrategy.new('http://www.example.com', :images_only => true)
41
+ downloaded_page_mock = <<-PAGE
42
+ <img src="/sites/image.png" />
43
+ <img src="/sites/image.png" />
44
+ PAGE
45
+ @web_strategy.should_receive(:download_page).and_return(downloaded_page_mock)
46
+ @web_strategy.get_urls.size.should == 1
47
+ end
48
+
49
+ it "should retrieve 3 images from basic_page.html fixture" do
50
+ @web_strategy = WebsiteStrategy.new('http://www.example.com', :images_only => true)
51
+ downloaded_page_mock = IO.read(fixture_directory + '/basic_page.html')
52
+ @web_strategy.should_receive(:download_page).and_return(downloaded_page_mock)
53
+ @web_strategy.get_urls.size.should == 3
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,18 @@
1
+ <html>
2
+ <head>
3
+ <title>Hello, Page</title>
4
+ </head>
5
+
6
+ <body>
7
+ <p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
8
+
9
+ <img width="186" height="65" title="Urban Influence Brand Studio" alt="Urban Influence Brand Studio" src="/sites/all/themes/urbaninfluence/images/ui-logo-orange.png">
10
+
11
+ <img style="width: 600px; height: 1083px;" src="/sites/default/files/user_uploads/images/Essentials01(1).jpg" alt="Essentials">
12
+
13
+ <IMG style="width: 600px; height: 133px; border-width: 0px; border-style: solid;" src="/sites/default/files/user_uploads/images/RatRace.tiff" alt="Workin For Our Money!">
14
+
15
+ <a href="http://www.boobs.com">Free boobs!</a>
16
+ <a href="/clickhere.html">Click me!!!</a>
17
+ </body>
18
+ </html>
data/version.yml CHANGED
@@ -1,5 +1,5 @@
1
1
  ---
2
- :minor: 2
3
- :build:
4
- :patch: 2
2
+ :patch: 0
5
3
  :major: 0
4
+ :build:
5
+ :minor: 3
metadata CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 2
9
- - 2
10
- version: 0.2.2
8
+ - 3
9
+ - 0
10
+ version: 0.3.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Nate Miller
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-07-16 00:00:00 -07:00
18
+ date: 2010-08-08 00:00:00 -07:00
19
19
  default_executable: downer
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -34,6 +34,20 @@ dependencies:
34
34
  version: 1.2.9
35
35
  type: :development
36
36
  version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: nokogiri
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ hash: 3
46
+ segments:
47
+ - 0
48
+ version: "0"
49
+ type: :runtime
50
+ version_requirements: *id002
37
51
  description: Downer is a tool used to download a list of urls from a website thorugh HTTP.
38
52
  email: nate@natemiller.org
39
53
  executables:
@@ -59,13 +73,22 @@ files:
59
73
  - lib/downer/application.rb
60
74
  - lib/downer/download_item.rb
61
75
  - lib/downer/download_manager.rb
76
+ - lib/downer/download_strategy.rb
62
77
  - lib/downer/download_worker.rb
78
+ - lib/downer/generic_strategy.rb
63
79
  - lib/downer/options.rb
80
+ - lib/downer/strategies/flat_file_strategy.rb
81
+ - lib/downer/strategies/website_strategy.rb
64
82
  - spec/downer/application_spec.rb
65
83
  - spec/downer/download_item_spec.rb
66
84
  - spec/downer/download_manager_spec.rb
85
+ - spec/downer/download_strategy_spec.rb
67
86
  - spec/downer/download_worker_spec.rb
68
87
  - spec/downer/generator_spec.rb
88
+ - spec/downer/options_spec.rb
89
+ - spec/downer/strategies/flat_file_stragtegy_spec.rb
90
+ - spec/downer/strategies/website_strategy_spec.rb
91
+ - spec/fixtures/basic_page.html
69
92
  - spec/fixtures/some_images.txt
70
93
  - spec/spec.opts
71
94
  - spec/spec_helper.rb
@@ -108,6 +131,10 @@ test_files:
108
131
  - spec/downer/application_spec.rb
109
132
  - spec/downer/download_item_spec.rb
110
133
  - spec/downer/download_manager_spec.rb
134
+ - spec/downer/download_strategy_spec.rb
111
135
  - spec/downer/download_worker_spec.rb
112
136
  - spec/downer/generator_spec.rb
137
+ - spec/downer/options_spec.rb
138
+ - spec/downer/strategies/flat_file_stragtegy_spec.rb
139
+ - spec/downer/strategies/website_strategy_spec.rb
113
140
  - spec/spec_helper.rb