RubyGems - wayback_archiver - Versions diffs - 0.2.0 → 1.0.0 - Mend

wayback_archiver 0.2.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/bin/wayback_archiver +71 -7
data/lib/wayback_archiver.rb +120 -25
data/lib/wayback_archiver/archive.rb +45 -23
data/lib/wayback_archiver/http_code.rb +49 -0
data/lib/wayback_archiver/null_logger.rb +7 -4
data/lib/wayback_archiver/request.rb +214 -53
data/lib/wayback_archiver/sitemap.rb +79 -0
data/lib/wayback_archiver/sitemapper.rb +75 -0
data/lib/wayback_archiver/thread_pool.rb +26 -0
data/lib/wayback_archiver/url_collector.rb +16 -25
data/lib/wayback_archiver/version.rb +1 -1
metadata +25 -21

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: ccebbb815d374658a9b2e1b2998a40041115d295
-  data.tar.gz: 153afa895756670988fa3663f44fd1fd5a2e5e3e
+  metadata.gz: 1f9f979d5fa0d31cfdf61660baa3464bdb1425e5
+  data.tar.gz: 1d5701273bbe4d02b2ba5d88f9e75c9477058a28
 SHA512:
-  metadata.gz: db7d655b1ea642618797d7fa3ece8357fa608dcb60322da76be7d793de11cd61df61f796025000ed0099edd96711aa78db607ce0614e9256e5921c252a7d931b
-  data.tar.gz: 919d8473c7f97bbd36c9065b95a99304ace7e0556a8dcd0f92eedf00d65961400dd92c63d86e405aa97babd5d254b522384f665ff39ababdc71afed9c13333ad
+  metadata.gz: e69883c975584b3120993371b29a0c0b7a71f3fd4210764b4caa712d4071ec175dc47fa63217950968da62485e927c05fcc977dfb076a317be4754cf4f16ec90
+  data.tar.gz: 51f80b591f40f4bc22b5bf2e2b7d56273c3c495db43d8a9268f11048a410ad355a3ff238088a4abd83c69fd585db5e1e704d34d22dac3d4f4b9b2f73089458ac

data/bin/wayback_archiver CHANGED Viewed

@@ -1,14 +1,78 @@
 #!/usr/bin/env ruby
+require 'optparse'
 require 'wayback_archiver'
-url  = ARGV[0]
-from = ARGV[1]
+# Default values
+urls = nil
+strategy = 'auto'
+log = STDOUT
+log_level = Logger::INFO
+concurrency = WaybackArchiver.concurrency
-WaybackArchiver.logger = Logger.new(STDOUT)
+optparse = OptionParser.new do |parser|
+  parser.banner = 'Usage: wayback_archiver [<url>] [options]'
-if from.nil?
-  WaybackArchiver.archive(url)
-else
-  WaybackArchiver.archive(url, from)
+  parser.on('--auto', 'Auto (default)') do |value|
+    strategy = 'auto'
+  end
+  parser.on('--crawl', 'Crawl') do |value|
+    strategy = 'crawl'
+  end
+  parser.on('--sitemap', 'Sitemap') do |value|
+    strategy = 'sitemap'
+  end
+  parser.on('--urls', '--url', 'URL(s)') do |value|
+    strategy = 'urls'
+  end
+  parser.on('--concurrency=5', Integer, 'Concurrency') do |value|
+    concurrency = value
+  end
+  parser.on('--log=output.log', String, 'Path to desired log file (if no argument is given it defaults to STDOUT)') do |path|
+    log = path
+  end
+  parser.on('--[no-]verbose', 'Verboes logs') do |value|
+    log_level = value ? Logger::DEBUG : Logger::WARN
+  end
+  parser.on('-h', '--help', 'How to use') do
+    puts parser
+    exit
+  end
+  # No argument, shows at tail. This will print an options summary.
+  parser.on_tail('-h', '--help', 'Show this message') do
+    puts parser
+    exit
+  end
+  parser.on_tail('--version', 'Show version') do
+    puts "WaybackArchiver version #{WaybackArchiver::VERSION}"
+    exit
+  end
+end
+optparse.parse!
+urls = ARGV.map(&:strip).reject(&:empty?)
+if urls.empty?
+  puts optparse.help
+  raise ArgumentError, "[<url>] is required"
+end
+WaybackArchiver.logger = Logger.new(log).tap do |logger|
+  logger.progname = 'WaybackArchiver'
+  logger.level = log_level
+end
+# If no strategy has explicitly been given, then default to 'auto'
+strategy ||= 'auto'
+urls.each do |url|
+  WaybackArchiver.archive(url, strategy: strategy, concurrency: concurrency)
 end

data/lib/wayback_archiver.rb CHANGED Viewed

@@ -1,60 +1,155 @@
-require 'uri'
-require 'net/http'
-require 'concurrent'
+require 'wayback_archiver/thread_pool'
 require 'wayback_archiver/null_logger'
 require 'wayback_archiver/version'
 require 'wayback_archiver/url_collector'
 require 'wayback_archiver/archive'
-require 'wayback_archiver/request'
+require 'wayback_archiver/sitemapper'
-# WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
+# WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap or by passing a list of URLs.
 module WaybackArchiver
   # Link to gem on rubygems.org, part of the sent User-Agent
   INFO_LINK  = 'https://rubygems.org/gems/wayback_archiver'.freeze
   # WaybackArchiver User-Agent
   USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})".freeze
+  # Default concurrency for archiving URLs
+  DEFAULT_CONCURRENCY = 5
   # Send URLs to Wayback Machine.
-  # @return [Array] with URLs sent to the Wayback Machine.
-  # @param [String] source for URL(s).
-  # @param [String/Symbol] type of source. Supported types: ['crawl', 'sitemap', 'url', 'file'].
+  # @return [Array<String>] of URLs sent to the Wayback Machine.
+  # @param [String/Array<String>] source for URL(s).
+  # @param [String/Symbol] strategy of source. Supported strategies: crawl, sitemap, url, urls, auto.
   # @example Crawl example.com and send all URLs of the same domain
-  #    WaybackArchiver.archive('example.com') # Default type is :crawl
+  #    WaybackArchiver.archive('example.com') # Default strategy is :auto
+  #    WaybackArchiver.archive('example.com', strategy: :auto)
+  #    WaybackArchiver.archive('example.com', strategy: :auto, concurrency: 10)
+  #    WaybackArchiver.archive('example.com', :auto)
+  # @example Crawl example.com and send all URLs of the same domain
+  #    WaybackArchiver.archive('example.com', strategy: :crawl)
+  #    WaybackArchiver.archive('example.com', strategy: :crawl, concurrency: 10)
   #    WaybackArchiver.archive('example.com', :crawl)
+  # @example Send example.com Sitemap URLs
+  #    WaybackArchiver.archive('example.com', strategy: :sitemap)
+  #    WaybackArchiver.archive('example.com', strategy: :sitemap, concurrency: 10)
+  #    WaybackArchiver.archive('example.com', :sitemap)
   # @example Send only example.com
+  #    WaybackArchiver.archive('example.com', strategy: :url)
+  #    WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
   #    WaybackArchiver.archive('example.com', :url)
-  # @example Send URL on each line in specified file
-  #    WaybackArchiver.archive('/path/to/file', :file)
-  def self.archive(source, type = :crawl)
-    case type.to_s
-    when 'file'    then Archive.post(UrlCollector.file(source))
-    when 'crawl'   then crawl(source)
-    when 'sitemap' then Archive.post(UrlCollector.sitemap(source))
-    when 'url'     then Archive.post_url(Request.resolve_url(source))
+  def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency)
+    strategy = legacy_strategy || strategy
+    case strategy.to_s
+    when 'crawl'   then crawl(source, concurrency: concurrency)
+    when 'auto'    then auto(source, concurrency: concurrency)
+    when 'sitemap' then sitemap(source, concurrency: concurrency)
+    when 'urls'    then urls(source, concurrency: concurrency)
+    when 'url'     then urls(source, concurrency: concurrency)
     else
-      raise ArgumentError, "Unknown type: '#{type}'. Allowed types: sitemap, url, file, crawl"
+      raise ArgumentError, "Unknown strategy: '#{strategy}'. Allowed strategies: sitemap, urls, url, crawl"
     end
   end
+  # Look for Sitemap(s) and if nothing is found fallback to crawling.
+  # Then send found URLs to the Wayback Machine.
+  # @return [Array<String>] of URLs sent to the Wayback Machine.
+  # @param [String] source (must be a valid URL).
+  # @param concurrency [Integer]
+  # @example Auto archive example.com
+  #    WaybackArchiver.auto('example.com') # Default concurrency is 5
+  # @example Auto archive example.com with low concurrency
+  #    WaybackArchiver.auto('example.com', concurrency: 1)
+  # @see http://www.sitemaps.org
+  def self.auto(source, concurrency: WaybackArchiver.concurrency)
+    urls = Sitemapper.autodiscover(source)
+    return urls(urls, concurrency: concurrency) if urls.any?
+    crawl(source, concurrency: concurrency)
+  end
   # Crawl site for URLs to send to the Wayback Machine.
-  # @return [Array] with URLs sent to the Wayback Machine.
-  # @param [String] source for URL(s).
-  # @param [Integer] concurrency.
+  # @return [Array<String>] of URLs sent to the Wayback Machine.
+  # @param [String] url to start crawling from.
+  # @param concurrency [Integer]
   # @example Crawl example.com and send all URLs of the same domain
   #    WaybackArchiver.crawl('example.com') # Default concurrency is 5
   # @example Crawl example.com and send all URLs of the same domain with low concurrency
   #    WaybackArchiver.crawl('example.com', concurrency: 1)
-  def self.crawl(source, concurrency: Archive::DEFAULT_CONCURRENCY)
-    Archive.crawl(source, concurrency: concurrency)
+  def self.crawl(url, concurrency: WaybackArchiver.concurrency)
+    WaybackArchiver.logger.info "Crawling #{url}"
+    Archive.crawl(url, concurrency: concurrency)
+  end
+  # Get URLs from sitemap and send found URLs to the Wayback Machine.
+  # @return [Array<String>] of URLs sent to the Wayback Machine.
+  # @param [String] url to the sitemap.
+  # @param concurrency [Integer]
+  # @example Get example.com sitemap and archive all found URLs
+  #    WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 5
+  # @example Get example.com sitemap and archive all found URLs with low concurrency
+  #    WaybackArchiver.sitemap('example.com/sitemap.xml', concurrency: 1)
+  # @see http://www.sitemaps.org
+  def self.sitemap(url, concurrency: WaybackArchiver.concurrency)
+    WaybackArchiver.logger.info "Fetching Sitemap"
+    Archive.post(URLCollector.sitemap(url), concurrency: concurrency)
+  end
+  # Send URL to the Wayback Machine.
+  # @return [Array<String>] of URLs sent to the Wayback Machine.
+  # @param [Array<String>/String] urls or url.
+  # @param concurrency [Integer]
+  # @example Archive example.com
+  #    WaybackArchiver.urls('example.com')
+  # @example Archive example.com and google.com
+  #    WaybackArchiver.urls(%w(example.com google.com))
+  def self.urls(urls, concurrency: WaybackArchiver.concurrency)
+    Archive.post(Array(urls), concurrency: concurrency)
   end
+  # Set logger
+  # @return [Object] the set logger
+  # @param [Object] logger an object than response to quacks like a Logger
+  # @example set a logger that prints to standard out (STDOUT)
+  #    WaybackArchiver.logger = Logger.new(STDOUT)
   def self.logger=(logger)
     @logger = logger
   end
+  # Returns the current logger
+  # @return [Object] the current logger instance
   def self.logger
     @logger ||= NullLogger.new
   end
+  # Resets the logger to the default
+  # @return [NullLogger] a new instance of NullLogger
+  def self.default_logger!
+    @logger = NullLogger.new
+  end
+  # Sets the user agent
+  # @return [String] the configured user agent
+  # @param [String] user_agent the desired user agent
+  def self.user_agent=(user_agent)
+    @user_agent = user_agent
+  end
+  # Returns the configured user agent
+  # @return [String] the configured or the default user agent
+  def self.user_agent
+    @user_agent ||= USER_AGENT
+  end
+  # Sets the default concurrency
+  # @return [Integer] the desired default concurrency
+  # @param [Integer] concurrency the desired default concurrency
+  def self.concurrency=(concurrency)
+    @concurrency = concurrency
+  end
+  # Returns the default concurrency
+  # @return [Integer] the configured or the default concurrency
+  def self.concurrency
+    @concurrency ||= DEFAULT_CONCURRENCY
+  end
 end

data/lib/wayback_archiver/archive.rb CHANGED Viewed

@@ -1,46 +1,68 @@
+require 'concurrent'
+require 'wayback_archiver/thread_pool'
+require 'wayback_archiver/request'
 module WaybackArchiver
   # Post URL(s) to Wayback Machine
   class Archive
     # Wayback Machine base URL.
     WAYBACK_BASE_URL    = 'https://web.archive.org/save/'.freeze
-    # Default concurrency for archiving URLs
-    DEFAULT_CONCURRENCY = 5
     # Send URLs to Wayback Machine.
-    # @return [Array] with sent URLs.
-    # @param [Array] urls URLs to send.
-    # @param [Hash] options
+    # @return [Array<String>] with sent URLs.
+    # @param [Array<String>] urls to send to the Wayback Machine.
+    # @param concurrency [Integer] the default is 5
     # @example Archive urls, asynchronously
     #    Archive.post(['http://example.com'])
     # @example Archive urls, using only 1 thread
     #    Archive.post(['http://example.com'], concurrency: 1)
-    def self.post(urls, concurrency: DEFAULT_CONCURRENCY)
-      WaybackArchiver.logger.info "=== WAYBACK ARCHIVER ==="
+    def self.post(urls, concurrency: WaybackArchiver.concurrency)
+      WaybackArchiver.logger.info "Total URLs to be sent: #{urls.length}"
       WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
-      WaybackArchiver.logger.info "Total urls to be sent: #{urls.length}"
-      pool = Concurrent::FixedThreadPool.new(concurrency)
+      posted_urls = Concurrent::Array.new
+      pool = ThreadPool.build(concurrency)
       urls.each do |url|
-        pool.post { Archive.post_url(url) }
+        pool.post do
+          posted_url = post_url(url)
+          posted_urls << posted_url if posted_url
+        end
       end
-      WaybackArchiver.logger.info "#{urls.length} URLs sent to Internet archive"
-      urls
+      pool.shutdown
+      pool.wait_for_termination
+      WaybackArchiver.logger.info "#{posted_urls.length} URL(s) posted to Wayback Machine"
+      posted_urls
     end
     # Send URLs to Wayback Machine by crawling the site.
-    # @return [Array] with URLs sent to the Wayback Machine.
+    # @return [Array<String>] with URLs sent to the Wayback Machine.
     # @param [String] source for URL to crawl.
-    # @param [Integer] concurrency (default is 5).
+    # @param concurrency [Integer] the default is 5
     # @example Crawl example.com and send all URLs of the same domain
     #    WaybackArchiver.crawl('example.com')
     # @example Crawl example.com and send all URLs of the same domain with low concurrency
     #    WaybackArchiver.crawl('example.com', concurrency: 1)
-    def self.crawl(source, concurrency: DEFAULT_CONCURRENCY)
-      pool = Concurrent::FixedThreadPool.new(concurrency) # X threads
+    def self.crawl(source, concurrency: WaybackArchiver.concurrency)
+      WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
+      posted_urls = Concurrent::Array.new
+      pool = ThreadPool.build(concurrency)
-      UrlCollector.crawl(source) do |url|
-        pool.post { Archive.post_url(url) }
+      found_urls = URLCollector.crawl(source) do |url|
+        pool.post do
+          posted_url = post_url(url)
+          posted_urls << posted_url if posted_url
+        end
       end
+      WaybackArchiver.logger.info "Crawling of #{source} finished, found #{found_urls.length} URL(s)"
+      pool.shutdown
+      pool.wait_for_termination
+      WaybackArchiver.logger.info "#{posted_urls.length} URL(s) posted to Wayback Machine"
+      posted_urls
     end
     # Send URL to Wayback Machine.
@@ -50,12 +72,12 @@ module WaybackArchiver
     #    Archive.post_url('http://example.com')
     def self.post_url(url)
       request_url  = "#{WAYBACK_BASE_URL}#{url}"
-      response     = Request.response(request_url)
-      WaybackArchiver.logger.info "[#{response.code}, #{response.message}] #{url}"
+      response     = Request.get(request_url, follow_redirects: false)
+      WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
       url
-    rescue Exception => e
-      WaybackArchiver.logger.error "Error message:     #{e.message}"
-      WaybackArchiver.logger.error "Failed to archive: #{url}"
+    rescue Request::Error => e
+      WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
+      nil
     end
   end
 end

data/lib/wayback_archiver/http_code.rb ADDED Viewed

@@ -0,0 +1,49 @@
+module WaybackArchiver
+  # Convience class for HTTP response codes
+  class HTTPCode
+    # Type of code as symbol
+    # @return [Symbol] code type
+    # @param [String/Integer] code the response code
+    # @example
+    #    HttpCode.type('200')
+    def self.type(code)
+      code = code.to_s
+      return :success if success?(code)
+      return :redirect if redirect?(code)
+      return :error if error?(code)
+      :unknown
+    end
+    # Whether the code is a success type
+    # @return [Boolean] is success or not
+    # @param [String] code the response code
+    # @example
+    #    HttpCode.success?('200') # => true
+    # @example
+    #    HttpCode.success?(200) # => true
+    # @example
+    #    HttpCode.success?(nil) # => false
+    def self.success?(code)
+      code.to_s.match?(/2\d\d/)
+    end
+    # Whether the code is a redirect type
+    # @return [Boolean] is redirect or not
+    # @param [String] code the response code
+    # @example
+    #    HttpCode.redirect?('301')
+    def self.redirect?(code)
+      code.to_s.match?(/3\d\d/)
+    end
+    # Whether the code is a error type
+    # @return [Boolean] is error or not
+    # @param [String] code the response code
+    # @example
+    #    HttpCode.error?('301')
+    def self.error?(code)
+      code.to_s.match?(/4\d\d/) || code.to_s.match?(/5\d\d/)
+    end
+  end
+end

data/lib/wayback_archiver/null_logger.rb CHANGED Viewed

@@ -1,9 +1,12 @@
 require 'logger'
-class NullLogger < Logger
-  def initialize(*args)
-  end
+module WaybackArchiver
+  # Don't log anyting / Send the logs to the abyss
+  class NullLogger < Logger
+    # Allow any and all params
+    def initialize(*args); end
-  def add(*args, &block)
+    # Allow any and alls params and don't do anyting
+    def add(*args, &block); end
   end
 end

data/lib/wayback_archiver/request.rb CHANGED Viewed

@@ -1,62 +1,223 @@
-require 'url_resolver' # TODO: Allow users to use any resolver
+require 'net/http'
+require 'openssl'
+require 'timeout'
+require 'uri'
+require 'zlib'
+require 'wayback_archiver/http_code'
 module WaybackArchiver
-  # Request and parse HTML & XML documents
+  # Make HTTP requests
   class Request
-    # Get and parse HTML & XML documents.
-    # @return [Array] with links sent to the Wayback Machine.
-    # @param [String] url to retrieve and parse.
-    # @example Request and parse example.com
-    #    Request.document('example.com')
-    # @example Request and parse google.com/sitemap.xml
-    #    Request.document('google.com/sitemap.xml')
-    def self.document(url)
-      response_body = Request.response(url).body
-      Nokogiri::HTML(response_body)
+    # General error, something went wrong
+    class Error < StandardError; end
+    # Client error, something went wrong on the local machine
+    class ClientError < Error; end
+    # Server error, the remote server did something wrong
+    class ServerError < Error; end
+    # Remote server responded with a HTTP error
+    class HTTPError < ServerError; end
+    # Remote server error
+    class ResponseError < ServerError; end
+    # Max redirects reached error
+    class MaxRedirectError < ServerError; end
+    # Remote server responded with an invalid redirect
+    class InvalidRedirectError < ServerError; end
+    # Remote server responded with an unknown HTTP code
+    class UnknownResponseCodeError < ServerError; end
+    # GET response wrapper
+    GETStruct = Struct.new(:response, :error)
+    # Max number of redirects before an error is raised
+    MAX_REDIRECTS = 10
+    # Response data struct
+    Response = Struct.new(:code, :message, :body, :uri, :error)
+    class Response
+      # Returns true if a successfull response
+      # @example check if Response was successfull
+      #    response = Response.new('200', 'OK', 'buren', 'http://example.com')
+      #    response.success? # => true
+      def success?
+        HTTPCode.success?(code)
+      end
     end
     # Get reponse.
-    # @return [Net::HTTP*] the http response.
-    # @param [String] url URL to retrieve.
-    # @param [Boolean] resolve whether to resolve the URL.
-    # @example Resolve example.com and request
-    #    Request.response('example.com', true)
-    # @example Request http://example.com
-    #    Request.response('http://example.com', false)
-    def self.response(url, resolve = true)
-      resolved_url = resolve ? resolve_url(url) : url
-      uri          = URI.parse(resolved_url)
-      http         = Net::HTTP.new(uri.host, uri.port)
-      http.use_ssl = true if resolved_url.start_with?('https://')
-      request = Net::HTTP::Get.new(uri.request_uri)
-      request['User-Agent'] = WaybackArchiver::USER_AGENT
-      http.request(request)
-    end
-    # Resolve the URL, follows redirects.
-    # @return [String] the resolved URL.
-    # @param [String] url to retrieve.
-    # @example Resolve example.com and request
-    #    Request.resolve_url('example.com')
-    def self.resolve_url(url)
-      resolved = UrlResolver.resolve(url)
-      resolved = resolved.prepend('http://') unless protocol?(resolved)
-      resolved
-    end
-    # Resolve the URL, follows redirects.
-    # @return [Boolean] true if string includes protocol.
-    # @param [String] url to check.
-    # @example Check if string includes protocol
-    #    Request.protocol?('example.com')
-    #    # => false
-    #    Request.protocol?('https://example.com')
-    #    # => true
-    #    Request.protocol?('http://example.com')
-    #    # => true
-    def self.protocol?(url)
-      url.start_with?('http://') || url.start_with?('https://')
+    # @return [Response] the http response representation.
+    # @param [String, URI] uri to retrieve.
+    # @param max_redirects [Integer] max redirects (default: 10).
+    # @param follow_redirects [Boolean] follow redirects (default: true).
+    # @example Get example.com
+    #    Request.get('example.com')
+    # @example Get http://example.com and follow max 3 redirects
+    #    Request.get('http://example.com', max_redirects: 3)
+    # @example Get http://example.com and don't follow redirects
+    #    Request.get('http://example.com', follow_redirects: false)
+    # @raise [Error] super class of all exceptions that this method can raise
+    # @raise [ServerError] all server errors
+    # @raise [ClientError] all client errors
+    # @raise [HTTPError] all HTTP errors
+    # @raise [MaxRedirectError] too many redirects, subclass of HTTPError (only raised if raise_on_http_error flag is true)
+    # @raise [ResponseError] server responsed with a 4xx or 5xx HTTP status code, subclass of HTTPError (only raised if raise_on_http_error flag is true)
+    # @raise [UnknownResponseCodeError] server responded with an unknown HTTP status code, subclass of HTTPError (only raised if raise_on_http_error flag is true)
+    # @raise [InvalidRedirectError] server responded with an invalid redirect, subclass of HTTPError (only raised if raise_on_http_error flag is true)
+    def self.get(
+      uri,
+      max_redirects: MAX_REDIRECTS,
+      raise_on_http_error: false,
+      follow_redirects: true
+    )
+      uri = build_uri(uri)
+      redirect_count = 0
+      until redirect_count > max_redirects
+        WaybackArchiver.logger.debug "Requesting #{uri}"
+        http = Net::HTTP.new(uri.host, uri.port)
+        if uri.scheme == 'https'
+          http.use_ssl = true
+          http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+        end
+        request = Net::HTTP::Get.new(uri.request_uri)
+        request['User-Agent'] = WaybackArchiver.user_agent
+        result = perform_request(uri, http, request)
+        response = result.response
+        error = result.error
+        raise error if error
+        code = response.code
+        WaybackArchiver.logger.debug "[#{code}, #{response.message}] Requested #{uri}"
+        case HTTPCode.type(code)
+        when :success
+          return build_response(uri, response)
+        when :redirect
+          return build_response(uri, response) unless follow_redirects
+          uri = build_redirect_uri(uri, response)
+          redirect_count += 1
+          next
+        when :error
+          if raise_on_http_error
+            raise ResponseError, "Failed with response code: #{code} when requesting #{uri}"
+          end
+          return build_response(uri, response)
+        else
+          raise UnknownResponseCodeError, "Unknown HTTP response code #{code} when requesting #{uri}"
+        end
+      end
+      raise MaxRedirectError, "Redirected too many times when requesting #{uri}"
+    end
+    # Builds a Response object.
+    # @return [Response]
+    # @param [URI] uri that was requested.
+    # @param [Net::HTTPResponse] response the server response.
+    # @example Build Response object for example.com
+    #    Request.build_response(uri, net_http_response)
+    def self.build_response(uri, response)
+      Response.new(
+        response.code,
+        response.message,
+        parse_body(response.body),
+        uri.to_s
+      )
+    end
+    # Builds an URI for a redirect response.
+    # @return [URI] to redirect to.
+    # @param [URI] uri that was requested.
+    # @param [Net::HTTPResponse] response the server response.
+    # @example Build redirect URI for example.com (lets pretend it will redirect..)
+    #    Request.build_redirect_uri('http://example.com', net_http_response)
+    def self.build_redirect_uri(uri, response)
+      location_header = response.header.fetch('location') do
+        raise InvalidRedirectError, "No location header found on redirect when requesting #{uri}"
+      end
+      location = URI.parse(location_header)
+      return build_uri(uri) + location_header if location.relative?
+      location
+    end
+    # Build URI.
+    # @return [URI] uri to redirect to.
+    # @param [URI, String] uri to build.
+    # @example Build URI for example.com
+    #    Request.build_uri('http://example.com')
+    # @example Build URI for #<URI::HTTP http://example.com>
+    #    uri = URI.parse('http://example.com')
+    #    Request.build_uri(uri)
+    def self.build_uri(uri)
+      return uri if uri.is_a?(URI)
+      uri = "http://#{uri}" unless uri =~ %r{^https?://}
+      URI.parse(uri)
+    end
+    # Parse response body, handles reqular and gzipped response bodies.
+    # @return [String] the response body.
+    # @param [String] response_body the server response body.
+    # @example Return response body for response.
+    #    Request.parse_body(uri, net_http_response)
+    def self.parse_body(response_body)
+      return '' unless response_body
+      Zlib::GzipReader.new(StringIO.new(response_body)).read
+    rescue Zlib::GzipFile::Error => _e
+      response_body
+    end
+    # Return whether a value is blank or not.
+    # @return [Boolean] whether the value is blank or not.
+    # @param [Object] value the value to check if its blank or not.
+    # @example Returns false for nil.
+    #    Request.blank?(nil)
+    # @example Returns false for empty string.
+    #    Request.blank?('')
+    # @example Returns false for string with only spaces.
+    #    Request.blank?('  ')
+    def self.blank?(value)
+      return true unless value
+      return true if value.strip.empty?
+      false
+    end
+    private
+    def self.perform_request(uri, http, request)
+      # TODO: Consider retrying failed requests
+      response = http.request(request)
+      GETStruct.new(response)
+    rescue Timeout::Error,
+           OpenSSL::SSL::SSLError,
+           Net::HTTPBadResponse,
+           Zlib::Error => e
+      build_request_error(uri, e, ServerError)
+    rescue SystemCallError,
+           SocketError,
+           IOError => e
+      build_request_error(uri, e, ClientError)
+    end
+    def self.build_request_error(uri, error, error_wrapper_klass)
+      WaybackArchiver.logger.error "Request to #{uri} failed: #{error_wrapper_klass}, #{error.class}, #{error.message}"
+      GETStruct.new(
+        Response.new,
+        error_wrapper_klass.new("#{error.class}, #{error.message}")
+      )
     end
   end
 end

data/lib/wayback_archiver/sitemap.rb ADDED Viewed

@@ -0,0 +1,79 @@
+require 'rexml/document'
+module WaybackArchiver
+  # Parse Sitemaps, https://www.sitemaps.org
+  class Sitemap
+    attr_reader :document
+    def initialize(xml, strict: false)
+      @document = REXML::Document.new(xml)
+    rescue REXML::ParseException => _e
+      raise if strict
+      @document = REXML::Document.new('')
+    end
+    # Return all URLs defined in Sitemap.
+    # @return [Array<String>] of URLs defined in Sitemap.
+    # @example Get URLs defined in Sitemap
+    #    sitemap = Sitemap.new(xml)
+    #    sitemap.urls
+    def urls
+      @urls ||= extract_urls('url')
+    end
+    # Return all sitemap URLs defined in Sitemap.
+    # @return [Array<String>] of Sitemap URLs defined in Sitemap.
+    # @example Get Sitemap URLs defined in Sitemap
+    #    sitemap = Sitemap.new(xml)
+    #    sitemap.sitemaps
+    def sitemaps
+      @sitemaps ||= extract_urls('sitemap')
+    end
+    # Check if sitemap is a plain file
+    # @return [Boolean] whether document is plain
+    def plain_document?
+      document.elements.empty?
+    end
+    # Return the name of the document (if there is one)
+    # @return [String] the document root name
+    def root_name
+      return unless document.root
+      document.root.name
+    end
+    # Returns true of Sitemap is a Sitemap index
+    # @return [Boolean] of whether the Sitemap is an Sitemap index or not
+    # @example Check if Sitemap is a sitemap index
+    #    sitemap = Sitemap.new(xml)
+    #    sitemap.sitemap_index?
+    def sitemap_index?
+      root_name == 'sitemapindex'
+    end
+    # Returns true of Sitemap lists regular URLs
+    # @return [Boolean] of whether the Sitemap regular URL list
+    # @example Check if Sitemap is a regular URL list
+    #    sitemap = Sitemap.new(xml)
+    #    sitemap.urlset?
+    def urlset?
+      root_name == 'urlset'
+    end
+    private
+    # Extract URLs from Sitemap
+    def extract_urls(node_name)
+      return document.to_s.each_line.map(&:strip) if plain_document?
+      urls = []
+      document.root.elements.each("#{node_name}/loc") do |element|
+        urls << element.text
+      end
+      urls
+    end
+  end
+end

data/lib/wayback_archiver/sitemapper.rb ADDED Viewed

@@ -0,0 +1,75 @@
+require 'robots'
+require 'wayback_archiver/sitemap'
+require 'wayback_archiver/request'
+module WaybackArchiver
+  # Fetch and parse sitemaps recursively
+  class Sitemapper
+    # Common locations for Sitemap(s)
+    COMMON_SITEMAP_LOCATIONS = %w[
+      sitemap_index.xml.gz
+      sitemap-index.xml.gz
+      sitemap_index.xml
+      sitemap-index.xml
+      sitemap.xml.gz
+      sitemap.xml
+    ].freeze
+    # Autodiscover the location of the Sitemap, then fetch and parse recursively.
+    # First it tries /robots.txt, then common locations for Sitemap and finally the supplied URL.
+    # @return [Array<String>] of URLs defined in Sitemap(s).
+    # @param [URI] url to domain.
+    # @example Get URLs defined in Sitemap for google.com
+    #    Sitemapper.autodiscover('https://google.com/')
+    # @see http://www.sitemaps.org
+    def self.autodiscover(url)
+      WaybackArchiver.logger.info 'Looking for Sitemap(s) in /robots.txt'
+      robots = Robots.new(WaybackArchiver.user_agent)
+      sitemaps = robots.other_values(url)['Sitemap']
+      if sitemaps
+        return sitemaps.flat_map do |sitemap|
+          WaybackArchiver.logger.info "Fetching Sitemap at #{sitemap}"
+          urls(url: sitemap)
+        end
+      end
+      COMMON_SITEMAP_LOCATIONS.each do |path|
+        WaybackArchiver.logger.info "Looking for Sitemap at #{path}"
+        sitemap_url = [url, path].join(url.end_with?('/') ? '' : '/')
+        response = Request.get(sitemap_url, raise_on_http_error: false)
+        return urls(xml: response.body) if response.success?
+      end
+      WaybackArchiver.logger.info "Looking for Sitemap at #{url}"
+      urls(url: url)
+    rescue Request::Error => e
+      WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}"
+      []
+    end
+    # Fetch and parse sitemaps recursively.
+    # @return [Array<String>] of URLs defined in Sitemap(s).
+    # @param url [String] URL to Sitemap.
+    # @param xml [String] Sitemap XML.
+    # @example Get URLs defined in Sitemap for google.com
+    #    Sitemapper.urls(url: 'https://google.com/sitemap.xml')
+    # @example Get URLs defined in Sitemap
+    #    Sitemapper.urls(xml: xml)
+    # @see http://www.sitemaps.org
+    def self.urls(url: nil, xml: nil)
+      xml = Request.get(url).body unless xml
+      sitemap = Sitemap.new(xml)
+      if sitemap.sitemap_index?
+        sitemap.sitemaps.flat_map { |sitemap_url| urls(url: sitemap_url) }
+      else
+        sitemap.urls
+      end
+    rescue Request::Error => e
+      WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}"
+      []
+    end
+  end
+end

data/lib/wayback_archiver/thread_pool.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require 'concurrent'
+module WaybackArchiver
+  # Thread pool
+  class ThreadPool
+    # Build a thread pool
+    # @return [Concurrent::FixedThreadPool/Concurrent::ImmediateExecutor] an instance of a concurrent thread pool
+    # @param [Integer] concurrency the desired concurrency
+    # @example Build a thread pool with 10 as the desired concurrency
+    #    pool = ThreadPool.build(10)
+    #    pool.post { some_work } # Returns a Concurrent::FixedThreadPool
+    # @example Build a thread pool with 1 as the desired concurrency
+    #    pool = ThreadPool.build(1)
+    #    pool.post { some_work } # Returns a Concurrent::ImmediateExecutor
+    # @see https://github.com/ruby-concurrency/concurrent-ruby/blob/master/doc/thread_pools.md
+    def self.build(concurrency)
+      if concurrency == 1
+        Concurrent::ImmediateExecutor.new
+      elsif concurrency > 1
+        Concurrent::FixedThreadPool.new(concurrency)
+      else
+        raise ArgumentError, 'concurrency must be one or greater'
+      end
+    end
+  end
+end

data/lib/wayback_archiver/url_collector.rb CHANGED Viewed

@@ -1,51 +1,42 @@
 require 'spidr'
 require 'robots'
+require 'wayback_archiver/sitemapper'
+require 'wayback_archiver/request'
 module WaybackArchiver
   # Retrive URLs from different sources
-  class UrlCollector
+  class URLCollector
     # Retrieve URLs from Sitemap.
-    # @return [Array] of URLs defined in Sitemap.
+    # @return [Array<String>] of URLs defined in Sitemap.
     # @param [String] url domain to retrieve Sitemap from.
     # @example Get URLs defined in Sitemap for google.com
-    #    UrlCollector.sitemap('https://google.com')
+    #    URLCollector.sitemap('https://google.com/sitemap.xml')
     def self.sitemap(url)
-      resolved = Request.resolve_url("#{url}/sitemap.xml")
-      sitemap  = Request.document(resolved)
-      sitemap.css('loc').map(&:text)
+      Sitemapper.urls(url: Request.build_uri(url))
     end
     # Retrieve URLs by crawling.
-    # @return [Array] of URLs defined found during crawl.
+    # @return [Array<String>] of URLs defined found during crawl.
     # @param [String] url domain to crawl URLs from.
     # @example Crawl URLs defined on example.com
-    #    UrlCollector.crawl('http://example.com')
+    #    URLCollector.crawl('http://example.com')
     def self.crawl(url)
       urls = []
-      resolved_url = Request.resolve_url(url)
-      Spidr.site(resolved_url, robots: true) do |spider|
+      start_at_url = Request.build_uri(url).to_s
+      options = {
+        robots: true,
+        user_agent: WaybackArchiver.user_agent
+      }
+      Spidr.site(start_at_url, options) do |spider|
         spider.every_html_page do |page|
           page_url = page.url.to_s
           urls << page_url
-          WaybackArchiver.logger.info "Found: #{page_url}"
+          WaybackArchiver.logger.debug "Found: #{page_url}"
           yield(page_url) if block_given?
         end
       end
       urls
     end
-    # Retrieve URLs listed in file.
-    # @return [Array] of URLs defined in file.
-    # @param [String] path to get URLs from.
-    # @example Get URLs defined in /path/to/file
-    #    UrlCollector.file('/path/to/file')
-    def self.file(path)
-      raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
-      urls = []
-      File.open(path).read
-          .gsub(/\r\n?/, "\n")
-          .each_line { |line| urls << line.delete("\n").strip }
-      urls.reject(&:empty?)
-    end
   end
 end

data/lib/wayback_archiver/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 module WaybackArchiver
   # Gem version
-  VERSION = '0.2.0'.freeze
+  VERSION = '1.0.0'.freeze
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: wayback_archiver
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 1.0.0
 platform: ruby
 authors:
 - Jacob Burenstam
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-07-01 00:00:00.000000000 Z
+date: 2017-08-01 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: spidr
@@ -38,20 +38,6 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '0.1'
-- !ruby/object:Gem::Dependency
-  name: url_resolver
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: '0.1'
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: '0.1'
 - !ruby/object:Gem::Dependency
   name: concurrent-ruby
   requirement: !ruby/object:Gem::Requirement
@@ -128,14 +114,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.7'
+        version: '0.8'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.7'
+        version: '0.8'
 - !ruby/object:Gem::Dependency
   name: redcarpet
   requirement: !ruby/object:Gem::Requirement
@@ -150,6 +136,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '3.2'
+- !ruby/object:Gem::Dependency
+  name: webmock
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
 - !ruby/object:Gem::Dependency
   name: byebug
   requirement: !ruby/object:Gem::Requirement
@@ -164,8 +164,8 @@ dependencies:
     - - ">"
       - !ruby/object:Gem::Version
         version: '0'
-description: Send URLs to Wayback Machine (Internet Archive). By crawling, sitemap,
-  file or single URL.
+description: Post URLs to Wayback Machine (Internet Archive), using a crawler, from
+  Sitemap(s) or a list of URLs.
 email:
 - burenstam@gmail.com
 executables:
@@ -176,8 +176,12 @@ files:
 - bin/wayback_archiver
 - lib/wayback_archiver.rb
 - lib/wayback_archiver/archive.rb
+- lib/wayback_archiver/http_code.rb
 - lib/wayback_archiver/null_logger.rb
 - lib/wayback_archiver/request.rb
+- lib/wayback_archiver/sitemap.rb
+- lib/wayback_archiver/sitemapper.rb
+- lib/wayback_archiver/thread_pool.rb
 - lib/wayback_archiver/url_collector.rb
 - lib/wayback_archiver/version.rb
 homepage: https://github.com/buren/wayback_archiver
@@ -203,5 +207,5 @@ rubyforge_project:
 rubygems_version: 2.6.11
 signing_key:
 specification_version: 4
-summary: Send URLs to Wayback Machine (Internet Archive)
+summary: Post URLs to Wayback Machine (Internet Archive)
 test_files: []