RubyGems - wayback_archiver - Versions diffs - 1.0.0 → 1.1.0 - Mend

wayback_archiver 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/bin/wayback_archiver +11 -1
data/lib/wayback_archiver.rb +45 -17
data/lib/wayback_archiver/archive.rb +42 -16
data/lib/wayback_archiver/archive_result.rb +32 -0
data/lib/wayback_archiver/http_code.rb +3 -3
data/lib/wayback_archiver/request.rb +1 -12
data/lib/wayback_archiver/response.rb +13 -0
data/lib/wayback_archiver/sitemapper.rb +5 -1
data/lib/wayback_archiver/url_collector.rb +7 -1
data/lib/wayback_archiver/version.rb +1 -1
metadata +4 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 1f9f979d5fa0d31cfdf61660baa3464bdb1425e5
-  data.tar.gz: 1d5701273bbe4d02b2ba5d88f9e75c9477058a28
+  metadata.gz: a4ba8820f2974f5a506c3fd125be8b6d6b429e3d
+  data.tar.gz: 00e101acf27d03d0fccc48b3031cca01aa4792ae
 SHA512:
-  metadata.gz: e69883c975584b3120993371b29a0c0b7a71f3fd4210764b4caa712d4071ec175dc47fa63217950968da62485e927c05fcc977dfb076a317be4754cf4f16ec90
-  data.tar.gz: 51f80b591f40f4bc22b5bf2e2b7d56273c3c495db43d8a9268f11048a410ad355a3ff238088a4abd83c69fd585db5e1e704d34d22dac3d4f4b9b2f73089458ac
+  metadata.gz: 8c0ec54efa365cdbf9ea9847d0289d67407fcc85b41100559fc768af6ea6bb3d1ab90594bd0578cc02c503fb5b2394a67e77e6cd624783314ec226aee3948424
+  data.tar.gz: b206a4c9e45fa42159d6cb4013149aa003f51dd70a41b6ddbbdb77978b8bdb09fe91852d3c67b5f56bff7eab9f9305f82b16dadae1487301e2344bf335f77600

data/bin/wayback_archiver CHANGED

@@ -9,6 +9,7 @@ strategy = 'auto'
 log = STDOUT
 log_level = Logger::INFO
 concurrency = WaybackArchiver.concurrency
+limit = WaybackArchiver.max_limit
 optparse = OptionParser.new do |parser|
   parser.banner = 'Usage: wayback_archiver [<url>] [options]'
@@ -33,6 +34,10 @@ optparse = OptionParser.new do |parser|
     concurrency = value
   end
+  parser.on('--limit=5', Integer, 'Max number of URLs to archive') do |value|
+    limit = value
+  end
   parser.on('--log=output.log', String, 'Path to desired log file (if no argument is given it defaults to STDOUT)') do |path|
     log = path
   end
@@ -74,5 +79,10 @@ end
 # If no strategy has explicitly been given, then default to 'auto'
 strategy ||= 'auto'
 urls.each do |url|
-  WaybackArchiver.archive(url, strategy: strategy, concurrency: concurrency)
+  WaybackArchiver.archive(
+    url,
+    strategy: strategy,
+    concurrency: concurrency,
+    limit: limit
+  )
 end

data/lib/wayback_archiver.rb CHANGED

@@ -15,36 +15,43 @@ module WaybackArchiver
   # Default concurrency for archiving URLs
   DEFAULT_CONCURRENCY = 5
+  # Maxmium number of links posted (-1 is no limit)
+  DEFAULT_MAX_LIMIT = -1
   # Send URLs to Wayback Machine.
-  # @return [Array<String>] of URLs sent to the Wayback Machine.
+  # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
   # @param [String/Array<String>] source for URL(s).
   # @param [String/Symbol] strategy of source. Supported strategies: crawl, sitemap, url, urls, auto.
   # @example Crawl example.com and send all URLs of the same domain
   #    WaybackArchiver.archive('example.com') # Default strategy is :auto
   #    WaybackArchiver.archive('example.com', strategy: :auto)
   #    WaybackArchiver.archive('example.com', strategy: :auto, concurrency: 10)
+  #    WaybackArchiver.archive('example.com', strategy: :auto, limit: 100) # send max 100 URLs
   #    WaybackArchiver.archive('example.com', :auto)
   # @example Crawl example.com and send all URLs of the same domain
   #    WaybackArchiver.archive('example.com', strategy: :crawl)
   #    WaybackArchiver.archive('example.com', strategy: :crawl, concurrency: 10)
+  #    WaybackArchiver.archive('example.com', strategy: :crawl, limit: 100) # send max 100 URLs
   #    WaybackArchiver.archive('example.com', :crawl)
   # @example Send example.com Sitemap URLs
   #    WaybackArchiver.archive('example.com', strategy: :sitemap)
   #    WaybackArchiver.archive('example.com', strategy: :sitemap, concurrency: 10)
+  #    WaybackArchiver.archive('example.com', strategy: :sitemap, limit: 100) # send max 100 URLs
   #    WaybackArchiver.archive('example.com', :sitemap)
   # @example Send only example.com
   #    WaybackArchiver.archive('example.com', strategy: :url)
   #    WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
+  #    WaybackArchiver.archive('example.com', strategy: :url, limit: 100) # send max 100 URLs
   #    WaybackArchiver.archive('example.com', :url)
-  def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency)
+  def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
     strategy = legacy_strategy || strategy
     case strategy.to_s
-    when 'crawl'   then crawl(source, concurrency: concurrency)
-    when 'auto'    then auto(source, concurrency: concurrency)
-    when 'sitemap' then sitemap(source, concurrency: concurrency)
-    when 'urls'    then urls(source, concurrency: concurrency)
-    when 'url'     then urls(source, concurrency: concurrency)
+    when 'crawl'   then crawl(source, concurrency: concurrency, limit: limit)
+    when 'auto'    then auto(source, concurrency: concurrency, limit: limit)
+    when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit)
+    when 'urls'    then urls(source, concurrency: concurrency, limit: limit)
+    when 'url'     then urls(source, concurrency: concurrency, limit: limit)
     else
       raise ArgumentError, "Unknown strategy: '#{strategy}'. Allowed strategies: sitemap, urls, url, crawl"
     end
@@ -52,15 +59,17 @@ module WaybackArchiver
   # Look for Sitemap(s) and if nothing is found fallback to crawling.
   # Then send found URLs to the Wayback Machine.
-  # @return [Array<String>] of URLs sent to the Wayback Machine.
+  # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
   # @param [String] source (must be a valid URL).
   # @param concurrency [Integer]
   # @example Auto archive example.com
   #    WaybackArchiver.auto('example.com') # Default concurrency is 5
   # @example Auto archive example.com with low concurrency
   #    WaybackArchiver.auto('example.com', concurrency: 1)
+  # @example Auto archive example.com and archive max 100 URLs
+  #    WaybackArchiver.auto('example.com', limit: 100)
   # @see http://www.sitemaps.org
-  def self.auto(source, concurrency: WaybackArchiver.concurrency)
+  def self.auto(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
     urls = Sitemapper.autodiscover(source)
     return urls(urls, concurrency: concurrency) if urls.any?
@@ -68,41 +77,47 @@ module WaybackArchiver
   end
   # Crawl site for URLs to send to the Wayback Machine.
-  # @return [Array<String>] of URLs sent to the Wayback Machine.
+  # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
   # @param [String] url to start crawling from.
   # @param concurrency [Integer]
   # @example Crawl example.com and send all URLs of the same domain
   #    WaybackArchiver.crawl('example.com') # Default concurrency is 5
   # @example Crawl example.com and send all URLs of the same domain with low concurrency
   #    WaybackArchiver.crawl('example.com', concurrency: 1)
-  def self.crawl(url, concurrency: WaybackArchiver.concurrency)
+  # @example Crawl example.com and archive max 100 URLs
+  #    WaybackArchiver.crawl('example.com', limit: 100)
+  def self.crawl(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
     WaybackArchiver.logger.info "Crawling #{url}"
-    Archive.crawl(url, concurrency: concurrency)
+    Archive.crawl(url, concurrency: concurrency, limit: limit)
   end
   # Get URLs from sitemap and send found URLs to the Wayback Machine.
-  # @return [Array<String>] of URLs sent to the Wayback Machine.
+  # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
   # @param [String] url to the sitemap.
   # @param concurrency [Integer]
   # @example Get example.com sitemap and archive all found URLs
   #    WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 5
   # @example Get example.com sitemap and archive all found URLs with low concurrency
   #    WaybackArchiver.sitemap('example.com/sitemap.xml', concurrency: 1)
+  # @example Get example.com sitemap archive max 100 URLs
+  #    WaybackArchiver.sitemap('example.com/sitemap.xml', limit: 100)
   # @see http://www.sitemaps.org
-  def self.sitemap(url, concurrency: WaybackArchiver.concurrency)
+  def self.sitemap(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
     WaybackArchiver.logger.info "Fetching Sitemap"
-    Archive.post(URLCollector.sitemap(url), concurrency: concurrency)
+    Archive.post(URLCollector.sitemap(url), concurrency: concurrency, limit: limit)
   end
   # Send URL to the Wayback Machine.
-  # @return [Array<String>] of URLs sent to the Wayback Machine.
+  # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
   # @param [Array<String>/String] urls or url.
   # @param concurrency [Integer]
   # @example Archive example.com
   #    WaybackArchiver.urls('example.com')
   # @example Archive example.com and google.com
   #    WaybackArchiver.urls(%w(example.com google.com))
-  def self.urls(urls, concurrency: WaybackArchiver.concurrency)
+  # @example Archive example.com, max 100 URLs
+  #    WaybackArchiver.urls(%w(example.com www.example.com), limit: 100)
+  def self.urls(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
     Archive.post(Array(urls), concurrency: concurrency)
   end
@@ -152,4 +167,17 @@ module WaybackArchiver
   def self.concurrency
     @concurrency ||= DEFAULT_CONCURRENCY
   end
+  # Sets the default max_limit
+  # @return [Integer] the desired default max_limit
+  # @param [Integer] max_limit the desired default max_limit
+  def self.max_limit=(max_limit)
+    @max_limit = max_limit
+  end
+  # Returns the default max_limit
+  # @return [Integer] the configured or the default max_limit
+  def self.max_limit
+    @max_limit ||= DEFAULT_MAX_LIMIT
+  end
 end

data/lib/wayback_archiver/archive.rb CHANGED

@@ -1,5 +1,6 @@
 require 'concurrent'
+require 'wayback_archiver/archive_result'
 require 'wayback_archiver/thread_pool'
 require 'wayback_archiver/request'
@@ -10,23 +11,40 @@ module WaybackArchiver
     WAYBACK_BASE_URL    = 'https://web.archive.org/save/'.freeze
     # Send URLs to Wayback Machine.
-    # @return [Array<String>] with sent URLs.
+    # @return [Array<ArchiveResult>] with sent URLs.
     # @param [Array<String>] urls to send to the Wayback Machine.
     # @param concurrency [Integer] the default is 5
+    # @yield [archive_result] If a block is given, each result will be yielded
+    # @yieldparam [ArchiveResult] archive_result
     # @example Archive urls, asynchronously
     #    Archive.post(['http://example.com'])
+    #    Archiver.post(['http://example.com']) do |result|
+    #      puts [result.code || 'error', result.url] # print response status and URL
+    #    end
     # @example Archive urls, using only 1 thread
     #    Archive.post(['http://example.com'], concurrency: 1)
-    def self.post(urls, concurrency: WaybackArchiver.concurrency)
+    # @example Stop after archiving 100 links
+    #    Archive.post(['http://example.com'], limit: 100)
+    # @example Explicitly set no limit on how many links are posted
+    #    Archive.post(['http://example.com'], limit: -1)
+    def self.post(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
       WaybackArchiver.logger.info "Total URLs to be sent: #{urls.length}"
       WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
+      urls_queue = if limit == -1
+                     urls
+                   else
+                     urls[0...limit]
+                   end
       posted_urls = Concurrent::Array.new
       pool = ThreadPool.build(concurrency)
-      urls.each do |url|
+      urls_queue.each do |url|
         pool.post do
-          posted_url = post_url(url)
-          posted_urls << posted_url if posted_url
+          result = post_url(url)
+          yield(result) if block_given?
+          posted_urls << result unless result.errored?
         end
       end
@@ -38,23 +56,31 @@ module WaybackArchiver
     end
     # Send URLs to Wayback Machine by crawling the site.
-    # @return [Array<String>] with URLs sent to the Wayback Machine.
+    # @return [Array<ArchiveResult>] with URLs sent to the Wayback Machine.
     # @param [String] source for URL to crawl.
     # @param concurrency [Integer] the default is 5
+    # @yield [archive_result] If a block is given, each result will be yielded
+    # @yieldparam [ArchiveResult] archive_result
     # @example Crawl example.com and send all URLs of the same domain
-    #    WaybackArchiver.crawl('example.com')
+    #    Archiver.crawl('example.com')
+    #    Archiver.crawl('example.com') do |result|
+    #      puts [result.code || 'error', result.url] # print response status and URL
+    #    end
     # @example Crawl example.com and send all URLs of the same domain with low concurrency
-    #    WaybackArchiver.crawl('example.com', concurrency: 1)
-    def self.crawl(source, concurrency: WaybackArchiver.concurrency)
+    #    Archiver.crawl('example.com', concurrency: 1)
+    # @example Stop after archiving 100 links
+    #    Archiver.crawl('example.com', limit: 100)
+    def self.crawl(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
       WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
       posted_urls = Concurrent::Array.new
       pool = ThreadPool.build(concurrency)
-      found_urls = URLCollector.crawl(source) do |url|
+      found_urls = URLCollector.crawl(source, limit: limit) do |url|
         pool.post do
-          posted_url = post_url(url)
-          posted_urls << posted_url if posted_url
+          result = post_url(url)
+          yield(result) if block_given?
+          posted_urls << result unless result.errored?
         end
       end
       WaybackArchiver.logger.info "Crawling of #{source} finished, found #{found_urls.length} URL(s)"
@@ -66,18 +92,18 @@ module WaybackArchiver
     end
     # Send URL to Wayback Machine.
-    # @return [String] the sent URL.
+    # @return [ArchiveResult] the sent URL.
     # @param [String] url to send.
     # @example Archive example.com, with default options
     #    Archive.post_url('http://example.com')
     def self.post_url(url)
       request_url  = "#{WAYBACK_BASE_URL}#{url}"
-      response     = Request.get(request_url, follow_redirects: false)
+      response = Request.get(request_url, follow_redirects: false)
       WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
-      url
+      ArchiveResult.new(url, response)
     rescue Request::Error => e
       WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
-      nil
+      ArchiveResult.new(url, nil, e)
     end
   end
 end

data/lib/wayback_archiver/archive_result.rb ADDED

@@ -0,0 +1,32 @@
+module WaybackArchiver
+  # Result data for posting URL to archive
+  ArchiveResult = Struct.new(:uri, :response, :error)
+  class ArchiveResult
+    # @return [String] the URL that was archived
+    def archived_url
+      uri
+    end
+    # @return [String] the requested URL
+    def request_url
+      return unless response?
+      response.uri
+    end
+    # @return [String] The HTTP status code if any
+    def code
+      return unless response?
+      response.code
+    end
+    # @return [Boolean] true if errored
+    def errored?
+      !!error
+    end
+    # @return [Boolean] true if response is present
+    def response?
+      !!response
+    end
+  end
+end

data/lib/wayback_archiver/http_code.rb CHANGED

@@ -25,7 +25,7 @@ module WaybackArchiver
     # @example
     #    HttpCode.success?(nil) # => false
     def self.success?(code)
-      code.to_s.match?(/2\d\d/)
+      !!code.to_s.match(/2\d\d/)
     end
     # Whether the code is a redirect type
@@ -34,7 +34,7 @@ module WaybackArchiver
     # @example
     #    HttpCode.redirect?('301')
     def self.redirect?(code)
-      code.to_s.match?(/3\d\d/)
+      !!code.to_s.match(/3\d\d/)
     end
     # Whether the code is a error type
@@ -43,7 +43,7 @@ module WaybackArchiver
     # @example
     #    HttpCode.error?('301')
     def self.error?(code)
-      code.to_s.match?(/4\d\d/) || code.to_s.match?(/5\d\d/)
+      !!code.to_s.match(/4\d\d/) || !!code.to_s.match(/5\d\d/)
     end
   end
 end

data/lib/wayback_archiver/request.rb CHANGED

@@ -5,6 +5,7 @@ require 'uri'
 require 'zlib'
 require 'wayback_archiver/http_code'
+require 'wayback_archiver/response'
 module WaybackArchiver
   # Make HTTP requests
@@ -32,18 +33,6 @@ module WaybackArchiver
     # Max number of redirects before an error is raised
     MAX_REDIRECTS = 10
-    # Response data struct
-    Response = Struct.new(:code, :message, :body, :uri, :error)
-    class Response
-      # Returns true if a successfull response
-      # @example check if Response was successfull
-      #    response = Response.new('200', 'OK', 'buren', 'http://example.com')
-      #    response.success? # => true
-      def success?
-        HTTPCode.success?(code)
-      end
-    end
     # Get reponse.
     # @return [Response] the http response representation.
     # @param [String, URI] uri to retrieve.

data/lib/wayback_archiver/response.rb ADDED

@@ -0,0 +1,13 @@
+module WaybackArchiver
+  # Response data struct
+  Response = Struct.new(:code, :message, :body, :uri, :error)
+  class Response
+    # Returns true if a successfull response
+    # @example check if Response was successfull
+    #    response = Response.new('200', 'OK', 'buren', 'http://example.com')
+    #    response.success? # => true
+    def success?
+      HTTPCode.success?(code)
+    end
+  end
+end

data/lib/wayback_archiver/sitemapper.rb CHANGED

@@ -38,7 +38,11 @@ module WaybackArchiver
         WaybackArchiver.logger.info "Looking for Sitemap at #{path}"
         sitemap_url = [url, path].join(url.end_with?('/') ? '' : '/')
         response = Request.get(sitemap_url, raise_on_http_error: false)
-        return urls(xml: response.body) if response.success?
+        if response.success?
+          WaybackArchiver.logger.info "Sitemap found at #{sitemap_url}"
+          return urls(xml: response.body)
+        end
       end
       WaybackArchiver.logger.info "Looking for Sitemap at #{url}"

data/lib/wayback_archiver/url_collector.rb CHANGED

@@ -21,13 +21,19 @@ module WaybackArchiver
     # @param [String] url domain to crawl URLs from.
     # @example Crawl URLs defined on example.com
     #    URLCollector.crawl('http://example.com')
-    def self.crawl(url)
+    # @example Crawl URLs defined on example.com and limit the number of visited pages to 100
+    #    URLCollector.crawl('http://example.com', limit: 100)
+    # @example Crawl URLs defined on example.com and explicitly set no upper limit on the number of visited pages to 100
+    #    URLCollector.crawl('http://example.com', limit: -1)
+    def self.crawl(url, limit: WaybackArchiver.max_limit)
       urls = []
       start_at_url = Request.build_uri(url).to_s
       options = {
         robots: true,
         user_agent: WaybackArchiver.user_agent
       }
+      options[:limit] = limit unless limit == -1
       Spidr.site(start_at_url, options) do |spider|
         spider.every_html_page do |page|
           page_url = page.url.to_s

data/lib/wayback_archiver/version.rb CHANGED

@@ -1,4 +1,4 @@
 module WaybackArchiver
   # Gem version
-  VERSION = '1.0.0'.freeze
+  VERSION = '1.1.0'.freeze
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: wayback_archiver
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.1.0
 platform: ruby
 authors:
 - Jacob Burenstam
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-08-01 00:00:00.000000000 Z
+date: 2017-08-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: spidr
@@ -176,9 +176,11 @@ files:
 - bin/wayback_archiver
 - lib/wayback_archiver.rb
 - lib/wayback_archiver/archive.rb
+- lib/wayback_archiver/archive_result.rb
 - lib/wayback_archiver/http_code.rb
 - lib/wayback_archiver/null_logger.rb
 - lib/wayback_archiver/request.rb
+- lib/wayback_archiver/response.rb
 - lib/wayback_archiver/sitemap.rb
 - lib/wayback_archiver/sitemapper.rb
 - lib/wayback_archiver/thread_pool.rb