RubyGems - wayback_archiver - Versions diffs - 1.3.0 → 1.4.0 - Mend

wayback_archiver 1.3.0 → 1.4.0

Files changed (7) hide show

checksums.yaml +4 -4
data/bin/wayback_archiver +7 -1
data/lib/wayback_archiver.rb +41 -8
data/lib/wayback_archiver/archive.rb +13 -4
data/lib/wayback_archiver/url_collector.rb +12 -2
data/lib/wayback_archiver/version.rb +1 -1
metadata +9 -10

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4dba94d5ac29b57a1df0a6e98f5d9ea29c1905b9ffdef7ee9bf9c344448362fe
-  data.tar.gz: 79af5b2af8660dd5f0cafae521e7b73be36e528da7c8ce2f62502f76b434959b
+  metadata.gz: e5f39f42fe6d5a4f6fbded5ab460cd91a2c9411af28645a41b987bf01a31ea14
+  data.tar.gz: 8cc1f5dbdc7d55fb9a1ec358c354fd8db3bacd856a824e9090005445a966de39
 SHA512:
-  metadata.gz: d02a18809ebdc880bdcdb9a7d0d522da445c2a2329959080c897d1f7033f6a687c268a626abfa05a45791c5e1404a4e3e8025ea172494bca0b430033e4fc20b0
-  data.tar.gz: 316e8fe4e922b0aeb5047191fa6ddd8e242fa7bfd1ce8eb2b2a2e08d980999dd895c071f82f949a2690c4259bc5b3de00049e691b540040db79d002ffdf8a4f0
+  metadata.gz: 6e3edf351b7cda562d39120df2dff564a8248c41ecedae54deed2017c11cbc6c56311507a7dda87ce350b17ac9bdf3d265523dde3a79de751426d206a2f51741
+  data.tar.gz: 020cb49d6dfc204de93a035853a26448b20901dc962d772554c4cb9074ff14682c570afffe3c2cbc05984c2298a7a05ffe662f8aaccde80e510b97450dd270cc

data/bin/wayback_archiver CHANGED Viewed

@@ -10,6 +10,7 @@ log = STDOUT
 log_level = Logger::INFO
 concurrency = WaybackArchiver.concurrency
 limit = WaybackArchiver.max_limit
+hosts = []
 optparse = OptionParser.new do |parser|
   parser.banner = 'Usage: wayback_archiver [<url>] [options]'
@@ -30,7 +31,11 @@ optparse = OptionParser.new do |parser|
     strategy = 'urls'
   end
-  parser.on('--concurrency=5', Integer, 'Concurrency') do |value|
+  parser.on('--hosts=[example.com]', Array, 'Only spider links on certain hosts') do |value|
+    hosts = value.map { |v| Regexp.new(v) } if value
+  end
+  parser.on('--concurrency=1', Integer, 'Concurrency') do |value|
     concurrency = value
   end
@@ -81,6 +86,7 @@ strategy ||= 'auto'
 urls.each do |url|
   WaybackArchiver.archive(
     url,
+    hosts: hosts,
     strategy: strategy,
     concurrency: concurrency,
     limit: limit

data/lib/wayback_archiver.rb CHANGED Viewed

@@ -11,9 +11,11 @@ module WaybackArchiver
   INFO_LINK  = 'https://rubygems.org/gems/wayback_archiver'.freeze
   # WaybackArchiver User-Agent
   USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})".freeze
+  # Default for whether to respect robots txt files
+  DEFAULT_RESPECT_ROBOTS_TXT = false
   # Default concurrency for archiving URLs
-  DEFAULT_CONCURRENCY = 5
+  DEFAULT_CONCURRENCY = 1
   # Maxmium number of links posted (-1 is no limit)
   DEFAULT_MAX_LIMIT = -1
@@ -22,6 +24,7 @@ module WaybackArchiver
   # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
   # @param [String/Array<String>] source for URL(s).
   # @param [String/Symbol] strategy of source. Supported strategies: crawl, sitemap, url, urls, auto.
+  # @param [Array<String, Regexp>] hosts to crawl.
   # @example Crawl example.com and send all URLs of the same domain
   #    WaybackArchiver.archive('example.com') # Default strategy is :auto
   #    WaybackArchiver.archive('example.com', strategy: :auto)
@@ -43,11 +46,19 @@ module WaybackArchiver
   #    WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
   #    WaybackArchiver.archive('example.com', strategy: :url, limit: 100) # send max 100 URLs
   #    WaybackArchiver.archive('example.com', :url)
-  def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
+  # @example Crawl multiple hosts
+  #    WaybackArchiver.archive(
+  #      'http://example.com',
+  #      hosts: [
+  #        'example.com',
+  #        /host[\d]+\.example\.com/
+  #      ]
+  #    )
+  def self.archive(source, legacy_strategy = nil, strategy: :auto, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
     strategy = legacy_strategy || strategy
     case strategy.to_s
-    when 'crawl'   then crawl(source, concurrency: concurrency, limit: limit, &block)
+    when 'crawl'   then crawl(source, concurrency: concurrency, limit: limit, hosts: hosts, &block)
     when 'auto'    then auto(source, concurrency: concurrency, limit: limit, &block)
     when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit, &block)
     when 'urls'    then urls(source, concurrency: concurrency, limit: limit, &block)
@@ -63,7 +74,7 @@ module WaybackArchiver
   # @param [String] source (must be a valid URL).
   # @param concurrency [Integer]
   # @example Auto archive example.com
-  #    WaybackArchiver.auto('example.com') # Default concurrency is 5
+  #    WaybackArchiver.auto('example.com') # Default concurrency is 1
   # @example Auto archive example.com with low concurrency
   #    WaybackArchiver.auto('example.com', concurrency: 1)
   # @example Auto archive example.com and archive max 100 URLs
@@ -79,16 +90,25 @@ module WaybackArchiver
   # Crawl site for URLs to send to the Wayback Machine.
   # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
   # @param [String] url to start crawling from.
+  # @param [Array<String, Regexp>] hosts to crawl
   # @param concurrency [Integer]
   # @example Crawl example.com and send all URLs of the same domain
-  #    WaybackArchiver.crawl('example.com') # Default concurrency is 5
+  #    WaybackArchiver.crawl('example.com') # Default concurrency is 1
   # @example Crawl example.com and send all URLs of the same domain with low concurrency
   #    WaybackArchiver.crawl('example.com', concurrency: 1)
   # @example Crawl example.com and archive max 100 URLs
   #    WaybackArchiver.crawl('example.com', limit: 100)
-  def self.crawl(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
+  # @example Crawl multiple hosts
+  #    URLCollector.crawl(
+  #      'http://example.com',
+  #      hosts: [
+  #        'example.com',
+  #        /host[\d]+\.example\.com/
+  #      ]
+  #    )
+  def self.crawl(url, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
     WaybackArchiver.logger.info "Crawling #{url}"
-    Archive.crawl(url, concurrency: concurrency, limit: limit, &block)
+    Archive.crawl(url, hosts: hosts, concurrency: concurrency, limit: limit, &block)
   end
   # Get URLs from sitemap and send found URLs to the Wayback Machine.
@@ -96,7 +116,7 @@ module WaybackArchiver
   # @param [String] url to the sitemap.
   # @param concurrency [Integer]
   # @example Get example.com sitemap and archive all found URLs
-  #    WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 5
+  #    WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 1
   # @example Get example.com sitemap and archive all found URLs with low concurrency
   #    WaybackArchiver.sitemap('example.com/sitemap.xml', concurrency: 1)
   # @example Get example.com sitemap archive max 100 URLs
@@ -155,6 +175,19 @@ module WaybackArchiver
     @user_agent ||= USER_AGENT
   end
+  # Sets the default respect_robots_txt
+  # @return [Boolean] the desired default for respect_robots_txt
+  # @param [Boolean] respect_robots_txt the desired default
+  def self.respect_robots_txt=(respect_robots_txt)
+    @respect_robots_txt = respect_robots_txt
+  end
+  # Returns the default respect_robots_txt
+  # @return [Boolean] the configured or the default respect_robots_txt
+  def self.respect_robots_txt
+    @respect_robots_txt ||= DEFAULT_RESPECT_ROBOTS_TXT
+  end
   # Sets the default concurrency
   # @return [Integer] the desired default concurrency
   # @param [Integer] concurrency the desired default concurrency

data/lib/wayback_archiver/archive.rb CHANGED Viewed

@@ -9,7 +9,7 @@ module WaybackArchiver
     # Send URLs to Wayback Machine.
     # @return [Array<ArchiveResult>] with sent URLs.
     # @param [Array<String>] urls to send to the Wayback Machine.
-    # @param concurrency [Integer] the default is 5
+    # @param concurrency [Integer] the default is 1
     # @yield [archive_result] If a block is given, each result will be yielded
     # @yieldparam [ArchiveResult] archive_result
     # @example Archive urls, asynchronously
@@ -54,7 +54,8 @@ module WaybackArchiver
     # Send URLs to Wayback Machine by crawling the site.
     # @return [Array<ArchiveResult>] with URLs sent to the Wayback Machine.
     # @param [String] source for URL to crawl.
-    # @param concurrency [Integer] the default is 5
+    # @param concurrency [Integer] the default is 1
+    # @param [Array<String, Regexp>] hosts to crawl
     # @yield [archive_result] If a block is given, each result will be yielded
     # @yieldparam [ArchiveResult] archive_result
     # @example Crawl example.com and send all URLs of the same domain
@@ -66,13 +67,21 @@ module WaybackArchiver
     #    Archiver.crawl('example.com', concurrency: 1)
     # @example Stop after archiving 100 links
     #    Archiver.crawl('example.com', limit: 100)
-    def self.crawl(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
+    # @example Crawl multiple hosts
+    #    URLCollector.crawl(
+    #      'http://example.com',
+    #      hosts: [
+    #        'example.com',
+    #        /host[\d]+\.example\.com/
+    #      ]
+    #    )
+    def self.crawl(source, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
       WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
       posted_urls = Concurrent::Array.new
       pool = ThreadPool.build(concurrency)
-      found_urls = URLCollector.crawl(source, limit: limit) do |url|
+      found_urls = URLCollector.crawl(source, hosts: hosts, limit: limit) do |url|
         pool.post do
           result = post_url(url)
           yield(result) if block_given?

data/lib/wayback_archiver/url_collector.rb CHANGED Viewed

@@ -19,17 +19,27 @@ module WaybackArchiver
     # Retrieve URLs by crawling.
     # @return [Array<String>] of URLs defined found during crawl.
     # @param [String] url domain to crawl URLs from.
+    # @param [Array<String, Regexp>] hosts to crawl.
     # @example Crawl URLs defined on example.com
     #    URLCollector.crawl('http://example.com')
     # @example Crawl URLs defined on example.com and limit the number of visited pages to 100
     #    URLCollector.crawl('http://example.com', limit: 100)
     # @example Crawl URLs defined on example.com and explicitly set no upper limit on the number of visited pages to 100
     #    URLCollector.crawl('http://example.com', limit: -1)
-    def self.crawl(url, limit: WaybackArchiver.max_limit)
+    # @example Crawl multiple hosts
+    #    URLCollector.crawl(
+    #      'http://example.com',
+    #      hosts: [
+    #        'example.com',
+    #        /host[\d]+\.example\.com/
+    #      ]
+    #    )
+    def self.crawl(url, hosts: [], limit: WaybackArchiver.max_limit)
       urls = []
       start_at_url = Request.build_uri(url).to_s
       options = {
-        robots: true,
+        robots: WaybackArchiver.respect_robots_txt,
+        hosts: hosts,
         user_agent: WaybackArchiver.user_agent
       }
       options[:limit] = limit unless limit == -1

data/lib/wayback_archiver/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 module WaybackArchiver
   # Gem version
-  VERSION = '1.3.0'.freeze
+  VERSION = '1.4.0'.freeze
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: wayback_archiver
 version: !ruby/object:Gem::Version
-  version: 1.3.0
+  version: 1.4.0
 platform: ruby
 authors:
 - Jacob Burenstam
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-01-24 00:00:00.000000000 Z
+date: 2021-04-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: spidr
@@ -16,14 +16,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.6.0
+        version: 0.6.1
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.6.0
+        version: 0.6.1
 - !ruby/object:Gem::Dependency
   name: robots
   requirement: !ruby/object:Gem::Requirement
@@ -58,28 +58,28 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.3'
+        version: '2.1'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.3'
+        version: '2.1'
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '10.3'
+        version: '12.3'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '10.3'
+        version: '12.3'
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
@@ -220,8 +220,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.7.6
+rubygems_version: 3.1.4
 signing_key:
 specification_version: 4
 summary: Post URLs to Wayback Machine (Internet Archive)