RubyGems - wayback_archiver - Versions diffs - 0.0.11 → 0.0.12 - Mend

wayback_archiver 0.0.11 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/lib/wayback_archiver.rb +4 -5
data/lib/wayback_archiver/archive.rb +35 -37
data/lib/wayback_archiver/process_queue.rb +28 -0
data/lib/wayback_archiver/request.rb +50 -54
data/lib/wayback_archiver/url_collector.rb +41 -30
data/lib/wayback_archiver/version.rb +1 -1
metadata +35 -7

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: fa0d72d6164b6b280db7c683535e6894ba64778d
-  data.tar.gz: 629963ef30283123820418a714ae73160fa20293
+  metadata.gz: fd01b7c39f7432bba6bb0abf29cff5cced700e90
+  data.tar.gz: 887582fb5f46f8e42a126b09d7c31b3d005bb1cf
 SHA512:
-  metadata.gz: 1021437d5c97ecfad14822548522cad49d3257992c91ee8ccf90a30e3cd4cd98fae6c44d977fc3c1cd4f61334bd7c935fcc50bdee92a68a4399470d4610836be
-  data.tar.gz: 5c805b9bb8b7cc1123be943966b2f4e94619580d17eb051947d38b1e3d4a165b2c61dd3f117c6bc7c8d858768731108bb1aab1cc59472ad0a3e120fb8df9cae4
+  metadata.gz: 4f66455e9ddfdd2cb191910bacfe784a3822e31b0e7ef7bce395fd60614887c9c700b262e958b6acfff9fa59941068c023a0729d09cf00116f44b359c01b2213
+  data.tar.gz: 8c8f98adfeb91808296d8c1bc7e01d219918074b5680d2afccc22ac00cb02b6d6ff18b29129d3650fa660238b023e8948a851aeb7ce36b42c34b9be7bb913386

data/lib/wayback_archiver.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-require 'site_mapper'
 require 'uri'
 require 'net/http'
@@ -7,13 +5,14 @@ require 'wayback_archiver/version'
 require 'wayback_archiver/url_collector'
 require 'wayback_archiver/archive'
 require 'wayback_archiver/request'
+require 'wayback_archiver/process_queue'
 # WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
 module WaybackArchiver
   # Link to gem on rubygems.org, part of the sent User-Agent
-  INFO_LINK  = 'https://rubygems.org/gems/wayback_archiver'
+  INFO_LINK  = 'https://rubygems.org/gems/wayback_archiver'.freeze
   # WaybackArchiver User-Agent
-  USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})"
+  USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})".freeze
   # Send URLs to Wayback Machine.
   # @return [Array] with URLs sent to the Wayback Machine.
@@ -29,7 +28,7 @@ module WaybackArchiver
   def self.archive(source, type = :crawl)
     case type.to_s
     when 'file'    then Archive.post(UrlCollector.file(source))
-    when 'crawl'   then UrlCollector.crawl(source) { |url| Archive.post_url(url) }
+    when 'crawl'   then Archive.post(UrlCollector.crawl(source))
     when 'sitemap' then Archive.post(UrlCollector.sitemap(source))
     when 'url'     then Archive.post_url(Request.resolve_url(source))
     else

data/lib/wayback_archiver/archive.rb CHANGED Viewed

@@ -2,46 +2,44 @@ module WaybackArchiver
   # Post URL(s) to Wayback Machine
   class Archive
     # Wayback Machine base URL.
-    WAYBACK_BASE_URL    = 'https://web.archive.org/save/'
+    WAYBACK_BASE_URL    = 'https://web.archive.org/save/'.freeze
     # Default concurrency for archiving URLs
     DEFAULT_CONCURRENCY = 10
-    class << self
-      # Send URLs to Wayback Machine.
-      # @return [Array] with sent URLs.
-      # @param [Array] urls URLs to send.
-      # @param [Hash] options
-      # @example Archive example.com, with default options
-      #    Archive.post(['http://example.com'])
-      # @example Archive example.com, using only 1 thread
-      #    Archive.post(['http://example.com'], concurrency: 1)
-      def post(urls, options = {})
-        options     = { concurrency: DEFAULT_CONCURRENCY }.merge!(options)
-        concurrency = options[:concurrency]
-        puts "Request are sent with up to #{concurrency} parallel threads"
-        puts "Total urls to be sent: #{urls.length}"
-        group_size = (urls.length / concurrency) + 1
-        urls.each_slice(group_size).to_a.map! do |archive_urls|
-          Thread.new { archive_urls.each { |url| post_url(url) } }
-        end.each(&:join)
-        puts "#{urls.length} URLs sent to Internet archive"
-        urls
-      end
+    # Send URLs to Wayback Machine.
+    # @return [Array] with sent URLs.
+    # @param [Array] urls URLs to send.
+    # @param [Hash] options
+    # @example Archive example.com, with default options
+    #    Archive.post(['http://example.com'])
+    # @example Archive example.com, using only 1 thread
+    #    Archive.post(['http://example.com'], concurrency: 1)
+    def self.post(urls, options = {})
+      options     = { concurrency: DEFAULT_CONCURRENCY }.merge!(options)
+      concurrency = options[:concurrency]
-      # Send URL to Wayback Machine.
-      # @return [String] the sent URL.
-      # @param [String] url to send.
-      # @example Archive example.com, with default options
-      #    Archive.post_url('http://example.com')
-      def post_url(url)
-        resolved_url = Request.resolve_url(url)
-        request_url  = "#{WAYBACK_BASE_URL}#{resolved_url}"
-        response     = Request.response(request_url)
-        puts "[#{response.code}, #{response.message}] #{resolved_url}"
-        resolved_url
-      rescue Exception => e
-        puts "Error message:     #{e.message}"
-        puts "Failed to archive: #{resolved_url}"
-      end
+      puts "=== WAYBACK ARCHIVER ==="
+      puts "Request are sent with up to #{concurrency} parallel threads"
+      puts "Total urls to be sent: #{urls.length}"
+      ProcessQueue.process(urls, threads_count: concurrency) { |url| post_url(url) }
+      puts "#{urls.length} URLs sent to Internet archive"
+      urls
+    end
+    # Send URL to Wayback Machine.
+    # @return [String] the sent URL.
+    # @param [String] url to send.
+    # @example Archive example.com, with default options
+    #    Archive.post_url('http://example.com')
+    def self.post_url(url)
+      request_url  = "#{WAYBACK_BASE_URL}#{url}"
+      response     = Request.response(request_url)
+      puts "[#{response.code}, #{response.message}] #{url}"
+      url
+    rescue Exception => e
+      puts "Error message:     #{e.message}"
+      puts "Failed to archive: #{url}"
     end
   end
 end

data/lib/wayback_archiver/process_queue.rb ADDED Viewed

@@ -0,0 +1,28 @@
+require 'thread'
+module WaybackArchiver
+  class ProcessQueue
+    # Process enumerable data in parallel.
+    # @return [Array] of URLs defined found during crawl.
+    # @param [Object] Enumberable object
+    # @example Print list of names in parallel
+    #    ProcessQueue.process(%w(jacob peter eva)) { |v| puts n }
+    # @example Print list of names using 2 threads
+    #    ProcessQueue.process(%w(jacob peter eva), threads_count: 2) { |v| puts n }
+    def self.process(data_array, threads_count: 5)
+      queue = Queue.new
+      data_array.each { |data| queue.push(data) }
+      workers = threads_count.times.map do
+        Thread.new do
+          begin
+            while data = queue.pop(true)
+              yield(data)
+            end
+          rescue ThreadError
+          end
+        end
+      end
+      workers.map(&:join)
+    end
+  end
+end

data/lib/wayback_archiver/request.rb CHANGED Viewed

@@ -3,64 +3,60 @@ require 'url_resolver' # TODO: Allow users to use any resolver
 module WaybackArchiver
   # Request and parse HTML & XML documents
   class Request
-    class << self
-      # Get and parse HTML & XML documents.
-      # @return [Array] with links sent to the Wayback Machine.
-      # @param [String] url to retrieve and parse.
-      # @example Request and parse example.com
-      #    Request.document('example.com')
-      # @example Request and parse google.com/sitemap.xml
-      #    Request.document('google.com/sitemap.xml')
-      def document(url)
-        response_body = Request.response(url).body
-        Nokogiri::HTML(response_body)
-      end
-      # Get reponse.
-      # @return [Net::HTTP*] the http response.
-      # @param [String] url URL to retrieve.
-      # @param [Boolean] resolve whether to resolve the URL.
-      # @example Resolve example.com and request
-      #    Request.response('example.com', true)
-      # @example Request http://example.com
-      #    Request.response('http://example.com', false)
-      def response(url, resolve = true)
-        resolved_url = resolve ? resolve_url(url) : url
-        uri          = URI.parse(resolved_url)
-        http         = Net::HTTP.new(uri.host, uri.port)
-        http.use_ssl = true if resolved_url.start_with?('https://')
+    # Get and parse HTML & XML documents.
+    # @return [Array] with links sent to the Wayback Machine.
+    # @param [String] url to retrieve and parse.
+    # @example Request and parse example.com
+    #    Request.document('example.com')
+    # @example Request and parse google.com/sitemap.xml
+    #    Request.document('google.com/sitemap.xml')
+    def self.document(url)
+      response_body = Request.response(url).body
+      Nokogiri::HTML(response_body)
+    end
-        request = Net::HTTP::Get.new(uri.request_uri)
-        request['User-Agent'] = WaybackArchiver::USER_AGENT
-        http.request(request)
-      end
+    # Get reponse.
+    # @return [Net::HTTP*] the http response.
+    # @param [String] url URL to retrieve.
+    # @param [Boolean] resolve whether to resolve the URL.
+    # @example Resolve example.com and request
+    #    Request.response('example.com', true)
+    # @example Request http://example.com
+    #    Request.response('http://example.com', false)
+    def self.response(url, resolve = true)
+      resolved_url = resolve ? resolve_url(url) : url
+      uri          = URI.parse(resolved_url)
+      http         = Net::HTTP.new(uri.host, uri.port)
+      http.use_ssl = true if resolved_url.start_with?('https://')
-      # Resolve the URL, follows redirects.
-      # @return [String] the resolved URL.
-      # @param [String] url to retrieve.
-      # @example Resolve example.com and request
-      #    Request.resolve_url('example.com')
-      def resolve_url(url)
-        resolved = UrlResolver.resolve(url)
-        resolved = resolved.prepend('http://') unless has_protocol?(resolved)
-        resolved
-      end
+      request = Net::HTTP::Get.new(uri.request_uri)
+      request['User-Agent'] = WaybackArchiver::USER_AGENT
+      http.request(request)
+    end
-      private
+    # Resolve the URL, follows redirects.
+    # @return [String] the resolved URL.
+    # @param [String] url to retrieve.
+    # @example Resolve example.com and request
+    #    Request.resolve_url('example.com')
+    def self.resolve_url(url)
+      resolved = UrlResolver.resolve(url)
+      resolved = resolved.prepend('http://') unless protocol?(resolved)
+      resolved
+    end
-      # Resolve the URL, follows redirects.
-      # @return [Boolean] true if string includes protocol.
-      # @param [String] url to check.
-      # @example Check if string includes protocol
-      #    Request.has_protocol?('example.com')
-      #    # => false
-      #    Request.has_protocol?('https://example.com')
-      #    # => true
-      #    Request.has_protocol?('http://example.com')
-      #    # => true
-      def has_protocol?(url)
-        url.start_with?('http://') || url.start_with?('https://')
-      end
+    # Resolve the URL, follows redirects.
+    # @return [Boolean] true if string includes protocol.
+    # @param [String] url to check.
+    # @example Check if string includes protocol
+    #    Request.protocol?('example.com')
+    #    # => false
+    #    Request.protocol?('https://example.com')
+    #    # => true
+    #    Request.protocol?('http://example.com')
+    #    # => true
+    def self.protocol?(url)
+      url.start_with?('http://') || url.start_with?('https://')
     end
   end
 end

data/lib/wayback_archiver/url_collector.rb CHANGED Viewed

@@ -1,40 +1,51 @@
+require 'spidr'
+require 'robots'
 module WaybackArchiver
   # Retrive URLs from different sources
   class UrlCollector
-    class << self
-      # Retrieve URLs from Sitemap.
-      # @return [Array] of URLs defined in Sitemap.
-      # @param [String] url domain to retrieve Sitemap from.
-      # @example Get URLs defined in Sitemap for google.com
-      #    UrlCollector.sitemap('https://google.com')
-      def sitemap(url)
-        resolved = Request.resolve_url("#{url}/sitemap.xml")
-        sitemap  = Request.document(resolved)
-        sitemap.css('loc').map { |element| element.text }
-      end
+    # Retrieve URLs from Sitemap.
+    # @return [Array] of URLs defined in Sitemap.
+    # @param [String] url domain to retrieve Sitemap from.
+    # @example Get URLs defined in Sitemap for google.com
+    #    UrlCollector.sitemap('https://google.com')
+    def self.sitemap(url)
+      resolved = Request.resolve_url("#{url}/sitemap.xml")
+      sitemap  = Request.document(resolved)
+      sitemap.css('loc').map(&:text)
+    end
-      # Retrieve URLs by crawling.
-      # @return [Array] of URLs defined found during crawl.
-      # @param [String] url domain to crawl URLs from.
-      # @example Crawl URLs defined on example.com
-      #    UrlCollector.crawl('http://example.com')
-      def crawl(url)
-        SiteMapper.map(url, user_agent: WaybackArchiver::USER_AGENT) { |new_url| yield(new_url) if block_given? }
+    # Retrieve URLs by crawling.
+    # @return [Array] of URLs defined found during crawl.
+    # @param [String] url domain to crawl URLs from.
+    # @example Crawl URLs defined on example.com
+    #    UrlCollector.crawl('http://example.com')
+    def self.crawl(url)
+      urls = []
+      resolved_url = Request.resolve_url(url)
+      Spidr.site(resolved_url, robots: true) do |spider|
+        spider.every_html_page do |page|
+          page_url = page.url.to_s
+          urls << page_url
+          puts "Found: #{page_url}"
+          yield(page_url) if block_given?
+        end
       end
+      urls
+    end
-      # Retrieve URLs listed in file.
-      # @return [Array] of URLs defined in file.
-      # @param [String] path to get URLs from.
-      # @example Get URLs defined in /path/to/file
-      #    UrlCollector.file('/path/to/file')
-      def file(path)
-        raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
-        urls = []
-        File.open(path).read
+    # Retrieve URLs listed in file.
+    # @return [Array] of URLs defined in file.
+    # @param [String] path to get URLs from.
+    # @example Get URLs defined in /path/to/file
+    #    UrlCollector.file('/path/to/file')
+    def self.file(path)
+      raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
+      urls = []
+      File.open(path).read
           .gsub(/\r\n?/, "\n")
-          .each_line { |line| urls << line.gsub(/\n/, '').strip }
-        urls.reject(&:empty?)
-      end
+          .each_line { |line| urls << line.delete("\n").strip }
+      urls.reject(&:empty?)
     end
   end
 end

data/lib/wayback_archiver/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 module WaybackArchiver
   # Gem version
-  VERSION = '0.0.11'
+  VERSION = '0.0.12'.freeze
 end

metadata CHANGED Viewed

@@ -1,29 +1,43 @@
 --- !ruby/object:Gem::Specification
 name: wayback_archiver
 version: !ruby/object:Gem::Version
-  version: 0.0.11
+  version: 0.0.12
 platform: ruby
 authors:
 - Jacob Burenstam
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-04-02 00:00:00.000000000 Z
+date: 2017-02-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
-  name: site_mapper
+  name: spidr
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 0.6.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 0.6.0
+- !ruby/object:Gem::Dependency
+  name: robots
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.1'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.1'
 - !ruby/object:Gem::Dependency
   name: url_resolver
   requirement: !ruby/object:Gem::Requirement
@@ -122,6 +136,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '3.2'
+- !ruby/object:Gem::Dependency
+  name: byebug
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">"
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">"
+      - !ruby/object:Gem::Version
+        version: '0'
 description: Send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
 email:
 - burenstam@gmail.com
@@ -133,6 +161,7 @@ files:
 - bin/wayback_archiver
 - lib/wayback_archiver.rb
 - lib/wayback_archiver/archive.rb
+- lib/wayback_archiver/process_queue.rb
 - lib/wayback_archiver/request.rb
 - lib/wayback_archiver/url_collector.rb
 - lib/wayback_archiver/version.rb
@@ -156,9 +185,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.2.2
+rubygems_version: 2.5.2
 signing_key:
 specification_version: 4
 summary: Send URLs to Wayback Machine
 test_files: []
-has_rdoc: