wayback_archiver 1.2.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/wayback_archiver/request.rb +16 -12
- data/lib/wayback_archiver/sitemapper.rb +13 -2
- data/lib/wayback_archiver/version.rb +1 -1
- metadata +19 -5
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 | 
            -
             | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 2 | 
            +
            SHA256:
         | 
| 3 | 
            +
              metadata.gz: 1d97461d94b8ec02e1cae528f939be17991d054a1d29953c8672bfc8e29ea7cf
         | 
| 4 | 
            +
              data.tar.gz: 75bf5c3a7214001df417678c7a084ea3bc1e80dc87773a8da17c1c1defcdaff8
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 00d2eef71ef692688249dc97cbaca906f6725679a19deab2fd3c5998e06319032ee9e9aff24bb68dd80254f1b18ef06d1aeadfeabd7fcaafa326f9bdcca10688
         | 
| 7 | 
            +
              data.tar.gz: d05a72cfb6fbb2636f43c8d8737f72aa0587415a8ce344d1c1d01db9a3cdcecfc278d81d27e2962868dab0674e4ffdcc33fe9115f0969e8c32f1c490473e13aa
         | 
| @@ -33,6 +33,19 @@ module WaybackArchiver | |
| 33 33 | 
             
                # Max number of redirects before an error is raised
         | 
| 34 34 | 
             
                MAX_REDIRECTS = 10
         | 
| 35 35 |  | 
| 36 | 
            +
                # Known request errors
         | 
| 37 | 
            +
                REQUEST_ERRORS = {
         | 
| 38 | 
            +
                  # server
         | 
| 39 | 
            +
                  Timeout::Error => ServerError,
         | 
| 40 | 
            +
                  OpenSSL::SSL::SSLError => ServerError,
         | 
| 41 | 
            +
                  Net::HTTPBadResponse => ServerError,
         | 
| 42 | 
            +
                  Zlib::Error => ServerError,
         | 
| 43 | 
            +
                  # client
         | 
| 44 | 
            +
                  SystemCallError => ClientError,
         | 
| 45 | 
            +
                  SocketError => ClientError,
         | 
| 46 | 
            +
                  IOError => ClientError
         | 
| 47 | 
            +
                }.freeze
         | 
| 48 | 
            +
             | 
| 36 49 | 
             
                # Get reponse.
         | 
| 37 50 | 
             
                # @return [Response] the http response representation.
         | 
| 38 51 | 
             
                # @param [String, URI] uri to retrieve.
         | 
| @@ -184,20 +197,11 @@ module WaybackArchiver | |
| 184 197 | 
             
                private
         | 
| 185 198 |  | 
| 186 199 | 
             
                def self.perform_request(uri, http, request)
         | 
| 187 | 
            -
                  # TODO: Consider retrying  | 
| 200 | 
            +
                  # TODO: Consider retrying on certain HTTP response codes, i.e 429, 503
         | 
| 188 201 | 
             
                  response = http.request(request)
         | 
| 189 202 | 
             
                  GETStruct.new(response)
         | 
| 190 | 
            -
                rescue  | 
| 191 | 
            -
             | 
| 192 | 
            -
                       Net::HTTPBadResponse,
         | 
| 193 | 
            -
                       Zlib::Error => e
         | 
| 194 | 
            -
             | 
| 195 | 
            -
                  build_request_error(uri, e, ServerError)
         | 
| 196 | 
            -
                rescue SystemCallError,
         | 
| 197 | 
            -
                       SocketError,
         | 
| 198 | 
            -
                       IOError => e
         | 
| 199 | 
            -
             | 
| 200 | 
            -
                  build_request_error(uri, e, ClientError)
         | 
| 203 | 
            +
                rescue *REQUEST_ERRORS.keys => e
         | 
| 204 | 
            +
                  build_request_error(uri, e, REQUEST_ERRORS.fetch(e.class))
         | 
| 201 205 | 
             
                end
         | 
| 202 206 |  | 
| 203 207 | 
             
                def self.build_request_error(uri, error, error_wrapper_klass)
         | 
| @@ -1,3 +1,4 @@ | |
| 1 | 
            +
            require 'set'
         | 
| 1 2 | 
             
            require 'robots'
         | 
| 2 3 |  | 
| 3 4 | 
             
            require 'wayback_archiver/sitemap'
         | 
| @@ -27,6 +28,7 @@ module WaybackArchiver | |
| 27 28 | 
             
                  WaybackArchiver.logger.info 'Looking for Sitemap(s) in /robots.txt'
         | 
| 28 29 | 
             
                  robots = Robots.new(WaybackArchiver.user_agent)
         | 
| 29 30 | 
             
                  sitemaps = robots.other_values(url)['Sitemap']
         | 
| 31 | 
            +
             | 
| 30 32 | 
             
                  if sitemaps
         | 
| 31 33 | 
             
                    return sitemaps.flat_map do |sitemap|
         | 
| 32 34 | 
             
                      WaybackArchiver.logger.info "Fetching Sitemap at #{sitemap}"
         | 
| @@ -61,12 +63,21 @@ module WaybackArchiver | |
| 61 63 | 
             
                # @example Get URLs defined in Sitemap
         | 
| 62 64 | 
             
                #    Sitemapper.urls(xml: xml)
         | 
| 63 65 | 
             
                # @see http://www.sitemaps.org
         | 
| 64 | 
            -
                def self.urls(url: nil, xml: nil)
         | 
| 66 | 
            +
                def self.urls(url: nil, xml: nil, visited: Set.new)
         | 
| 67 | 
            +
                  if visited.include?(url)
         | 
| 68 | 
            +
                    WaybackArchiver.logger.debug "Already visited #{url} skipping.."
         | 
| 69 | 
            +
                    return []
         | 
| 70 | 
            +
                  end
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                  visited << url if url
         | 
| 73 | 
            +
             | 
| 65 74 | 
             
                  xml = Request.get(url).body unless xml
         | 
| 66 75 | 
             
                  sitemap = Sitemap.new(xml)
         | 
| 67 76 |  | 
| 68 77 | 
             
                  if sitemap.sitemap_index?
         | 
| 69 | 
            -
                    sitemap.sitemaps.flat_map  | 
| 78 | 
            +
                    sitemap.sitemaps.flat_map do |sitemap_url|
         | 
| 79 | 
            +
                      urls(url: sitemap_url, visited: visited)
         | 
| 80 | 
            +
                    end
         | 
| 70 81 | 
             
                  else
         | 
| 71 82 | 
             
                    sitemap.urls
         | 
| 72 83 | 
             
                  end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: wayback_archiver
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 1.2. | 
| 4 | 
            +
              version: 1.2.1
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Jacob Burenstam
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date:  | 
| 11 | 
            +
            date: 2018-10-14 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: spidr
         | 
| @@ -100,14 +100,28 @@ dependencies: | |
| 100 100 | 
             
                requirements:
         | 
| 101 101 | 
             
                - - "~>"
         | 
| 102 102 | 
             
                  - !ruby/object:Gem::Version
         | 
| 103 | 
            -
                    version: '0. | 
| 103 | 
            +
                    version: '0.9'
         | 
| 104 104 | 
             
              type: :development
         | 
| 105 105 | 
             
              prerelease: false
         | 
| 106 106 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 107 107 | 
             
                requirements:
         | 
| 108 108 | 
             
                - - "~>"
         | 
| 109 109 | 
             
                  - !ruby/object:Gem::Version
         | 
| 110 | 
            -
                    version: '0. | 
| 110 | 
            +
                    version: '0.9'
         | 
| 111 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 112 | 
            +
              name: simplecov
         | 
| 113 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 114 | 
            +
                requirements:
         | 
| 115 | 
            +
                - - "~>"
         | 
| 116 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 117 | 
            +
                    version: 0.14.1
         | 
| 118 | 
            +
              type: :development
         | 
| 119 | 
            +
              prerelease: false
         | 
| 120 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 121 | 
            +
                requirements:
         | 
| 122 | 
            +
                - - "~>"
         | 
| 123 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 124 | 
            +
                    version: 0.14.1
         | 
| 111 125 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 112 126 | 
             
              name: coveralls
         | 
| 113 127 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| @@ -207,7 +221,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 207 221 | 
             
                  version: '0'
         | 
| 208 222 | 
             
            requirements: []
         | 
| 209 223 | 
             
            rubyforge_project: 
         | 
| 210 | 
            -
            rubygems_version: 2.6 | 
| 224 | 
            +
            rubygems_version: 2.7.6
         | 
| 211 225 | 
             
            signing_key: 
         | 
| 212 226 | 
             
            specification_version: 4
         | 
| 213 227 | 
             
            summary: Post URLs to Wayback Machine (Internet Archive)
         |