aranha 0.15.0 → 0.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/aranha/address_processor.rb +48 -0
- data/lib/aranha/default_processor.rb +3 -7
- data/lib/aranha/processor.rb +10 -29
- data/lib/aranha/version.rb +1 -1
- metadata +14 -7
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 33a45e4db83f3d2743b53386005803da35188866dad28e15313a56b747efd4d1
         | 
| 4 | 
            +
              data.tar.gz: 77ac8f2bcd922652fe62999f43d148a6557fb66d62e9edb21ba51d00ad5cb6d5
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: c9424bbcd6540d5627052427cdbc746db9f36470e35e3c6deed3b2668160de22b2163e3f0a8b5ef03c13681bb05bb42b20c1a7348b71e616facdd82bb464e1fc
         | 
| 7 | 
            +
              data.tar.gz: 54800ff8cc979371da63d2cea2543951f08f3f9a6423d0b20de7a9399cdc19338775f087d9a6791bcd3a0ec27d0500983fd47056f4438d9603e1fac29879a7cb
         | 
| @@ -0,0 +1,48 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            require 'eac_ruby_utils/core_ext'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            module Aranha
         | 
| 6 | 
            +
              class AddressProcessor
         | 
| 7 | 
            +
                ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
         | 
| 8 | 
            +
                CORE_EXCEPTIONS = [::SocketError].freeze
         | 
| 9 | 
            +
                ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
         | 
| 10 | 
            +
                HTTPCLIENT_EXCEPTIONS = [
         | 
| 11 | 
            +
                  ::HTTPClient::BadResponseError,
         | 
| 12 | 
            +
                  ::HTTPClient::ConnectTimeoutError,
         | 
| 13 | 
            +
                  ::HTTPClient::ReceiveTimeoutError
         | 
| 14 | 
            +
                ].freeze
         | 
| 15 | 
            +
                NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
         | 
| 18 | 
            +
                                     HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                class << self
         | 
| 21 | 
            +
                  def rescuable_error?(error)
         | 
| 22 | 
            +
                    return true if NETWORK_EXCEPTIONS.any? { |klass| error.is_a?(klass) }
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                    error.cause.present? ? network_error?(error.cause) : false
         | 
| 25 | 
            +
                  end
         | 
| 26 | 
            +
                end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                enable_simple_cache
         | 
| 29 | 
            +
                common_constructor :address
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                def successful?
         | 
| 32 | 
            +
                  error.blank?
         | 
| 33 | 
            +
                end
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                def rescuable_error?
         | 
| 36 | 
            +
                  self.class.rescuable_error?(error)
         | 
| 37 | 
            +
                end
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                private
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                def error_uncached
         | 
| 42 | 
            +
                  address.process
         | 
| 43 | 
            +
                  nil
         | 
| 44 | 
            +
                rescue ::StandardError => e
         | 
| 45 | 
            +
                  e
         | 
| 46 | 
            +
                end
         | 
| 47 | 
            +
              end
         | 
| 48 | 
            +
            end
         | 
| @@ -1,11 +1,10 @@ | |
| 1 1 | 
             
            # frozen_string_literal: true
         | 
| 2 2 |  | 
| 3 3 | 
             
            require 'addressable'
         | 
| 4 | 
            +
            require 'eac_ruby_utils/core_ext'
         | 
| 4 5 |  | 
| 5 6 | 
             
            module Aranha
         | 
| 6 7 | 
             
              class DefaultProcessor
         | 
| 7 | 
            -
                attr_reader :source_uri, :extra_data
         | 
| 8 | 
            -
             | 
| 9 8 | 
             
                class << self
         | 
| 10 9 | 
             
                  def sanitize_uri(uri)
         | 
| 11 10 | 
             
                    return uri if uri.is_a?(Hash)
         | 
| @@ -15,17 +14,14 @@ module Aranha | |
| 15 14 | 
             
                  end
         | 
| 16 15 | 
             
                end
         | 
| 17 16 |  | 
| 18 | 
            -
                 | 
| 19 | 
            -
                   | 
| 20 | 
            -
                  @extra_data = extra_data
         | 
| 17 | 
            +
                common_constructor :source_uri, :extra_data do
         | 
| 18 | 
            +
                  self.source_uri = self.class.sanitize_uri(source_uri)
         | 
| 21 19 | 
             
                end
         | 
| 22 20 |  | 
| 23 21 | 
             
                def process
         | 
| 24 22 | 
             
                  raise 'Implement method process'
         | 
| 25 23 | 
             
                end
         | 
| 26 24 |  | 
| 27 | 
            -
                protected
         | 
| 28 | 
            -
             | 
| 29 25 | 
             
                def target_uri
         | 
| 30 26 | 
             
                  source_uri
         | 
| 31 27 | 
             
                end
         | 
    
        data/lib/aranha/processor.rb
    CHANGED
    
    | @@ -7,19 +7,6 @@ require 'aranha/manager' | |
| 7 7 |  | 
| 8 8 | 
             
            module Aranha
         | 
| 9 9 | 
             
              class Processor
         | 
| 10 | 
            -
                ARANHA_EXCEPTIONS = [::Aranha::Parsers::InvalidStateException].freeze
         | 
| 11 | 
            -
                CORE_EXCEPTIONS = [::SocketError].freeze
         | 
| 12 | 
            -
                ERRNO_EXCEPTIONS = [Errno::ECONNREFUSED, ::Errno::ECONNRESET].freeze
         | 
| 13 | 
            -
                HTTPCLIENT_EXCEPTIONS = [
         | 
| 14 | 
            -
                  ::HTTPClient::BadResponseError,
         | 
| 15 | 
            -
                  ::HTTPClient::ConnectTimeoutError,
         | 
| 16 | 
            -
                  ::HTTPClient::ReceiveTimeoutError
         | 
| 17 | 
            -
                ].freeze
         | 
| 18 | 
            -
                NET_EXCEPTIONS = [::Net::HTTPFatalError, ::Net::HTTPServerException, ::Net::OpenTimeout].freeze
         | 
| 19 | 
            -
             | 
| 20 | 
            -
                NETWORK_EXCEPTIONS = ARANHA_EXCEPTIONS + CORE_EXCEPTIONS + ERRNO_EXCEPTIONS +
         | 
| 21 | 
            -
                                     HTTPCLIENT_EXCEPTIONS + NET_EXCEPTIONS
         | 
| 22 | 
            -
             | 
| 23 10 | 
             
                DEFAULT_MAX_TRIES = 3
         | 
| 24 11 |  | 
| 25 12 | 
             
                attr_reader :manager
         | 
| @@ -58,20 +45,20 @@ module Aranha | |
| 58 45 | 
             
                def process_address(address)
         | 
| 59 46 | 
             
                  manager.log_info("Processing #{address} (Try: #{@try}/#{max_tries_s}," \
         | 
| 60 47 | 
             
                      " Unprocessed: #{unprocessed.count}/#{::Aranha::Manager.default.addresses_count})")
         | 
| 61 | 
            -
                   | 
| 62 | 
            -
             | 
| 63 | 
            -
                    @failed.delete(address.id)
         | 
| 64 | 
            -
                   | 
| 65 | 
            -
                    process_exception( | 
| 48 | 
            +
                  ap = ::Aranha::AddressProcessor.new(address)
         | 
| 49 | 
            +
                  if ap.successful?
         | 
| 50 | 
            +
                    @failed.delete(ap.address.id)
         | 
| 51 | 
            +
                  else
         | 
| 52 | 
            +
                    process_exception(ap)
         | 
| 66 53 | 
             
                  end
         | 
| 67 54 | 
             
                end
         | 
| 68 55 |  | 
| 69 | 
            -
                def process_exception( | 
| 70 | 
            -
                  raise  | 
| 56 | 
            +
                def process_exception(address_processor)
         | 
| 57 | 
            +
                  raise address_processor.error unless address_processor.rescuable_error?
         | 
| 71 58 |  | 
| 72 | 
            -
                  @failed[address.id] ||= 0
         | 
| 73 | 
            -
                  @failed[address.id] += 1
         | 
| 74 | 
            -
                  manager.log_warn( | 
| 59 | 
            +
                  @failed[address_processor.address.id] ||= 0
         | 
| 60 | 
            +
                  @failed[address_processor.address.id] += 1
         | 
| 61 | 
            +
                  manager.log_warn(address_processor.error)
         | 
| 75 62 | 
             
                end
         | 
| 76 63 |  | 
| 77 64 | 
             
                def next_address
         | 
| @@ -82,12 +69,6 @@ module Aranha | |
| 82 69 | 
             
                  ::Aranha::Manager.default.unprocessed_addresses
         | 
| 83 70 | 
             
                end
         | 
| 84 71 |  | 
| 85 | 
            -
                def network_exception?(exception)
         | 
| 86 | 
            -
                  return true if NETWORK_EXCEPTIONS.any? { |klass| exception.is_a?(klass) }
         | 
| 87 | 
            -
             | 
| 88 | 
            -
                  exception.cause.present? ? network_exception?(exception.cause) : false
         | 
| 89 | 
            -
                end
         | 
| 90 | 
            -
             | 
| 91 72 | 
             
                def not_try_ids
         | 
| 92 73 | 
             
                  @failed.select { |_k, v| v > @try }.map { |k, _v| k }
         | 
| 93 74 | 
             
                end
         | 
    
        data/lib/aranha/version.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: aranha
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 0.16.1
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Eduardo H. Bogoni
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date:  | 
| 11 | 
            +
            date: 2021-08-08 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: aranha-parsers
         | 
| @@ -50,14 +50,14 @@ dependencies: | |
| 50 50 | 
             
                requirements:
         | 
| 51 51 | 
             
                - - "~>"
         | 
| 52 52 | 
             
                  - !ruby/object:Gem::Version
         | 
| 53 | 
            -
                    version: '0. | 
| 53 | 
            +
                    version: '0.72'
         | 
| 54 54 | 
             
              type: :runtime
         | 
| 55 55 | 
             
              prerelease: false
         | 
| 56 56 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 57 57 | 
             
                requirements:
         | 
| 58 58 | 
             
                - - "~>"
         | 
| 59 59 | 
             
                  - !ruby/object:Gem::Version
         | 
| 60 | 
            -
                    version: '0. | 
| 60 | 
            +
                    version: '0.72'
         | 
| 61 61 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 62 62 | 
             
              name: httpclient
         | 
| 63 63 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| @@ -78,14 +78,20 @@ dependencies: | |
| 78 78 | 
             
                requirements:
         | 
| 79 79 | 
             
                - - "~>"
         | 
| 80 80 | 
             
                  - !ruby/object:Gem::Version
         | 
| 81 | 
            -
                    version: '0. | 
| 81 | 
            +
                    version: '0.3'
         | 
| 82 | 
            +
                - - ">="
         | 
| 83 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 84 | 
            +
                    version: 0.3.1
         | 
| 82 85 | 
             
              type: :development
         | 
| 83 86 | 
             
              prerelease: false
         | 
| 84 87 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 85 88 | 
             
                requirements:
         | 
| 86 89 | 
             
                - - "~>"
         | 
| 87 90 | 
             
                  - !ruby/object:Gem::Version
         | 
| 88 | 
            -
                    version: '0. | 
| 91 | 
            +
                    version: '0.3'
         | 
| 92 | 
            +
                - - ">="
         | 
| 93 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 94 | 
            +
                    version: 0.3.1
         | 
| 89 95 | 
             
            description: 
         | 
| 90 96 | 
             
            email:
         | 
| 91 97 | 
             
            - eduardobogoni@gmail.com
         | 
| @@ -96,6 +102,7 @@ files: | |
| 96 102 | 
             
            - MIT-LICENSE
         | 
| 97 103 | 
             
            - README.rdoc
         | 
| 98 104 | 
             
            - lib/aranha.rb
         | 
| 105 | 
            +
            - lib/aranha/address_processor.rb
         | 
| 99 106 | 
             
            - lib/aranha/default_processor.rb
         | 
| 100 107 | 
             
            - lib/aranha/manager.rb
         | 
| 101 108 | 
             
            - lib/aranha/processor.rb
         | 
| @@ -119,7 +126,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 119 126 | 
             
                - !ruby/object:Gem::Version
         | 
| 120 127 | 
             
                  version: '0'
         | 
| 121 128 | 
             
            requirements: []
         | 
| 122 | 
            -
            rubygems_version: 3. | 
| 129 | 
            +
            rubygems_version: 3.1.6
         | 
| 123 130 | 
             
            signing_key: 
         | 
| 124 131 | 
             
            specification_version: 4
         | 
| 125 132 | 
             
            summary: Ruby utilities for web crawling.
         |