aranha-parsers 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/aranha/parsers/base.rb +10 -5
- data/lib/aranha/parsers/html/base.rb +14 -0
- data/lib/aranha/parsers/html/node/default/string_support.rb +6 -6
- data/lib/aranha/parsers/html/node/default.rb +16 -2
- data/lib/aranha/parsers/source_address/hash_http_base.rb +13 -22
- data/lib/aranha/parsers/source_address/http_get.rb +6 -12
- data/lib/aranha/parsers/version.rb +1 -1
- metadata +14 -56
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: cb5cd7c64c21a8805a583f01c75efd58268003a8c8cc695ef809f938e79dc3ed
         | 
| 4 | 
            +
              data.tar.gz: '078397e90586fe403b39dc8821e30f36efcf86fa55d31058bd94d18a42cb8eb8'
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: d017b16cb135ad2968fb35e83c086d3e5bdfbc59a8be5281303f26d8fbdbdff6697d9d1f029d949c69bb6341ae7eb25509d924e82a99c26e0391df7322a6a095
         | 
| 7 | 
            +
              data.tar.gz: ada16700b30a9456f1140debf88f17020699d27ecc1dc6bcafbac43a1569d3bfe19415109b6f74688214bb94033dcb14ec2fc251fb3ae8060f5a45e62f8902c0
         | 
    
        data/lib/aranha/parsers/base.rb
    CHANGED
    
    | @@ -9,20 +9,25 @@ module Aranha | |
| 9 9 | 
             
              module Parsers
         | 
| 10 10 | 
             
                class Base
         | 
| 11 11 | 
             
                  class << self
         | 
| 12 | 
            +
                    # @deprecated Use {#from_string} instead.
         | 
| 13 | 
            +
                    # @param content [String]
         | 
| 14 | 
            +
                    # @return [Aranha::Parsers::Base]
         | 
| 12 15 | 
             
                    def from_content(content)
         | 
| 16 | 
            +
                      from_string(content)
         | 
| 17 | 
            +
                    end
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                    # @param string [String]
         | 
| 20 | 
            +
                    # @return [Aranha::Parsers::Base]
         | 
| 21 | 
            +
                    def from_string(string)
         | 
| 13 22 | 
             
                      ::EacRubyUtils::Fs::Temp.on_file do |path|
         | 
| 14 23 | 
             
                        ::File.open(path.to_s, 'w:UTF-8') do |f|
         | 
| 15 | 
            -
                          f.write  | 
| 24 | 
            +
                          f.write string.dup.force_encoding('UTF-8')
         | 
| 16 25 | 
             
                        end
         | 
| 17 26 | 
             
                        r = new(path.to_path)
         | 
| 18 27 | 
             
                        r.content
         | 
| 19 28 | 
             
                        r
         | 
| 20 29 | 
             
                      end
         | 
| 21 30 | 
             
                    end
         | 
| 22 | 
            -
             | 
| 23 | 
            -
                    def parse_content(content)
         | 
| 24 | 
            -
                      from_content(content).data
         | 
| 25 | 
            -
                    end
         | 
| 26 31 | 
             
                  end
         | 
| 27 32 |  | 
| 28 33 | 
             
                  LOG_DIR_ENVVAR = 'ARANHA_PARSERS_LOG_DIR'
         | 
| @@ -19,6 +19,20 @@ module Aranha | |
| 19 19 | 
             
                        @fields << Field.new(name, type, xpath)
         | 
| 20 20 | 
             
                      end
         | 
| 21 21 |  | 
| 22 | 
            +
                      # @param node [Nokogiri::XML::Node]
         | 
| 23 | 
            +
                      # @return [Aranha::Parsers::Html::Base]
         | 
| 24 | 
            +
                      def from_node(node)
         | 
| 25 | 
            +
                        from_string(node.to_html)
         | 
| 26 | 
            +
                      end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                      # @param haystack [String]
         | 
| 29 | 
            +
                      # @param needle [String]
         | 
| 30 | 
            +
                      # @return [String]
         | 
| 31 | 
            +
                      def xpath_ends_with(haystack, needle)
         | 
| 32 | 
            +
                        "substring(#{haystack}, string-length(#{haystack}) - string-length(#{needle}) + 1) " \
         | 
| 33 | 
            +
                          "= #{needle}"
         | 
| 34 | 
            +
                      end
         | 
| 35 | 
            +
             | 
| 22 36 | 
             
                      Field = Struct.new(:name, :type, :xpath)
         | 
| 23 37 | 
             
                    end
         | 
| 24 38 |  | 
| @@ -27,16 +27,16 @@ module Aranha | |
| 27 27 | 
             
                          raise "Pattern \"#{pattern}\" not found in string \"#{s}\""
         | 
| 28 28 | 
             
                        end
         | 
| 29 29 |  | 
| 30 | 
            +
                        # @param node [Nokogiri::XML::Node]
         | 
| 31 | 
            +
                        # @param xpath [String]
         | 
| 32 | 
            +
                        # @return [String]
         | 
| 30 33 | 
             
                        def string_value(node, xpath)
         | 
| 31 | 
            -
                           | 
| 32 | 
            -
             | 
| 33 | 
            -
                          else
         | 
| 34 | 
            -
                            ''
         | 
| 35 | 
            -
                          end
         | 
| 34 | 
            +
                          found = node_value(node, xpath)
         | 
| 35 | 
            +
                          found ? sanitize_string(found.text) : ''
         | 
| 36 36 | 
             
                        end
         | 
| 37 37 |  | 
| 38 38 | 
             
                        def string_recursive_value(node, xpath, required = true)
         | 
| 39 | 
            -
                          root = node | 
| 39 | 
            +
                          root = node_value(node, xpath)
         | 
| 40 40 | 
             
                          if root.blank?
         | 
| 41 41 | 
             
                            return nil unless required
         | 
| 42 42 |  | 
| @@ -11,13 +11,13 @@ module Aranha | |
| 11 11 | 
             
                      require_sub __FILE__, include_modules: true
         | 
| 12 12 |  | 
| 13 13 | 
             
                      def array_value(node, xpath)
         | 
| 14 | 
            -
                        r = node | 
| 14 | 
            +
                        r = node_set_value(node, xpath).map { |n| n.text.strip }
         | 
| 15 15 | 
             
                        r.join('|')
         | 
| 16 16 | 
             
                      end
         | 
| 17 17 |  | 
| 18 18 | 
             
                      def join_value(node, xpath)
         | 
| 19 19 | 
             
                        m = ''
         | 
| 20 | 
            -
                        node | 
| 20 | 
            +
                        node_set_value(node, xpath).each do |n|
         | 
| 21 21 | 
             
                          m << n.text.strip
         | 
| 22 22 | 
             
                        end
         | 
| 23 23 | 
             
                        m
         | 
| @@ -27,6 +27,20 @@ module Aranha | |
| 27 27 | 
             
                        m = /(\d+) m/.match(join_value(node, xpath))
         | 
| 28 28 | 
             
                        m ? m[1].to_i : nil
         | 
| 29 29 | 
             
                      end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                      # @param node [Nokogiri::XML::Node]
         | 
| 32 | 
            +
                      # @param xpath [String]
         | 
| 33 | 
            +
                      # @return [Nokogiri::XML::NodeSet]
         | 
| 34 | 
            +
                      def node_set_value(node, xpath)
         | 
| 35 | 
            +
                        node.xpath(xpath)
         | 
| 36 | 
            +
                      end
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                      # @param node [Nokogiri::XML::Node]
         | 
| 39 | 
            +
                      # @param xpath [String]
         | 
| 40 | 
            +
                      # @return [Nokogiri::XML::Node]
         | 
| 41 | 
            +
                      def node_value(node, xpath)
         | 
| 42 | 
            +
                        node.at_xpath(xpath)
         | 
| 43 | 
            +
                      end
         | 
| 30 44 | 
             
                    end
         | 
| 31 45 | 
             
                  end
         | 
| 32 46 | 
             
                end
         | 
| @@ -2,10 +2,9 @@ | |
| 2 2 |  | 
| 3 3 | 
             
            require 'aranha/parsers/source_address/fetch_content_error'
         | 
| 4 4 | 
             
            require 'aranha/parsers/source_address/hash_http_base'
         | 
| 5 | 
            +
            require 'eac_envs/http/error'
         | 
| 6 | 
            +
            require 'eac_envs/http/request'
         | 
| 5 7 | 
             
            require 'eac_ruby_utils/core_ext'
         | 
| 6 | 
            -
            require 'faraday'
         | 
| 7 | 
            -
            require 'faraday/follow_redirects'
         | 
| 8 | 
            -
            require 'faraday/gzip'
         | 
| 9 8 | 
             
            require 'yaml'
         | 
| 10 9 |  | 
| 11 10 | 
             
            module Aranha
         | 
| @@ -55,21 +54,11 @@ module Aranha | |
| 55 54 | 
             
                      source.to_yaml
         | 
| 56 55 | 
             
                    end
         | 
| 57 56 |  | 
| 58 | 
            -
                    # @return [Faraday]
         | 
| 59 | 
            -
                    def faraday_connection
         | 
| 60 | 
            -
                      ::Faraday.new do |f|
         | 
| 61 | 
            -
                        f.request :gzip
         | 
| 62 | 
            -
                        f.response :follow_redirects if follow_redirect?
         | 
| 63 | 
            -
                      end
         | 
| 64 | 
            -
                    end
         | 
| 65 | 
            -
             | 
| 66 57 | 
             
                    def content
         | 
| 67 | 
            -
                       | 
| 68 | 
            -
                       | 
| 69 | 
            -
             | 
| 70 | 
            -
                      raise ::Aranha::Parsers::SourceAddress::FetchContentError. | 
| 71 | 
            -
                        "Get #{url} returned #{req.status.to_i}", req
         | 
| 72 | 
            -
                      )
         | 
| 58 | 
            +
                      request = http_request
         | 
| 59 | 
            +
                      request.response.body_str
         | 
| 60 | 
            +
                    rescue ::EacEnvs::Http::Error => e
         | 
| 61 | 
            +
                      raise ::Aranha::Parsers::SourceAddress::FetchContentError, e.message, request
         | 
| 73 62 | 
             
                    end
         | 
| 74 63 |  | 
| 75 64 | 
             
                    def param(key, default_value)
         | 
| @@ -82,11 +71,13 @@ module Aranha | |
| 82 71 |  | 
| 83 72 | 
             
                    private
         | 
| 84 73 |  | 
| 85 | 
            -
                     | 
| 86 | 
            -
             | 
| 87 | 
            -
             | 
| 88 | 
            -
             | 
| 89 | 
            -
                       | 
| 74 | 
            +
                    # @return [EacEnvs::Http::Request]
         | 
| 75 | 
            +
                    def http_request
         | 
| 76 | 
            +
                      r = ::EacEnvs::Http::Request.new.verb(self.class.http_method).url(url)
         | 
| 77 | 
            +
                      r = headers.if_present(r) { |v| r.headers(v) }
         | 
| 78 | 
            +
                      r = body.if_present(r) { |v| r.body(v) }
         | 
| 79 | 
            +
                      r = r.follow_redirect(true) if follow_redirect?
         | 
| 80 | 
            +
                      r
         | 
| 90 81 | 
             
                    end
         | 
| 91 82 | 
             
                  end
         | 
| 92 83 | 
             
                end
         | 
| @@ -2,8 +2,8 @@ | |
| 2 2 |  | 
| 3 3 | 
             
            require 'addressable'
         | 
| 4 4 | 
             
            require 'aranha/parsers/source_address/fetch_content_error'
         | 
| 5 | 
            -
            require ' | 
| 6 | 
            -
            require ' | 
| 5 | 
            +
            require 'eac_envs/http/error'
         | 
| 6 | 
            +
            require 'eac_envs/http/request'
         | 
| 7 7 |  | 
| 8 8 | 
             
            module Aranha
         | 
| 9 9 | 
             
              module Parsers
         | 
| @@ -39,16 +39,10 @@ module Aranha | |
| 39 39 | 
             
                    end
         | 
| 40 40 |  | 
| 41 41 | 
             
                    def content
         | 
| 42 | 
            -
                       | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
                       | 
| 46 | 
            -
                      c = conn.get(url)
         | 
| 47 | 
            -
                      return c.body if c.status == 200
         | 
| 48 | 
            -
             | 
| 49 | 
            -
                      raise ::Aranha::Parsers::SourceAddress::FetchContentError.new(
         | 
| 50 | 
            -
                        "Get #{url} returned #{c.status.to_i}", c
         | 
| 51 | 
            -
                      )
         | 
| 42 | 
            +
                      request = ::EacEnvs::Http::Request.new.url(url).retry(true).follow_redirect(true)
         | 
| 43 | 
            +
                      request.response.body_str
         | 
| 44 | 
            +
                    rescue ::EacEnvs::Http::Error => e
         | 
| 45 | 
            +
                      raise ::Aranha::Parsers::SourceAddress::FetchContentError, e.message, request
         | 
| 52 46 | 
             
                    end
         | 
| 53 47 |  | 
| 54 48 | 
             
                    def serialize
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: aranha-parsers
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 0.18.0
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Esquilo Azul Company
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2023- | 
| 11 | 
            +
            date: 2023-05-13 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: activesupport
         | 
| @@ -33,7 +33,7 @@ dependencies: | |
| 33 33 | 
             
                    version: '2.8'
         | 
| 34 34 | 
             
                - - ">="
         | 
| 35 35 | 
             
                  - !ruby/object:Gem::Version
         | 
| 36 | 
            -
                    version: 2.8. | 
| 36 | 
            +
                    version: 2.8.4
         | 
| 37 37 | 
             
              type: :runtime
         | 
| 38 38 | 
             
              prerelease: false
         | 
| 39 39 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| @@ -43,48 +43,17 @@ dependencies: | |
| 43 43 | 
             
                    version: '2.8'
         | 
| 44 44 | 
             
                - - ">="
         | 
| 45 45 | 
             
                  - !ruby/object:Gem::Version
         | 
| 46 | 
            -
                    version: 2.8. | 
| 46 | 
            +
                    version: 2.8.4
         | 
| 47 47 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 48 | 
            -
              name:  | 
| 49 | 
            -
              requirement: !ruby/object:Gem::Requirement
         | 
| 50 | 
            -
                requirements:
         | 
| 51 | 
            -
                - - "~>"
         | 
| 52 | 
            -
                  - !ruby/object:Gem::Version
         | 
| 53 | 
            -
                    version: '0.112'
         | 
| 54 | 
            -
              type: :runtime
         | 
| 55 | 
            -
              prerelease: false
         | 
| 56 | 
            -
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 57 | 
            -
                requirements:
         | 
| 58 | 
            -
                - - "~>"
         | 
| 59 | 
            -
                  - !ruby/object:Gem::Version
         | 
| 60 | 
            -
                    version: '0.112'
         | 
| 61 | 
            -
            - !ruby/object:Gem::Dependency
         | 
| 62 | 
            -
              name: faraday
         | 
| 48 | 
            +
              name: eac_envs-http
         | 
| 63 49 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| 64 50 | 
             
                requirements:
         | 
| 65 51 | 
             
                - - "~>"
         | 
| 66 52 | 
             
                  - !ruby/object:Gem::Version
         | 
| 67 | 
            -
                    version: ' | 
| 68 | 
            -
                - - ">="
         | 
| 69 | 
            -
                  - !ruby/object:Gem::Version
         | 
| 70 | 
            -
                    version: 2.7.4
         | 
| 71 | 
            -
              type: :runtime
         | 
| 72 | 
            -
              prerelease: false
         | 
| 73 | 
            -
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 74 | 
            -
                requirements:
         | 
| 75 | 
            -
                - - "~>"
         | 
| 76 | 
            -
                  - !ruby/object:Gem::Version
         | 
| 77 | 
            -
                    version: '2.7'
         | 
| 53 | 
            +
                    version: '0.3'
         | 
| 78 54 | 
             
                - - ">="
         | 
| 79 55 | 
             
                  - !ruby/object:Gem::Version
         | 
| 80 | 
            -
                    version:  | 
| 81 | 
            -
            - !ruby/object:Gem::Dependency
         | 
| 82 | 
            -
              name: faraday-follow_redirects
         | 
| 83 | 
            -
              requirement: !ruby/object:Gem::Requirement
         | 
| 84 | 
            -
                requirements:
         | 
| 85 | 
            -
                - - "~>"
         | 
| 86 | 
            -
                  - !ruby/object:Gem::Version
         | 
| 87 | 
            -
                    version: '0.3'
         | 
| 56 | 
            +
                    version: 0.3.1
         | 
| 88 57 | 
             
              type: :runtime
         | 
| 89 58 | 
             
              prerelease: false
         | 
| 90 59 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| @@ -92,34 +61,23 @@ dependencies: | |
| 92 61 | 
             
                - - "~>"
         | 
| 93 62 | 
             
                  - !ruby/object:Gem::Version
         | 
| 94 63 | 
             
                    version: '0.3'
         | 
| 95 | 
            -
            -  | 
| 96 | 
            -
              name: faraday-gzip
         | 
| 97 | 
            -
              requirement: !ruby/object:Gem::Requirement
         | 
| 98 | 
            -
                requirements:
         | 
| 99 | 
            -
                - - "~>"
         | 
| 100 | 
            -
                  - !ruby/object:Gem::Version
         | 
| 101 | 
            -
                    version: '0.1'
         | 
| 102 | 
            -
              type: :runtime
         | 
| 103 | 
            -
              prerelease: false
         | 
| 104 | 
            -
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 105 | 
            -
                requirements:
         | 
| 106 | 
            -
                - - "~>"
         | 
| 64 | 
            +
                - - ">="
         | 
| 107 65 | 
             
                  - !ruby/object:Gem::Version
         | 
| 108 | 
            -
                    version:  | 
| 66 | 
            +
                    version: 0.3.1
         | 
| 109 67 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 110 | 
            -
              name:  | 
| 68 | 
            +
              name: eac_ruby_utils
         | 
| 111 69 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| 112 70 | 
             
                requirements:
         | 
| 113 71 | 
             
                - - "~>"
         | 
| 114 72 | 
             
                  - !ruby/object:Gem::Version
         | 
| 115 | 
            -
                    version: ' | 
| 73 | 
            +
                    version: '0.116'
         | 
| 116 74 | 
             
              type: :runtime
         | 
| 117 75 | 
             
              prerelease: false
         | 
| 118 76 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 119 77 | 
             
                requirements:
         | 
| 120 78 | 
             
                - - "~>"
         | 
| 121 79 | 
             
                  - !ruby/object:Gem::Version
         | 
| 122 | 
            -
                    version: ' | 
| 80 | 
            +
                    version: '0.116'
         | 
| 123 81 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 124 82 | 
             
              name: nokogiri
         | 
| 125 83 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| @@ -129,7 +87,7 @@ dependencies: | |
| 129 87 | 
             
                    version: '1.14'
         | 
| 130 88 | 
             
                - - ">="
         | 
| 131 89 | 
             
                  - !ruby/object:Gem::Version
         | 
| 132 | 
            -
                    version: 1.14. | 
| 90 | 
            +
                    version: 1.14.4
         | 
| 133 91 | 
             
              type: :runtime
         | 
| 134 92 | 
             
              prerelease: false
         | 
| 135 93 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| @@ -139,7 +97,7 @@ dependencies: | |
| 139 97 | 
             
                    version: '1.14'
         | 
| 140 98 | 
             
                - - ">="
         | 
| 141 99 | 
             
                  - !ruby/object:Gem::Version
         | 
| 142 | 
            -
                    version: 1.14. | 
| 100 | 
            +
                    version: 1.14.4
         | 
| 143 101 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 144 102 | 
             
              name: ofx-parser
         | 
| 145 103 | 
             
              requirement: !ruby/object:Gem::Requirement
         |