dh_easy-core 0.2.2 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/doc/DhEasy.html +6 -6
- data/doc/DhEasy/Core.html +39 -40
- data/doc/DhEasy/Core/Config.html +6 -6
- data/doc/DhEasy/Core/Exception.html +6 -6
- data/doc/DhEasy/Core/Exception/OutdatedError.html +6 -6
- data/doc/DhEasy/Core/Helper.html +6 -6
- data/doc/DhEasy/Core/Helper/Cookie.html +6 -6
- data/doc/DhEasy/Core/Mock.html +6 -6
- data/doc/DhEasy/Core/Mock/FakeDb.html +963 -400
- data/doc/DhEasy/Core/Mock/FakeExecutor.html +26 -37
- data/doc/DhEasy/Core/Mock/FakeFinisher.html +6 -6
- data/doc/DhEasy/Core/Mock/FakeParser.html +6 -6
- data/doc/DhEasy/Core/Mock/FakeSeeder.html +6 -6
- data/doc/DhEasy/Core/Plugin.html +6 -6
- data/doc/DhEasy/Core/Plugin/CollectionVault.html +6 -6
- data/doc/DhEasy/Core/Plugin/ConfigBehavior.html +7 -7
- data/doc/DhEasy/Core/Plugin/ContextIntegrator.html +6 -6
- data/doc/DhEasy/Core/Plugin/Executor.html +6 -6
- data/doc/DhEasy/Core/Plugin/ExecutorBehavior.html +6 -6
- data/doc/DhEasy/Core/Plugin/Finisher.html +6 -6
- data/doc/DhEasy/Core/Plugin/FinisherBehavior.html +6 -6
- data/doc/DhEasy/Core/Plugin/InitializeHook.html +6 -6
- data/doc/DhEasy/Core/Plugin/Parser.html +6 -6
- data/doc/DhEasy/Core/Plugin/ParserBehavior.html +6 -6
- data/doc/DhEasy/Core/Plugin/Seeder.html +6 -6
- data/doc/DhEasy/Core/Plugin/SeederBehavior.html +6 -6
- data/doc/DhEasy/Core/SmartCollection.html +6 -6
- data/doc/_index.html +7 -7
- data/doc/class_list.html +2 -2
- data/doc/css/style.css +2 -2
- data/doc/file.README.html +9 -16
- data/doc/file_list.html +2 -2
- data/doc/frames.html +2 -2
- data/doc/index.html +9 -16
- data/doc/js/app.js +14 -3
- data/doc/method_list.html +80 -48
- data/doc/top-level-namespace.html +6 -6
- data/lib/dh_easy/core.rb +2 -1
- data/lib/dh_easy/core/mock/fake_db.rb +228 -35
- data/lib/dh_easy/core/mock/fake_executor.rb +0 -1
- data/lib/dh_easy/core/version.rb +1 -1
- metadata +3 -4
| @@ -18,6 +18,12 @@ module DhEasy | |
| 18 18 | 
             
                    }
         | 
| 19 19 | 
             
                    # Default collection for saved outputs
         | 
| 20 20 | 
             
                    DEFAULT_COLLECTION = 'default'
         | 
| 21 | 
            +
                    # Default page's fetch type
         | 
| 22 | 
            +
                    DEFAULT_FETCH_TYPE = 'standard'
         | 
| 23 | 
            +
                    # Default uuid algorithm
         | 
| 24 | 
            +
                    DEFAULT_UUID_ALGORITHM = :md5
         | 
| 25 | 
            +
                    # Valid uuid algorithms
         | 
| 26 | 
            +
                    VALID_UUID_ALGORITHMS = [:md5, :sha1, :sha256]
         | 
| 21 27 |  | 
| 22 28 | 
             
                    # Generate a smart collection with keys and initial values.
         | 
| 23 29 | 
             
                    #
         | 
| @@ -32,21 +38,31 @@ module DhEasy | |
| 32 38 | 
             
                    # Generate a fake UUID.
         | 
| 33 39 | 
             
                    #
         | 
| 34 40 | 
             
                    # @param seed (nil) Object to use as seed for uuid.
         | 
| 41 | 
            +
                    # @param [Enumerator] algorithm (nil) Algorithm to use: sha256 (default), sha1, md5.
         | 
| 35 42 | 
             
                    #
         | 
| 36 43 | 
             
                    # @return [String]
         | 
| 37 | 
            -
                    def self.fake_uuid seed = nil
         | 
| 44 | 
            +
                    def self.fake_uuid seed = nil, algorithm = nil
         | 
| 38 45 | 
             
                      seed ||= (Time.new.to_f + rand)
         | 
| 39 | 
            -
                       | 
| 46 | 
            +
                      algorithm ||= DEFAULT_UUID_ALGORITHM
         | 
| 47 | 
            +
                      case algorithm
         | 
| 48 | 
            +
                      when :sha256
         | 
| 49 | 
            +
                        Digest::SHA256.hexdigest seed.to_s
         | 
| 50 | 
            +
                      when :sha1
         | 
| 51 | 
            +
                        Digest::SHA1.hexdigest seed.to_s
         | 
| 52 | 
            +
                      else
         | 
| 53 | 
            +
                        Digest::MD5.hexdigest seed.to_s
         | 
| 54 | 
            +
                      end
         | 
| 40 55 | 
             
                    end
         | 
| 41 56 |  | 
| 42 57 | 
             
                    # Generate a fake UUID based on output fields without `_` prefix.
         | 
| 43 58 | 
             
                    #
         | 
| 44 59 | 
             
                    # @param [Hash] data Output data.
         | 
| 60 | 
            +
                    # @param [Enumerator] uuid_algorithm (nil) Algorithm to use: sha256 (default), sha1, md5.
         | 
| 45 61 | 
             
                    #
         | 
| 46 62 | 
             
                    # @return [String]
         | 
| 47 | 
            -
                    def self.output_uuid data
         | 
| 63 | 
            +
                    def self.output_uuid data, uuid_algorithm = nil
         | 
| 48 64 | 
             
                      seed = data.select{|k,v|k.to_s =~ /^[^_]/}.hash
         | 
| 49 | 
            -
                      fake_uuid seed
         | 
| 65 | 
            +
                      fake_uuid seed, uuid_algorithm
         | 
| 50 66 | 
             
                    end
         | 
| 51 67 |  | 
| 52 68 | 
             
                    # Build a page with defaults by using FakeDb engine.
         | 
| @@ -83,8 +99,8 @@ module DhEasy | |
| 83 99 | 
             
                    #
         | 
| 84 100 | 
             
                    # @param [String] raw_url URL to clean.
         | 
| 85 101 | 
             
                    #
         | 
| 86 | 
            -
                    # @return [ | 
| 87 | 
            -
                    def self. | 
| 102 | 
            +
                    # @return [URI::HTTPS]
         | 
| 103 | 
            +
                    def self.clean_uri_obj raw_url
         | 
| 88 104 | 
             
                      url = URI.parse(raw_url)
         | 
| 89 105 | 
             
                      url.hostname = url.hostname.downcase
         | 
| 90 106 | 
             
                      url.fragment = nil
         | 
| @@ -101,7 +117,17 @@ module DhEasy | |
| 101 117 | 
             
                        end
         | 
| 102 118 | 
             
                        url.query = data.join('&')
         | 
| 103 119 | 
             
                      end
         | 
| 104 | 
            -
                      url | 
| 120 | 
            +
                      url
         | 
| 121 | 
            +
                    end
         | 
| 122 | 
            +
             | 
| 123 | 
            +
                    # Clean an URL to remove fragment, lowercase schema and host, and sort
         | 
| 124 | 
            +
                    #   query string.
         | 
| 125 | 
            +
                    #
         | 
| 126 | 
            +
                    # @param [String] raw_url URL to clean.
         | 
| 127 | 
            +
                    #
         | 
| 128 | 
            +
                    # @return [String]
         | 
| 129 | 
            +
                    def self.clean_uri raw_url
         | 
| 130 | 
            +
                      clean_uri_obj(raw_url).to_s
         | 
| 105 131 | 
             
                    end
         | 
| 106 132 |  | 
| 107 133 | 
             
                    # Format headers for gid generation.
         | 
| @@ -111,16 +137,87 @@ module DhEasy | |
| 111 137 | 
             
                    #
         | 
| 112 138 | 
             
                    # @return [Hash]
         | 
| 113 139 | 
             
                    def self.format_headers headers
         | 
| 114 | 
            -
                      return  | 
| 115 | 
            -
                      data =  | 
| 140 | 
            +
                      return '' if headers.nil?
         | 
| 141 | 
            +
                      data = []
         | 
| 116 142 | 
             
                      headers.each do |key, value|
         | 
| 117 143 | 
             
                        unless value.is_a? Array
         | 
| 118 | 
            -
                          data | 
| 144 | 
            +
                          data << "#{key.downcase}:#{value.to_s}"
         | 
| 119 145 | 
             
                          next
         | 
| 120 146 | 
             
                        end
         | 
| 121 | 
            -
                        data | 
| 147 | 
            +
                        data << "#{key.downcase}:#{value.sort.join ','}"
         | 
| 122 148 | 
             
                      end
         | 
| 123 | 
            -
                      data
         | 
| 149 | 
            +
                      data.sort.join ';'
         | 
| 150 | 
            +
                    end
         | 
| 151 | 
            +
             | 
| 152 | 
            +
                    # Identify whenever it has a default_fetch_type.
         | 
| 153 | 
            +
                    # @private
         | 
| 154 | 
            +
                    #
         | 
| 155 | 
            +
                    # @param [String,nil] fetch_type Fetch type.
         | 
| 156 | 
            +
                    #
         | 
| 157 | 
            +
                    # @return [Boolean] `true` when default value, else `false`.
         | 
| 158 | 
            +
                    def self.is_default_fetch_type? fetch_type
         | 
| 159 | 
            +
                      return true if fetch_type.nil?
         | 
| 160 | 
            +
                      return true if fetch_type === DEFAULT_FETCH_TYPE
         | 
| 161 | 
            +
                      false
         | 
| 162 | 
            +
                    end
         | 
| 163 | 
            +
             | 
| 164 | 
            +
                    # Identify whenever a driver hash is empty.
         | 
| 165 | 
            +
                    # @private
         | 
| 166 | 
            +
                    #
         | 
| 167 | 
            +
                    # @param [Hash,nil] driver Driver hash.
         | 
| 168 | 
            +
                    #
         | 
| 169 | 
            +
                    # @return [Boolean] `true` when empty, else `false`.
         | 
| 170 | 
            +
                    def self.is_driver_empty? driver
         | 
| 171 | 
            +
                      return true if driver.nil?
         | 
| 172 | 
            +
                      return true unless driver.is_a? Hash
         | 
| 173 | 
            +
                      return false if driver['name'].to_s.strip != ''
         | 
| 174 | 
            +
                      return false if driver['code'].to_s.strip != ''
         | 
| 175 | 
            +
                      return false if driver['pre_code'].to_s.strip != ''
         | 
| 176 | 
            +
                      return false if !driver['stealth'].nil? && !!driver['stealth']
         | 
| 177 | 
            +
                      return false if !driver['enable_images'].nil? && !!driver['enable_images']
         | 
| 178 | 
            +
                      return false if !driver['goto_options'].nil? && driver['goto_options'].is_a?(Hash) && driver['goto_options'].keys.length > 0
         | 
| 179 | 
            +
                      true
         | 
| 180 | 
            +
                    end
         | 
| 181 | 
            +
             | 
| 182 | 
            +
                    # Identify whenever a display hash is empty.
         | 
| 183 | 
            +
                    # @private
         | 
| 184 | 
            +
                    #
         | 
| 185 | 
            +
                    # @param [Hash,nil] display Display hash.
         | 
| 186 | 
            +
                    #
         | 
| 187 | 
            +
                    # @return [Boolean] `true` when empty, else `false`.
         | 
| 188 | 
            +
                    def self.is_display_empty? display
         | 
| 189 | 
            +
                      return true if display.nil?
         | 
| 190 | 
            +
                      return true unless display.is_a? Hash
         | 
| 191 | 
            +
                      return false if !display['width'].nil? && display['width'].to_f.ceil > 0
         | 
| 192 | 
            +
                      return false if !display['height'].nil? && display['height'].to_f.ceil > 0
         | 
| 193 | 
            +
                      true
         | 
| 194 | 
            +
                    end
         | 
| 195 | 
            +
             | 
| 196 | 
            +
                    # Identify whenever a screenshot hash is empty.
         | 
| 197 | 
            +
                    # @private
         | 
| 198 | 
            +
                    #
         | 
| 199 | 
            +
                    # @param [Hash,nil] screenshot Screenshot hash.
         | 
| 200 | 
            +
                    #
         | 
| 201 | 
            +
                    # @return [Boolean] `true` when empty, else `false`.
         | 
| 202 | 
            +
                    def self.is_screenshot_empty? screenshot
         | 
| 203 | 
            +
                      return true if screenshot.nil?
         | 
| 204 | 
            +
                      return true unless screenshot.is_a? Hash
         | 
| 205 | 
            +
                      return true if screenshot['take_screenshot'].nil? || !screenshot['take_screenshot']
         | 
| 206 | 
            +
                      return true if !screenshot['options'].nil? && !screenshot['options'].is_a?(Hash)
         | 
| 207 | 
            +
                      return false
         | 
| 208 | 
            +
                    end
         | 
| 209 | 
            +
             | 
| 210 | 
            +
                    # Identify whenever a hash is empty.
         | 
| 211 | 
            +
                    # @private
         | 
| 212 | 
            +
                    #
         | 
| 213 | 
            +
                    # @param [Hash,nil] hash Hash to validate.
         | 
| 214 | 
            +
                    #
         | 
| 215 | 
            +
                    # @return [Boolean] `true` when empty, else `false`.
         | 
| 216 | 
            +
                    def self.is_hash_empty? hash
         | 
| 217 | 
            +
                      return true if hash.nil?
         | 
| 218 | 
            +
                      return true unless hash.is_a? Hash
         | 
| 219 | 
            +
                      return false if hash.keys.length > 0
         | 
| 220 | 
            +
                      true
         | 
| 124 221 | 
             
                    end
         | 
| 125 222 |  | 
| 126 223 | 
             
                    # Build a job with defaults by using FakeDb engine.
         | 
| @@ -159,7 +256,7 @@ module DhEasy | |
| 159 256 | 
             
                    # @return [String]
         | 
| 160 257 | 
             
                    def self.time_stamp time = nil
         | 
| 161 258 | 
             
                      time = Time.new if time.nil?
         | 
| 162 | 
            -
                      time.utc.strftime('% | 
| 259 | 
            +
                      time.utc.strftime('%FT%T.%6N').gsub(/[0.]+\Z/,'') << "Z"
         | 
| 163 260 | 
             
                    end
         | 
| 164 261 |  | 
| 165 262 | 
             
                    # Get current job or create new one from values.
         | 
| @@ -209,7 +306,7 @@ module DhEasy | |
| 209 306 | 
             
                    # Current fake page gid.
         | 
| 210 307 | 
             
                    # @return [Integer,nil]
         | 
| 211 308 | 
             
                    def page_gid
         | 
| 212 | 
            -
                      @page_gid ||= self. | 
| 309 | 
            +
                      @page_gid ||= self.fake_uuid
         | 
| 213 310 | 
             
                    end
         | 
| 214 311 |  | 
| 215 312 | 
             
                    # Set current fake page gid value.
         | 
| @@ -217,6 +314,21 @@ module DhEasy | |
| 217 314 | 
             
                      @page_gid = value
         | 
| 218 315 | 
             
                    end
         | 
| 219 316 |  | 
| 317 | 
            +
                    # Current UUID algorithm.
         | 
| 318 | 
            +
                    # @return [Enumerator,nil]
         | 
| 319 | 
            +
                    def uuid_algorithm
         | 
| 320 | 
            +
                      @uuid_algorithm ||= DEFAULT_UUID_ALGORITHM
         | 
| 321 | 
            +
                    end
         | 
| 322 | 
            +
             | 
| 323 | 
            +
                    # Set current UUID algorithm value.
         | 
| 324 | 
            +
                    # @raise [ArgumentError] Whenever an invalid algorithm is provided
         | 
| 325 | 
            +
                    def uuid_algorithm= value
         | 
| 326 | 
            +
                      unless value.nil? || VALID_UUID_ALGORITHMS.include?(value)
         | 
| 327 | 
            +
                        raise ArgumentError.new("Invalid UUID algorithm, valid values are :md5, :sha1, :sha256")
         | 
| 328 | 
            +
                      end
         | 
| 329 | 
            +
                      @uuid_algorithm = value
         | 
| 330 | 
            +
                    end
         | 
| 331 | 
            +
             | 
| 220 332 | 
             
                    # Enable page gid override on page or output insert.
         | 
| 221 333 | 
             
                    def enable_page_gid_override
         | 
| 222 334 | 
             
                      @allow_page_gid_override = true
         | 
| @@ -263,14 +375,26 @@ module DhEasy | |
| 263 375 | 
             
                    #   whenever page gid can be overrided on page or output insert.
         | 
| 264 376 | 
             
                    # @option opts [Boolean, nil] :allow_job_id_override (false) Specify
         | 
| 265 377 | 
             
                    #   whenever job id can be overrided on page or output insert.
         | 
| 378 | 
            +
                    # @option opts [Enumerator, nil] :uuid_algorithm (:md5) Specify the
         | 
| 379 | 
            +
                    #   algorithm to be used to generate UUID values.
         | 
| 266 380 | 
             
                    def initialize opts = {}
         | 
| 267 381 | 
             
                      self.job_id = opts[:job_id]
         | 
| 268 382 | 
             
                      self.scraper_name = opts[:scraper_name]
         | 
| 269 383 | 
             
                      self.page_gid = opts[:page_gid]
         | 
| 384 | 
            +
                      self.uuid_algorithm = opts[:uuid_algorithm]
         | 
| 270 385 | 
             
                      @allow_page_gid_override = opts[:allow_page_gid_override].nil? ? false : !!opts[:allow_page_gid_override]
         | 
| 271 386 | 
             
                      @allow_job_id_override = opts[:allow_job_id_override].nil? ? false : !!opts[:allow_job_id_override]
         | 
| 272 387 | 
             
                    end
         | 
| 273 388 |  | 
| 389 | 
            +
                    # Generate a fake UUID using the configured uuid algorithm.
         | 
| 390 | 
            +
                    #
         | 
| 391 | 
            +
                    # @param seed (nil) Object to use as seed for uuid.
         | 
| 392 | 
            +
                    #
         | 
| 393 | 
            +
                    # @return [String]
         | 
| 394 | 
            +
                    def fake_uuid seed = nil
         | 
| 395 | 
            +
                      self.class.fake_uuid seed, self.uuid_algorithm
         | 
| 396 | 
            +
                    end
         | 
| 397 | 
            +
             | 
| 274 398 | 
             
                    # Generate a fake scraper name.
         | 
| 275 399 | 
             
                    #
         | 
| 276 400 | 
             
                    # @return [String]
         | 
| @@ -329,23 +453,42 @@ module DhEasy | |
| 329 453 | 
             
                    #
         | 
| 330 454 | 
             
                    # @return [String]
         | 
| 331 455 | 
             
                    def generate_page_gid page_data
         | 
| 332 | 
            -
                       | 
| 333 | 
            -
             | 
| 334 | 
            -
             | 
| 335 | 
            -
             | 
| 336 | 
            -
             | 
| 337 | 
            -
             | 
| 338 | 
            -
             | 
| 339 | 
            -
             | 
| 340 | 
            -
             | 
| 341 | 
            -
                       | 
| 342 | 
            -
                       | 
| 343 | 
            -
                      data | 
| 344 | 
            -
                      data | 
| 345 | 
            -
                       | 
| 346 | 
            -
                       | 
| 347 | 
            -
                       | 
| 348 | 
            -
                      " | 
| 456 | 
            +
                      # ensure page url
         | 
| 457 | 
            +
                      return "" if page_data['url'].nil? || page_data['url'].to_s.strip === ''
         | 
| 458 | 
            +
             | 
| 459 | 
            +
                      # calculate extra fields, keep field order to match datahen
         | 
| 460 | 
            +
                      data = []
         | 
| 461 | 
            +
                      data << "method:#{page_data['method'].to_s.downcase}"
         | 
| 462 | 
            +
                      no_url_encode = (!page_data['no_url_encode'].nil? && !!page_data['no_url_encode'])
         | 
| 463 | 
            +
                      uri = self.class.clean_uri_obj(page_data['url'])
         | 
| 464 | 
            +
                      url = (no_url_encode ? page_data['url'].to_s.lstrip : uri.to_s)
         | 
| 465 | 
            +
                      data << "url:#{url}"
         | 
| 466 | 
            +
                      headers = self.class.format_headers page_data['headers']
         | 
| 467 | 
            +
                      data << "headers:#{headers}"
         | 
| 468 | 
            +
                      data << "body:#{page_data['body'].to_s}"
         | 
| 469 | 
            +
                      no_redirect = (!page_data['no_redirect'].nil? && !!page_data['no_redirect'])
         | 
| 470 | 
            +
                      data << "no_redirect:#{no_redirect.to_s}"
         | 
| 471 | 
            +
                      ua_type = (page_data['ua_type'].to_s === '') ? 'desktop' : page_data['ua_type']
         | 
| 472 | 
            +
                      data << "ua_type:#{ua_type}"
         | 
| 473 | 
            +
             | 
| 474 | 
            +
                      # complex fields
         | 
| 475 | 
            +
                      data << "fetch_type:#{page_data['fetch_type']}" unless self.class.is_default_fetch_type? page_data['fetch_type']
         | 
| 476 | 
            +
                      # keep this cookie logic to match datahen
         | 
| 477 | 
            +
                      data << "cookie:#{page_data['cookie'].split(/;\s*/).sort.join(';')}" if page_data['cookie'].to_s.strip != ''
         | 
| 478 | 
            +
                      data << "http2:true" if page_data.has_key?('http2') && !page_data['http2'].nil? && !!page_data['http2']
         | 
| 479 | 
            +
                      data << "driverName:#{page_data['driver']['name']}" unless self.class.is_driver_empty? page_data['driver']
         | 
| 480 | 
            +
                      unless self.class.is_display_empty? page_data['display']
         | 
| 481 | 
            +
                        data << "display:#{page_data['display']['width']}x#{page_data['display']['height']}"
         | 
| 482 | 
            +
                      end
         | 
| 483 | 
            +
                      unless self.class.is_screenshot_empty? page_data['screenshot']
         | 
| 484 | 
            +
                        checksum = self.fake_uuid JSON.generate(page_data['screenshot'])
         | 
| 485 | 
            +
                        data << "screenshot:#{checksum}"
         | 
| 486 | 
            +
                      end
         | 
| 487 | 
            +
             | 
| 488 | 
            +
                      # generate GID
         | 
| 489 | 
            +
                      seed = data.join('|')
         | 
| 490 | 
            +
                      checksum = self.fake_uuid seed
         | 
| 491 | 
            +
                      "#{uri.hostname}-#{checksum}"
         | 
| 349 492 | 
             
                    end
         | 
| 350 493 |  | 
| 351 494 | 
             
                    # Get page keys with key generators to emulate saving on db.
         | 
| @@ -354,18 +497,45 @@ module DhEasy | |
| 354 497 | 
             
                    # @return [Hash]
         | 
| 355 498 | 
             
                    def page_defaults
         | 
| 356 499 | 
             
                      @page_defaults ||= {
         | 
| 500 | 
            +
                        'job_id' => lambda{|page| job_id},
         | 
| 357 501 | 
             
                        'url' => nil,
         | 
| 358 502 | 
             
                        'status' => 'to_fetch',
         | 
| 359 | 
            -
                        ' | 
| 503 | 
            +
                        'page_type' => 'default',
         | 
| 360 504 | 
             
                        'method' => 'GET',
         | 
| 361 505 | 
             
                        'headers' => {},
         | 
| 362 | 
            -
                        'fetch_type' =>  | 
| 506 | 
            +
                        'fetch_type' => DEFAULT_FETCH_TYPE,
         | 
| 363 507 | 
             
                        'cookie' => nil,
         | 
| 364 508 | 
             
                        'no_redirect' => false,
         | 
| 365 509 | 
             
                        'body' => nil,
         | 
| 366 510 | 
             
                        'ua_type' => 'desktop',
         | 
| 367 511 | 
             
                        'no_url_encode' => false,
         | 
| 368 512 | 
             
                        'http2' => false,
         | 
| 513 | 
            +
                        'priority' => 0,
         | 
| 514 | 
            +
                        'parsing_try_count' => 0,
         | 
| 515 | 
            +
                        'parsing_fail_count' => 0,
         | 
| 516 | 
            +
                        'fetching_at' => '0001-01-01T00:00:00Z',
         | 
| 517 | 
            +
                        'fetching_try_count' => 0,
         | 
| 518 | 
            +
                        'refetch_count' => 0,
         | 
| 519 | 
            +
                        'fetched_from' => '',
         | 
| 520 | 
            +
                        'content_size' => 0,
         | 
| 521 | 
            +
                        'force_fetch' => false,
         | 
| 522 | 
            +
                        'driver' => {
         | 
| 523 | 
            +
                          'name' => '',
         | 
| 524 | 
            +
                          'pre_code' => '',
         | 
| 525 | 
            +
                          'code' => '',
         | 
| 526 | 
            +
                          'goto_options' => nil,
         | 
| 527 | 
            +
                          'stealth' => false,
         | 
| 528 | 
            +
                          'enable_images' => false
         | 
| 529 | 
            +
                        },
         | 
| 530 | 
            +
                        'display' => {
         | 
| 531 | 
            +
                          'width' => 0,
         | 
| 532 | 
            +
                          'height' => 0
         | 
| 533 | 
            +
                        },
         | 
| 534 | 
            +
                        'screenshot' => {
         | 
| 535 | 
            +
                          'take_screenshot' => false,
         | 
| 536 | 
            +
                          'options' => nil
         | 
| 537 | 
            +
                        },
         | 
| 538 | 
            +
                        'driver_log' => nil,
         | 
| 369 539 | 
             
                        'vars' => {}
         | 
| 370 540 | 
             
                      }
         | 
| 371 541 | 
             
                    end
         | 
| @@ -380,17 +550,40 @@ module DhEasy | |
| 380 550 | 
             
                    def pages
         | 
| 381 551 | 
             
                      return @pages unless @page.nil?
         | 
| 382 552 |  | 
| 553 | 
            +
                      defaults = self.page_defaults
         | 
| 383 554 | 
             
                      collection = self.class.new_collection PAGE_KEYS,
         | 
| 384 | 
            -
                        defaults:  | 
| 555 | 
            +
                        defaults: defaults
         | 
| 385 556 | 
             
                      collection.bind_event(:before_defaults) do |collection, raw_item|
         | 
| 386 557 | 
             
                        item = DhEasy::Core.deep_stringify_keys raw_item
         | 
| 558 | 
            +
                        if !item['driver'].nil? && item['driver'].is_a?(Hash)
         | 
| 559 | 
            +
                          item['driver'] = defaults['driver'].merge item['driver']
         | 
| 560 | 
            +
                        end
         | 
| 561 | 
            +
                        if !item['display'].nil? && item['display'].is_a?(Hash)
         | 
| 562 | 
            +
                          item['display'] = defaults['display'].merge item['display']
         | 
| 563 | 
            +
                        end
         | 
| 564 | 
            +
                        if !item['screenshot'].nil? && item['screenshot'].is_a?(Hash)
         | 
| 565 | 
            +
                          item['screenshot'] = defaults['screenshot'].merge item['screenshot']
         | 
| 566 | 
            +
                        end
         | 
| 387 567 | 
             
                        item.delete 'job_id' unless allow_job_id_override?
         | 
| 388 568 | 
             
                        item
         | 
| 389 569 | 
             
                      end
         | 
| 390 570 | 
             
                      collection.bind_event(:before_insert) do |collection, item, match|
         | 
| 571 | 
            +
                        item['driver'] = nil if self.class.is_driver_empty? item['driver']
         | 
| 572 | 
            +
                        item['display'] = nil if self.class.is_display_empty? item['display']
         | 
| 573 | 
            +
                        item['screenshot'] = nil if self.class.is_screenshot_empty? item['screenshot']
         | 
| 574 | 
            +
                        item['headers'] = nil if self.class.is_hash_empty? item['headers']
         | 
| 575 | 
            +
                        item['vars'] = nil if self.class.is_hash_empty? item['vars']
         | 
| 576 | 
            +
                        uri = self.class.clean_uri_obj(item['url'])
         | 
| 577 | 
            +
                        item['hostname'] = uri.hostname
         | 
| 578 | 
            +
                        uri = nil
         | 
| 391 579 | 
             
                        if item['gid'].nil? || !allow_page_gid_override?
         | 
| 392 580 | 
             
                          item['gid'] = generate_page_gid item
         | 
| 393 581 | 
             
                        end
         | 
| 582 | 
            +
             | 
| 583 | 
            +
                        # 30 days = 60 * 60 * 24 * 30 = 2592000
         | 
| 584 | 
            +
                        item['freshness'] ||= self.class.time_stamp (Time.now - 2592000)
         | 
| 585 | 
            +
                        item['to_fetch'] ||= self.class.time_stamp
         | 
| 586 | 
            +
                        item['created_at'] ||= self.class.time_stamp
         | 
| 394 587 | 
             
                        item
         | 
| 395 588 | 
             
                      end
         | 
| 396 589 | 
             
                      collection.bind_event(:after_insert) do |collection, item|
         | 
| @@ -406,7 +599,7 @@ module DhEasy | |
| 406 599 | 
             
                    # @return [String]
         | 
| 407 600 | 
             
                    def generate_output_id data
         | 
| 408 601 | 
             
                      # Generate random UUID to match Datahen behavior
         | 
| 409 | 
            -
                      self. | 
| 602 | 
            +
                      self.fake_uuid
         | 
| 410 603 | 
             
                    end
         | 
| 411 604 |  | 
| 412 605 | 
             
                    # Get output keys with key generators to emulate saving on db.
         | 
| @@ -293,7 +293,6 @@ module DhEasy | |
| 293 293 | 
             
                        raise ArgumentError.new("per_page needs to be an Integer between 1 and #{MAX_FIND_OUTPUTS_PER_PAGE}.")
         | 
| 294 294 | 
             
                      end
         | 
| 295 295 |  | 
| 296 | 
            -
                      count = 0
         | 
| 297 296 | 
             
                      offset = (page - 1) * per_page
         | 
| 298 297 | 
             
                      job = latest_job_by(opts[:scraper_name])
         | 
| 299 298 | 
             
                      fixed_query = query.merge(
         | 
    
        data/lib/dh_easy/core/version.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: dh_easy-core
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 0.3.1
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Eduardo Rosales
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date:  | 
| 11 | 
            +
            date: 2021-05-23 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: datahen
         | 
| @@ -241,8 +241,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 241 241 | 
             
                - !ruby/object:Gem::Version
         | 
| 242 242 | 
             
                  version: '0'
         | 
| 243 243 | 
             
            requirements: []
         | 
| 244 | 
            -
             | 
| 245 | 
            -
            rubygems_version: 2.7.6
         | 
| 244 | 
            +
            rubygems_version: 3.0.3
         | 
| 246 245 | 
             
            signing_key: 
         | 
| 247 246 | 
             
            specification_version: 4
         | 
| 248 247 | 
             
            summary: DataHen Easy toolkit core module
         |