scraper_utils 0.3.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/README.md +35 -7
- data/lib/scraper_utils/cycle_utils.rb +25 -0
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +2 -2
- data/lib/scraper_utils/version.rb +1 -1
- metadata +2 -1
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 6cd5b7c0fb03a762cec4ddd35ec354d0921273e7c9ee5c8c7f332b14339f3182
         | 
| 4 | 
            +
              data.tar.gz: afb80849b5c078eb8f7bf7de74bb537f8df692bcb59070f90bde29a4c68b9231
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: d6e0e27e93d1b7528e96d9bda350510f5513854778972f7276683acae538b2b70e8478c5f7db2d4eb5bc0b4ebfb28c502e5885896c67f0090e8d4b85648df6de
         | 
| 7 | 
            +
              data.tar.gz: e3ec0ca5d77bff719b32bbe91f6efa43fceed75c92bcc8450304d1f64be6f73f5372626b84a4541d3ed4920b3c6af889400d677fcb0c5a20fd0224fa2ec75613
         | 
    
        data/CHANGELOG.md
    CHANGED
    
    | @@ -1,5 +1,19 @@ | |
| 1 1 | 
             
            # Changelog
         | 
| 2 2 |  | 
| 3 | 
            +
            ## 0.4.1 - 2025-03-04
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            * Document `ScraperUtils::CycleUtils.pick(values)`
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            ## 0.4.0 - 2025-03-04
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            * Add Cycle Utils as an alternative to Date range utils
         | 
| 10 | 
            +
            * Update README.md with changed defaults
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            ## 0.3.0 - 2025-03-04
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            * Add date range utils
         | 
| 15 | 
            +
            * Flush $stdout and $stderr when logging to sync exception output and logging lines
         | 
| 16 | 
            +
            * Break out example code from README.md into docs dir
         | 
| 3 17 |  | 
| 4 18 | 
             
            ## 0.2.1 - 2025-02-28
         | 
| 5 19 |  | 
| @@ -12,3 +26,5 @@ Added FiberScheduler, enabled complient mode with delays by default and simplifi | |
| 12 26 | 
             
            ## 0.1.0 - 2025-02-23
         | 
| 13 27 |  | 
| 14 28 | 
             
            First release for development
         | 
| 29 | 
            +
             | 
| 30 | 
            +
             | 
    
        data/README.md
    CHANGED
    
    | @@ -33,8 +33,7 @@ Even without specific configuration, our scrapers will, by default: | |
| 33 33 | 
             
              In the default "compliant mode" this defaults to a max load of 20% and is capped at 33%.
         | 
| 34 34 |  | 
| 35 35 | 
             
            - **Add randomized delays**: We add random delays between requests to further reduce our impact on servers, which should
         | 
| 36 | 
            -
              bring us
         | 
| 37 | 
            -
            - down to the load of a single industrious person.
         | 
| 36 | 
            +
              bring us down to the load of a single industrious person.
         | 
| 38 37 |  | 
| 39 38 | 
             
            Extra utilities provided for scrapers to further reduce your server load:
         | 
| 40 39 |  | 
| @@ -44,6 +43,9 @@ Extra utilities provided for scrapers to further reduce your server load: | |
| 44 43 | 
             
              checking the recent 4 days each day and reducing down to checking each 3 days by the end of the 33-day mark. This
         | 
| 45 44 | 
             
              replaces the simplistic check of the last 30 days each day.
         | 
| 46 45 |  | 
| 46 | 
            +
            - Alternative **Cycle Utilities** - a convenience class to cycle through short and longer search ranges to reduce server
         | 
| 47 | 
            +
              load.
         | 
| 48 | 
            +
             | 
| 47 49 | 
             
            Our goal is to access public planning information without negatively impacting your services.
         | 
| 48 50 |  | 
| 49 51 | 
             
            Installation
         | 
| @@ -123,15 +125,14 @@ The agent returned is configured using Mechanize hooks to implement the desired | |
| 123 125 | 
             
            By default, the Mechanize agent is configured with the following settings.
         | 
| 124 126 | 
             
            As you can see, the defaults can be changed using env variables.
         | 
| 125 127 |  | 
| 126 | 
            -
            Note - compliant mode forces max_load to be set to a value no greater than  | 
| 127 | 
            -
            PLEASE don't use our user agent string with a max_load higher than 33!
         | 
| 128 | 
            +
            Note - compliant mode forces max_load to be set to a value no greater than 50.
         | 
| 128 129 |  | 
| 129 130 | 
             
            ```ruby
         | 
| 130 131 | 
             
            ScraperUtils::MechanizeUtils::AgentConfig.configure do |config|
         | 
| 131 132 | 
             
              config.default_timeout = ENV.fetch('MORPH_TIMEOUT', 60).to_i # 60
         | 
| 132 133 | 
             
              config.default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
         | 
| 133 | 
            -
              config.default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY',  | 
| 134 | 
            -
              config.default_max_load = ENV.fetch('MORPH_MAX_LOAD',  | 
| 134 | 
            +
              config.default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', 5).to_i # 5
         | 
| 135 | 
            +
              config.default_max_load = ENV.fetch('MORPH_MAX_LOAD', 33.3).to_f # 33.3
         | 
| 135 136 | 
             
              config.default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
         | 
| 136 137 | 
             
              config.default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
         | 
| 137 138 | 
             
              config.default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
         | 
| @@ -268,13 +269,40 @@ Typical server load reductions: | |
| 268 269 |  | 
| 269 270 | 
             
            See the class documentation for customizing defaults and passing options.
         | 
| 270 271 |  | 
| 272 | 
            +
            ### Other possibilities
         | 
| 273 | 
            +
             | 
| 274 | 
            +
            If the site uses tags like 'L28', 'L14' and 'L7' for the last 28, 14 and 7 days, an alternative solution
         | 
| 275 | 
            +
            is to cycle through ['L28', 'L7', 'L14', 'L7'] which would drop the load by 50% and be less Bot like.
         | 
| 276 | 
            +
             | 
| 277 | 
            +
            Cycle Utils
         | 
| 278 | 
            +
            -----------
         | 
| 279 | 
            +
            Simple utility for cycling through options based on Julian day number:
         | 
| 280 | 
            +
             | 
| 281 | 
            +
            ```ruby
         | 
| 282 | 
            +
            # Toggle between main and alternate behaviour
         | 
| 283 | 
            +
            alternate = ScraperUtils::CycleUtils.position(2).even?
         | 
| 284 | 
            +
             | 
| 285 | 
            +
            # OR cycle through a list of values day by day:
         | 
| 286 | 
            +
            period = ScraperUtils::CycleUtils.pick(['L28', 'L7', 'L14', 'L7'])
         | 
| 287 | 
            +
             | 
| 288 | 
            +
            # Use with any cycle size
         | 
| 289 | 
            +
            pos = ScraperUtils::CycleUtils.position(7) # 0-6 cycle
         | 
| 290 | 
            +
             | 
| 291 | 
            +
            # Test with specific date
         | 
| 292 | 
            +
            pos = ScraperUtils::CycleUtils.position(3, date: Date.new(2024, 1, 5))
         | 
| 293 | 
            +
             | 
| 294 | 
            +
            # Override for testing
         | 
| 295 | 
            +
            # CYCLE_POSITION=2 bundle exec ruby scraper.rb
         | 
| 296 | 
            +
            ```
         | 
| 297 | 
            +
             | 
| 271 298 | 
             
            Randomizing Requests
         | 
| 272 299 | 
             
            --------------------
         | 
| 273 300 |  | 
| 274 301 | 
             
            Pass a `Collection` or `Array` to `ScraperUtils::RandomizeUtils.randomize_order` to randomize it in production, but
         | 
| 275 302 | 
             
            receive in as is when testing.
         | 
| 276 303 |  | 
| 277 | 
            -
            Use this with the list of records scraped from an index to randomise any requests for further information to be less Bot | 
| 304 | 
            +
            Use this with the list of records scraped from an index to randomise any requests for further information to be less Bot
         | 
| 305 | 
            +
            like.
         | 
| 278 306 |  | 
| 279 307 | 
             
            ### Spec setup
         | 
| 280 308 |  | 
| @@ -0,0 +1,25 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module ScraperUtils
         | 
| 4 | 
            +
              # Provides utilities for cycling through a range of options day by day
         | 
| 5 | 
            +
              module CycleUtils
         | 
| 6 | 
            +
                # Returns position in cycle from zero onwards
         | 
| 7 | 
            +
                # @param cycle [Integer] Cycle size (2 onwards)
         | 
| 8 | 
            +
                # @param date [Date, nil] Optional date to use instead of today
         | 
| 9 | 
            +
                # @return [Integer] position in cycle progressing from zero to cycle-1 and then repeating day by day
         | 
| 10 | 
            +
                # Can override using CYCLE_POSITION ENV variable
         | 
| 11 | 
            +
                def self.position(cycle, date: nil)
         | 
| 12 | 
            +
                  day = ENV.fetch('CYCLE_POSITION', (date || Date.today).jd).to_i
         | 
| 13 | 
            +
                  day % cycle
         | 
| 14 | 
            +
                end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                # Returns one value per day, cycling through all possible values in order
         | 
| 17 | 
            +
                # @param values [Array] Values to cycle through
         | 
| 18 | 
            +
                # @param date [Date, nil] Optional date to use instead of today to calculate position
         | 
| 19 | 
            +
                # @return value from array
         | 
| 20 | 
            +
                # Can override using CYCLE_POSITION ENV variable
         | 
| 21 | 
            +
                def self.pick(values, date: nil)
         | 
| 22 | 
            +
                  values[position(values.size, date: date)]
         | 
| 23 | 
            +
                end
         | 
| 24 | 
            +
              end
         | 
| 25 | 
            +
            end
         | 
| @@ -69,8 +69,8 @@ module ScraperUtils | |
| 69 69 | 
             
                    def reset_defaults!
         | 
| 70 70 | 
             
                      @default_timeout = ENV.fetch('MORPH_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
         | 
| 71 71 | 
             
                      @default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
         | 
| 72 | 
            -
                      @default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i #  | 
| 73 | 
            -
                      @default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD).to_f #  | 
| 72 | 
            +
                      @default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 5
         | 
| 73 | 
            +
                      @default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD).to_f # 33.3
         | 
| 74 74 | 
             
                      @default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
         | 
| 75 75 | 
             
                      @default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
         | 
| 76 76 | 
             
                      @default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
         | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: scraper_utils
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 0.4.1
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Ian Heggie
         | 
| @@ -79,6 +79,7 @@ files: | |
| 79 79 | 
             
            - lib/scraper_utils.rb
         | 
| 80 80 | 
             
            - lib/scraper_utils/adaptive_delay.rb
         | 
| 81 81 | 
             
            - lib/scraper_utils/authority_utils.rb
         | 
| 82 | 
            +
            - lib/scraper_utils/cycle_utils.rb
         | 
| 82 83 | 
             
            - lib/scraper_utils/data_quality_monitor.rb
         | 
| 83 84 | 
             
            - lib/scraper_utils/date_range_utils.rb
         | 
| 84 85 | 
             
            - lib/scraper_utils/db_utils.rb
         |