scraper_utils 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/.gitignore +3 -0
 - data/.rubocop.yml +4 -0
 - data/CHANGELOG.md +22 -1
 - data/Gemfile +5 -2
 - data/README.md +128 -149
 - data/docs/example_scrape_with_fibers.rb +31 -0
 - data/docs/example_scraper.rb +93 -0
 - data/lib/scraper_utils/adaptive_delay.rb +55 -50
 - data/lib/scraper_utils/cycle_utils.rb +25 -0
 - data/lib/scraper_utils/data_quality_monitor.rb +28 -17
 - data/lib/scraper_utils/date_range_utils.rb +159 -0
 - data/lib/scraper_utils/db_utils.rb +0 -2
 - data/lib/scraper_utils/debug_utils.rb +53 -6
 - data/lib/scraper_utils/fiber_scheduler.rb +45 -22
 - data/lib/scraper_utils/log_utils.rb +19 -17
 - data/lib/scraper_utils/mechanize_utils/agent_config.rb +67 -46
 - data/lib/scraper_utils/mechanize_utils.rb +12 -4
 - data/lib/scraper_utils/randomize_utils.rb +34 -0
 - data/lib/scraper_utils/robots_checker.rb +9 -4
 - data/lib/scraper_utils/version.rb +1 -1
 - data/lib/scraper_utils.rb +3 -10
 - metadata +7 -2
 
| 
         @@ -13,8 +13,8 @@ module ScraperUtils 
     | 
|
| 
       13 
13 
     | 
    
         
             
                # @param start_time [Time] When this scraping attempt was started
         
     | 
| 
       14 
14 
     | 
    
         
             
                # @param attempt [Integer] 1 for first run, 2 for first retry, 3 for last retry (without proxy)
         
     | 
| 
       15 
15 
     | 
    
         
             
                # @param authorities [Array<Symbol>] List of authorities attempted to scrape
         
     | 
| 
       16 
     | 
    
         
            -
                # @param exceptions [Hash  
     | 
| 
       17 
     | 
    
         
            -
                # DataQualityMonitor.stats is checked for :saved and :unprocessed entries
         
     | 
| 
      
 16 
     | 
    
         
            +
                # @param exceptions [Hash{Symbol => Exception}] Any exceptions that occurred during scraping
         
     | 
| 
      
 17 
     | 
    
         
            +
                # `DataQualityMonitor.stats` is checked for :saved and :unprocessed entries
         
     | 
| 
       18 
18 
     | 
    
         
             
                # @return [void]
         
     | 
| 
       19 
19 
     | 
    
         
             
                def self.log_scraping_run(start_time, attempt, authorities, exceptions)
         
     | 
| 
       20 
20 
     | 
    
         
             
                  raise ArgumentError, "Invalid start time" unless start_time.is_a?(Time)
         
     | 
| 
         @@ -75,39 +75,39 @@ module ScraperUtils 
     | 
|
| 
       75 
75 
     | 
    
         | 
| 
       76 
76 
     | 
    
         
             
                # Report on the results
         
     | 
| 
       77 
77 
     | 
    
         
             
                # @param authorities [Array<Symbol>] List of authorities attempted to scrape
         
     | 
| 
       78 
     | 
    
         
            -
                # @param exceptions [Hash  
     | 
| 
       79 
     | 
    
         
            -
                # DataQualityMonitor.stats is checked for :saved and :unprocessed entries
         
     | 
| 
      
 78 
     | 
    
         
            +
                # @param exceptions [Hash{Symbol => Exception}] Any exceptions that occurred during scraping
         
     | 
| 
      
 79 
     | 
    
         
            +
                # `DataQualityMonitor.stats` is checked for :saved and :unprocessed entries
         
     | 
| 
       80 
80 
     | 
    
         
             
                # @return [void]
         
     | 
| 
       81 
81 
     | 
    
         
             
                def self.report_on_results(authorities, exceptions)
         
     | 
| 
       82 
     | 
    
         
            -
                   
     | 
| 
      
 82 
     | 
    
         
            +
                  if ENV["MORPH_EXPECT_BAD"]
         
     | 
| 
      
 83 
     | 
    
         
            +
                    expect_bad = ENV["MORPH_EXPECT_BAD"].split(",").map(&:strip).map(&:to_sym)
         
     | 
| 
      
 84 
     | 
    
         
            +
                  end
         
     | 
| 
      
 85 
     | 
    
         
            +
                  expect_bad ||= []
         
     | 
| 
       83 
86 
     | 
    
         | 
| 
       84 
     | 
    
         
            -
                   
     | 
| 
      
 87 
     | 
    
         
            +
                  $stderr.flush
         
     | 
| 
      
 88 
     | 
    
         
            +
                  puts "MORPH_EXPECT_BAD=#{ENV.fetch('MORPH_EXPECT_BAD', nil)}"
         
     | 
| 
       85 
89 
     | 
    
         | 
| 
       86 
90 
     | 
    
         
             
                  # Print summary table
         
     | 
| 
       87 
91 
     | 
    
         
             
                  puts "\nScraping Summary:"
         
     | 
| 
       88 
92 
     | 
    
         
             
                  summary_format = "%-20s %6s %6s %s"
         
     | 
| 
       89 
93 
     | 
    
         | 
| 
       90 
     | 
    
         
            -
                  puts summary_format  
     | 
| 
       91 
     | 
    
         
            -
                  puts summary_format  
     | 
| 
      
 94 
     | 
    
         
            +
                  puts format(summary_format, 'Authority', 'OK', 'Bad', 'Exception')
         
     | 
| 
      
 95 
     | 
    
         
            +
                  puts format(summary_format, "-" * 20, "-" * 6, "-" * 6, "-" * 50)
         
     | 
| 
       92 
96 
     | 
    
         | 
| 
       93 
97 
     | 
    
         
             
                  authorities.each do |authority|
         
     | 
| 
       94 
98 
     | 
    
         
             
                    stats = ScraperUtils::DataQualityMonitor.stats&.fetch(authority, {}) || {}
         
     | 
| 
       95 
     | 
    
         
            -
             
     | 
| 
      
 99 
     | 
    
         
            +
             
     | 
| 
       96 
100 
     | 
    
         
             
                    ok_records = stats[:saved] || 0
         
     | 
| 
       97 
101 
     | 
    
         
             
                    bad_records = stats[:unprocessed] || 0
         
     | 
| 
       98 
     | 
    
         
            -
             
     | 
| 
      
 102 
     | 
    
         
            +
             
     | 
| 
       99 
103 
     | 
    
         
             
                    expect_bad_prefix = expect_bad.include?(authority) ? "[EXPECT BAD] " : ""
         
     | 
| 
       100 
104 
     | 
    
         
             
                    exception_msg = if exceptions[authority]
         
     | 
| 
       101 
105 
     | 
    
         
             
                                      "#{exceptions[authority].class} - #{exceptions[authority].message}"
         
     | 
| 
       102 
106 
     | 
    
         
             
                                    else
         
     | 
| 
       103 
107 
     | 
    
         
             
                                      "-"
         
     | 
| 
       104 
108 
     | 
    
         
             
                                    end
         
     | 
| 
       105 
     | 
    
         
            -
                    puts summary_format  
     | 
| 
       106 
     | 
    
         
            -
             
     | 
| 
       107 
     | 
    
         
            -
                      ok_records, 
         
     | 
| 
       108 
     | 
    
         
            -
                      bad_records,
         
     | 
| 
       109 
     | 
    
         
            -
                      "#{expect_bad_prefix}#{exception_msg}".slice(0, 70)
         
     | 
| 
       110 
     | 
    
         
            -
                    ]
         
     | 
| 
      
 109 
     | 
    
         
            +
                    puts format(summary_format, authority.to_s, ok_records, bad_records,
         
     | 
| 
      
 110 
     | 
    
         
            +
                                "#{expect_bad_prefix}#{exception_msg}".slice(0, 70))
         
     | 
| 
       111 
111 
     | 
    
         
             
                  end
         
     | 
| 
       112 
112 
     | 
    
         
             
                  puts
         
     | 
| 
       113 
113 
     | 
    
         | 
| 
         @@ -120,7 +120,8 @@ module ScraperUtils 
     | 
|
| 
       120 
120 
     | 
    
         
             
                  end
         
     | 
| 
       121 
121 
     | 
    
         | 
| 
       122 
122 
     | 
    
         
             
                  if unexpected_working.any?
         
     | 
| 
       123 
     | 
    
         
            -
                    errors << 
     | 
| 
      
 123 
     | 
    
         
            +
                    errors <<
         
     | 
| 
      
 124 
     | 
    
         
            +
                      "WARNING: Remove #{unexpected_working.join(',')} from MORPH_EXPECT_BAD as it now works!"
         
     | 
| 
       124 
125 
     | 
    
         
             
                  end
         
     | 
| 
       125 
126 
     | 
    
         | 
| 
       126 
127 
     | 
    
         
             
                  # Check for authorities with unexpected errors
         
     | 
| 
         @@ -137,6 +138,7 @@ module ScraperUtils 
     | 
|
| 
       137 
138 
     | 
    
         
             
                    end
         
     | 
| 
       138 
139 
     | 
    
         
             
                  end
         
     | 
| 
       139 
140 
     | 
    
         | 
| 
      
 141 
     | 
    
         
            +
                  $stdout.flush
         
     | 
| 
       140 
142 
     | 
    
         
             
                  if errors.any?
         
     | 
| 
       141 
143 
     | 
    
         
             
                    errors << "See earlier output for details"
         
     | 
| 
       142 
144 
     | 
    
         
             
                    raise errors.join("\n")
         
     | 
| 
         @@ -23,6 +23,11 @@ module ScraperUtils 
     | 
|
| 
       23 
23 
     | 
    
         
             
                #     random_delay: 10
         
     | 
| 
       24 
24 
     | 
    
         
             
                #   )
         
     | 
| 
       25 
25 
     | 
    
         
             
                class AgentConfig
         
     | 
| 
      
 26 
     | 
    
         
            +
                  DEFAULT_TIMEOUT = 60
         
     | 
| 
      
 27 
     | 
    
         
            +
                  DEFAULT_RANDOM_DELAY = 5
         
     | 
| 
      
 28 
     | 
    
         
            +
                  DEFAULT_MAX_LOAD = 33.3
         
     | 
| 
      
 29 
     | 
    
         
            +
                  MAX_LOAD_CAP = 50.0
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
       26 
31 
     | 
    
         
             
                  # Class-level defaults that can be modified
         
     | 
| 
       27 
32 
     | 
    
         
             
                  class << self
         
     | 
| 
       28 
33 
     | 
    
         
             
                    # @return [Integer] Default timeout in seconds for agent connections
         
     | 
| 
         @@ -62,65 +67,68 @@ module ScraperUtils 
     | 
|
| 
       62 
67 
     | 
    
         
             
                    # Reset all configuration options to their default values
         
     | 
| 
       63 
68 
     | 
    
         
             
                    # @return [void]
         
     | 
| 
       64 
69 
     | 
    
         
             
                    def reset_defaults!
         
     | 
| 
       65 
     | 
    
         
            -
                      @default_timeout = 60
         
     | 
| 
       66 
     | 
    
         
            -
                      @default_compliant_mode = true
         
     | 
| 
       67 
     | 
    
         
            -
                      @default_random_delay =  
     | 
| 
       68 
     | 
    
         
            -
                      @default_max_load =  
     | 
| 
       69 
     | 
    
         
            -
                      @default_disable_ssl_certificate_check = false
         
     | 
| 
       70 
     | 
    
         
            -
                      @default_australian_proxy = nil
         
     | 
| 
       71 
     | 
    
         
            -
                      @default_user_agent = nil
         
     | 
| 
      
 70 
     | 
    
         
            +
                      @default_timeout = ENV.fetch('MORPH_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
         
     | 
| 
      
 71 
     | 
    
         
            +
                      @default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
         
     | 
| 
      
 72 
     | 
    
         
            +
                      @default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 5
         
     | 
| 
      
 73 
     | 
    
         
            +
                      @default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD).to_f # 33.3
         
     | 
| 
      
 74 
     | 
    
         
            +
                      @default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
         
     | 
| 
      
 75 
     | 
    
         
            +
                      @default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
         
     | 
| 
      
 76 
     | 
    
         
            +
                      @default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
         
     | 
| 
       72 
77 
     | 
    
         
             
                    end
         
     | 
| 
       73 
78 
     | 
    
         
             
                  end
         
     | 
| 
       74 
79 
     | 
    
         | 
| 
       75 
80 
     | 
    
         
             
                  # Set defaults on load
         
     | 
| 
       76 
81 
     | 
    
         
             
                  reset_defaults!
         
     | 
| 
       77 
82 
     | 
    
         | 
| 
       78 
     | 
    
         
            -
             
     | 
| 
       79 
83 
     | 
    
         
             
                  # @return [String] User agent string
         
     | 
| 
       80 
84 
     | 
    
         
             
                  attr_reader :user_agent
         
     | 
| 
       81 
85 
     | 
    
         | 
| 
       82 
86 
     | 
    
         
             
                  # Give access for testing
         
     | 
| 
       83 
87 
     | 
    
         | 
| 
       84 
     | 
    
         
            -
                  attr_reader :max_load
         
     | 
| 
       85 
     | 
    
         
            -
                  attr_reader :min_random
         
     | 
| 
       86 
     | 
    
         
            -
                  attr_reader :max_random
         
     | 
| 
      
 88 
     | 
    
         
            +
                  attr_reader :max_load, :min_random, :max_random
         
     | 
| 
       87 
89 
     | 
    
         | 
| 
       88 
     | 
    
         
            -
                  # Creates  
     | 
| 
       89 
     | 
    
         
            -
                  # @param timeout [Integer, nil] Timeout for agent connections (default: 60 
     | 
| 
       90 
     | 
    
         
            -
                  # @param compliant_mode [Boolean, nil] Comply with headers and robots.txt (default: true 
     | 
| 
       91 
     | 
    
         
            -
                  # @param random_delay [Integer, nil] Average random delay in seconds (default: 3 
     | 
| 
       92 
     | 
    
         
            -
                  # @param max_load [Float, nil] Maximum server load percentage (nil = no  
     | 
| 
      
 90 
     | 
    
         
            +
                  # Creates Mechanize agent configuration with sensible defaults overridable via configure
         
     | 
| 
      
 91 
     | 
    
         
            +
                  # @param timeout [Integer, nil] Timeout for agent connections (default: 60)
         
     | 
| 
      
 92 
     | 
    
         
            +
                  # @param compliant_mode [Boolean, nil] Comply with headers and robots.txt (default: true)
         
     | 
| 
      
 93 
     | 
    
         
            +
                  # @param random_delay [Integer, nil] Average random delay in seconds (default: 3)
         
     | 
| 
      
 94 
     | 
    
         
            +
                  # @param max_load [Float, nil] Maximum server load percentage (nil = no delay, default: 20%)
         
     | 
| 
       93 
95 
     | 
    
         
             
                  #                              When compliant_mode is true, max_load is capped at 33%
         
     | 
| 
       94 
     | 
    
         
            -
                  # @param disable_ssl_certificate_check [Boolean, nil] Skip SSL verification (default: false 
     | 
| 
       95 
     | 
    
         
            -
                  # @param australian_proxy [Boolean, nil] Use proxy if available (default: false 
     | 
| 
      
 96 
     | 
    
         
            +
                  # @param disable_ssl_certificate_check [Boolean, nil] Skip SSL verification (default: false)
         
     | 
| 
      
 97 
     | 
    
         
            +
                  # @param australian_proxy [Boolean, nil] Use proxy if available (default: false)
         
     | 
| 
       96 
98 
     | 
    
         
             
                  # @param user_agent [String, nil] Configure Mechanize user agent
         
     | 
| 
       97 
99 
     | 
    
         
             
                  def initialize(timeout: nil,
         
     | 
| 
       98 
100 
     | 
    
         
             
                                 compliant_mode: nil,
         
     | 
| 
       99 
101 
     | 
    
         
             
                                 random_delay: nil,
         
     | 
| 
       100 
102 
     | 
    
         
             
                                 max_load: nil,
         
     | 
| 
       101 
103 
     | 
    
         
             
                                 disable_ssl_certificate_check: nil,
         
     | 
| 
       102 
     | 
    
         
            -
                                 australian_proxy:  
     | 
| 
      
 104 
     | 
    
         
            +
                                 australian_proxy: nil,
         
     | 
| 
       103 
105 
     | 
    
         
             
                                 user_agent: nil)
         
     | 
| 
       104 
106 
     | 
    
         
             
                    @timeout = timeout.nil? ? self.class.default_timeout : timeout
         
     | 
| 
       105 
107 
     | 
    
         
             
                    @compliant_mode = compliant_mode.nil? ? self.class.default_compliant_mode : compliant_mode
         
     | 
| 
       106 
108 
     | 
    
         
             
                    @random_delay = random_delay.nil? ? self.class.default_random_delay : random_delay
         
     | 
| 
       107 
109 
     | 
    
         
             
                    @max_load = max_load.nil? ? self.class.default_max_load : max_load
         
     | 
| 
       108 
     | 
    
         
            -
                    @max_load = [@max_load ||  
     | 
| 
      
 110 
     | 
    
         
            +
                    @max_load = [@max_load || DEFAULT_MAX_LOAD, MAX_LOAD_CAP].min if @compliant_mode
         
     | 
| 
       109 
111 
     | 
    
         
             
                    @user_agent = user_agent.nil? ? self.class.default_user_agent : user_agent
         
     | 
| 
       110 
112 
     | 
    
         | 
| 
       111 
     | 
    
         
            -
                    @disable_ssl_certificate_check = disable_ssl_certificate_check.nil? 
     | 
| 
       112 
     | 
    
         
            -
                                                       self.class.default_disable_ssl_certificate_check 
     | 
| 
      
 113 
     | 
    
         
            +
                    @disable_ssl_certificate_check = if disable_ssl_certificate_check.nil?
         
     | 
| 
      
 114 
     | 
    
         
            +
                                                       self.class.default_disable_ssl_certificate_check
         
     | 
| 
      
 115 
     | 
    
         
            +
                                                     else
         
     | 
| 
       113 
116 
     | 
    
         
             
                                                       disable_ssl_certificate_check
         
     | 
| 
       114 
     | 
    
         
            -
             
     | 
| 
      
 117 
     | 
    
         
            +
                                                     end
         
     | 
| 
      
 118 
     | 
    
         
            +
                    @australian_proxy = if australian_proxy.nil?
         
     | 
| 
      
 119 
     | 
    
         
            +
                                          self.class.default_australian_proxy
         
     | 
| 
      
 120 
     | 
    
         
            +
                                        else
         
     | 
| 
      
 121 
     | 
    
         
            +
                                          australian_proxy
         
     | 
| 
      
 122 
     | 
    
         
            +
                                        end
         
     | 
| 
       115 
123 
     | 
    
         | 
| 
       116 
124 
     | 
    
         
             
                    # Validate proxy URL format if proxy will be used
         
     | 
| 
       117 
125 
     | 
    
         
             
                    @australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
         
     | 
| 
       118 
126 
     | 
    
         
             
                    if @australian_proxy
         
     | 
| 
       119 
127 
     | 
    
         
             
                      uri = begin
         
     | 
| 
       120 
     | 
    
         
            -
             
     | 
| 
       121 
     | 
    
         
            -
             
     | 
| 
       122 
     | 
    
         
            -
             
     | 
| 
       123 
     | 
    
         
            -
             
     | 
| 
      
 128 
     | 
    
         
            +
                        URI.parse(ScraperUtils.australian_proxy.to_s)
         
     | 
| 
      
 129 
     | 
    
         
            +
                      rescue URI::InvalidURIError => e
         
     | 
| 
      
 130 
     | 
    
         
            +
                        raise URI::InvalidURIError, "Invalid proxy URL format: #{e.message}"
         
     | 
| 
      
 131 
     | 
    
         
            +
                      end
         
     | 
| 
       124 
132 
     | 
    
         
             
                      unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
         
     | 
| 
       125 
133 
     | 
    
         
             
                        raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
         
     | 
| 
       126 
134 
     | 
    
         
             
                      end
         
     | 
| 
         @@ -135,7 +143,7 @@ module ScraperUtils 
     | 
|
| 
       135 
143 
     | 
    
         
             
                    end
         
     | 
| 
       136 
144 
     | 
    
         | 
| 
       137 
145 
     | 
    
         
             
                    today = Date.today.strftime("%Y-%m-%d")
         
     | 
| 
       138 
     | 
    
         
            -
                    @user_agent = ENV 
     | 
| 
      
 146 
     | 
    
         
            +
                    @user_agent = ENV.fetch("MORPH_USER_AGENT", nil)&.sub("TODAY", today)
         
     | 
| 
       139 
147 
     | 
    
         
             
                    if @compliant_mode
         
     | 
| 
       140 
148 
     | 
    
         
             
                      version = ScraperUtils::VERSION
         
     | 
| 
       141 
149 
     | 
    
         
             
                      @user_agent ||= "Mozilla/5.0 (compatible; ScraperUtils/#{version} #{today}; +https://github.com/ianheggie-oaf/scraper_utils)"
         
     | 
| 
         @@ -159,7 +167,8 @@ module ScraperUtils 
     | 
|
| 
       159 
167 
     | 
    
         
             
                    if @compliant_mode
         
     | 
| 
       160 
168 
     | 
    
         
             
                      agent.user_agent = user_agent
         
     | 
| 
       161 
169 
     | 
    
         
             
                      agent.request_headers ||= {}
         
     | 
| 
       162 
     | 
    
         
            -
                      agent.request_headers["Accept"] = 
     | 
| 
      
 170 
     | 
    
         
            +
                      agent.request_headers["Accept"] =
         
     | 
| 
      
 171 
     | 
    
         
            +
                        "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
         
     | 
| 
       163 
172 
     | 
    
         
             
                      agent.request_headers["Upgrade-Insecure-Requests"] = "1"
         
     | 
| 
       164 
173 
     | 
    
         
             
                    end
         
     | 
| 
       165 
174 
     | 
    
         
             
                    if @australian_proxy
         
     | 
| 
         @@ -178,32 +187,39 @@ module ScraperUtils 
     | 
|
| 
       178 
187 
     | 
    
         
             
                  def display_options
         
     | 
| 
       179 
188 
     | 
    
         
             
                    display_args = []
         
     | 
| 
       180 
189 
     | 
    
         
             
                    display_args << "timeout=#{@timeout}" if @timeout
         
     | 
| 
       181 
     | 
    
         
            -
                    if  
     | 
| 
       182 
     | 
    
         
            -
             
     | 
| 
       183 
     | 
    
         
            -
             
     | 
| 
       184 
     | 
    
         
            -
             
     | 
| 
       185 
     | 
    
         
            -
             
     | 
| 
       186 
     | 
    
         
            -
                      display_args << "australian_proxy=#{@australian_proxy.inspect}"
         
     | 
| 
       187 
     | 
    
         
            -
                    end
         
     | 
| 
      
 190 
     | 
    
         
            +
                    display_args << if ScraperUtils.australian_proxy.to_s.empty? && !@australian_proxy
         
     | 
| 
      
 191 
     | 
    
         
            +
                                      "#{ScraperUtils::AUSTRALIAN_PROXY_ENV_VAR} not set"
         
     | 
| 
      
 192 
     | 
    
         
            +
                                    else
         
     | 
| 
      
 193 
     | 
    
         
            +
                                      "australian_proxy=#{@australian_proxy.inspect}"
         
     | 
| 
      
 194 
     | 
    
         
            +
                                    end
         
     | 
| 
       188 
195 
     | 
    
         
             
                    display_args << "compliant_mode" if @compliant_mode
         
     | 
| 
       189 
196 
     | 
    
         
             
                    display_args << "random_delay=#{@random_delay}" if @random_delay
         
     | 
| 
       190 
197 
     | 
    
         
             
                    display_args << "max_load=#{@max_load}%" if @max_load
         
     | 
| 
       191 
198 
     | 
    
         
             
                    display_args << "disable_ssl_certificate_check" if @disable_ssl_certificate_check
         
     | 
| 
       192 
199 
     | 
    
         
             
                    display_args << "default args" if display_args.empty?
         
     | 
| 
       193 
     | 
    
         
            -
                    ScraperUtils::FiberScheduler.log 
     | 
| 
      
 200 
     | 
    
         
            +
                    ScraperUtils::FiberScheduler.log(
         
     | 
| 
      
 201 
     | 
    
         
            +
                      "Configuring Mechanize agent with #{display_args.join(', ')}"
         
     | 
| 
      
 202 
     | 
    
         
            +
                    )
         
     | 
| 
       194 
203 
     | 
    
         
             
                  end
         
     | 
| 
       195 
204 
     | 
    
         | 
| 
       196 
205 
     | 
    
         
             
                  def pre_connect_hook(_agent, request)
         
     | 
| 
       197 
206 
     | 
    
         
             
                    @connection_started_at = Time.now
         
     | 
| 
       198 
     | 
    
         
            -
                     
     | 
| 
      
 207 
     | 
    
         
            +
                    return unless DebugUtils.verbose?
         
     | 
| 
      
 208 
     | 
    
         
            +
             
     | 
| 
      
 209 
     | 
    
         
            +
                    ScraperUtils::FiberScheduler.log(
         
     | 
| 
      
 210 
     | 
    
         
            +
                      "Pre Connect request: #{request.inspect} at #{@connection_started_at}"
         
     | 
| 
      
 211 
     | 
    
         
            +
                    )
         
     | 
| 
       199 
212 
     | 
    
         
             
                  end
         
     | 
| 
       200 
213 
     | 
    
         | 
| 
       201 
214 
     | 
    
         
             
                  def post_connect_hook(_agent, uri, response, _body)
         
     | 
| 
       202 
215 
     | 
    
         
             
                    raise ArgumentError, "URI must be present in post-connect hook" unless uri
         
     | 
| 
       203 
216 
     | 
    
         | 
| 
       204 
217 
     | 
    
         
             
                    response_time = Time.now - @connection_started_at
         
     | 
| 
       205 
     | 
    
         
            -
                    if  
     | 
| 
       206 
     | 
    
         
            -
                      ScraperUtils::FiberScheduler.log 
     | 
| 
      
 218 
     | 
    
         
            +
                    if DebugUtils.basic?
         
     | 
| 
      
 219 
     | 
    
         
            +
                      ScraperUtils::FiberScheduler.log(
         
     | 
| 
      
 220 
     | 
    
         
            +
                        "Post Connect uri: #{uri.inspect}, response: #{response.inspect} " \
         
     | 
| 
      
 221 
     | 
    
         
            +
                        "after #{response_time} seconds"
         
     | 
| 
      
 222 
     | 
    
         
            +
                      )
         
     | 
| 
       207 
223 
     | 
    
         
             
                    end
         
     | 
| 
       208 
224 
     | 
    
         | 
| 
       209 
225 
     | 
    
         
             
                    if @robots_checker&.disallowed?(uri)
         
     | 
| 
         @@ -214,18 +230,23 @@ module ScraperUtils 
     | 
|
| 
       214 
230 
     | 
    
         
             
                    delays = {
         
     | 
| 
       215 
231 
     | 
    
         
             
                      robot_txt: @robots_checker&.crawl_delay&.round(3),
         
     | 
| 
       216 
232 
     | 
    
         
             
                      max_load: @adaptive_delay&.next_delay(uri, response_time)&.round(3),
         
     | 
| 
       217 
     | 
    
         
            -
                      random: (@min_random ? (rand(@min_random..@max_random) 
     | 
| 
      
 233 
     | 
    
         
            +
                      random: (@min_random ? (rand(@min_random..@max_random)**2).round(3) : nil)
         
     | 
| 
       218 
234 
     | 
    
         
             
                    }
         
     | 
| 
       219 
235 
     | 
    
         
             
                    @delay = delays.values.compact.max
         
     | 
| 
       220 
236 
     | 
    
         
             
                    if @delay&.positive?
         
     | 
| 
       221 
     | 
    
         
            -
                       
     | 
| 
       222 
     | 
    
         
            -
                       
     | 
| 
      
 237 
     | 
    
         
            +
                      $stderr.flush
         
     | 
| 
      
 238 
     | 
    
         
            +
                      ScraperUtils::FiberScheduler.log("Delaying #{@delay} seconds, max of #{delays.inspect}") if ENV["DEBUG"]
         
     | 
| 
      
 239 
     | 
    
         
            +
                      $stdout.flush
         
     | 
| 
      
 240 
     | 
    
         
            +
                      ScraperUtils::FiberScheduler.delay(@delay)
         
     | 
| 
       223 
241 
     | 
    
         
             
                    end
         
     | 
| 
       224 
242 
     | 
    
         | 
| 
       225 
243 
     | 
    
         
             
                    response
         
     | 
| 
       226 
244 
     | 
    
         
             
                  end
         
     | 
| 
       227 
245 
     | 
    
         | 
| 
       228 
246 
     | 
    
         
             
                  def verify_proxy_works(agent)
         
     | 
| 
      
 247 
     | 
    
         
            +
                    $stderr.flush
         
     | 
| 
      
 248 
     | 
    
         
            +
                    $stdout.flush
         
     | 
| 
      
 249 
     | 
    
         
            +
                    FiberScheduler.log "Checking proxy works..."
         
     | 
| 
       229 
250 
     | 
    
         
             
                    my_ip = MechanizeUtils.public_ip(agent)
         
     | 
| 
       230 
251 
     | 
    
         
             
                    begin
         
     | 
| 
       231 
252 
     | 
    
         
             
                      IPAddr.new(my_ip)
         
     | 
| 
         @@ -233,17 +254,17 @@ module ScraperUtils 
     | 
|
| 
       233 
254 
     | 
    
         
             
                      raise "Invalid public IP address returned by proxy check: #{my_ip.inspect}: #{e}"
         
     | 
| 
       234 
255 
     | 
    
         
             
                    end
         
     | 
| 
       235 
256 
     | 
    
         
             
                    ScraperUtils::FiberScheduler.log "Proxy is using IP address: #{my_ip.inspect}"
         
     | 
| 
       236 
     | 
    
         
            -
                    my_headers = MechanizeUtils 
     | 
| 
      
 257 
     | 
    
         
            +
                    my_headers = MechanizeUtils.public_headers(agent)
         
     | 
| 
       237 
258 
     | 
    
         
             
                    begin
         
     | 
| 
       238 
259 
     | 
    
         
             
                      # Check response is JSON just to be safe!
         
     | 
| 
       239 
260 
     | 
    
         
             
                      headers = JSON.parse(my_headers)
         
     | 
| 
       240 
261 
     | 
    
         
             
                      puts "Proxy is passing headers:"
         
     | 
| 
       241 
     | 
    
         
            -
                      puts JSON.pretty_generate(headers[ 
     | 
| 
      
 262 
     | 
    
         
            +
                      puts JSON.pretty_generate(headers["headers"])
         
     | 
| 
       242 
263 
     | 
    
         
             
                    rescue JSON::ParserError => e
         
     | 
| 
       243 
264 
     | 
    
         
             
                      puts "Couldn't parse public_headers: #{e}! Raw response:"
         
     | 
| 
       244 
265 
     | 
    
         
             
                      puts my_headers.inspect
         
     | 
| 
       245 
266 
     | 
    
         
             
                    end
         
     | 
| 
       246 
     | 
    
         
            -
                  rescue  
     | 
| 
      
 267 
     | 
    
         
            +
                  rescue Timeout::Error => e # Includes Net::OpenTimeout
         
     | 
| 
       247 
268 
     | 
    
         
             
                    raise "Proxy check timed out: #{e}"
         
     | 
| 
       248 
269 
     | 
    
         
             
                  rescue Errno::ECONNREFUSED, Net::HTTP::Persistent::Error => e
         
     | 
| 
       249 
270 
     | 
    
         
             
                    raise "Failed to connect to proxy: #{e}"
         
     | 
| 
         @@ -45,20 +45,28 @@ module ScraperUtils 
     | 
|
| 
       45 
45 
     | 
    
         
             
                #
         
     | 
| 
       46 
46 
     | 
    
         
             
                # @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
         
     | 
| 
       47 
47 
     | 
    
         
             
                # @param force [Boolean] Force a new IP lookup, by clearing cache first
         
     | 
| 
       48 
     | 
    
         
            -
                # @return [String] The public IP address
         
     | 
| 
      
 48 
     | 
    
         
            +
                # @return [String, nil] The public IP address
         
     | 
| 
       49 
49 
     | 
    
         
             
                def self.public_ip(agent = nil, force: false)
         
     | 
| 
       50 
50 
     | 
    
         
             
                  @public_ip = nil if force
         
     | 
| 
       51 
     | 
    
         
            -
                  @public_ip ||=  
     | 
| 
      
 51 
     | 
    
         
            +
                  @public_ip ||= begin
         
     | 
| 
      
 52 
     | 
    
         
            +
                    response = agent&.get(PUBLIC_IP_URL)
         
     | 
| 
      
 53 
     | 
    
         
            +
                    response&.body&.strip
         
     | 
| 
      
 54 
     | 
    
         
            +
                  end
         
     | 
| 
      
 55 
     | 
    
         
            +
                  @public_ip
         
     | 
| 
       52 
56 
     | 
    
         
             
                end
         
     | 
| 
       53 
57 
     | 
    
         | 
| 
       54 
58 
     | 
    
         
             
                # Retrieves and logs the headers that make it through the proxy
         
     | 
| 
       55 
59 
     | 
    
         
             
                #
         
     | 
| 
       56 
60 
     | 
    
         
             
                # @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
         
     | 
| 
       57 
61 
     | 
    
         
             
                # @param force [Boolean] Force a new IP lookup, by clearing cache first
         
     | 
| 
       58 
     | 
    
         
            -
                # @return [String] The list of headers in json format
         
     | 
| 
      
 62 
     | 
    
         
            +
                # @return [String, nil] The list of headers in json format
         
     | 
| 
       59 
63 
     | 
    
         
             
                def self.public_headers(agent = nil, force: false)
         
     | 
| 
       60 
64 
     | 
    
         
             
                  @public_headers = nil if force
         
     | 
| 
       61 
     | 
    
         
            -
                  @public_headers ||=  
     | 
| 
      
 65 
     | 
    
         
            +
                  @public_headers ||= begin
         
     | 
| 
      
 66 
     | 
    
         
            +
                    response = agent&.get(HEADERS_ECHO_URL)
         
     | 
| 
      
 67 
     | 
    
         
            +
                    response&.body&.strip
         
     | 
| 
      
 68 
     | 
    
         
            +
                  end
         
     | 
| 
      
 69 
     | 
    
         
            +
                  @public_headers
         
     | 
| 
       62 
70 
     | 
    
         
             
                end
         
     | 
| 
       63 
71 
     | 
    
         
             
              end
         
     | 
| 
       64 
72 
     | 
    
         
             
            end
         
     | 
| 
         @@ -0,0 +1,34 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module ScraperUtils
         
     | 
| 
      
 4 
     | 
    
         
            +
              # Provides utilities for randomizing processing order in scrapers,
         
     | 
| 
      
 5 
     | 
    
         
            +
              # particularly helpful for distributing load and avoiding predictable patterns
         
     | 
| 
      
 6 
     | 
    
         
            +
              module RandomizeUtils
         
     | 
| 
      
 7 
     | 
    
         
            +
                # Returns a randomized version of the input collection when in production mode,
         
     | 
| 
      
 8 
     | 
    
         
            +
                # or the original collection when in test/sequential mode
         
     | 
| 
      
 9 
     | 
    
         
            +
                #
         
     | 
| 
      
 10 
     | 
    
         
            +
                # @param collection [Array, Enumerable] Collection of items to potentially randomize
         
     | 
| 
      
 11 
     | 
    
         
            +
                # @return [Array] Randomized or original collection depending on environment
         
     | 
| 
      
 12 
     | 
    
         
            +
                def self.randomize_order(collection)
         
     | 
| 
      
 13 
     | 
    
         
            +
                  return collection.to_a if sequential?
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
                  collection.to_a.shuffle
         
     | 
| 
      
 16 
     | 
    
         
            +
                end
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
                # Checks if sequential processing is enabled
         
     | 
| 
      
 19 
     | 
    
         
            +
                #
         
     | 
| 
      
 20 
     | 
    
         
            +
                # @return [Boolean] true when in test mode or MORPH_PROCESS_SEQUENTIALLY is set
         
     | 
| 
      
 21 
     | 
    
         
            +
                def self.sequential?
         
     | 
| 
      
 22 
     | 
    
         
            +
                  @sequential = !ENV["MORPH_PROCESS_SEQUENTIALLY"].to_s.empty? if @sequential.nil?
         
     | 
| 
      
 23 
     | 
    
         
            +
                  @sequential || false
         
     | 
| 
      
 24 
     | 
    
         
            +
                end
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
                # Explicitly set sequential mode for testing
         
     | 
| 
      
 27 
     | 
    
         
            +
                #
         
     | 
| 
      
 28 
     | 
    
         
            +
                # @param value [Boolean, nil] true to enable sequential mode, false to disable, nil to clear cache
         
     | 
| 
      
 29 
     | 
    
         
            +
                # @return [Boolean, nil]
         
     | 
| 
      
 30 
     | 
    
         
            +
                def self.sequential=(value)
         
     | 
| 
      
 31 
     | 
    
         
            +
                  @sequential = value
         
     | 
| 
      
 32 
     | 
    
         
            +
                end
         
     | 
| 
      
 33 
     | 
    
         
            +
              end
         
     | 
| 
      
 34 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -14,8 +14,10 @@ module ScraperUtils 
     | 
|
| 
       14 
14 
     | 
    
         
             
                # * Crawl-delay from either User-agent: bot name or * (default)
         
     | 
| 
       15 
15 
     | 
    
         
             
                def initialize(user_agent)
         
     | 
| 
       16 
16 
     | 
    
         
             
                  @user_agent = extract_user_agent(user_agent).downcase
         
     | 
| 
       17 
     | 
    
         
            -
                  if  
     | 
| 
       18 
     | 
    
         
            -
                    ScraperUtils::FiberScheduler.log 
     | 
| 
      
 17 
     | 
    
         
            +
                  if DebugUtils.basic?
         
     | 
| 
      
 18 
     | 
    
         
            +
                    ScraperUtils::FiberScheduler.log(
         
     | 
| 
      
 19 
     | 
    
         
            +
                      "Checking robots.txt for user agent prefix: #{@user_agent} (case insensitive)"
         
     | 
| 
      
 20 
     | 
    
         
            +
                    )
         
     | 
| 
       19 
21 
     | 
    
         
             
                  end
         
     | 
| 
       20 
22 
     | 
    
         
             
                  @rules = {} # domain -> {rules: [], delay: int}
         
     | 
| 
       21 
23 
     | 
    
         
             
                  @delay = nil # Delay from last robots.txt check
         
     | 
| 
         @@ -73,7 +75,11 @@ module ScraperUtils 
     | 
|
| 
       73 
75 
     | 
    
         
             
                    @rules[domain] = rules
         
     | 
| 
       74 
76 
     | 
    
         
             
                    rules
         
     | 
| 
       75 
77 
     | 
    
         
             
                  rescue StandardError => e
         
     | 
| 
       76 
     | 
    
         
            -
                     
     | 
| 
      
 78 
     | 
    
         
            +
                    if DebugUtils.basic?
         
     | 
| 
      
 79 
     | 
    
         
            +
                      ScraperUtils::FiberScheduler.log(
         
     | 
| 
      
 80 
     | 
    
         
            +
                        "WARNING: Failed to fetch robots.txt for #{domain}: #{e.message}"
         
     | 
| 
      
 81 
     | 
    
         
            +
                      )
         
     | 
| 
      
 82 
     | 
    
         
            +
                    end
         
     | 
| 
       77 
83 
     | 
    
         
             
                    nil
         
     | 
| 
       78 
84 
     | 
    
         
             
                  end
         
     | 
| 
       79 
85 
     | 
    
         
             
                end
         
     | 
| 
         @@ -141,4 +147,3 @@ module ScraperUtils 
     | 
|
| 
       141 
147 
     | 
    
         
             
                end
         
     | 
| 
       142 
148 
     | 
    
         
             
              end
         
     | 
| 
       143 
149 
     | 
    
         
             
            end
         
     | 
| 
       144 
     | 
    
         
            -
             
     | 
    
        data/lib/scraper_utils.rb
    CHANGED
    
    | 
         @@ -5,8 +5,11 @@ require "scraper_utils/authority_utils" 
     | 
|
| 
       5 
5 
     | 
    
         
             
            require "scraper_utils/data_quality_monitor"
         
     | 
| 
       6 
6 
     | 
    
         
             
            require "scraper_utils/db_utils"
         
     | 
| 
       7 
7 
     | 
    
         
             
            require "scraper_utils/debug_utils"
         
     | 
| 
      
 8 
     | 
    
         
            +
            require "scraper_utils/fiber_scheduler"
         
     | 
| 
       8 
9 
     | 
    
         
             
            require "scraper_utils/log_utils"
         
     | 
| 
      
 10 
     | 
    
         
            +
            require "scraper_utils/mechanize_utils/agent_config"
         
     | 
| 
       9 
11 
     | 
    
         
             
            require "scraper_utils/mechanize_utils"
         
     | 
| 
      
 12 
     | 
    
         
            +
            require "scraper_utils/randomize_utils"
         
     | 
| 
       10 
13 
     | 
    
         
             
            require "scraper_utils/robots_checker"
         
     | 
| 
       11 
14 
     | 
    
         
             
            require "scraper_utils/version"
         
     | 
| 
       12 
15 
     | 
    
         | 
| 
         @@ -15,9 +18,6 @@ module ScraperUtils 
     | 
|
| 
       15 
18 
     | 
    
         
             
              # Constants for configuration on Morph.io
         
     | 
| 
       16 
19 
     | 
    
         
             
              AUSTRALIAN_PROXY_ENV_VAR = "MORPH_AUSTRALIAN_PROXY"
         
     | 
| 
       17 
20 
     | 
    
         | 
| 
       18 
     | 
    
         
            -
              # Enable debug locally, not on morph.io
         
     | 
| 
       19 
     | 
    
         
            -
              DEBUG_ENV_VAR = "DEBUG"
         
     | 
| 
       20 
     | 
    
         
            -
             
     | 
| 
       21 
21 
     | 
    
         
             
              # Fatal Error
         
     | 
| 
       22 
22 
     | 
    
         
             
              class Error < StandardError
         
     | 
| 
       23 
23 
     | 
    
         
             
              end
         
     | 
| 
         @@ -31,13 +31,6 @@ module ScraperUtils 
     | 
|
| 
       31 
31 
     | 
    
         
             
              class UnprocessableRecord < Error
         
     | 
| 
       32 
32 
     | 
    
         
             
              end
         
     | 
| 
       33 
33 
     | 
    
         | 
| 
       34 
     | 
    
         
            -
              # Check if debug mode is enabled
         
     | 
| 
       35 
     | 
    
         
            -
              #
         
     | 
| 
       36 
     | 
    
         
            -
              # @return [Boolean] Whether debug mode is active
         
     | 
| 
       37 
     | 
    
         
            -
              def self.debug?
         
     | 
| 
       38 
     | 
    
         
            -
                !ENV[DEBUG_ENV_VAR].to_s.empty?
         
     | 
| 
       39 
     | 
    
         
            -
              end
         
     | 
| 
       40 
     | 
    
         
            -
             
     | 
| 
       41 
34 
     | 
    
         
             
              def self.australian_proxy
         
     | 
| 
       42 
35 
     | 
    
         
             
                ap = ENV[AUSTRALIAN_PROXY_ENV_VAR].to_s
         
     | 
| 
       43 
36 
     | 
    
         
             
                ap.empty? ? nil : ap
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: scraper_utils
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.4.0
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Ian Heggie
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire:
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: exe
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date: 2025- 
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2025-03-03 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
14 
     | 
    
         
             
              name: mechanize
         
     | 
| 
         @@ -74,16 +74,21 @@ files: 
     | 
|
| 
       74 
74 
     | 
    
         
             
            - SPECS.md
         
     | 
| 
       75 
75 
     | 
    
         
             
            - bin/console
         
     | 
| 
       76 
76 
     | 
    
         
             
            - bin/setup
         
     | 
| 
      
 77 
     | 
    
         
            +
            - docs/example_scrape_with_fibers.rb
         
     | 
| 
      
 78 
     | 
    
         
            +
            - docs/example_scraper.rb
         
     | 
| 
       77 
79 
     | 
    
         
             
            - lib/scraper_utils.rb
         
     | 
| 
       78 
80 
     | 
    
         
             
            - lib/scraper_utils/adaptive_delay.rb
         
     | 
| 
       79 
81 
     | 
    
         
             
            - lib/scraper_utils/authority_utils.rb
         
     | 
| 
      
 82 
     | 
    
         
            +
            - lib/scraper_utils/cycle_utils.rb
         
     | 
| 
       80 
83 
     | 
    
         
             
            - lib/scraper_utils/data_quality_monitor.rb
         
     | 
| 
      
 84 
     | 
    
         
            +
            - lib/scraper_utils/date_range_utils.rb
         
     | 
| 
       81 
85 
     | 
    
         
             
            - lib/scraper_utils/db_utils.rb
         
     | 
| 
       82 
86 
     | 
    
         
             
            - lib/scraper_utils/debug_utils.rb
         
     | 
| 
       83 
87 
     | 
    
         
             
            - lib/scraper_utils/fiber_scheduler.rb
         
     | 
| 
       84 
88 
     | 
    
         
             
            - lib/scraper_utils/log_utils.rb
         
     | 
| 
       85 
89 
     | 
    
         
             
            - lib/scraper_utils/mechanize_utils.rb
         
     | 
| 
       86 
90 
     | 
    
         
             
            - lib/scraper_utils/mechanize_utils/agent_config.rb
         
     | 
| 
      
 91 
     | 
    
         
            +
            - lib/scraper_utils/randomize_utils.rb
         
     | 
| 
       87 
92 
     | 
    
         
             
            - lib/scraper_utils/robots_checker.rb
         
     | 
| 
       88 
93 
     | 
    
         
             
            - lib/scraper_utils/version.rb
         
     | 
| 
       89 
94 
     | 
    
         
             
            - scraper_utils.gemspec
         
     |