scraper_utils 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +5 -0
  3. data/CHANGELOG.md +7 -0
  4. data/GUIDELINES.md +2 -1
  5. data/Gemfile +1 -0
  6. data/IMPLEMENTATION.md +40 -0
  7. data/README.md +29 -23
  8. data/SPECS.md +13 -1
  9. data/bin/rspec +27 -0
  10. data/docs/example_scrape_with_fibers.rb +4 -4
  11. data/docs/fibers_and_threads.md +72 -0
  12. data/docs/getting_started.md +6 -6
  13. data/docs/interleaving_requests.md +7 -7
  14. data/docs/parallel_requests.md +138 -0
  15. data/docs/randomizing_requests.md +12 -8
  16. data/docs/reducing_server_load.md +6 -6
  17. data/lib/scraper_utils/data_quality_monitor.rb +2 -3
  18. data/lib/scraper_utils/date_range_utils.rb +37 -78
  19. data/lib/scraper_utils/debug_utils.rb +5 -5
  20. data/lib/scraper_utils/log_utils.rb +15 -0
  21. data/lib/scraper_utils/mechanize_actions.rb +37 -8
  22. data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +79 -0
  23. data/lib/scraper_utils/mechanize_utils/agent_config.rb +31 -30
  24. data/lib/scraper_utils/mechanize_utils/robots_checker.rb +151 -0
  25. data/lib/scraper_utils/mechanize_utils.rb +8 -5
  26. data/lib/scraper_utils/randomize_utils.rb +22 -19
  27. data/lib/scraper_utils/scheduler/constants.rb +12 -0
  28. data/lib/scraper_utils/scheduler/operation_registry.rb +101 -0
  29. data/lib/scraper_utils/scheduler/operation_worker.rb +199 -0
  30. data/lib/scraper_utils/scheduler/process_request.rb +59 -0
  31. data/lib/scraper_utils/scheduler/thread_request.rb +51 -0
  32. data/lib/scraper_utils/scheduler/thread_response.rb +59 -0
  33. data/lib/scraper_utils/scheduler.rb +286 -0
  34. data/lib/scraper_utils/version.rb +1 -1
  35. data/lib/scraper_utils.rb +11 -14
  36. metadata +16 -6
  37. data/lib/scraper_utils/adaptive_delay.rb +0 -70
  38. data/lib/scraper_utils/fiber_scheduler.rb +0 -229
  39. data/lib/scraper_utils/robots_checker.rb +0 -149
@@ -0,0 +1,151 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ module MechanizeUtils
5
+ # robots.txt checker with deliberately simplistic rules
6
+ class RobotsChecker
7
+ # @return [String] Lowercased user_agent for matching
8
+ attr_reader :user_agent
9
+
10
+ # Initialize with full user agent string like:
11
+ # "Mozilla/5.0 (compatible; ScraperUtils/0.1.0 2025-02-22; +https://github.com/ianheggie-oaf/scraper_utils)"
12
+ # Extracts the bot name (e.g. "ScraperUtils") to check against robots.txt
13
+ # Checks for
14
+ # * Disallow for User-agent: bot_name and
15
+ # * Crawl-delay from either User-agent: bot name or * (default)
16
+ def initialize(user_agent)
17
+ @user_agent = extract_user_agent(user_agent).downcase
18
+ if DebugUtils.basic?
19
+ ScraperUtils::LogUtils.log(
20
+ "Checking robots.txt for user agent prefix: #{@user_agent} (case insensitive)"
21
+ )
22
+ end
23
+ @rules = {} # domain -> {rules: [], delay: int}
24
+ @delay = nil # Delay from last robots.txt check
25
+ end
26
+
27
+ # Check if a URL is disallowed based on robots.txt rules specific to our user agent
28
+ # @param url [String] The full URL to check
29
+ # @return [Boolean] true if specifically blocked for our user agent, otherwise false
30
+ def disallowed?(url)
31
+ return false unless url
32
+
33
+ uri = URI(url)
34
+ domain = "#{uri.scheme}://#{uri.host}"
35
+ path = uri.path || "/"
36
+
37
+ # Get or fetch robots.txt rules
38
+ rules = get_rules(domain)
39
+ return false unless rules # If we can't get robots.txt, assume allowed
40
+
41
+ # Store any delay found for this domain
42
+ @delay = rules[:our_delay]
43
+
44
+ # Check rules specific to our user agent
45
+ matches_any_rule?(path, rules[:our_rules])
46
+ end
47
+
48
+ # Returns the crawl delay (if any) that applied to the last URL checked
49
+ # Should be called after disallowed? to get relevant delay
50
+ # @return [Integer, nil] The delay in seconds, or nil if no delay specified
51
+ def crawl_delay
52
+ @delay
53
+ end
54
+
55
+ private
56
+
57
+ def extract_user_agent(user_agent)
58
+ if user_agent =~ /^(.*compatible;\s*)?([-_a-z0-9]+)/i
59
+ user_agent = ::Regexp.last_match(2)&.strip
60
+ end
61
+ user_agent&.strip
62
+ end
63
+
64
+ def matches_any_rule?(path, rules)
65
+ rules&.any? { |rule| path.start_with?(rule) }
66
+ end
67
+
68
+ def get_rules(domain)
69
+ return @rules[domain] if @rules.key?(domain)
70
+
71
+ begin
72
+ response = Net::HTTP.get_response(URI("#{domain}/robots.txt"))
73
+ return nil unless response.code.start_with?("2") # 2xx response
74
+
75
+ rules = parse_robots_txt(response.body)
76
+ @rules[domain] = rules
77
+ rules
78
+ rescue StandardError => e
79
+ if DebugUtils.basic?
80
+ ScraperUtils::LogUtils.log(
81
+ "WARNING: Failed to fetch robots.txt for #{domain}: #{e.message}"
82
+ )
83
+ end
84
+ nil
85
+ end
86
+ end
87
+
88
+ # Parse robots.txt content into structured rules
89
+ # Only collects rules for our specific user agent and generic crawl-delay
90
+ # @param content [String] The robots.txt content
91
+ # @return [Hash] Hash containing :our_rules and :our_delay
92
+ def parse_robots_txt(content)
93
+ sections = [] # Array of {agent:, rules:[], delay:} hashes
94
+ current_section = nil
95
+
96
+ content.each_line do |line|
97
+ line = line.strip.downcase
98
+ next if line.empty? || line.start_with?("#")
99
+
100
+ if line.start_with?("user-agent:")
101
+ agent = line.split(":", 2).last.strip
102
+ # Check if this is a continuation of the previous section
103
+ if current_section && current_section[:rules].empty? && current_section[:delay].nil?
104
+ current_section[:agents] << agent
105
+ else
106
+ current_section = { agents: [agent], rules: [], delay: nil }
107
+ sections << current_section
108
+ end
109
+ next
110
+ end
111
+
112
+ next unless current_section # Skip rules before first user-agent
113
+
114
+ if line.start_with?("disallow:")
115
+ path = line.split(":", 2).last.strip
116
+ current_section[:rules] << path unless path.empty?
117
+ elsif line.start_with?("crawl-delay:")
118
+ delay = line.split(":", 2).last.strip.to_i
119
+ current_section[:delay] = delay if delay.positive?
120
+ end
121
+ end
122
+
123
+ # Sort sections by most specific agent match first
124
+ matched_section = sections.find do |section|
125
+ section[:agents].any? do |agent|
126
+ # Our user agent starts with the agent from robots.txt
127
+ @user_agent.start_with?(agent) ||
128
+ # Or the agent from robots.txt starts with our user agent
129
+ # (handles ScraperUtils matching ScraperUtils/1.0)
130
+ agent.start_with?(@user_agent)
131
+ end
132
+ end
133
+
134
+ # Use matched section or fall back to wildcard
135
+ if matched_section
136
+ {
137
+ our_rules: matched_section[:rules],
138
+ our_delay: matched_section[:delay]
139
+ }
140
+ else
141
+ # Find default section
142
+ default_section = sections.find { |s| s[:agents].include?("*") }
143
+ {
144
+ our_rules: [],
145
+ our_delay: default_section&.dig(:delay)
146
+ }
147
+ end
148
+ end
149
+ end
150
+ end
151
+ end
@@ -2,7 +2,10 @@
2
2
 
3
3
  require "mechanize"
4
4
  require "ipaddr"
5
- require "scraper_utils/mechanize_utils/agent_config"
5
+
6
+ require_relative "mechanize_utils/adaptive_delay"
7
+ require_relative "mechanize_utils/agent_config"
8
+ require_relative "mechanize_utils/robots_checker"
6
9
 
7
10
  module ScraperUtils
8
11
  # Utilities for configuring and using Mechanize for web scraping
@@ -43,8 +46,8 @@ module ScraperUtils
43
46
 
44
47
  # Retrieves and logs the public IP address
45
48
  #
46
- # @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
47
- # @param force [Boolean] Force a new IP lookup, by clearing cache first
49
+ # @param agent [Mechanize, nil] Mechanize agent to use for IP find or nil when clearing cache
50
+ # @param force [Boolean] Force a new IP find, by clearing cache first
48
51
  # @return [String, nil] The public IP address
49
52
  def self.public_ip(agent = nil, force: false)
50
53
  @public_ip = nil if force
@@ -57,8 +60,8 @@ module ScraperUtils
57
60
 
58
61
  # Retrieves and logs the headers that make it through the proxy
59
62
  #
60
- # @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
61
- # @param force [Boolean] Force a new IP lookup, by clearing cache first
63
+ # @param agent [Mechanize, nil] Mechanize agent to use for IP find or nil when clearing cache
64
+ # @param force [Boolean] Force a new IP find, by clearing cache first
62
65
  # @return [String, nil] The list of headers in json format
63
66
  def self.public_headers(agent = nil, force: false)
64
67
  @public_headers = nil if force
@@ -4,31 +4,34 @@ module ScraperUtils
4
4
  # Provides utilities for randomizing processing order in scrapers,
5
5
  # particularly helpful for distributing load and avoiding predictable patterns
6
6
  module RandomizeUtils
7
- # Returns a randomized version of the input collection when in production mode,
8
- # or the original collection when in test/sequential mode
9
- #
10
- # @param collection [Array, Enumerable] Collection of items to potentially randomize
11
- # @return [Array] Randomized or original collection depending on environment
12
- def self.randomize_order(collection)
13
- return collection.to_a if sequential?
7
+ class << self
8
+ # Controls if processing order can be randomized
9
+ #
10
+ # @return [Boolean] true if all processing is done sequentially, otherwise false
11
+ # @note Defaults to true unless the MORPH_DISABLE_RANDOM ENV variable is set
12
+ attr_accessor :random
14
13
 
15
- collection.to_a.shuffle
14
+ # Reports if processing order will be randomized
15
+ #
16
+ # @return (see #random)
17
+ alias random? random
16
18
  end
17
19
 
18
- # Checks if sequential processing is enabled
19
- #
20
- # @return [Boolean] true when in test mode or MORPH_PROCESS_SEQUENTIALLY is set
21
- def self.sequential?
22
- @sequential = !ENV["MORPH_PROCESS_SEQUENTIALLY"].to_s.empty? if @sequential.nil?
23
- @sequential || false
20
+ def self.reset!
21
+ @random = ENV["MORPH_DISABLE_RANDOM"].to_s.empty?
24
22
  end
25
23
 
26
- # Explicitly set sequential mode for testing
24
+ # reset on class load
25
+ reset!
26
+
27
+ # Returns a randomized version of the input collection unless `.sequential?` is true.
27
28
  #
28
- # @param value [Boolean, nil] true to enable sequential mode, false to disable, nil to clear cache
29
- # @return [Boolean, nil]
30
- def self.sequential=(value)
31
- @sequential = value
29
+ # @param collection [Array, Enumerable] Collection of items
30
+ # @return [Array] Randomized unless {.sequential?} is true, otherwise original order
31
+ def self.randomize_order(collection)
32
+ return collection.to_a.shuffle if random?
33
+
34
+ collection.to_a
32
35
  end
33
36
  end
34
37
  end
@@ -0,0 +1,12 @@
1
+ module ScraperUtils
2
+ module Scheduler
3
+ module Constants
4
+ MAIN_FIBER = Fiber.current
5
+
6
+ # @!group Scheduler defaults
7
+ DEFAULT_MAX_WORKERS = 50
8
+ DEFAULT_TIMEOUT = 6 * 60 * 60 # 6 hours
9
+ POLL_PERIOD = 0.01
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fiber"
4
+
5
+ require_relative "operation_worker"
6
+
7
+ module ScraperUtils
8
+ module Scheduler
9
+ # Registry of all active OperationWorkers registered to be processed
10
+ class OperationRegistry
11
+
12
+ def initialize
13
+ @operations = {}
14
+ @fiber_ids = {}
15
+ end
16
+
17
+ def register(fiber, authority)
18
+ authority = authority.to_sym
19
+ operation = OperationWorker.new(fiber, authority, @response_queue)
20
+ @operations[authority] = operation
21
+ @fiber_ids[operation.fiber.object_id] = operation
22
+ end
23
+
24
+ # Remove yourself from registry, called from fiber
25
+ def deregister
26
+ operation = find
27
+ return unless operation
28
+
29
+ operation.close
30
+ # Remove operation from registry since shutdown has done all it can to shut down the thread and fiber
31
+ @operations.delete(operation.authority)
32
+ @fiber_ids.delete(operation.fiber.object_id)
33
+ end
34
+
35
+ def current_authority
36
+ find(Fiber.current.object_id)&.authority
37
+ end
38
+
39
+ # Find OperationWorker
40
+ # @param key [Integer, String, nil] Fiber's object_id or authority (default current fiber's object_id)
41
+ # @return [OperationWorker, nil] Returns worker or nil if not found
42
+ def find(key = nil)
43
+ key ||= Fiber.current.object_id
44
+ if key.is_a?(Symbol)
45
+ @operations[key]
46
+ elsif key.is_a?(Integer)
47
+ @fiber_ids[key]
48
+ end
49
+ end
50
+
51
+ # Removes operations
52
+ def shutdown
53
+ operations.each do |_key, operation|
54
+ operation.shutdown
55
+ end
56
+ end
57
+
58
+ # Returns true if there are no registered operations
59
+ def empty?
60
+ @operations.empty?
61
+ end
62
+
63
+ # Returns number of registered operations
64
+ def size
65
+ @operations.size
66
+ end
67
+
68
+ # Find operations that can be resumed in resume_at order (may include future resume_at)
69
+ #
70
+ # @return [Array{OperationWorker}] Operations that are alive and have a response to use with resume
71
+ def can_resume
72
+ @operations
73
+ .values
74
+ .select { |op| op.can_resume? }
75
+ .sort_by(&:resume_at)
76
+ end
77
+
78
+ # Cleanup dead fibers that haven't removed themselves so we don't loop forever
79
+ def cleanup_zombies
80
+ dead_operations = @operations.values.reject(&:alive?)
81
+
82
+ dead_operations.each do |operation|
83
+ LogUtils.log "WARNING: removing dead operation for #{operation.authority} - it should have cleaned up after itself!"
84
+ operation.shutdown
85
+ @operations.delete(operation.authority)
86
+ @fiber_ids.delete(operation.fiber.object_id)
87
+ end
88
+ end
89
+
90
+ # Save the thread response into the thread and mark that it can continue
91
+ def process_thread_response(response)
92
+ operation = find(response.authority)
93
+ operation&.save_thread_response response
94
+ end
95
+
96
+ private
97
+
98
+ attr_accessor :operations
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,199 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "constants"
4
+ require_relative 'process_request'
5
+
6
+ module ScraperUtils
7
+ module Scheduler
8
+ # Handles the processing of a registered operation and associated fiber and thread state
9
+ class OperationWorker
10
+
11
+ class NotReadyError < RuntimeError; end
12
+
13
+ # @return [Fiber] The fiber
14
+ attr_reader :fiber
15
+
16
+ # @return [Symbol] The authority name associated with this fiber
17
+ attr_reader :authority
18
+
19
+ # @return [Time] When the fiber should be delayed till / ready to resume at
20
+ attr_accessor :resume_at
21
+
22
+ # @return [ThreadResponse, nil] The response to be passed on the next resume
23
+ attr_accessor :response
24
+
25
+ # @return [Boolean] Waiting for a response
26
+ attr_reader :waiting_for_response
27
+
28
+ # @return [Thread] Thread used
29
+ attr_reader :thread
30
+
31
+ # @return [Thread::Queue] The request queue for the thread
32
+ attr_reader :request_queue
33
+
34
+ def self.next_resume_at
35
+ @next_resume_at = [@next_resume_at, Time.now - 0.001].compact.max + 0.001
36
+ end
37
+
38
+ # Fiber has not finished running
39
+ def alive?
40
+ fiber.alive?
41
+ end
42
+
43
+ # Worker has the necessary state to be resumed
44
+ def can_resume?
45
+ !@response.nil? && !@resume_at.nil? && alive?
46
+ end
47
+
48
+ # Save thread response from main or worker fiber
49
+ def save_thread_response(response)
50
+ raise "#{authority} Wasn't waiting for response! Got: #{response.inspect}" unless @waiting_for_response
51
+ @response = response
52
+ @waiting_for_response = false
53
+ @resume_at = [response&.delay_till, Time.now].compact.max
54
+ if DebugUtils.basic?
55
+ log "Received #{response&.class&.name || 'nil response'} from thread for fiber #{authority} in #{response&.time_taken&.round(3)}s"
56
+ end
57
+ response
58
+ end
59
+
60
+ # close resources from worker fiber
61
+ # Called by worker fiber just before it exits
62
+ def close
63
+ validate_fiber(main: false)
64
+ # Signal thread to finish processing, then wait for it
65
+ @request_queue&.close
66
+ @thread&.join(60)
67
+ # drop references for GC
68
+ @request_queue = nil
69
+ @thread = nil
70
+ # make can_resume? false
71
+ clear_resume_state
72
+ end
73
+
74
+ # ===================================================
75
+ # @! Main Fiber API
76
+
77
+ # Initialize a new Worker Fiber and Thread, called from the main Fiber
78
+ #
79
+ # The Thread executes ThreadRequest objects from the request_queue and pushes
80
+ # responses to the global response_queue.
81
+ #
82
+ # @param fiber [Fiber] Fiber to process authority block
83
+ # @param authority [Symbol] Authority label
84
+ # @param response_queue [Thread::Queue, nil] Queue for thread responses if enabled
85
+ def initialize(fiber, authority, response_queue)
86
+ raise(ArgumentError, "Fiber and Authority must be provided") unless fiber && authority
87
+ validate_fiber(main: true)
88
+
89
+ @fiber = fiber
90
+ @authority = authority
91
+ @response_queue = response_queue
92
+ @fiber.instance_variable_set(:@operation_worker, self)
93
+ if response_queue
94
+ @request_queue = Thread::Queue.new
95
+ @thread = Thread.new do
96
+ Thread.current[:current_authority] = authority
97
+ while (request = @request_queue&.pop)
98
+ @response_queue.push request.execute
99
+ end
100
+ end
101
+ end
102
+ @resume_at = self.class.next_resume_at
103
+ @waiting_for_response = false
104
+ # First resume response is ignored
105
+ @response = true
106
+ end
107
+
108
+ # Resume an operation fiber and queue request if there is any from main fiber
109
+ #
110
+ # @return [ThreadRequest, nil] request returned by resume or nil if finished
111
+ def resume
112
+ raise ClosedQueueError unless alive?
113
+ raise NotReadyError, "Cannot resume #{authority} without response!" unless @response
114
+ validate_fiber(main: true)
115
+
116
+ request = @fiber.resume(@response)
117
+ # submit the next request for processing
118
+ submit_request(request) if request
119
+ request
120
+ end
121
+
122
+ # Shutdown worker called from main fiber
123
+ def shutdown
124
+ validate_fiber(main: true)
125
+
126
+ clear_resume_state
127
+ if @fiber&.alive?
128
+ # Trigger fiber to raise an error and thus call deregister
129
+ @fiber.resume(nil)
130
+ end
131
+ end
132
+
133
+ # ===================================================
134
+ # @! Worker Fiber API
135
+
136
+ # Queue a thread request to be executed from worker fiber
137
+ # otherwise locally if parallel processing is disabled
138
+ #
139
+ # Process flow if parallel enabled:
140
+ # 1. This method:
141
+ # a. pushes request onto local @request_queue
142
+ # b. calls Fiber.yield(true) so Scheduler can run other fibers
143
+ # 2. Meanwhile, this fibers thread:
144
+ # a. pops request off queue
145
+ # b. processes request
146
+ # c. pushes response to global response queue
147
+ # 3. Meanwhile, Scheduler on Main fiber:
148
+ # a. pops response from response queue as they arrive
149
+ # * calls {#save_thread_response} on associated worker to save each response
150
+ # c. calls {#resume} on worker when it is its' turn (based on resume_at) and it can_resume (has @response)
151
+ #
152
+ # If parallel processing is not enabled, then the processing occurs in the workers fiber
153
+ #
154
+ # @param request [ThreadRequest] The request to be processed in thread
155
+ def submit_request(request)
156
+ raise NotReadyError, "Cannot make a second request before the first has responded!" if @waiting_for_response
157
+ raise ArgumentError, "Must be passed a valid ThreadRequest! Got: #{request.inspect}" unless request.is_a? ThreadRequest
158
+ validate_fiber(main: false)
159
+
160
+ @response = nil
161
+ @waiting_for_response = true
162
+ if @request_queue
163
+ @request_queue&.push request
164
+ response = Fiber.yield true
165
+ raise "Terminated fiber for #{authority} as requested" unless response
166
+ else
167
+ response = save_thread_response request.execute
168
+ end
169
+ response
170
+ end
171
+
172
+ private
173
+
174
+ def validate_fiber(main: false)
175
+ required_fiber = main ? Constants::MAIN_FIBER : @fiber
176
+ current_id = Fiber.current.object_id
177
+ return if current_id == required_fiber.object_id
178
+
179
+ desc = main ? 'main' : 'worker'
180
+ we_are = if current_id == Constants::MAIN_FIBER.object_id
181
+ 'main'
182
+ elsif current_id == @fiber.object_id
183
+ 'worker'
184
+ else
185
+ 'other'
186
+ end
187
+ raise ArgumentError,
188
+ "Must be run within the #{desc} not #{we_are} fiber!"
189
+ end
190
+
191
+ # Clear resume state so the operation won't be resumed
192
+ def clear_resume_state
193
+ @resume_at = nil
194
+ @response = nil
195
+ @waiting_for_response = false
196
+ end
197
+ end
198
+ end
199
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "thread_request"
4
+
5
+ module ScraperUtils
6
+ module Scheduler
7
+ # Encapsulates a request to be executed (usually )asynchronously by the ThreadPool)
8
+ class ProcessRequest < ThreadRequest
9
+ # @return [Object] The object to call the method on
10
+ attr_reader :subject
11
+
12
+ # @return [Symbol] The method to call on the subject
13
+ attr_reader :method_name
14
+
15
+ # @return [Array] The arguments to pass to the method
16
+ attr_reader :args
17
+
18
+ # Initialize a new async request
19
+ #
20
+ # @param authority [Symbol, nil] Authority for correlating requests and responses
21
+ # nil is used when threads are disabled to process locally without duplicating codd
22
+ # @param subject [Object] The object to call the method on
23
+ # @param method_name [Symbol] The method to call on the subject
24
+ # @param args [Array] The arguments to pass to the method
25
+ # @raise [ArgumentError] If any required parameter is missing or invalid
26
+ def initialize(authority, subject, method_name, args)
27
+ super(authority)
28
+ @subject = subject
29
+ @method_name = method_name
30
+ @args = args
31
+
32
+ validate!
33
+ end
34
+
35
+ # Execute the request by calling the method on the subject
36
+ # If the subject has an instance variable @delay_till then that is added to the response
37
+ # @return [ThreadResponse] The result of the request
38
+ def execute
39
+ result = execute_block do
40
+ subject.send(method_name, *args)
41
+ end
42
+ result.delay_till = subject.instance_variable_get(:@delay_till)
43
+ result
44
+ end
45
+
46
+ private
47
+
48
+ # Validate that all required parameters are present and valid
49
+ #
50
+ # @raise [ArgumentError] If any parameter is missing or invalid
51
+ def validate!
52
+ raise ArgumentError, "Subject must be provided" unless @subject
53
+ raise ArgumentError, "Method name must be provided" unless @method_name
54
+ raise ArgumentError, "Args must be an array" unless @args.is_a?(Array)
55
+ raise ArgumentError, "Subject must respond to method" unless @subject&.respond_to?(@method_name)
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "thread_response"
4
+
5
+ module ScraperUtils
6
+ module Scheduler
7
+ # Encapsulates a request that pushed to the fiber's request queue to be executed by the Fiber's Thread
8
+ # The response is returned via the Scheduler's response queue
9
+ # @see {ProcessRequest}
10
+ class ThreadRequest
11
+ # @return [Symbol] Authority for correlating requests and responses
12
+ attr_reader :authority
13
+
14
+ # Initialize a new process request
15
+ #
16
+ # @param authority [Symbol, nil] Authority for correlating requests and responses
17
+ def initialize(authority)
18
+ @authority = authority
19
+ end
20
+
21
+ # Execute a request and return ThreadResponse - use helper method `.execute_block`
22
+ def execute
23
+ raise NotImplementedError, "Implement in subclass"
24
+ end
25
+
26
+ # Execute a request by calling the block
27
+ # @return [ThreadResponse] The result of the request
28
+ def execute_block
29
+ start_time = Time.now
30
+ begin
31
+ result = yield
32
+ elapsed_time = Time.now - start_time
33
+ ThreadResponse.new(
34
+ authority,
35
+ result,
36
+ nil,
37
+ elapsed_time
38
+ )
39
+ rescue => e
40
+ elapsed_time = Time.now - start_time
41
+ ThreadResponse.new(
42
+ authority,
43
+ nil,
44
+ e,
45
+ elapsed_time
46
+ )
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end