crawlr 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,232 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlr
4
+ # Configuration management class for Crawlr scraping sessions.
5
+ #
6
+ # The Config class centralizes all configuration options for the Crawlr framework,
7
+ # providing sensible defaults while allowing extensive customization of scraping
8
+ # behavior, networking settings, error handling, and crawling policies.
9
+ #
10
+ # @example Basic configuration
11
+ # config = Crawlr::Config.new(
12
+ # timeout: 15,
13
+ # max_depth: 3,
14
+ # max_parallelism: 5
15
+ # )
16
+ #
17
+ # @example Advanced configuration with domain filtering and retries
18
+ # config = Crawlr::Config.new(
19
+ # allowed_domains: ['example.com', 'api.example.com'],
20
+ # max_retries: 3,
21
+ # retry_delay: 2.0,
22
+ # retry_backoff: 1.5,
23
+ # random_delay: 1.0,
24
+ # allow_cookies: true,
25
+ # ignore_robots_txt: false
26
+ # )
27
+ #
28
+ # @example Proxy configuration
29
+ # config = Crawlr::Config.new(
30
+ # proxies: ['proxy1.com:8080', 'proxy2.com:8080'],
31
+ # proxy_strategy: :random,
32
+ # max_parallelism: 10
33
+ # )
34
+ #
35
+ # @author [Your Name]
36
+ # @since 0.1.0
37
+ class Config
38
+ # @return [Integer] HTTP request timeout in seconds
39
+ # @return [Hash<String, String>] Default HTTP headers for all requests
40
+ # @return [Array<String>] Glob patterns for allowed domains
41
+ # @return [Array<String>] Explicit list of allowed domains
42
+ # @return [Boolean] Whether to enable cookie handling
43
+ # @return [Integer] Maximum crawling depth (0 for unlimited)
44
+ # @return [Float] Maximum random delay between requests in seconds
45
+ # @return [Integer] Maximum number of concurrent requests
46
+ # @return [Boolean] Whether to allow revisiting previously scraped URLs
47
+ # @return [Integer, nil] Maximum number of retry attempts (nil to disable)
48
+ # @return [Float] Base delay between retry attempts in seconds
49
+ # @return [Float] Exponential backoff multiplier for retry delays
50
+ # @return [Array<Class>] List of exception classes that trigger retries
51
+ # @return [Integer] Maximum number of URLs to track in visit history
52
+ # @return [Array<String>] List of proxy server addresses
53
+ # @return [Symbol] Strategy for selecting proxies (:round_robin, :random)
54
+ # @return [Boolean] Whether to ignore robots.txt restrictions
55
+ attr_accessor :timeout, :headers, :domain_glob, :allowed_domains, :allow_cookies,
56
+ :max_depth, :random_delay, :max_parallelism, :allow_url_revisit,
57
+ :max_retries, :retry_delay, :retry_backoff, :retryable_errors,
58
+ :max_visited, :proxies, :proxy_strategy, :ignore_robots_txt
59
+
60
+ # Initializes a new Config instance with the provided options
61
+ #
62
+ # @param options [Hash] Configuration options hash
63
+ # @option options [Integer] :timeout (10) HTTP request timeout in seconds
64
+ # @option options [Hash<String, String>] :default_headers Default HTTP headers
65
+ # @option options [Array<String>] :allowed_domains ([]) Explicit list of allowed domains
66
+ # @option options [Array<String>] :domain_glob ([]) Glob patterns for domain filtering
67
+ # @option options [Boolean] :allow_cookies (false) Enable cookie handling
68
+ # @option options [Integer] :max_depth (0) Maximum crawling depth (0 = unlimited)
69
+ # @option options [Float] :random_delay (0) Maximum random delay between requests
70
+ # @option options [Integer] :max_parallelism (1) Maximum concurrent requests
71
+ # @option options [Boolean] :allow_url_revisit (false) Allow revisiting URLs
72
+ # @option options [Integer] :max_retries (0) Maximum retry attempts (0 = disabled)
73
+ # @option options [Float] :retry_delay (1.0) Base retry delay in seconds
74
+ # @option options [Float] :retry_backoff (2.0) Exponential backoff multiplier
75
+ # @option options [Array<Class>] :retryable_errors Custom list of retryable exceptions
76
+ # @option options [Integer] :max_visited (10000) Maximum URLs to track in history
77
+ # @option options [Array<String>] :proxies ([]) List of proxy servers
78
+ # @option options [Symbol] :proxy_strategy (:round_robin) Proxy selection strategy
79
+ # @option options [Boolean] :ignore_robots_txt (false) Ignore robots.txt restrictions
80
+ #
81
+ # @raise [StandardError] When both :allowed_domains and :domain_glob are specified
82
+ #
83
+ # @example Minimal configuration
84
+ # config = Crawlr::Config.new
85
+ #
86
+ # @example Timeout and parallelism configuration
87
+ # config = Crawlr::Config.new(
88
+ # timeout: 30,
89
+ # max_parallelism: 8
90
+ # )
91
+ #
92
+ # @example Domain filtering with explicit domains
93
+ # config = Crawlr::Config.new(
94
+ # allowed_domains: ['site1.com', 'api.site1.com']
95
+ # )
96
+ #
97
+ # @example Domain filtering with glob patterns
98
+ # config = Crawlr::Config.new(
99
+ # domain_glob: ['*.example.com', '*.api.example.com']
100
+ # )
101
+ #
102
+ # @example Retry configuration with custom errors
103
+ # config = Crawlr::Config.new(
104
+ # max_retries: 5,
105
+ # retry_delay: 0.5,
106
+ # retry_backoff: 1.5,
107
+ # retryable_errors: [Timeout::Error, Net::ReadTimeout]
108
+ # )
109
+ def initialize(options = {})
110
+ initialize_domain_settings(options)
111
+ initialize_parallelism_settings(options)
112
+ initialize_throttle_settings(options)
113
+ initialize_http_settings(options)
114
+ initialize_retry_settings(options)
115
+ initialize_visit_settings(options)
116
+ initialize_proxy_settings(options)
117
+ initialize_robots_settings(options)
118
+
119
+ validate
120
+ end
121
+
122
+ # Converts the configuration to a hash representation
123
+ #
124
+ # This method is useful for serialization, debugging, or creating
125
+ # new Config instances with the same settings.
126
+ #
127
+ # @return [Hash<Symbol, Object>] Hash containing all configuration values
128
+ #
129
+ # @example
130
+ # config = Crawlr::Config.new(timeout: 15, max_depth: 3)
131
+ # hash = config.to_h
132
+ # new_config = Crawlr::Config.new(hash)
133
+ #
134
+ # @example Inspect configuration
135
+ # puts config.to_h.inspect
136
+ def to_h
137
+ attrs = %i[
138
+ timeout headers allowed_domains domain_glob allow_cookies max_depth
139
+ random_delay max_parallelism allow_url_revisit max_retries retry_delay
140
+ retry_backoff retryable_errors max_visited proxies proxy_strategy
141
+ ignore_robots_txt
142
+ ]
143
+
144
+ attrs.each_with_object({}) { |name, hash| hash[name] = instance_variable_get("@#{name}") }
145
+ end
146
+
147
+ private
148
+
149
+ def initialize_domain_settings(options)
150
+ @allowed_domains = Array(options[:allowed_domains])
151
+ @domain_glob = Array(options[:domain_glob])
152
+ end
153
+
154
+ def initialize_parallelism_settings(options)
155
+ @max_parallelism = options.fetch(:max_parallelism, 1)
156
+ end
157
+
158
+ def initialize_throttle_settings(options)
159
+ @random_delay = options.fetch(:random_delay, 0)
160
+ end
161
+
162
+ def initialize_http_settings(options)
163
+ @timeout = options.fetch(:timeout, 10)
164
+ @headers = options[:default_headers] || default_headers
165
+ @allow_cookies = options.fetch(:allow_cookies, false)
166
+ @max_depth = options.fetch(:max_depth, 0)
167
+ end
168
+
169
+ def initialize_retry_settings(options)
170
+ @max_retries = options[:max_retries]&.positive? ? options[:max_retries] : 0
171
+ @retry_delay = options.fetch(:retry_delay, 1.0)
172
+ @retry_backoff = options.fetch(:retry_backoff, 2.0)
173
+ @retryable_errors = options[:retryable_errors] || default_retryable_errors
174
+ end
175
+
176
+ def initialize_visit_settings(options)
177
+ @allow_url_revisit = options.fetch(:allow_url_revisit, false)
178
+ @max_visited = options.fetch(:max_visited, 10_000)
179
+ end
180
+
181
+ def initialize_proxy_settings(options)
182
+ @proxies = Array(options[:proxies])
183
+ @proxy_strategy = options.fetch(:proxy_strategy, :round_robin)
184
+ end
185
+
186
+ def initialize_robots_settings(options)
187
+ @ignore_robots_txt = options.fetch(:ignore_robots_txt, false)
188
+ end
189
+
190
+ # Returns the default HTTP headers for requests
191
+ #
192
+ # @return [Hash<String, String>] Default headers with User-Agent
193
+ # @api private
194
+ def default_headers
195
+ {
196
+ "User-Agent" => "Crawlr/#{Crawlr::VERSION}"
197
+ }
198
+ end
199
+
200
+ # Returns the default list of exceptions that should trigger retries
201
+ #
202
+ # These exceptions typically represent temporary network issues
203
+ # that may resolve on subsequent attempts.
204
+ #
205
+ # @return [Array<Class>] Array of exception classes for retry logic
206
+ # @api private
207
+ def default_retryable_errors
208
+ [
209
+ Async::TimeoutError,
210
+ Errno::ECONNREFUSED,
211
+ Errno::ECONNRESET,
212
+ Errno::EHOSTUNREACH,
213
+ Errno::ENETUNREACH,
214
+ SocketError
215
+ ]
216
+ end
217
+
218
+ # Validates the configuration for conflicting options
219
+ #
220
+ # Ensures that mutually exclusive configuration options are not
221
+ # specified simultaneously, which would create ambiguous behavior.
222
+ #
223
+ # @return [void]
224
+ # @raise [StandardError] When both allowed_domains and domain_glob are specified
225
+ # @api private
226
+ def validate
227
+ return unless !@allowed_domains.empty? && !@domain_glob.empty?
228
+
229
+ raise "Cannot specify both allowed_domains and domain_glob"
230
+ end
231
+ end
232
+ end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlr
4
+ # The Context class holds metadata and shared data
5
+ # during a scraping session, such as URLs and crawl depth.
6
+ #
7
+ # It acts like a small key-value store (`@data`) and provides
8
+ # helper methods to manage depth and resolve relative URLs.
9
+ #
10
+ # @example Creating a new context
11
+ # ctx = Crawlr::Context.new(base_url: "https://example.com")
12
+ # ctx[:title] = "Home"
13
+ # ctx.increment_depth
14
+ # ctx.to_h
15
+ # # => { base_url: "https://example.com", page_url: nil, current_depth: 1, title: "Home" }
16
+ #
17
+ class Context
18
+ # @return [String, nil] The base URL used for resolving relative links
19
+ # @return [String, nil] The current page URL
20
+ # @return [Integer] The current depth in the crawl hierarchy
21
+ attr_accessor :base_url, :page_url, :current_depth
22
+
23
+ # Create a new scraping context.
24
+ #
25
+ # @param [String, nil] base_url The root URL of the crawl
26
+ # @param [String, nil] page_url The current page URL
27
+ # @param [Integer] current_depth The crawl depth (default: 0)
28
+ def initialize(base_url: nil, page_url: nil, current_depth: 0)
29
+ @base_url = base_url
30
+ @page_url = page_url
31
+ @current_depth = current_depth
32
+ @data = {}
33
+ end
34
+
35
+ # Retrieve a stored value by key.
36
+ #
37
+ # @param [Symbol, String] key The key to fetch
38
+ # @return [Object, nil] The stored value, or nil if not found
39
+ def [](key)
40
+ @data[key]
41
+ end
42
+
43
+ # Assign a value to a key.
44
+ #
45
+ # @param [Symbol, String] key The key to set
46
+ # @param [Object] value The value to store
47
+ # @return [Object] The stored value
48
+ def []=(key, value)
49
+ @data[key] = value
50
+ end
51
+
52
+ # Convert the context to a Hash.
53
+ #
54
+ # Includes base_url, page_url, current_depth, and all stored data.
55
+ #
56
+ # @return [Hash] The full context data as a Hash
57
+ def to_h
58
+ {
59
+ base_url: @base_url,
60
+ page_url: @page_url,
61
+ current_depth: @current_depth
62
+ }.merge(@data)
63
+ end
64
+
65
+ # Increment the crawl depth by 1.
66
+ #
67
+ # @return [Integer] The updated depth value
68
+ def increment_depth
69
+ @current_depth += 1
70
+ end
71
+
72
+ # Resolve a relative URL using the base_url.
73
+ #
74
+ # @param [String] url The relative or absolute URL
75
+ # @return [String] The resolved absolute URL
76
+ def resolve_url(url)
77
+ URI.join(@base_url, url).to_s
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,166 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlr
4
+ # Domain filtering and validation class for controlling scraping scope.
5
+ #
6
+ # The Domains class manages which domains are allowed to be scraped by
7
+ # implementing both explicit domain allowlists and glob pattern matching.
8
+ # It provides flexible domain filtering to restrict scraping to specific
9
+ # sites or domain patterns while normalizing domain names for consistent
10
+ # comparison.
11
+ #
12
+ # @example Allow specific domains
13
+ # config = Crawlr::Config.new(
14
+ # allowed_domains: ['example.com', 'api.example.com', 'subdomain.site.org']
15
+ # )
16
+ # domains = Crawlr::Domains.new(config)
17
+ #
18
+ # domains.allowed?('https://example.com/page') #=> true
19
+ # domains.allowed?('https://www.example.com/page') #=> true (www. stripped)
20
+ # domains.allowed?('https://forbidden.com/page') #=> false
21
+ #
22
+ # @example Use glob patterns for flexible matching
23
+ # config = Crawlr::Config.new(
24
+ # domain_glob: ['*.example.com', '*.api.*.com', 'site?.org']
25
+ # )
26
+ # domains = Crawlr::Domains.new(config)
27
+ #
28
+ # domains.allowed?('https://sub.example.com/path') #=> true
29
+ # domains.allowed?('https://api.service.com/data') #=> true
30
+ # domains.allowed?('https://site1.org/content') #=> true
31
+ #
32
+ # @example No restrictions (allow all domains)
33
+ # config = Crawlr::Config.new # No domain restrictions
34
+ # domains = Crawlr::Domains.new(config)
35
+ #
36
+ # domains.allowed?('https://any-site.com') #=> true
37
+ #
38
+ # @author [Your Name]
39
+ # @since 0.1.0
40
+ class Domains
41
+ # Initializes a new Domains instance with the given configuration
42
+ #
43
+ # @param config [Crawlr::Config] Configuration object containing domain restrictions
44
+ #
45
+ # @example
46
+ # config = Crawlr::Config.new(allowed_domains: ['site.com'])
47
+ # domains = Crawlr::Domains.new(config)
48
+ def initialize(config)
49
+ @config = config
50
+ @allowed_domains = extract_allowed_domains(@config.allowed_domains)
51
+ @domain_glob = @config.domain_glob
52
+ end
53
+
54
+ # Checks if a URL is allowed based on configured domain restrictions
55
+ #
56
+ # The method performs the following checks in order:
57
+ # 1. If no restrictions are configured, allows all URLs
58
+ # 2. If glob patterns are configured, tests URL against each pattern
59
+ # 3. If explicit domains are configured, checks normalized domain name
60
+ # 4. Logs rejection for debugging purposes
61
+ #
62
+ # @param url [String] The URL to check for domain allowance
63
+ # @return [Boolean] true if the URL's domain is allowed, false otherwise
64
+ #
65
+ # @example With explicit domain allowlist
66
+ # domains.allowed?('https://example.com/page') #=> true (if allowed)
67
+ # domains.allowed?('https://www.example.com/page') #=> true (www. stripped)
68
+ # domains.allowed?('https://subdomain.example.com') #=> false (unless explicitly allowed)
69
+ #
70
+ # @example With glob patterns
71
+ # # config.domain_glob = ['*.example.com']
72
+ # domains.allowed?('https://api.example.com') #=> true
73
+ # domains.allowed?('https://cdn.example.com/asset') #=> true
74
+ # domains.allowed?('https://other.com') #=> false
75
+ #
76
+ # @example No restrictions
77
+ # # config.allowed_domains = [], config.domain_glob = []
78
+ # domains.allowed?('https://any-domain.com') #=> true
79
+ def allowed?(url)
80
+ return true if @allowed_domains.empty? && @domain_glob.empty?
81
+
82
+ unless @domain_glob.empty?
83
+ @domain_glob.each do |glob|
84
+ return true if File.fnmatch?(glob, url)
85
+ end
86
+ end
87
+
88
+ uri = URI(url)
89
+ base_name = uri.host.sub("www.", "")
90
+ allowed = @allowed_domains.include?(base_name)
91
+
92
+ Crawlr.logger.info("URL not allowed: #{url}") unless allowed
93
+ allowed
94
+ end
95
+
96
+ # Returns statistics about the configured domain restrictions
97
+ #
98
+ # Provides metrics about the number of explicitly allowed domains
99
+ # and glob patterns configured for monitoring and debugging purposes.
100
+ #
101
+ # @return [Hash<Symbol, Integer>] Statistics hash containing domain counts
102
+ # @option return [Integer] :allowed_domains Number of explicitly allowed domains
103
+ # @option return [Integer] :domain_glob Number of configured glob patterns
104
+ #
105
+ # @example
106
+ # stats = domains.domain_stats
107
+ # puts "Allowing #{stats[:allowed_domains]} explicit domains"
108
+ # puts "Using #{stats[:domain_glob]} glob patterns"
109
+ def domain_stats
110
+ {
111
+ allowed_domains: @allowed_domains.size,
112
+ domain_glob: @domain_glob.size
113
+ }
114
+ end
115
+
116
+ private
117
+
118
+ # Extracts and normalizes domain names from the configuration
119
+ #
120
+ # Processes the list of allowed domains by:
121
+ # 1. Handling nil/empty input gracefully
122
+ # 2. Normalizing each domain using base_domain method
123
+ # 3. Removing duplicates from the final list
124
+ #
125
+ # @param domains [Array<String>, nil] List of domain strings to process
126
+ # @return [Array<String>] Normalized, unique list of base domain names
127
+ # @api private
128
+ #
129
+ # @example
130
+ # extract_allowed_domains(['https://www.example.com', 'api.example.com'])
131
+ # #=> ['example.com', 'api.example.com']
132
+ def extract_allowed_domains(domains)
133
+ return [] if domains.nil? || domains.empty?
134
+
135
+ domains.map { |domain| base_domain(domain) }.uniq
136
+ end
137
+
138
+ # Normalizes a domain string to its base form for consistent comparison
139
+ #
140
+ # Performs the following normalization:
141
+ # 1. Parses as URI if it looks like a full URL
142
+ # 2. Ensures path is set to "/" if empty (for valid URI)
143
+ # 3. Extracts hostname and removes "www." prefix
144
+ # 4. Falls back to original string if URI parsing fails
145
+ #
146
+ # @param domain [String] Domain string or URL to normalize
147
+ # @return [String] Normalized base domain name without www prefix
148
+ # @api private
149
+ #
150
+ # @example URL normalization
151
+ # base_domain('https://www.example.com/path') #=> 'example.com'
152
+ # base_domain('http://api.site.org') #=> 'api.site.org'
153
+ #
154
+ # @example Domain name normalization
155
+ # base_domain('www.example.com') #=> 'example.com'
156
+ # base_domain('subdomain.example.com') #=> 'subdomain.example.com'
157
+ #
158
+ # @example Fallback behavior
159
+ # base_domain('not-a-valid-uri') #=> 'not-a-valid-uri'
160
+ def base_domain(domain)
161
+ uri = URI(domain)
162
+ uri.path = "/" if uri.path.empty?
163
+ uri.host ? uri.host.sub("www.", "") : domain
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,161 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlr
4
+ # Event hook management system for scraping lifecycle customization.
5
+ #
6
+ # The Hooks class provides a flexible event-driven system that allows users
7
+ # to register custom behavior at specific points during the scraping process.
8
+ # It supports multiple hooks per event and validates event names to ensure
9
+ # consistency across the framework.
10
+ #
11
+ # @example Basic hook registration
12
+ # hooks = Crawlr::Hooks.new
13
+ #
14
+ # hooks.register(:before_visit) do |url, headers|
15
+ # puts "About to visit: #{url}"
16
+ # headers['X-Custom'] = 'value'
17
+ # end
18
+ #
19
+ # @example Multiple hooks for the same event
20
+ # hooks.register(:after_visit) do |url, response|
21
+ # log_response_time(url, response)
22
+ # end
23
+ #
24
+ # hooks.register(:after_visit) do |url, response|
25
+ # update_statistics(response.status)
26
+ # end
27
+ #
28
+ # @example Error handling hooks
29
+ # hooks.register(:on_error) do |url, error|
30
+ # error_logger.warn("Failed to scrape #{url}: #{error.message}")
31
+ # notify_monitoring_system(url, error)
32
+ # end
33
+ #
34
+ # @author [Your Name]
35
+ # @since 0.1.0
36
+ class Hooks
37
+ # Supported lifecycle events for hook registration
38
+ #
39
+ # @return [Array<Symbol>] Array of valid event names
40
+ # - `:before_visit` - Triggered before making HTTP request
41
+ # - `:after_visit` - Triggered after receiving HTTP response
42
+ # - `:on_error` - Triggered when an error occurs during scraping
43
+ ALLOWED_EVENTS = %i[before_visit after_visit on_error].freeze
44
+
45
+ # Initializes a new Hooks instance
46
+ #
47
+ # Creates an empty hook registry with auto-vivifying arrays for each event type.
48
+ #
49
+ # @example
50
+ # hooks = Crawlr::Hooks.new
51
+ def initialize
52
+ @hooks = Hash.new { |h, k| h[k] = [] }
53
+ end
54
+
55
+ # Registers a hook for a specific scraping lifecycle event
56
+ #
57
+ # Hooks are executed in the order they were registered. Multiple hooks
58
+ # can be registered for the same event, and all will be executed when
59
+ # the event is triggered.
60
+ #
61
+ # @param event [Symbol] The lifecycle event to hook into
62
+ # @param block [Proc] The block to execute when the event occurs
63
+ # @yieldparam args [Array] Event-specific arguments passed to the hook
64
+ # @return [void]
65
+ # @raise [ArgumentError] When the event is not in ALLOWED_EVENTS
66
+ # @raise [ArgumentError] When no block is provided
67
+ #
68
+ # @example Before visit hook for request modification
69
+ # register(:before_visit) do |url, headers|
70
+ # headers['User-Agent'] = 'Custom Bot 1.0'
71
+ # headers['Authorization'] = get_auth_token(url)
72
+ # end
73
+ #
74
+ # @example After visit hook for response processing
75
+ # register(:after_visit) do |url, response|
76
+ # response_time = response.headers['X-Response-Time']
77
+ # metrics.record_response_time(url, response_time)
78
+ # end
79
+ #
80
+ # @example Error handling hook
81
+ # register(:on_error) do |url, error|
82
+ # if error.is_a?(Timeout::Error)
83
+ # retry_queue.add(url, delay: 30)
84
+ # end
85
+ # end
86
+ def register(event, &block)
87
+ raise ArgumentError, "Invalid event #{event}" unless ALLOWED_EVENTS.include?(event)
88
+ raise ArgumentError, "Block required" unless block
89
+
90
+ @hooks[event] << block
91
+ end
92
+
93
+ # Triggers all registered hooks for a specific event
94
+ #
95
+ # Executes hooks in the order they were registered. If any hook raises
96
+ # an exception, it will be propagated and may prevent subsequent hooks
97
+ # from executing.
98
+ #
99
+ # @param event [Symbol] The event to trigger
100
+ # @param args [Array] Variable arguments to pass to the hook blocks
101
+ # @return [void]
102
+ # @raise [ArgumentError] When the event is not in ALLOWED_EVENTS
103
+ #
104
+ # @example Trigger before_visit hooks
105
+ # trigger(:before_visit, 'https://example.com', headers_hash)
106
+ #
107
+ # @example Trigger after_visit hooks
108
+ # trigger(:after_visit, 'https://example.com', response_object)
109
+ #
110
+ # @example Trigger error hooks
111
+ # trigger(:on_error, 'https://example.com', exception_object)
112
+ def trigger(event, *args)
113
+ raise ArgumentError, "Invalid event #{event}" unless ALLOWED_EVENTS.include?(event)
114
+
115
+ @hooks[event].each { |blk| blk.call(*args) }
116
+ end
117
+
118
+ # Returns statistics about registered hooks
119
+ #
120
+ # Provides metrics about hook registration for monitoring, debugging,
121
+ # and ensuring expected hooks are properly configured.
122
+ #
123
+ # @return [Hash<Symbol, Object>] Statistics hash containing hook metrics
124
+ # @option return [Integer] :total_hooks Total number of registered hooks across all events
125
+ # @option return [Hash<Symbol, Integer>] :per_event Number of hooks per event type
126
+ #
127
+ # @example
128
+ # stats = hooks.stats
129
+ # puts "Total hooks: #{stats[:total_hooks]}"
130
+ # puts "Before visit hooks: #{stats[:per_event][:before_visit]}"
131
+ # puts "Error hooks: #{stats[:per_event][:on_error]}"
132
+ def stats
133
+ grouped = @hooks.transform_values(&:size)
134
+ { total_hooks: @hooks.values.flatten.size, per_event: grouped }
135
+ end
136
+
137
+ # Clears registered hooks for all events or a specific event
138
+ #
139
+ # Useful for testing, resetting hook configuration, or dynamically
140
+ # changing hook behavior during scraping sessions.
141
+ #
142
+ # @param event [Symbol, nil] Specific event to clear, or nil to clear all
143
+ # @return [void]
144
+ #
145
+ # @example Clear all hooks
146
+ # hooks.clear
147
+ #
148
+ # @example Clear hooks for specific event
149
+ # hooks.clear(:before_visit)
150
+ #
151
+ # @example Clear error hooks only
152
+ # hooks.clear(:on_error)
153
+ def clear(event = nil)
154
+ if event
155
+ @hooks[event].clear
156
+ else
157
+ @hooks.clear
158
+ end
159
+ end
160
+ end
161
+ end