crawlr 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +9 -0
- data/CHANGELOG.md +5 -0
- data/LICENSE.txt +21 -0
- data/README.md +326 -0
- data/Rakefile +12 -0
- data/lib/crawlr/callbacks.rb +177 -0
- data/lib/crawlr/collector.rb +632 -0
- data/lib/crawlr/config.rb +232 -0
- data/lib/crawlr/context.rb +80 -0
- data/lib/crawlr/domains.rb +166 -0
- data/lib/crawlr/hooks.rb +161 -0
- data/lib/crawlr/http_interface.rb +286 -0
- data/lib/crawlr/parser.rb +242 -0
- data/lib/crawlr/robots.rb +329 -0
- data/lib/crawlr/version.rb +5 -0
- data/lib/crawlr/visits.rb +190 -0
- data/lib/crawlr.rb +16 -0
- data/sig/crawlr.rbs +4 -0
- metadata +209 -0
@@ -0,0 +1,232 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Crawlr
|
4
|
+
# Configuration management class for Crawlr scraping sessions.
|
5
|
+
#
|
6
|
+
# The Config class centralizes all configuration options for the Crawlr framework,
|
7
|
+
# providing sensible defaults while allowing extensive customization of scraping
|
8
|
+
# behavior, networking settings, error handling, and crawling policies.
|
9
|
+
#
|
10
|
+
# @example Basic configuration
|
11
|
+
# config = Crawlr::Config.new(
|
12
|
+
# timeout: 15,
|
13
|
+
# max_depth: 3,
|
14
|
+
# max_parallelism: 5
|
15
|
+
# )
|
16
|
+
#
|
17
|
+
# @example Advanced configuration with domain filtering and retries
|
18
|
+
# config = Crawlr::Config.new(
|
19
|
+
# allowed_domains: ['example.com', 'api.example.com'],
|
20
|
+
# max_retries: 3,
|
21
|
+
# retry_delay: 2.0,
|
22
|
+
# retry_backoff: 1.5,
|
23
|
+
# random_delay: 1.0,
|
24
|
+
# allow_cookies: true,
|
25
|
+
# ignore_robots_txt: false
|
26
|
+
# )
|
27
|
+
#
|
28
|
+
# @example Proxy configuration
|
29
|
+
# config = Crawlr::Config.new(
|
30
|
+
# proxies: ['proxy1.com:8080', 'proxy2.com:8080'],
|
31
|
+
# proxy_strategy: :random,
|
32
|
+
# max_parallelism: 10
|
33
|
+
# )
|
34
|
+
#
|
35
|
+
# @author [Your Name]
|
36
|
+
# @since 0.1.0
|
37
|
+
class Config
|
38
|
+
# @return [Integer] HTTP request timeout in seconds
|
39
|
+
# @return [Hash<String, String>] Default HTTP headers for all requests
|
40
|
+
# @return [Array<String>] Glob patterns for allowed domains
|
41
|
+
# @return [Array<String>] Explicit list of allowed domains
|
42
|
+
# @return [Boolean] Whether to enable cookie handling
|
43
|
+
# @return [Integer] Maximum crawling depth (0 for unlimited)
|
44
|
+
# @return [Float] Maximum random delay between requests in seconds
|
45
|
+
# @return [Integer] Maximum number of concurrent requests
|
46
|
+
# @return [Boolean] Whether to allow revisiting previously scraped URLs
|
47
|
+
# @return [Integer, nil] Maximum number of retry attempts (nil to disable)
|
48
|
+
# @return [Float] Base delay between retry attempts in seconds
|
49
|
+
# @return [Float] Exponential backoff multiplier for retry delays
|
50
|
+
# @return [Array<Class>] List of exception classes that trigger retries
|
51
|
+
# @return [Integer] Maximum number of URLs to track in visit history
|
52
|
+
# @return [Array<String>] List of proxy server addresses
|
53
|
+
# @return [Symbol] Strategy for selecting proxies (:round_robin, :random)
|
54
|
+
# @return [Boolean] Whether to ignore robots.txt restrictions
|
55
|
+
attr_accessor :timeout, :headers, :domain_glob, :allowed_domains, :allow_cookies,
|
56
|
+
:max_depth, :random_delay, :max_parallelism, :allow_url_revisit,
|
57
|
+
:max_retries, :retry_delay, :retry_backoff, :retryable_errors,
|
58
|
+
:max_visited, :proxies, :proxy_strategy, :ignore_robots_txt
|
59
|
+
|
60
|
+
# Initializes a new Config instance with the provided options
|
61
|
+
#
|
62
|
+
# @param options [Hash] Configuration options hash
|
63
|
+
# @option options [Integer] :timeout (10) HTTP request timeout in seconds
|
64
|
+
# @option options [Hash<String, String>] :default_headers Default HTTP headers
|
65
|
+
# @option options [Array<String>] :allowed_domains ([]) Explicit list of allowed domains
|
66
|
+
# @option options [Array<String>] :domain_glob ([]) Glob patterns for domain filtering
|
67
|
+
# @option options [Boolean] :allow_cookies (false) Enable cookie handling
|
68
|
+
# @option options [Integer] :max_depth (0) Maximum crawling depth (0 = unlimited)
|
69
|
+
# @option options [Float] :random_delay (0) Maximum random delay between requests
|
70
|
+
# @option options [Integer] :max_parallelism (1) Maximum concurrent requests
|
71
|
+
# @option options [Boolean] :allow_url_revisit (false) Allow revisiting URLs
|
72
|
+
# @option options [Integer] :max_retries (0) Maximum retry attempts (0 = disabled)
|
73
|
+
# @option options [Float] :retry_delay (1.0) Base retry delay in seconds
|
74
|
+
# @option options [Float] :retry_backoff (2.0) Exponential backoff multiplier
|
75
|
+
# @option options [Array<Class>] :retryable_errors Custom list of retryable exceptions
|
76
|
+
# @option options [Integer] :max_visited (10000) Maximum URLs to track in history
|
77
|
+
# @option options [Array<String>] :proxies ([]) List of proxy servers
|
78
|
+
# @option options [Symbol] :proxy_strategy (:round_robin) Proxy selection strategy
|
79
|
+
# @option options [Boolean] :ignore_robots_txt (false) Ignore robots.txt restrictions
|
80
|
+
#
|
81
|
+
# @raise [StandardError] When both :allowed_domains and :domain_glob are specified
|
82
|
+
#
|
83
|
+
# @example Minimal configuration
|
84
|
+
# config = Crawlr::Config.new
|
85
|
+
#
|
86
|
+
# @example Timeout and parallelism configuration
|
87
|
+
# config = Crawlr::Config.new(
|
88
|
+
# timeout: 30,
|
89
|
+
# max_parallelism: 8
|
90
|
+
# )
|
91
|
+
#
|
92
|
+
# @example Domain filtering with explicit domains
|
93
|
+
# config = Crawlr::Config.new(
|
94
|
+
# allowed_domains: ['site1.com', 'api.site1.com']
|
95
|
+
# )
|
96
|
+
#
|
97
|
+
# @example Domain filtering with glob patterns
|
98
|
+
# config = Crawlr::Config.new(
|
99
|
+
# domain_glob: ['*.example.com', '*.api.example.com']
|
100
|
+
# )
|
101
|
+
#
|
102
|
+
# @example Retry configuration with custom errors
|
103
|
+
# config = Crawlr::Config.new(
|
104
|
+
# max_retries: 5,
|
105
|
+
# retry_delay: 0.5,
|
106
|
+
# retry_backoff: 1.5,
|
107
|
+
# retryable_errors: [Timeout::Error, Net::ReadTimeout]
|
108
|
+
# )
|
109
|
+
def initialize(options = {})
|
110
|
+
initialize_domain_settings(options)
|
111
|
+
initialize_parallelism_settings(options)
|
112
|
+
initialize_throttle_settings(options)
|
113
|
+
initialize_http_settings(options)
|
114
|
+
initialize_retry_settings(options)
|
115
|
+
initialize_visit_settings(options)
|
116
|
+
initialize_proxy_settings(options)
|
117
|
+
initialize_robots_settings(options)
|
118
|
+
|
119
|
+
validate
|
120
|
+
end
|
121
|
+
|
122
|
+
# Converts the configuration to a hash representation
|
123
|
+
#
|
124
|
+
# This method is useful for serialization, debugging, or creating
|
125
|
+
# new Config instances with the same settings.
|
126
|
+
#
|
127
|
+
# @return [Hash<Symbol, Object>] Hash containing all configuration values
|
128
|
+
#
|
129
|
+
# @example
|
130
|
+
# config = Crawlr::Config.new(timeout: 15, max_depth: 3)
|
131
|
+
# hash = config.to_h
|
132
|
+
# new_config = Crawlr::Config.new(hash)
|
133
|
+
#
|
134
|
+
# @example Inspect configuration
|
135
|
+
# puts config.to_h.inspect
|
136
|
+
def to_h
|
137
|
+
attrs = %i[
|
138
|
+
timeout headers allowed_domains domain_glob allow_cookies max_depth
|
139
|
+
random_delay max_parallelism allow_url_revisit max_retries retry_delay
|
140
|
+
retry_backoff retryable_errors max_visited proxies proxy_strategy
|
141
|
+
ignore_robots_txt
|
142
|
+
]
|
143
|
+
|
144
|
+
attrs.each_with_object({}) { |name, hash| hash[name] = instance_variable_get("@#{name}") }
|
145
|
+
end
|
146
|
+
|
147
|
+
private
|
148
|
+
|
149
|
+
def initialize_domain_settings(options)
|
150
|
+
@allowed_domains = Array(options[:allowed_domains])
|
151
|
+
@domain_glob = Array(options[:domain_glob])
|
152
|
+
end
|
153
|
+
|
154
|
+
def initialize_parallelism_settings(options)
|
155
|
+
@max_parallelism = options.fetch(:max_parallelism, 1)
|
156
|
+
end
|
157
|
+
|
158
|
+
def initialize_throttle_settings(options)
|
159
|
+
@random_delay = options.fetch(:random_delay, 0)
|
160
|
+
end
|
161
|
+
|
162
|
+
def initialize_http_settings(options)
|
163
|
+
@timeout = options.fetch(:timeout, 10)
|
164
|
+
@headers = options[:default_headers] || default_headers
|
165
|
+
@allow_cookies = options.fetch(:allow_cookies, false)
|
166
|
+
@max_depth = options.fetch(:max_depth, 0)
|
167
|
+
end
|
168
|
+
|
169
|
+
def initialize_retry_settings(options)
|
170
|
+
@max_retries = options[:max_retries]&.positive? ? options[:max_retries] : 0
|
171
|
+
@retry_delay = options.fetch(:retry_delay, 1.0)
|
172
|
+
@retry_backoff = options.fetch(:retry_backoff, 2.0)
|
173
|
+
@retryable_errors = options[:retryable_errors] || default_retryable_errors
|
174
|
+
end
|
175
|
+
|
176
|
+
def initialize_visit_settings(options)
|
177
|
+
@allow_url_revisit = options.fetch(:allow_url_revisit, false)
|
178
|
+
@max_visited = options.fetch(:max_visited, 10_000)
|
179
|
+
end
|
180
|
+
|
181
|
+
def initialize_proxy_settings(options)
|
182
|
+
@proxies = Array(options[:proxies])
|
183
|
+
@proxy_strategy = options.fetch(:proxy_strategy, :round_robin)
|
184
|
+
end
|
185
|
+
|
186
|
+
def initialize_robots_settings(options)
|
187
|
+
@ignore_robots_txt = options.fetch(:ignore_robots_txt, false)
|
188
|
+
end
|
189
|
+
|
190
|
+
# Returns the default HTTP headers for requests
|
191
|
+
#
|
192
|
+
# @return [Hash<String, String>] Default headers with User-Agent
|
193
|
+
# @api private
|
194
|
+
def default_headers
|
195
|
+
{
|
196
|
+
"User-Agent" => "Crawlr/#{Crawlr::VERSION}"
|
197
|
+
}
|
198
|
+
end
|
199
|
+
|
200
|
+
# Returns the default list of exceptions that should trigger retries
|
201
|
+
#
|
202
|
+
# These exceptions typically represent temporary network issues
|
203
|
+
# that may resolve on subsequent attempts.
|
204
|
+
#
|
205
|
+
# @return [Array<Class>] Array of exception classes for retry logic
|
206
|
+
# @api private
|
207
|
+
def default_retryable_errors
|
208
|
+
[
|
209
|
+
Async::TimeoutError,
|
210
|
+
Errno::ECONNREFUSED,
|
211
|
+
Errno::ECONNRESET,
|
212
|
+
Errno::EHOSTUNREACH,
|
213
|
+
Errno::ENETUNREACH,
|
214
|
+
SocketError
|
215
|
+
]
|
216
|
+
end
|
217
|
+
|
218
|
+
# Validates the configuration for conflicting options
|
219
|
+
#
|
220
|
+
# Ensures that mutually exclusive configuration options are not
|
221
|
+
# specified simultaneously, which would create ambiguous behavior.
|
222
|
+
#
|
223
|
+
# @return [void]
|
224
|
+
# @raise [StandardError] When both allowed_domains and domain_glob are specified
|
225
|
+
# @api private
|
226
|
+
def validate
|
227
|
+
return unless !@allowed_domains.empty? && !@domain_glob.empty?
|
228
|
+
|
229
|
+
raise "Cannot specify both allowed_domains and domain_glob"
|
230
|
+
end
|
231
|
+
end
|
232
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Crawlr
|
4
|
+
# The Context class holds metadata and shared data
|
5
|
+
# during a scraping session, such as URLs and crawl depth.
|
6
|
+
#
|
7
|
+
# It acts like a small key-value store (`@data`) and provides
|
8
|
+
# helper methods to manage depth and resolve relative URLs.
|
9
|
+
#
|
10
|
+
# @example Creating a new context
|
11
|
+
# ctx = Crawlr::Context.new(base_url: "https://example.com")
|
12
|
+
# ctx[:title] = "Home"
|
13
|
+
# ctx.increment_depth
|
14
|
+
# ctx.to_h
|
15
|
+
# # => { base_url: "https://example.com", page_url: nil, current_depth: 1, title: "Home" }
|
16
|
+
#
|
17
|
+
class Context
|
18
|
+
# @return [String, nil] The base URL used for resolving relative links
|
19
|
+
# @return [String, nil] The current page URL
|
20
|
+
# @return [Integer] The current depth in the crawl hierarchy
|
21
|
+
attr_accessor :base_url, :page_url, :current_depth
|
22
|
+
|
23
|
+
# Create a new scraping context.
|
24
|
+
#
|
25
|
+
# @param [String, nil] base_url The root URL of the crawl
|
26
|
+
# @param [String, nil] page_url The current page URL
|
27
|
+
# @param [Integer] current_depth The crawl depth (default: 0)
|
28
|
+
def initialize(base_url: nil, page_url: nil, current_depth: 0)
|
29
|
+
@base_url = base_url
|
30
|
+
@page_url = page_url
|
31
|
+
@current_depth = current_depth
|
32
|
+
@data = {}
|
33
|
+
end
|
34
|
+
|
35
|
+
# Retrieve a stored value by key.
|
36
|
+
#
|
37
|
+
# @param [Symbol, String] key The key to fetch
|
38
|
+
# @return [Object, nil] The stored value, or nil if not found
|
39
|
+
def [](key)
|
40
|
+
@data[key]
|
41
|
+
end
|
42
|
+
|
43
|
+
# Assign a value to a key.
|
44
|
+
#
|
45
|
+
# @param [Symbol, String] key The key to set
|
46
|
+
# @param [Object] value The value to store
|
47
|
+
# @return [Object] The stored value
|
48
|
+
def []=(key, value)
|
49
|
+
@data[key] = value
|
50
|
+
end
|
51
|
+
|
52
|
+
# Convert the context to a Hash.
|
53
|
+
#
|
54
|
+
# Includes base_url, page_url, current_depth, and all stored data.
|
55
|
+
#
|
56
|
+
# @return [Hash] The full context data as a Hash
|
57
|
+
def to_h
|
58
|
+
{
|
59
|
+
base_url: @base_url,
|
60
|
+
page_url: @page_url,
|
61
|
+
current_depth: @current_depth
|
62
|
+
}.merge(@data)
|
63
|
+
end
|
64
|
+
|
65
|
+
# Increment the crawl depth by 1.
|
66
|
+
#
|
67
|
+
# @return [Integer] The updated depth value
|
68
|
+
def increment_depth
|
69
|
+
@current_depth += 1
|
70
|
+
end
|
71
|
+
|
72
|
+
# Resolve a relative URL using the base_url.
|
73
|
+
#
|
74
|
+
# @param [String] url The relative or absolute URL
|
75
|
+
# @return [String] The resolved absolute URL
|
76
|
+
def resolve_url(url)
|
77
|
+
URI.join(@base_url, url).to_s
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,166 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Crawlr
|
4
|
+
# Domain filtering and validation class for controlling scraping scope.
|
5
|
+
#
|
6
|
+
# The Domains class manages which domains are allowed to be scraped by
|
7
|
+
# implementing both explicit domain allowlists and glob pattern matching.
|
8
|
+
# It provides flexible domain filtering to restrict scraping to specific
|
9
|
+
# sites or domain patterns while normalizing domain names for consistent
|
10
|
+
# comparison.
|
11
|
+
#
|
12
|
+
# @example Allow specific domains
|
13
|
+
# config = Crawlr::Config.new(
|
14
|
+
# allowed_domains: ['example.com', 'api.example.com', 'subdomain.site.org']
|
15
|
+
# )
|
16
|
+
# domains = Crawlr::Domains.new(config)
|
17
|
+
#
|
18
|
+
# domains.allowed?('https://example.com/page') #=> true
|
19
|
+
# domains.allowed?('https://www.example.com/page') #=> true (www. stripped)
|
20
|
+
# domains.allowed?('https://forbidden.com/page') #=> false
|
21
|
+
#
|
22
|
+
# @example Use glob patterns for flexible matching
|
23
|
+
# config = Crawlr::Config.new(
|
24
|
+
# domain_glob: ['*.example.com', '*.api.*.com', 'site?.org']
|
25
|
+
# )
|
26
|
+
# domains = Crawlr::Domains.new(config)
|
27
|
+
#
|
28
|
+
# domains.allowed?('https://sub.example.com/path') #=> true
|
29
|
+
# domains.allowed?('https://api.service.com/data') #=> true
|
30
|
+
# domains.allowed?('https://site1.org/content') #=> true
|
31
|
+
#
|
32
|
+
# @example No restrictions (allow all domains)
|
33
|
+
# config = Crawlr::Config.new # No domain restrictions
|
34
|
+
# domains = Crawlr::Domains.new(config)
|
35
|
+
#
|
36
|
+
# domains.allowed?('https://any-site.com') #=> true
|
37
|
+
#
|
38
|
+
# @author [Your Name]
|
39
|
+
# @since 0.1.0
|
40
|
+
class Domains
|
41
|
+
# Initializes a new Domains instance with the given configuration
|
42
|
+
#
|
43
|
+
# @param config [Crawlr::Config] Configuration object containing domain restrictions
|
44
|
+
#
|
45
|
+
# @example
|
46
|
+
# config = Crawlr::Config.new(allowed_domains: ['site.com'])
|
47
|
+
# domains = Crawlr::Domains.new(config)
|
48
|
+
def initialize(config)
|
49
|
+
@config = config
|
50
|
+
@allowed_domains = extract_allowed_domains(@config.allowed_domains)
|
51
|
+
@domain_glob = @config.domain_glob
|
52
|
+
end
|
53
|
+
|
54
|
+
# Checks if a URL is allowed based on configured domain restrictions
|
55
|
+
#
|
56
|
+
# The method performs the following checks in order:
|
57
|
+
# 1. If no restrictions are configured, allows all URLs
|
58
|
+
# 2. If glob patterns are configured, tests URL against each pattern
|
59
|
+
# 3. If explicit domains are configured, checks normalized domain name
|
60
|
+
# 4. Logs rejection for debugging purposes
|
61
|
+
#
|
62
|
+
# @param url [String] The URL to check for domain allowance
|
63
|
+
# @return [Boolean] true if the URL's domain is allowed, false otherwise
|
64
|
+
#
|
65
|
+
# @example With explicit domain allowlist
|
66
|
+
# domains.allowed?('https://example.com/page') #=> true (if allowed)
|
67
|
+
# domains.allowed?('https://www.example.com/page') #=> true (www. stripped)
|
68
|
+
# domains.allowed?('https://subdomain.example.com') #=> false (unless explicitly allowed)
|
69
|
+
#
|
70
|
+
# @example With glob patterns
|
71
|
+
# # config.domain_glob = ['*.example.com']
|
72
|
+
# domains.allowed?('https://api.example.com') #=> true
|
73
|
+
# domains.allowed?('https://cdn.example.com/asset') #=> true
|
74
|
+
# domains.allowed?('https://other.com') #=> false
|
75
|
+
#
|
76
|
+
# @example No restrictions
|
77
|
+
# # config.allowed_domains = [], config.domain_glob = []
|
78
|
+
# domains.allowed?('https://any-domain.com') #=> true
|
79
|
+
def allowed?(url)
|
80
|
+
return true if @allowed_domains.empty? && @domain_glob.empty?
|
81
|
+
|
82
|
+
unless @domain_glob.empty?
|
83
|
+
@domain_glob.each do |glob|
|
84
|
+
return true if File.fnmatch?(glob, url)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
uri = URI(url)
|
89
|
+
base_name = uri.host.sub("www.", "")
|
90
|
+
allowed = @allowed_domains.include?(base_name)
|
91
|
+
|
92
|
+
Crawlr.logger.info("URL not allowed: #{url}") unless allowed
|
93
|
+
allowed
|
94
|
+
end
|
95
|
+
|
96
|
+
# Returns statistics about the configured domain restrictions
|
97
|
+
#
|
98
|
+
# Provides metrics about the number of explicitly allowed domains
|
99
|
+
# and glob patterns configured for monitoring and debugging purposes.
|
100
|
+
#
|
101
|
+
# @return [Hash<Symbol, Integer>] Statistics hash containing domain counts
|
102
|
+
# @option return [Integer] :allowed_domains Number of explicitly allowed domains
|
103
|
+
# @option return [Integer] :domain_glob Number of configured glob patterns
|
104
|
+
#
|
105
|
+
# @example
|
106
|
+
# stats = domains.domain_stats
|
107
|
+
# puts "Allowing #{stats[:allowed_domains]} explicit domains"
|
108
|
+
# puts "Using #{stats[:domain_glob]} glob patterns"
|
109
|
+
def domain_stats
|
110
|
+
{
|
111
|
+
allowed_domains: @allowed_domains.size,
|
112
|
+
domain_glob: @domain_glob.size
|
113
|
+
}
|
114
|
+
end
|
115
|
+
|
116
|
+
private
|
117
|
+
|
118
|
+
# Extracts and normalizes domain names from the configuration
|
119
|
+
#
|
120
|
+
# Processes the list of allowed domains by:
|
121
|
+
# 1. Handling nil/empty input gracefully
|
122
|
+
# 2. Normalizing each domain using base_domain method
|
123
|
+
# 3. Removing duplicates from the final list
|
124
|
+
#
|
125
|
+
# @param domains [Array<String>, nil] List of domain strings to process
|
126
|
+
# @return [Array<String>] Normalized, unique list of base domain names
|
127
|
+
# @api private
|
128
|
+
#
|
129
|
+
# @example
|
130
|
+
# extract_allowed_domains(['https://www.example.com', 'api.example.com'])
|
131
|
+
# #=> ['example.com', 'api.example.com']
|
132
|
+
def extract_allowed_domains(domains)
|
133
|
+
return [] if domains.nil? || domains.empty?
|
134
|
+
|
135
|
+
domains.map { |domain| base_domain(domain) }.uniq
|
136
|
+
end
|
137
|
+
|
138
|
+
# Normalizes a domain string to its base form for consistent comparison
|
139
|
+
#
|
140
|
+
# Performs the following normalization:
|
141
|
+
# 1. Parses as URI if it looks like a full URL
|
142
|
+
# 2. Ensures path is set to "/" if empty (for valid URI)
|
143
|
+
# 3. Extracts hostname and removes "www." prefix
|
144
|
+
# 4. Falls back to original string if URI parsing fails
|
145
|
+
#
|
146
|
+
# @param domain [String] Domain string or URL to normalize
|
147
|
+
# @return [String] Normalized base domain name without www prefix
|
148
|
+
# @api private
|
149
|
+
#
|
150
|
+
# @example URL normalization
|
151
|
+
# base_domain('https://www.example.com/path') #=> 'example.com'
|
152
|
+
# base_domain('http://api.site.org') #=> 'api.site.org'
|
153
|
+
#
|
154
|
+
# @example Domain name normalization
|
155
|
+
# base_domain('www.example.com') #=> 'example.com'
|
156
|
+
# base_domain('subdomain.example.com') #=> 'subdomain.example.com'
|
157
|
+
#
|
158
|
+
# @example Fallback behavior
|
159
|
+
# base_domain('not-a-valid-uri') #=> 'not-a-valid-uri'
|
160
|
+
def base_domain(domain)
|
161
|
+
uri = URI(domain)
|
162
|
+
uri.path = "/" if uri.path.empty?
|
163
|
+
uri.host ? uri.host.sub("www.", "") : domain
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
data/lib/crawlr/hooks.rb
ADDED
@@ -0,0 +1,161 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Crawlr
|
4
|
+
# Event hook management system for scraping lifecycle customization.
|
5
|
+
#
|
6
|
+
# The Hooks class provides a flexible event-driven system that allows users
|
7
|
+
# to register custom behavior at specific points during the scraping process.
|
8
|
+
# It supports multiple hooks per event and validates event names to ensure
|
9
|
+
# consistency across the framework.
|
10
|
+
#
|
11
|
+
# @example Basic hook registration
|
12
|
+
# hooks = Crawlr::Hooks.new
|
13
|
+
#
|
14
|
+
# hooks.register(:before_visit) do |url, headers|
|
15
|
+
# puts "About to visit: #{url}"
|
16
|
+
# headers['X-Custom'] = 'value'
|
17
|
+
# end
|
18
|
+
#
|
19
|
+
# @example Multiple hooks for the same event
|
20
|
+
# hooks.register(:after_visit) do |url, response|
|
21
|
+
# log_response_time(url, response)
|
22
|
+
# end
|
23
|
+
#
|
24
|
+
# hooks.register(:after_visit) do |url, response|
|
25
|
+
# update_statistics(response.status)
|
26
|
+
# end
|
27
|
+
#
|
28
|
+
# @example Error handling hooks
|
29
|
+
# hooks.register(:on_error) do |url, error|
|
30
|
+
# error_logger.warn("Failed to scrape #{url}: #{error.message}")
|
31
|
+
# notify_monitoring_system(url, error)
|
32
|
+
# end
|
33
|
+
#
|
34
|
+
# @author [Your Name]
|
35
|
+
# @since 0.1.0
|
36
|
+
class Hooks
|
37
|
+
# Supported lifecycle events for hook registration
|
38
|
+
#
|
39
|
+
# @return [Array<Symbol>] Array of valid event names
|
40
|
+
# - `:before_visit` - Triggered before making HTTP request
|
41
|
+
# - `:after_visit` - Triggered after receiving HTTP response
|
42
|
+
# - `:on_error` - Triggered when an error occurs during scraping
|
43
|
+
ALLOWED_EVENTS = %i[before_visit after_visit on_error].freeze
|
44
|
+
|
45
|
+
# Initializes a new Hooks instance
|
46
|
+
#
|
47
|
+
# Creates an empty hook registry with auto-vivifying arrays for each event type.
|
48
|
+
#
|
49
|
+
# @example
|
50
|
+
# hooks = Crawlr::Hooks.new
|
51
|
+
def initialize
|
52
|
+
@hooks = Hash.new { |h, k| h[k] = [] }
|
53
|
+
end
|
54
|
+
|
55
|
+
# Registers a hook for a specific scraping lifecycle event
|
56
|
+
#
|
57
|
+
# Hooks are executed in the order they were registered. Multiple hooks
|
58
|
+
# can be registered for the same event, and all will be executed when
|
59
|
+
# the event is triggered.
|
60
|
+
#
|
61
|
+
# @param event [Symbol] The lifecycle event to hook into
|
62
|
+
# @param block [Proc] The block to execute when the event occurs
|
63
|
+
# @yieldparam args [Array] Event-specific arguments passed to the hook
|
64
|
+
# @return [void]
|
65
|
+
# @raise [ArgumentError] When the event is not in ALLOWED_EVENTS
|
66
|
+
# @raise [ArgumentError] When no block is provided
|
67
|
+
#
|
68
|
+
# @example Before visit hook for request modification
|
69
|
+
# register(:before_visit) do |url, headers|
|
70
|
+
# headers['User-Agent'] = 'Custom Bot 1.0'
|
71
|
+
# headers['Authorization'] = get_auth_token(url)
|
72
|
+
# end
|
73
|
+
#
|
74
|
+
# @example After visit hook for response processing
|
75
|
+
# register(:after_visit) do |url, response|
|
76
|
+
# response_time = response.headers['X-Response-Time']
|
77
|
+
# metrics.record_response_time(url, response_time)
|
78
|
+
# end
|
79
|
+
#
|
80
|
+
# @example Error handling hook
|
81
|
+
# register(:on_error) do |url, error|
|
82
|
+
# if error.is_a?(Timeout::Error)
|
83
|
+
# retry_queue.add(url, delay: 30)
|
84
|
+
# end
|
85
|
+
# end
|
86
|
+
def register(event, &block)
|
87
|
+
raise ArgumentError, "Invalid event #{event}" unless ALLOWED_EVENTS.include?(event)
|
88
|
+
raise ArgumentError, "Block required" unless block
|
89
|
+
|
90
|
+
@hooks[event] << block
|
91
|
+
end
|
92
|
+
|
93
|
+
# Triggers all registered hooks for a specific event
|
94
|
+
#
|
95
|
+
# Executes hooks in the order they were registered. If any hook raises
|
96
|
+
# an exception, it will be propagated and may prevent subsequent hooks
|
97
|
+
# from executing.
|
98
|
+
#
|
99
|
+
# @param event [Symbol] The event to trigger
|
100
|
+
# @param args [Array] Variable arguments to pass to the hook blocks
|
101
|
+
# @return [void]
|
102
|
+
# @raise [ArgumentError] When the event is not in ALLOWED_EVENTS
|
103
|
+
#
|
104
|
+
# @example Trigger before_visit hooks
|
105
|
+
# trigger(:before_visit, 'https://example.com', headers_hash)
|
106
|
+
#
|
107
|
+
# @example Trigger after_visit hooks
|
108
|
+
# trigger(:after_visit, 'https://example.com', response_object)
|
109
|
+
#
|
110
|
+
# @example Trigger error hooks
|
111
|
+
# trigger(:on_error, 'https://example.com', exception_object)
|
112
|
+
def trigger(event, *args)
|
113
|
+
raise ArgumentError, "Invalid event #{event}" unless ALLOWED_EVENTS.include?(event)
|
114
|
+
|
115
|
+
@hooks[event].each { |blk| blk.call(*args) }
|
116
|
+
end
|
117
|
+
|
118
|
+
# Returns statistics about registered hooks
|
119
|
+
#
|
120
|
+
# Provides metrics about hook registration for monitoring, debugging,
|
121
|
+
# and ensuring expected hooks are properly configured.
|
122
|
+
#
|
123
|
+
# @return [Hash<Symbol, Object>] Statistics hash containing hook metrics
|
124
|
+
# @option return [Integer] :total_hooks Total number of registered hooks across all events
|
125
|
+
# @option return [Hash<Symbol, Integer>] :per_event Number of hooks per event type
|
126
|
+
#
|
127
|
+
# @example
|
128
|
+
# stats = hooks.stats
|
129
|
+
# puts "Total hooks: #{stats[:total_hooks]}"
|
130
|
+
# puts "Before visit hooks: #{stats[:per_event][:before_visit]}"
|
131
|
+
# puts "Error hooks: #{stats[:per_event][:on_error]}"
|
132
|
+
def stats
|
133
|
+
grouped = @hooks.transform_values(&:size)
|
134
|
+
{ total_hooks: @hooks.values.flatten.size, per_event: grouped }
|
135
|
+
end
|
136
|
+
|
137
|
+
# Clears registered hooks for all events or a specific event
|
138
|
+
#
|
139
|
+
# Useful for testing, resetting hook configuration, or dynamically
|
140
|
+
# changing hook behavior during scraping sessions.
|
141
|
+
#
|
142
|
+
# @param event [Symbol, nil] Specific event to clear, or nil to clear all
|
143
|
+
# @return [void]
|
144
|
+
#
|
145
|
+
# @example Clear all hooks
|
146
|
+
# hooks.clear
|
147
|
+
#
|
148
|
+
# @example Clear hooks for specific event
|
149
|
+
# hooks.clear(:before_visit)
|
150
|
+
#
|
151
|
+
# @example Clear error hooks only
|
152
|
+
# hooks.clear(:on_error)
|
153
|
+
def clear(event = nil)
|
154
|
+
if event
|
155
|
+
@hooks[event].clear
|
156
|
+
else
|
157
|
+
@hooks.clear
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|