crawlr 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,632 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "http_interface"
4
+ require_relative "hooks"
5
+ require_relative "context"
6
+ require_relative "callbacks"
7
+ require_relative "parser"
8
+ require_relative "config"
9
+ require_relative "visits"
10
+ require_relative "domains"
11
+ require_relative "robots"
12
+ require "async/semaphore"
13
+
14
+ module Crawlr
15
+ # Main orchestrator class that manages scraping sessions.
16
+ #
17
+ # The Collector is the central component of the Crawlr framework, responsible for:
18
+ # - Managing URL visits with configurable depth control
19
+ # - Handling concurrent requests with parallelism limits
20
+ # - Respecting robots.txt and implementing polite crawling delays
21
+ # - Executing registered callbacks on scraped content
22
+ # - Maintaining visit history and domain filtering
23
+ # - Providing hooks for custom behavior during scraping lifecycle
24
+ #
25
+ # @example Basic scraping setup
26
+ # collector = Crawlr::Collector.new(max_depth: 3, max_parallelism: 5)
27
+ #
28
+ # collector.on_html(:css, '.product-title') do |node, ctx|
29
+ # puts "Found: #{node.text} at #{ctx.page_url}"
30
+ # end
31
+ #
32
+ # collector.visit('https://example.com')
33
+ #
34
+ # @example Paginated scraping
35
+ # collector.paginated_visit(
36
+ # 'https://api.example.com/items',
37
+ # batch_size: 10,
38
+ # start_page: 1
39
+ # )
40
+ #
41
+ # @example With hooks and configuration
42
+ # collector = Crawlr::Collector.new(
43
+ # max_retries: 3,
44
+ # random_delay: 2.0,
45
+ # ignore_robots_txt: false
46
+ # )
47
+ #
48
+ # collector.hook(:before_visit) do |url, headers|
49
+ # puts "About to visit: #{url}"
50
+ # end
51
+ #
52
+ # collector.hook(:on_error) do |url, error|
53
+ # puts "Failed to scrape #{url}: #{error.message}"
54
+ # end
55
+ #
56
+ # @author [Your Name]
57
+ # @since 0.1.0
58
+ class Collector
59
+ # @return [Crawlr::Config] The configuration object for this collector
60
+ attr_reader :config
61
+
62
+ # @return [Crawlr::Context] The current scraping context
63
+ # @return [Crawlr::HTTPInterface] The HTTP interface for making requests
64
+ # @return [Crawlr::Visits] The visit tracking system
65
+ attr_accessor :context, :http, :visits
66
+
67
+ # Initializes a new Collector instance with the given configuration
68
+ #
69
+ # @param options [Hash] Configuration options for the collector
70
+ # @option options [Integer] :max_depth Maximum crawling depth (default: nil for unlimited)
71
+ # @option options [Integer] :max_parallelism Maximum concurrent requests (default: 1)
72
+ # @option options [Float] :random_delay Maximum random delay between requests in seconds
73
+ # @option options [Boolean] :ignore_robots_txt Whether to ignore robots.txt (default: false)
74
+ # @option options [Integer] :max_retries Maximum retry attempts for failed requests
75
+ # @option options [Boolean] :allow_url_revisit Allow revisiting previously scraped URLs
76
+ #
77
+ # @example
78
+ # collector = Crawlr::Collector.new(
79
+ # max_depth: 5,
80
+ # max_parallelism: 3,
81
+ # random_delay: 1.5
82
+ # )
83
+ def initialize(options = {})
84
+ @config = Crawlr::Config.new(options)
85
+ @http = Crawlr::HTTPInterface.new(@config)
86
+ @visits = Crawlr::Visits.new(@config)
87
+ @domains = Crawlr::Domains.new(@config)
88
+ @hooks = Crawlr::Hooks.new
89
+ @callbacks = Crawlr::Callbacks.new
90
+ @robots = Crawlr::Robots.new
91
+ end
92
+
93
+ # Registers a callback for HTML content using CSS or XPath selectors
94
+ #
95
+ # @param selector_type [Symbol] The type of selector (:css or :xpath)
96
+ # @param selector [String] The selector string to match elements
97
+ # @param block [Proc] The callback block to execute when elements match
98
+ # @yieldparam node [Nokogiri::XML::Node] The matched DOM node
99
+ # @yieldparam ctx [Crawlr::Context] The scraping context
100
+ # @return [void]
101
+ #
102
+ # @example Register CSS selector for HTML
103
+ # on_html(:css, '.article-title') do |node, ctx|
104
+ # ctx.titles << node.text.strip
105
+ # end
106
+ #
107
+ # @example Register XPath selector for HTML
108
+ # on_html(:xpath, '//a[@class="next-page"]') do |link, ctx|
109
+ # next_url = URI.join(ctx.base_url, link['href'])
110
+ # ctx.queue_url(next_url.to_s)
111
+ # end
112
+ def on_html(selector_type, selector, &block)
113
+ @callbacks.register(:html, selector_type, selector, &block)
114
+ end
115
+
116
+ # Registers a callback for XML content using CSS or XPath selectors
117
+ #
118
+ # @param selector_type [Symbol] The type of selector (:css or :xpath)
119
+ # @param selector [String] The selector string to match elements
120
+ # @param block [Proc] The callback block to execute when elements match
121
+ # @yieldparam node [Nokogiri::XML::Node] The matched DOM node
122
+ # @yieldparam ctx [Crawlr::Context] The scraping context
123
+ # @return [void]
124
+ #
125
+ # @example Register XPath selector for XML feeds
126
+ # on_xml(:xpath, '//item/title') do |title_node, ctx|
127
+ # ctx.feed_titles << title_node.text
128
+ # end
129
+ #
130
+ # @example Register CSS selector for XML
131
+ # on_xml(:css, 'product[price]') do |product, ctx|
132
+ # ctx.products << parse_product(product)
133
+ # end
134
+ def on_xml(selector_type, selector, &block)
135
+ @callbacks.register(:xml, selector_type, selector, &block)
136
+ end
137
+
138
+ # Visits one or more URLs and processes them according to registered callbacks
139
+ #
140
+ # This method handles the core scraping workflow including:
141
+ # - robots.txt checking (unless disabled)
142
+ # - URL validation and filtering
143
+ # - Concurrent processing with parallelism limits
144
+ # - Depth tracking and limits
145
+ # - Error handling and retry logic
146
+ #
147
+ # @param input [String, Array<String>] Single URL or array of URLs to visit
148
+ # @param current_depth [Integer] Current depth level for recursive crawling
149
+ # @param block [Proc] Optional block to configure the collector before visiting
150
+ # @yieldparam collector [Crawlr::Collector] The collector instance for configuration
151
+ # @return [void]
152
+ #
153
+ # @example Visit a single URL
154
+ # visit('https://example.com/products')
155
+ #
156
+ # @example Visit multiple URLs
157
+ # visit(['https://site1.com', 'https://site2.com'])
158
+ #
159
+ # @example Visit with configuration block
160
+ # visit('https://example.com') do |collector|
161
+ # collector.on_html(:css, '.product') do |node, ctx|
162
+ # # Process products
163
+ # end
164
+ # end
165
+ #
166
+ # @example Recursive crawling with depth control
167
+ # visit('https://example.com', 0) # Start at depth 0
168
+ def visit(input, current_depth = 0)
169
+ yield self if block_given?
170
+
171
+ urls = normalize_urls(input)
172
+ return if exceeded_max_depth?(urls, current_depth)
173
+
174
+ process_robots(urls) unless @config.ignore_robots_txt
175
+ urls = filter_urls(urls)
176
+ return if urls.empty?
177
+
178
+ perform_visits(urls, current_depth)
179
+ end
180
+
181
+ # Performs paginated scraping by automatically generating page URLs
182
+ #
183
+ # This method is specifically designed for APIs or websites that use
184
+ # query parameter pagination (e.g., ?page=1, ?page=2, etc.). It automatically
185
+ # generates URLs and stops when pages return 404 or too many failures occur.
186
+ #
187
+ # @param url [String] Base URL for pagination
188
+ # @param current_depth [Integer] Starting depth for crawling limits
189
+ # @param query [String] Query parameter name for pagination (default: "page")
190
+ # @param batch_size [Integer] Number of pages to process in parallel batches (default: 5)
191
+ # @param start_page [Integer] Starting page number (default: 1)
192
+ # @param block [Proc] Optional block to configure the collector before visiting
193
+ # @yieldparam collector [Crawlr::Collector] The collector instance for configuration
194
+ # @return [void]
195
+ #
196
+ # @example Basic pagination
197
+ # paginated_visit('https://api.example.com/items', batch_size: 10)
198
+ #
199
+ # @example Custom query parameter and start page
200
+ # paginated_visit(
201
+ # 'https://example.com/products',
202
+ # query: 'p',
203
+ # start_page: 2,
204
+ # batch_size: 3
205
+ # )
206
+ #
207
+ # @example With configuration block
208
+ # paginated_visit('https://api.site.com/data') do |collector|
209
+ # collector.on_xml(:css, 'item') do |node, ctx|
210
+ # process_item(node, ctx)
211
+ # end
212
+ # end
213
+ def paginated_visit(url, current_depth: 0, query: "page", batch_size: 5, start_page: 1)
214
+ return unless valid_url?(url)
215
+
216
+ yield self if block_given?
217
+ fetch_robots(url) unless @config.ignore_robots_txt
218
+ return unless can_visit?(url, @config.headers)
219
+
220
+ pages_to_visit = build_initial_pages(url, query, batch_size, start_page)
221
+ process_page_batches(pages_to_visit, current_depth, batch_size, query)
222
+ end
223
+
224
+ # Registers a hook for specific scraping lifecycle events
225
+ #
226
+ # Hooks allow you to execute custom code at specific points during
227
+ # the scraping process, such as before/after visits or on errors.
228
+ #
229
+ # @param event [Symbol] The event to hook into (:before_visit, :after_visit, :on_error)
230
+ # @param block [Proc] The block to execute when the event occurs
231
+ # @yieldparam args [Array] Event-specific arguments passed to the block
232
+ # @return [void]
233
+ #
234
+ # @example Hook before each visit
235
+ # hook(:before_visit) do |url, headers|
236
+ # puts "About to visit: #{url}"
237
+ # headers['Custom-Header'] = 'value'
238
+ # end
239
+ #
240
+ # @example Hook after each visit
241
+ # hook(:after_visit) do |url, response|
242
+ # puts "Visited #{url}, got status: #{response.status}"
243
+ # end
244
+ #
245
+ # @example Hook for error handling
246
+ # hook(:on_error) do |url, error|
247
+ # logger.error "Failed to scrape #{url}: #{error.message}"
248
+ # end
249
+ def hook(event, &block)
250
+ @hooks.register(event, &block)
251
+ end
252
+
253
+ # Creates a clone of the current collector with shared HTTP and visit state
254
+ #
255
+ # This is useful for creating multiple collectors that share the same
256
+ # HTTP connection pool and visit history while having independent
257
+ # callback and hook configurations.
258
+ #
259
+ # @return [Crawlr::Collector] A new collector instance sharing HTTP and visits
260
+ #
261
+ # @example
262
+ # main_collector = Crawlr::Collector.new(max_parallelism: 10)
263
+ # product_collector = main_collector.clone
264
+ #
265
+ # product_collector.on_html(:css, '.product') do |node, ctx|
266
+ # # Process products with shared visit history
267
+ # end
268
+ def clone
269
+ new_collector = self.class.new(@config.to_h)
270
+ new_collector.http = @http
271
+ new_collector.visits = @visits
272
+
273
+ new_collector
274
+ end
275
+
276
+ # Returns comprehensive statistics about the collector's state and activity
277
+ #
278
+ # Provides metrics about configuration, registered hooks/callbacks,
279
+ # visit history, and retry behavior for monitoring and debugging.
280
+ #
281
+ # @return [Hash<Symbol, Object>] Statistics hash containing various metrics
282
+ # @option return [Integer] :max_depth Maximum configured crawling depth
283
+ # @option return [Boolean] :allow_url_revisit Whether URL revisiting is allowed
284
+ # @option return [Integer] :hooks_count Number of registered hooks
285
+ # @option return [Integer] :callbacks_count Number of registered callbacks
286
+ # @option return [Integer] :total_visits Number of URLs visited
287
+ # @option return [Integer] :unique_visits Number of unique URLs visited
288
+ # @option return [Integer] :max_retries Maximum retry attempts (if configured)
289
+ # @option return [Float] :retry_delay Base retry delay in seconds (if configured)
290
+ # @option return [Float] :retry_backoff Retry backoff multiplier (if configured)
291
+ #
292
+ # @example
293
+ # stats = collector.stats
294
+ # puts "Visited #{stats[:total_visits]} pages"
295
+ # puts "Registered #{stats[:callbacks_count]} callbacks"
296
+ def stats
297
+ base = {
298
+ max_depth: @config.max_depth,
299
+ allow_url_revisit: @config.allow_url_revisit
300
+ }
301
+
302
+ base.merge!(@hooks.stats)
303
+ base.merge!(@callbacks.stats)
304
+ base.merge!(@visits.stats)
305
+ base.merge!(retry_stats) if @config.max_retries
306
+ base
307
+ end
308
+
309
+ private
310
+
311
+ # Performs concurrent visits to multiple URLs with parallelism control
312
+ #
313
+ # @param urls [Array<String>] URLs to visit
314
+ # @param current_depth [Integer] Current crawling depth
315
+ # @return [Array<HTTP::Response>, nil] Array of responses or nil if depth exceeded
316
+ # @api private
317
+ def perform_visits(urls, current_depth)
318
+ return if exceeded_max_depth?(urls, current_depth)
319
+
320
+ responses = []
321
+
322
+ Sync do |parent| # embedded execution
323
+ semaphore = Async::Semaphore.new(@config.max_parallelism || urls.size)
324
+
325
+ tasks = urls.map do |url|
326
+ parent.async do
327
+ semaphore.acquire do
328
+ execute_visit(url, current_depth)
329
+ end
330
+ end
331
+ end
332
+
333
+ # Wait for all tasks and collect results
334
+ responses = tasks.map(&:wait)
335
+ end
336
+
337
+ responses
338
+ end
339
+
340
+ # Executes a single URL visit with error handling and context setup
341
+ #
342
+ # @param url [String] URL to visit
343
+ # @param depth [Integer] Current depth level
344
+ # @return [HTTP::Response, nil] HTTP response or nil on error
345
+ # @api private
346
+ def execute_visit(url, depth)
347
+ apply_random_delay(url)
348
+
349
+ begin
350
+ response = fetch_response(url)
351
+ raise StandardError unless response
352
+
353
+ ctx = setup_context(url, depth)
354
+ scrape_response(response, ctx)
355
+ response
356
+ rescue StandardError => e
357
+ handle_visit_error(url, e)
358
+ nil
359
+ end
360
+ end
361
+
362
+ # Fetches a URL with retry logic and error handling
363
+ #
364
+ # @param url [String] URL to fetch
365
+ # @return [HTTP::Response] HTTP response object
366
+ # @raise [StandardError] When all retry attempts are exhausted
367
+ # @api private
368
+ def fetch(url)
369
+ attempt = 0
370
+ begin
371
+ attempt += 1
372
+ @http.get(url) { |url, headers| @hooks.trigger(:before_visit, url, headers) }
373
+ rescue *@config.retryable_errors => e
374
+ if @config.max_retries.positive? && attempt <= @config.max_retries
375
+ delay = calculate_retry_delay(attempt)
376
+ Crawlr.logger.warn "Attempt #{attempt}/#{@config.max_retries + 1} failed for #{url}: #{e.class} - #{e.message}"
377
+ Crawlr.logger.info "Sleeping for #{delay.round(2)}sec before retry"
378
+ sleep(delay)
379
+ retry
380
+ else
381
+ Crawlr.logger.warn "#{@config.max_retries + 1}/#{@config.max_retries + 1} failed attempts for #{url}"
382
+ raise
383
+ end
384
+ end
385
+ end
386
+
387
+ # Calculates exponential backoff delay with jitter for retries
388
+ #
389
+ # @param attempt [Integer] Current retry attempt number
390
+ # @return [Float] Calculated delay in seconds
391
+ # @api private
392
+ def calculate_retry_delay(attempt)
393
+ base_delay = @config.retry_delay * (@config.retry_backoff**(attempt - 1))
394
+ jitter = rand(0.1..0.3) * base_delay
395
+ base_delay + jitter
396
+ end
397
+
398
+ # Applies a random delay before visiting URLs to be polite
399
+ #
400
+ # @param url [String] URL being visited (for logging)
401
+ # @return [void]
402
+ # @api private
403
+ def apply_random_delay(url)
404
+ return unless @visits.blank?
405
+
406
+ time_to_sleep = rand * @config.random_delay
407
+ return unless time_to_sleep.positive?
408
+
409
+ Crawlr.logger.debug "Sleeping for #{time_to_sleep.round(2)}sec before visiting #{url}"
410
+ sleep(time_to_sleep)
411
+ end
412
+
413
+ # Fetches a URL response and triggers appropriate hooks
414
+ #
415
+ # @param url [String] URL to fetch
416
+ # @return [HTTP::Response, nil] Response object or nil on failure
417
+ # @api private
418
+ def fetch_response(url)
419
+ response = fetch(url)
420
+ @hooks.trigger(:after_visit, url, response)
421
+
422
+ return unless response&.body
423
+
424
+ @visits.register(url)
425
+ response
426
+ end
427
+
428
+ # Applies registered callbacks to scraped response content
429
+ #
430
+ # @param response [HTTP::Response] HTTP response to process
431
+ # @param context [Crawlr::Context] Scraping context
432
+ # @return [void]
433
+ # @api private
434
+ def scrape_response(response, context)
435
+ Crawlr::Parser.apply_callbacks(content: response.body, callbacks: @callbacks.all, context: context)
436
+ end
437
+
438
+ # Handles errors that occur during URL visits
439
+ #
440
+ # @param url [String] URL that failed
441
+ # @param error [StandardError] The error that occurred
442
+ # @return [void]
443
+ # @api private
444
+ def handle_visit_error(url, error)
445
+ @hooks.trigger(:on_error, url, error)
446
+ Crawlr.logger.error "Error visiting #{url}: #{error.class} - #{error.message}"
447
+ end
448
+
449
+ # Sets up scraping context for a specific URL and depth
450
+ #
451
+ # @param url [String] Current page URL
452
+ # @param depth [Integer] Current crawling depth
453
+ # @return [Crawlr::Context] Configured context object
454
+ # @api private
455
+ def setup_context(url, depth)
456
+ uri = URI(url)
457
+ Crawlr::Context.new(
458
+ page_url: url,
459
+ base_url: uri.origin,
460
+ current_depth: depth
461
+ )
462
+ end
463
+
464
+ # Checks if the maximum crawling depth has been exceeded
465
+ #
466
+ # @param input [Array<String>] URLs being processed
467
+ # @param depth [Integer] Current depth level
468
+ # @return [Boolean] true if max depth exceeded
469
+ # @api private
470
+ def exceeded_max_depth?(input, depth)
471
+ if @config.max_depth && depth > @config.max_depth
472
+ Crawlr.logger.debug "Exceeded max depth; Skipping visit to #{input}"
473
+ true
474
+ else
475
+ false
476
+ end
477
+ end
478
+
479
+ # Converts various input types to a normalized array of URLs
480
+ #
481
+ # @param input [String, Array<String>] Input URLs
482
+ # @return [Array<String>] Normalized array of unique, non-nil URLs
483
+ # @api private
484
+ def input_to_url_array(input)
485
+ urls = case input
486
+ when String then [input]
487
+ when Array then input
488
+ else
489
+ Crawlr.logger.warn "Unsupported input type: #{input.class}"
490
+ return []
491
+ end
492
+
493
+ urls.compact.uniq
494
+ end
495
+
496
+ # Determines if a URL can be visited based on domain, visit history, and robots.txt
497
+ #
498
+ # @param url [String] URL to check
499
+ # @param headers [Hash] HTTP headers for robots.txt checking
500
+ # @return [Boolean] true if URL can be visited
501
+ # @api private
502
+ def can_visit?(url, headers = {})
503
+ return false if url.nil? || url.empty?
504
+
505
+ @domains.allowed?(url) &&
506
+ @visits.new?(url) &&
507
+ @robots.allowed?(url, headers["User-Agent"])
508
+ end
509
+
510
+ # Validates that a URL is a proper HTTP/HTTPS URL
511
+ #
512
+ # @param url [String] URL to validate
513
+ # @return [Boolean] true if URL is valid HTTP/HTTPS
514
+ # @api private
515
+ def valid_url?(url)
516
+ uri = URI.parse(url)
517
+ uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
518
+ rescue URI::InvalidURIError
519
+ false
520
+ end
521
+
522
+ # Returns retry-related statistics when retry is configured
523
+ #
524
+ # @return [Hash<Symbol, Object>] Hash containing retry configuration
525
+ # @api private
526
+ def retry_stats
527
+ {
528
+ max_retries: @config.max_retries,
529
+ retry_delay: @config.retry_delay,
530
+ retry_backoff: @config.retry_backoff
531
+ }
532
+ end
533
+
534
+ # Fetches robots.txt for a given URL origin
535
+ #
536
+ # @param url [String] URL to get robots.txt for
537
+ # @return [HTTP::Response, nil] robots.txt response or nil on error
538
+ # @api private
539
+ def fetch_robots_txt(url)
540
+ uri = URI.parse(url)
541
+ robots_link = "#{uri.origin}/robots.txt"
542
+ response = fetch_response(robots_link)
543
+ raise StandardError unless response
544
+
545
+ response
546
+ rescue StandardError => e
547
+ handle_visit_error(robots_link, e)
548
+ nil
549
+ end
550
+
551
+ def normalize_urls(input)
552
+ input_to_url_array(input)
553
+ end
554
+
555
+ def process_robots(urls)
556
+ origins = urls.map do |url|
557
+ URI.parse(url).origin
558
+ rescue StandardError
559
+ nil
560
+ end.compact.uniq
561
+ origins.each do |origin|
562
+ next if @robots.exists?(origin)
563
+
564
+ response = fetch_robots_txt(origin)
565
+ @robots.parse(origin, response.body) if response&.body
566
+ end
567
+ end
568
+
569
+ def filter_urls(urls)
570
+ urls.select { |url| can_visit?(url, @config.headers) }
571
+ end
572
+
573
+ def build_initial_pages(url, query, batch_size, start_page)
574
+ max_batch = [@config.max_depth, batch_size].min
575
+ if start_page == 1
576
+ [url] + (max_batch - 1).times.map { |i| "#{url}?#{query}=#{i + 2}" }
577
+ else
578
+ max_batch.times.map { |i| "#{url}?#{query}=#{i + start_page}" }
579
+ end
580
+ end
581
+
582
+ def process_page_batches(pages, current_depth, batch_size, query)
583
+ scheduled_depth = current_depth
584
+ max_batch = [@config.max_depth, batch_size].min
585
+
586
+ loop do
587
+ break if reached_max_depth?(scheduled_depth)
588
+
589
+ batch = next_batch(pages, max_batch)
590
+ break if batch.empty?
591
+
592
+ break unless batch_successful?(batch, scheduled_depth)
593
+
594
+ scheduled_depth = update_depth(scheduled_depth, max_batch)
595
+ pages = generate_next_pages(batch, scheduled_depth, max_batch, query)
596
+ end
597
+ end
598
+
599
+ def valid_batch?(responses, batch_size)
600
+ return false unless responses
601
+
602
+ success_count = responses.count { |r| r && r.status != 404 }
603
+ success_count.positive? && success_count >= batch_size / 2
604
+ end
605
+
606
+ def reached_max_depth?(depth)
607
+ return false unless @config.max_depth
608
+
609
+ depth >= @config.max_depth
610
+ end
611
+
612
+ def next_batch(pages, max_batch)
613
+ pages.shift(max_batch)
614
+ end
615
+
616
+ def batch_successful?(batch, depth)
617
+ responses = perform_visits(batch, depth)
618
+ return false unless responses
619
+
620
+ success_count = responses.count { |r| r && r.status != 404 }
621
+ success_count.positive? && success_count >= batch.size / 2
622
+ end
623
+
624
+ def update_depth(current, max_batch)
625
+ current + max_batch
626
+ end
627
+
628
+ def generate_next_pages(batch, scheduled_depth, max_batch, query)
629
+ max_batch.times.map { |i| "#{batch.first}?#{query}=#{i + scheduled_depth + 1}" }
630
+ end
631
+ end
632
+ end