RubyGems - rubycrawl - Versions diffs - 0.3.0 → 0.4.0 - Mend

rubycrawl 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/README.md +56 -17
data/lib/rubycrawl/robots_parser.rb +86 -0
data/lib/rubycrawl/site_crawler.rb +15 -1
data/lib/rubycrawl/tasks/install.rake +6 -5
data/lib/rubycrawl/version.rb +1 -1
data/lib/rubycrawl.rb +9 -7
metadata +2 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 56d56f2c264e3febc0f1b22badabb739393332e0948a0f4e4ceb534a68127604
-  data.tar.gz: ebcadd14ba65b12870f6069f898658240073ba7233eab63468e0502871a7a408
+  metadata.gz: c38e6b7b377a04d6baec4756a7bdf749580e5391d42483b9f6f7e50ee0cbd25f
+  data.tar.gz: 8323d9dbe93915b2f81fb6adbd6056b0007ef3ac58a828feb3492d14e02b7423
 SHA512:
-  metadata.gz: 182e8c771358324d256b38a42a236f634a113b18e16c716da891543ddb43a90ea68242bbc1655639781485e05802a203b64d7ea874eb4cb98900c2e771b85ec0
-  data.tar.gz: f150a6394fb2279b1f872c4074ef9b9df489f19266a7a35886b1e9fbd57e3d4d3761e0519a015270a5259c698d163e2ca223cf0106b6db6471c7911d65c12a29
+  metadata.gz: 2905355938f1f18c747c83bdcc1360f88c887026d8b2242c00a87727cb32ab9954927a24ea12975d8c89a1f0e358ffab22ed458b2cac1d8a9acfb9537bb03eca
+  data.tar.gz: 556b1d58707d72698a8e537dc41e8a0b4d47656b501b1cbdcff9db1532180d4d28b4f443f505789454da381efa52adf722c501eb85212564809b6571d504ee55

data/README.md CHANGED Viewed

@@ -1,6 +1,7 @@
 # RubyCrawl 🎭
 [![Gem Version](https://badge.fury.io/rb/rubycrawl.svg)](https://rubygems.org/gems/rubycrawl)
+[![CI](https://github.com/craft-wise/rubycrawl/actions/workflows/ci.yml/badge.svg)](https://github.com/craft-wise/rubycrawl/actions/workflows/ci.yml)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 [![Ruby](https://img.shields.io/badge/ruby-%3E%3D%203.0-red.svg)](https://www.ruby-lang.org/)
@@ -189,13 +190,38 @@ puts "Indexed #{pages_crawled} pages"
 #### Multi-Page Options
-| Option            | Default   | Description                          |
-| ----------------- | --------- | ------------------------------------ |
-| `max_pages`       | 50        | Maximum number of pages to crawl     |
-| `max_depth`       | 3         | Maximum link depth from start URL    |
-| `same_host_only`  | true      | Only follow links on the same domain |
-| `wait_until`      | inherited | Page load strategy                   |
-| `block_resources` | inherited | Block images/fonts/CSS               |
+| Option                 | Default   | Description                                         |
+| ---------------------- | --------- | --------------------------------------------------- |
+| `max_pages`            | 50        | Maximum number of pages to crawl                    |
+| `max_depth`            | 3         | Maximum link depth from start URL                   |
+| `same_host_only`       | true      | Only follow links on the same domain                |
+| `wait_until`           | inherited | Page load strategy                                  |
+| `block_resources`      | inherited | Block images/fonts/CSS                              |
+| `respect_robots_txt`   | false     | Honour robots.txt rules and auto-sleep `Crawl-delay`|
+#### robots.txt Support
+When `respect_robots_txt: true`, RubyCrawl fetches `robots.txt` once at the start of the crawl and:
+- Skips any URL disallowed for `User-agent: *`
+- Automatically sleeps the `Crawl-delay` specified in robots.txt between pages
+```ruby
+RubyCrawl.crawl_site("https://example.com",
+  respect_robots_txt: true,
+  max_pages: 100
+) do |page|
+  puts page.url
+end
+```
+Or enable globally:
+```ruby
+RubyCrawl.configure(respect_robots_txt: true)
+```
+If robots.txt is unreachable or missing, crawling proceeds normally (fail open).
 #### Page Result Object
@@ -247,11 +273,12 @@ result = RubyCrawl.crawl(
 | Option            | Values                                                      | Default | Description                                         |
 | ----------------- | ----------------------------------------------------------- | ------- | --------------------------------------------------- |
-| `wait_until`      | `"load"`, `"domcontentloaded"`, `"networkidle"`, `"commit"` | `nil`   | When to consider page loaded (nil = Ferrum default) |
-| `block_resources` | `true`, `false`                                             | `nil`   | Block images, fonts, CSS, media for faster crawls   |
-| `max_attempts`    | Integer                                                     | `3`     | Total number of attempts (including the first)      |
-| `timeout`         | Integer (seconds)                                           | `30`    | Browser navigation timeout                          |
-| `headless`        | `true`, `false`                                             | `true`  | Run Chrome headlessly                               |
+| `wait_until`           | `"load"`, `"domcontentloaded"`, `"networkidle"`, `"commit"` | `nil`   | When to consider page loaded (nil = Ferrum default) |
+| `block_resources`      | `true`, `false`                                             | `nil`   | Block images, fonts, CSS, media for faster crawls   |
+| `max_attempts`         | Integer                                                     | `3`     | Total number of attempts (including the first)      |
+| `timeout`              | Integer (seconds)                                           | `30`    | Browser navigation timeout                          |
+| `headless`             | `true`, `false`                                             | `true`  | Run Chrome headlessly                               |
+| `respect_robots_txt`   | `true`, `false`                                             | `false` | Honour robots.txt rules and auto-sleep Crawl-delay  |
 **Wait strategies explained:**
@@ -497,9 +524,24 @@ Readability.js → heuristic fallback      ← content extraction (inside browse
 - **Resource blocking**: Keep `block_resources: true` (default: nil) to skip images/fonts/CSS for 2-3x faster crawls
 - **Wait strategy**: Use `wait_until: "load"` for static sites, `"networkidle"` for SPAs
-- **Concurrency**: Use background jobs (Sidekiq, GoodJob, etc.) for parallel crawling
 - **Browser reuse**: The first crawl is slower (~2s) due to Chrome launch; subsequent crawls are much faster (~200-500ms)
+### Parallelism
+RubyCrawl does not support parallel page loading within a single process — Ferrum uses one Chrome instance and concurrent access is not thread-safe.
+The recommended pattern is **job-level parallelism**: each background job gets its own `RubyCrawl` instance and Chrome process, with natural rate limiting via your job queue's concurrency setting:
+```ruby
+# Enqueue independent crawls — each job runs its own Chrome
+urls.each { |url| CrawlJob.perform_later(url) }
+# Control concurrency via your queue worker config (Sidekiq, GoodJob, etc.)
+# e.g. Sidekiq concurrency: 3 → 3 Chrome processes crawling in parallel
+```
+This also works naturally with `respect_robots_txt: true` — each job respects Crawl-delay independently.
 ## Development
 ```bash
@@ -507,12 +549,9 @@ git clone git@github.com:craft-wise/rubycrawl.git
 cd rubycrawl
 bin/setup
-# Run unit tests (no browser required)
+# Run all tests (Chrome required — installed as a gem dependency)
 bundle exec rspec
-# Run integration tests (requires Chrome)
-INTEGRATION=1 bundle exec rspec
 # Manual testing
 bin/console
 > RubyCrawl.crawl("https://example.com")

data/lib/rubycrawl/robots_parser.rb ADDED Viewed

@@ -0,0 +1,86 @@
+# frozen_string_literal: true
+require 'net/http'
+require 'uri'
+class RubyCrawl
+  # Fetches and parses robots.txt for a given site.
+  # Supports User-agent: *, Disallow, Allow, and Crawl-delay directives.
+  # Fails open — any fetch/parse error allows all URLs.
+  class RobotsParser
+    # Fetch robots.txt from base_url and return a parser instance.
+    # Returns a permissive (allow-all) instance on any network error.
+    def self.fetch(base_url)
+      uri = URI.join(base_url, '/robots.txt')
+      response = Net::HTTP.start(uri.host, uri.port,
+                                 use_ssl:      uri.scheme == 'https',
+                                 open_timeout: 5,
+                                 read_timeout: 5) do |http|
+        http.get(uri.request_uri)
+      end
+      new(response.is_a?(Net::HTTPOK) ? response.body : '')
+    rescue StandardError
+      new('') # network error or invalid URL → allow everything
+    end
+    def initialize(content)
+      @rules = parse(content.to_s)
+    end
+    # Returns true if the given URL is allowed to be crawled.
+    def allowed?(url)
+      path = URI.parse(url).path
+      path = '/' if path.nil? || path.empty?
+      # Allow rules take precedence over Disallow when both match.
+      return true if @rules[:allow].any? { |rule| path_matches?(path, rule) }
+      return false if @rules[:disallow].any? { |rule| path_matches?(path, rule) }
+      true
+    rescue URI::InvalidURIError
+      true
+    end
+    # Returns the Crawl-delay value in seconds, or nil if not specified.
+    def crawl_delay
+      @rules[:crawl_delay]
+    end
+    private
+    def parse(content)
+      rules = { allow: [], disallow: [], crawl_delay: nil }
+      in_relevant_section = false
+      content.each_line do |raw_line|
+        line = raw_line.strip.sub(/#.*$/, '').strip
+        next if line.empty?
+        key, value = line.split(':', 2).map(&:strip)
+        next unless key && value
+        case key.downcase
+        when 'user-agent'
+          in_relevant_section = (value == '*')
+        when 'disallow'
+          rules[:disallow] << value if in_relevant_section && !value.empty?
+        when 'allow'
+          rules[:allow] << value if in_relevant_section && !value.empty?
+        when 'crawl-delay'
+          rules[:crawl_delay] = value.to_f if in_relevant_section && value.match?(/\A\d+(\.\d+)?\z/)
+        end
+      end
+      rules
+    end
+    # Matches a URL path against a robots.txt rule pattern.
+    # Supports * (wildcard) and $ (end-of-string anchor).
+    def path_matches?(path, rule)
+      return false if rule.empty?
+      pattern = Regexp.escape(rule).gsub('\*', '.*').gsub('\$', '\z')
+      path.match?(/\A#{pattern}/)
+    end
+  end
+end

data/lib/rubycrawl/site_crawler.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require 'set'
+require_relative 'robots_parser'
 class RubyCrawl
   # BFS crawler that follows links with deduplication.
@@ -46,7 +47,8 @@ class RubyCrawl
       @same_host_only = options.fetch(:same_host_only, true)
       @wait_until = options.fetch(:wait_until, nil)
       @block_resources = options.fetch(:block_resources, nil)
-      @max_attempts = options.fetch(:max_attempts, nil)
+      @max_attempts        = options.fetch(:max_attempts, nil)
+      @respect_robots_txt  = options.fetch(:respect_robots_txt, false)
       @visited = Set.new
       @queue = []
     end
@@ -58,6 +60,7 @@ class RubyCrawl
       raise ConfigurationError, "Invalid start URL: #{start_url}" unless normalized
       @base_url = normalized
+      @robots   = @respect_robots_txt ? RobotsParser.fetch(@base_url) : nil
       enqueue(normalized, 0)
       process_queue(&block)
     end
@@ -71,6 +74,8 @@ class RubyCrawl
         url, depth = item
         next if @visited.include?(url)
+        sleep(@robots.crawl_delay) if @robots&.crawl_delay && pages_crawled.positive?
         result = process_page(url, depth)
         next unless result
@@ -130,11 +135,20 @@ class RubyCrawl
         next unless normalized
         next if @visited.include?(normalized)
         next if @same_host_only && !UrlNormalizer.same_host?(normalized, @base_url)
+        next if robots_disallowed?(normalized)
         enqueue(normalized, depth)
       end
     end
+    def robots_disallowed?(url)
+      return false unless @robots
+      return false if @robots.allowed?(url)
+      warn "[rubycrawl] Skipping #{url} (disallowed by robots.txt)"
+      true
+    end
     def enqueue(url, depth)
       return if @visited.include?(url)

data/lib/rubycrawl/tasks/install.rake CHANGED Viewed

@@ -27,11 +27,12 @@ namespace :rubycrawl do
           # RubyCrawl Configuration
           RubyCrawl.configure(
-            # wait_until: "load",       # "load", "domcontentloaded", "networkidle"
-            # block_resources: true,    # block images/fonts/CSS/media for speed
-            # max_attempts: 3,          # retry count with exponential backoff
-            # timeout: 30,             # browser navigation timeout in seconds
-            # headless: true,          # set false to see the browser (debugging)
+            # wait_until: "load",             # "load", "domcontentloaded", "networkidle"
+            # block_resources: true,          # block images/fonts/CSS/media for speed
+            # max_attempts: 3,               # retry count with exponential backoff
+            # timeout: 30,                  # browser navigation timeout in seconds
+            # headless: true,              # set false to see the browser (debugging)
+            # respect_robots_txt: false,   # set true to honour robots.txt and Crawl-delay
           )
         RUBY

data/lib/rubycrawl/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 class RubyCrawl
-  VERSION = '0.3.0'
+  VERSION = '0.4.0'
 end

data/lib/rubycrawl.rb CHANGED Viewed

@@ -81,7 +81,8 @@ class RubyCrawl
     @max_attempts    = options.fetch(:max_attempts, 3)
     @timeout         = options.fetch(:timeout, 30)
     @headless        = options.fetch(:headless, true)
-    @browser_options = options.fetch(:browser_options, {})
+    @browser_options    = options.fetch(:browser_options, {})
+    @respect_robots_txt = options.fetch(:respect_robots_txt, false)
   end
   def with_retries(max_attempts)
@@ -101,12 +102,13 @@ class RubyCrawl
   def build_crawler_options(options)
     {
-      max_pages:       options.fetch(:max_pages, 50),
-      max_depth:       options.fetch(:max_depth, 3),
-      same_host_only:  options.fetch(:same_host_only, true),
-      wait_until:      options.fetch(:wait_until, @wait_until),
-      block_resources: options.fetch(:block_resources, @block_resources),
-      max_attempts:    options.fetch(:max_attempts, @max_attempts)
+      max_pages:          options.fetch(:max_pages, 50),
+      max_depth:          options.fetch(:max_depth, 3),
+      same_host_only:     options.fetch(:same_host_only, true),
+      wait_until:         options.fetch(:wait_until, @wait_until),
+      block_resources:    options.fetch(:block_resources, @block_resources),
+      max_attempts:       options.fetch(:max_attempts, @max_attempts),
+      respect_robots_txt: options.fetch(:respect_robots_txt, @respect_robots_txt)
     }
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rubycrawl
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.4.0
 platform: ruby
 authors:
 - RubyCrawl contributors
@@ -58,6 +58,7 @@ files:
 - lib/rubycrawl/markdown_converter.rb
 - lib/rubycrawl/railtie.rb
 - lib/rubycrawl/result.rb
+- lib/rubycrawl/robots_parser.rb
 - lib/rubycrawl/site_crawler.rb
 - lib/rubycrawl/tasks/install.rake
 - lib/rubycrawl/url_normalizer.rb