RubyGems - rubycrawl - Versions diffs - 0.2.0 → 0.4.0 - Mend

rubycrawl 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/README.md +81 -32
data/lib/rubycrawl/browser/extraction.rb +34 -12
data/lib/rubycrawl/browser/readability.js +2786 -0
data/lib/rubycrawl/browser.rb +1 -1
data/lib/rubycrawl/robots_parser.rb +86 -0
data/lib/rubycrawl/site_crawler.rb +15 -1
data/lib/rubycrawl/tasks/install.rake +6 -5
data/lib/rubycrawl/version.rb +1 -1
data/lib/rubycrawl.rb +9 -7
metadata +4 -2

data/lib/rubycrawl/browser.rb CHANGED Viewed

@@ -99,7 +99,7 @@ class RubyCrawl
         raw_text:   raw_text.to_s,
         clean_html: content['cleanHtml'].to_s,
         links:      Array(links),
-        metadata:   { 'final_url' => final_url }.merge(metadata || {})
+        metadata:   { 'final_url' => final_url, 'extractor' => content['extractor'] }.merge(metadata || {})
       )
     end
   end

data/lib/rubycrawl/robots_parser.rb ADDED Viewed

@@ -0,0 +1,86 @@
+# frozen_string_literal: true
+require 'net/http'
+require 'uri'
+class RubyCrawl
+  # Fetches and parses robots.txt for a given site.
+  # Supports User-agent: *, Disallow, Allow, and Crawl-delay directives.
+  # Fails open — any fetch/parse error allows all URLs.
+  class RobotsParser
+    # Fetch robots.txt from base_url and return a parser instance.
+    # Returns a permissive (allow-all) instance on any network error.
+    def self.fetch(base_url)
+      uri = URI.join(base_url, '/robots.txt')
+      response = Net::HTTP.start(uri.host, uri.port,
+                                 use_ssl:      uri.scheme == 'https',
+                                 open_timeout: 5,
+                                 read_timeout: 5) do |http|
+        http.get(uri.request_uri)
+      end
+      new(response.is_a?(Net::HTTPOK) ? response.body : '')
+    rescue StandardError
+      new('') # network error or invalid URL → allow everything
+    end
+    def initialize(content)
+      @rules = parse(content.to_s)
+    end
+    # Returns true if the given URL is allowed to be crawled.
+    def allowed?(url)
+      path = URI.parse(url).path
+      path = '/' if path.nil? || path.empty?
+      # Allow rules take precedence over Disallow when both match.
+      return true if @rules[:allow].any? { |rule| path_matches?(path, rule) }
+      return false if @rules[:disallow].any? { |rule| path_matches?(path, rule) }
+      true
+    rescue URI::InvalidURIError
+      true
+    end
+    # Returns the Crawl-delay value in seconds, or nil if not specified.
+    def crawl_delay
+      @rules[:crawl_delay]
+    end
+    private
+    def parse(content)
+      rules = { allow: [], disallow: [], crawl_delay: nil }
+      in_relevant_section = false
+      content.each_line do |raw_line|
+        line = raw_line.strip.sub(/#.*$/, '').strip
+        next if line.empty?
+        key, value = line.split(':', 2).map(&:strip)
+        next unless key && value
+        case key.downcase
+        when 'user-agent'
+          in_relevant_section = (value == '*')
+        when 'disallow'
+          rules[:disallow] << value if in_relevant_section && !value.empty?
+        when 'allow'
+          rules[:allow] << value if in_relevant_section && !value.empty?
+        when 'crawl-delay'
+          rules[:crawl_delay] = value.to_f if in_relevant_section && value.match?(/\A\d+(\.\d+)?\z/)
+        end
+      end
+      rules
+    end
+    # Matches a URL path against a robots.txt rule pattern.
+    # Supports * (wildcard) and $ (end-of-string anchor).
+    def path_matches?(path, rule)
+      return false if rule.empty?
+      pattern = Regexp.escape(rule).gsub('\*', '.*').gsub('\$', '\z')
+      path.match?(/\A#{pattern}/)
+    end
+  end
+end

data/lib/rubycrawl/site_crawler.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require 'set'
+require_relative 'robots_parser'
 class RubyCrawl
   # BFS crawler that follows links with deduplication.
@@ -46,7 +47,8 @@ class RubyCrawl
       @same_host_only = options.fetch(:same_host_only, true)
       @wait_until = options.fetch(:wait_until, nil)
       @block_resources = options.fetch(:block_resources, nil)
-      @max_attempts = options.fetch(:max_attempts, nil)
+      @max_attempts        = options.fetch(:max_attempts, nil)
+      @respect_robots_txt  = options.fetch(:respect_robots_txt, false)
       @visited = Set.new
       @queue = []
     end
@@ -58,6 +60,7 @@ class RubyCrawl
       raise ConfigurationError, "Invalid start URL: #{start_url}" unless normalized
       @base_url = normalized
+      @robots   = @respect_robots_txt ? RobotsParser.fetch(@base_url) : nil
       enqueue(normalized, 0)
       process_queue(&block)
     end
@@ -71,6 +74,8 @@ class RubyCrawl
         url, depth = item
         next if @visited.include?(url)
+        sleep(@robots.crawl_delay) if @robots&.crawl_delay && pages_crawled.positive?
         result = process_page(url, depth)
         next unless result
@@ -130,11 +135,20 @@ class RubyCrawl
         next unless normalized
         next if @visited.include?(normalized)
         next if @same_host_only && !UrlNormalizer.same_host?(normalized, @base_url)
+        next if robots_disallowed?(normalized)
         enqueue(normalized, depth)
       end
     end
+    def robots_disallowed?(url)
+      return false unless @robots
+      return false if @robots.allowed?(url)
+      warn "[rubycrawl] Skipping #{url} (disallowed by robots.txt)"
+      true
+    end
     def enqueue(url, depth)
       return if @visited.include?(url)

data/lib/rubycrawl/tasks/install.rake CHANGED Viewed

@@ -27,11 +27,12 @@ namespace :rubycrawl do
           # RubyCrawl Configuration
           RubyCrawl.configure(
-            # wait_until: "load",       # "load", "domcontentloaded", "networkidle"
-            # block_resources: true,    # block images/fonts/CSS/media for speed
-            # max_attempts: 3,          # retry count with exponential backoff
-            # timeout: 30,             # browser navigation timeout in seconds
-            # headless: true,          # set false to see the browser (debugging)
+            # wait_until: "load",             # "load", "domcontentloaded", "networkidle"
+            # block_resources: true,          # block images/fonts/CSS/media for speed
+            # max_attempts: 3,               # retry count with exponential backoff
+            # timeout: 30,                  # browser navigation timeout in seconds
+            # headless: true,              # set false to see the browser (debugging)
+            # respect_robots_txt: false,   # set true to honour robots.txt and Crawl-delay
           )
         RUBY

data/lib/rubycrawl/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 class RubyCrawl
-  VERSION = '0.2.0'
+  VERSION = '0.4.0'
 end

data/lib/rubycrawl.rb CHANGED Viewed

@@ -81,7 +81,8 @@ class RubyCrawl
     @max_attempts    = options.fetch(:max_attempts, 3)
     @timeout         = options.fetch(:timeout, 30)
     @headless        = options.fetch(:headless, true)
-    @browser_options = options.fetch(:browser_options, {})
+    @browser_options    = options.fetch(:browser_options, {})
+    @respect_robots_txt = options.fetch(:respect_robots_txt, false)
   end
   def with_retries(max_attempts)
@@ -101,12 +102,13 @@ class RubyCrawl
   def build_crawler_options(options)
     {
-      max_pages:       options.fetch(:max_pages, 50),
-      max_depth:       options.fetch(:max_depth, 3),
-      same_host_only:  options.fetch(:same_host_only, true),
-      wait_until:      options.fetch(:wait_until, @wait_until),
-      block_resources: options.fetch(:block_resources, @block_resources),
-      max_attempts:    options.fetch(:max_attempts, @max_attempts)
+      max_pages:          options.fetch(:max_pages, 50),
+      max_depth:          options.fetch(:max_depth, 3),
+      same_host_only:     options.fetch(:same_host_only, true),
+      wait_until:         options.fetch(:wait_until, @wait_until),
+      block_resources:    options.fetch(:block_resources, @block_resources),
+      max_attempts:       options.fetch(:max_attempts, @max_attempts),
+      respect_robots_txt: options.fetch(:respect_robots_txt, @respect_robots_txt)
     }
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rubycrawl
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.4.0
 platform: ruby
 authors:
 - RubyCrawl contributors
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2026-03-16 00:00:00.000000000 Z
+date: 2026-03-17 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ferrum
@@ -52,11 +52,13 @@ files:
 - lib/rubycrawl.rb
 - lib/rubycrawl/browser.rb
 - lib/rubycrawl/browser/extraction.rb
+- lib/rubycrawl/browser/readability.js
 - lib/rubycrawl/errors.rb
 - lib/rubycrawl/helpers.rb
 - lib/rubycrawl/markdown_converter.rb
 - lib/rubycrawl/railtie.rb
 - lib/rubycrawl/result.rb
+- lib/rubycrawl/robots_parser.rb
 - lib/rubycrawl/site_crawler.rb
 - lib/rubycrawl/tasks/install.rake
 - lib/rubycrawl/url_normalizer.rb