rubycrawl 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -99,7 +99,7 @@ class RubyCrawl
99
99
  raw_text: raw_text.to_s,
100
100
  clean_html: content['cleanHtml'].to_s,
101
101
  links: Array(links),
102
- metadata: { 'final_url' => final_url }.merge(metadata || {})
102
+ metadata: { 'final_url' => final_url, 'extractor' => content['extractor'] }.merge(metadata || {})
103
103
  )
104
104
  end
105
105
  end
@@ -0,0 +1,86 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'net/http'
4
+ require 'uri'
5
+
6
+ class RubyCrawl
7
+ # Fetches and parses robots.txt for a given site.
8
+ # Supports User-agent: *, Disallow, Allow, and Crawl-delay directives.
9
+ # Fails open — any fetch/parse error allows all URLs.
10
+ class RobotsParser
11
+ # Fetch robots.txt from base_url and return a parser instance.
12
+ # Returns a permissive (allow-all) instance on any network error.
13
+ def self.fetch(base_url)
14
+ uri = URI.join(base_url, '/robots.txt')
15
+ response = Net::HTTP.start(uri.host, uri.port,
16
+ use_ssl: uri.scheme == 'https',
17
+ open_timeout: 5,
18
+ read_timeout: 5) do |http|
19
+ http.get(uri.request_uri)
20
+ end
21
+ new(response.is_a?(Net::HTTPOK) ? response.body : '')
22
+ rescue StandardError
23
+ new('') # network error or invalid URL → allow everything
24
+ end
25
+
26
+ def initialize(content)
27
+ @rules = parse(content.to_s)
28
+ end
29
+
30
+ # Returns true if the given URL is allowed to be crawled.
31
+ def allowed?(url)
32
+ path = URI.parse(url).path
33
+ path = '/' if path.nil? || path.empty?
34
+
35
+ # Allow rules take precedence over Disallow when both match.
36
+ return true if @rules[:allow].any? { |rule| path_matches?(path, rule) }
37
+ return false if @rules[:disallow].any? { |rule| path_matches?(path, rule) }
38
+
39
+ true
40
+ rescue URI::InvalidURIError
41
+ true
42
+ end
43
+
44
+ # Returns the Crawl-delay value in seconds, or nil if not specified.
45
+ def crawl_delay
46
+ @rules[:crawl_delay]
47
+ end
48
+
49
+ private
50
+
51
+ def parse(content)
52
+ rules = { allow: [], disallow: [], crawl_delay: nil }
53
+ in_relevant_section = false
54
+
55
+ content.each_line do |raw_line|
56
+ line = raw_line.strip.sub(/#.*$/, '').strip
57
+ next if line.empty?
58
+
59
+ key, value = line.split(':', 2).map(&:strip)
60
+ next unless key && value
61
+
62
+ case key.downcase
63
+ when 'user-agent'
64
+ in_relevant_section = (value == '*')
65
+ when 'disallow'
66
+ rules[:disallow] << value if in_relevant_section && !value.empty?
67
+ when 'allow'
68
+ rules[:allow] << value if in_relevant_section && !value.empty?
69
+ when 'crawl-delay'
70
+ rules[:crawl_delay] = value.to_f if in_relevant_section && value.match?(/\A\d+(\.\d+)?\z/)
71
+ end
72
+ end
73
+
74
+ rules
75
+ end
76
+
77
+ # Matches a URL path against a robots.txt rule pattern.
78
+ # Supports * (wildcard) and $ (end-of-string anchor).
79
+ def path_matches?(path, rule)
80
+ return false if rule.empty?
81
+
82
+ pattern = Regexp.escape(rule).gsub('\*', '.*').gsub('\$', '\z')
83
+ path.match?(/\A#{pattern}/)
84
+ end
85
+ end
86
+ end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'set'
4
+ require_relative 'robots_parser'
4
5
 
5
6
  class RubyCrawl
6
7
  # BFS crawler that follows links with deduplication.
@@ -46,7 +47,8 @@ class RubyCrawl
46
47
  @same_host_only = options.fetch(:same_host_only, true)
47
48
  @wait_until = options.fetch(:wait_until, nil)
48
49
  @block_resources = options.fetch(:block_resources, nil)
49
- @max_attempts = options.fetch(:max_attempts, nil)
50
+ @max_attempts = options.fetch(:max_attempts, nil)
51
+ @respect_robots_txt = options.fetch(:respect_robots_txt, false)
50
52
  @visited = Set.new
51
53
  @queue = []
52
54
  end
@@ -58,6 +60,7 @@ class RubyCrawl
58
60
  raise ConfigurationError, "Invalid start URL: #{start_url}" unless normalized
59
61
 
60
62
  @base_url = normalized
63
+ @robots = @respect_robots_txt ? RobotsParser.fetch(@base_url) : nil
61
64
  enqueue(normalized, 0)
62
65
  process_queue(&block)
63
66
  end
@@ -71,6 +74,8 @@ class RubyCrawl
71
74
  url, depth = item
72
75
  next if @visited.include?(url)
73
76
 
77
+ sleep(@robots.crawl_delay) if @robots&.crawl_delay && pages_crawled.positive?
78
+
74
79
  result = process_page(url, depth)
75
80
  next unless result
76
81
 
@@ -130,11 +135,20 @@ class RubyCrawl
130
135
  next unless normalized
131
136
  next if @visited.include?(normalized)
132
137
  next if @same_host_only && !UrlNormalizer.same_host?(normalized, @base_url)
138
+ next if robots_disallowed?(normalized)
133
139
 
134
140
  enqueue(normalized, depth)
135
141
  end
136
142
  end
137
143
 
144
+ def robots_disallowed?(url)
145
+ return false unless @robots
146
+ return false if @robots.allowed?(url)
147
+
148
+ warn "[rubycrawl] Skipping #{url} (disallowed by robots.txt)"
149
+ true
150
+ end
151
+
138
152
  def enqueue(url, depth)
139
153
  return if @visited.include?(url)
140
154
 
@@ -27,11 +27,12 @@ namespace :rubycrawl do
27
27
 
28
28
  # RubyCrawl Configuration
29
29
  RubyCrawl.configure(
30
- # wait_until: "load", # "load", "domcontentloaded", "networkidle"
31
- # block_resources: true, # block images/fonts/CSS/media for speed
32
- # max_attempts: 3, # retry count with exponential backoff
33
- # timeout: 30, # browser navigation timeout in seconds
34
- # headless: true, # set false to see the browser (debugging)
30
+ # wait_until: "load", # "load", "domcontentloaded", "networkidle"
31
+ # block_resources: true, # block images/fonts/CSS/media for speed
32
+ # max_attempts: 3, # retry count with exponential backoff
33
+ # timeout: 30, # browser navigation timeout in seconds
34
+ # headless: true, # set false to see the browser (debugging)
35
+ # respect_robots_txt: false, # set true to honour robots.txt and Crawl-delay
35
36
  )
36
37
  RUBY
37
38
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class RubyCrawl
4
- VERSION = '0.2.0'
4
+ VERSION = '0.4.0'
5
5
  end
data/lib/rubycrawl.rb CHANGED
@@ -81,7 +81,8 @@ class RubyCrawl
81
81
  @max_attempts = options.fetch(:max_attempts, 3)
82
82
  @timeout = options.fetch(:timeout, 30)
83
83
  @headless = options.fetch(:headless, true)
84
- @browser_options = options.fetch(:browser_options, {})
84
+ @browser_options = options.fetch(:browser_options, {})
85
+ @respect_robots_txt = options.fetch(:respect_robots_txt, false)
85
86
  end
86
87
 
87
88
  def with_retries(max_attempts)
@@ -101,12 +102,13 @@ class RubyCrawl
101
102
 
102
103
  def build_crawler_options(options)
103
104
  {
104
- max_pages: options.fetch(:max_pages, 50),
105
- max_depth: options.fetch(:max_depth, 3),
106
- same_host_only: options.fetch(:same_host_only, true),
107
- wait_until: options.fetch(:wait_until, @wait_until),
108
- block_resources: options.fetch(:block_resources, @block_resources),
109
- max_attempts: options.fetch(:max_attempts, @max_attempts)
105
+ max_pages: options.fetch(:max_pages, 50),
106
+ max_depth: options.fetch(:max_depth, 3),
107
+ same_host_only: options.fetch(:same_host_only, true),
108
+ wait_until: options.fetch(:wait_until, @wait_until),
109
+ block_resources: options.fetch(:block_resources, @block_resources),
110
+ max_attempts: options.fetch(:max_attempts, @max_attempts),
111
+ respect_robots_txt: options.fetch(:respect_robots_txt, @respect_robots_txt)
110
112
  }
111
113
  end
112
114
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubycrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - RubyCrawl contributors
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-03-16 00:00:00.000000000 Z
11
+ date: 2026-03-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ferrum
@@ -52,11 +52,13 @@ files:
52
52
  - lib/rubycrawl.rb
53
53
  - lib/rubycrawl/browser.rb
54
54
  - lib/rubycrawl/browser/extraction.rb
55
+ - lib/rubycrawl/browser/readability.js
55
56
  - lib/rubycrawl/errors.rb
56
57
  - lib/rubycrawl/helpers.rb
57
58
  - lib/rubycrawl/markdown_converter.rb
58
59
  - lib/rubycrawl/railtie.rb
59
60
  - lib/rubycrawl/result.rb
61
+ - lib/rubycrawl/robots_parser.rb
60
62
  - lib/rubycrawl/site_crawler.rb
61
63
  - lib/rubycrawl/tasks/install.rake
62
64
  - lib/rubycrawl/url_normalizer.rb