rubycrawl 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +81 -32
- data/lib/rubycrawl/browser/extraction.rb +34 -12
- data/lib/rubycrawl/browser/readability.js +2786 -0
- data/lib/rubycrawl/browser.rb +1 -1
- data/lib/rubycrawl/robots_parser.rb +86 -0
- data/lib/rubycrawl/site_crawler.rb +15 -1
- data/lib/rubycrawl/tasks/install.rake +6 -5
- data/lib/rubycrawl/version.rb +1 -1
- data/lib/rubycrawl.rb +9 -7
- metadata +4 -2
data/lib/rubycrawl/browser.rb
CHANGED
|
@@ -99,7 +99,7 @@ class RubyCrawl
|
|
|
99
99
|
raw_text: raw_text.to_s,
|
|
100
100
|
clean_html: content['cleanHtml'].to_s,
|
|
101
101
|
links: Array(links),
|
|
102
|
-
metadata: { 'final_url' => final_url }.merge(metadata || {})
|
|
102
|
+
metadata: { 'final_url' => final_url, 'extractor' => content['extractor'] }.merge(metadata || {})
|
|
103
103
|
)
|
|
104
104
|
end
|
|
105
105
|
end
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'net/http'
|
|
4
|
+
require 'uri'
|
|
5
|
+
|
|
6
|
+
class RubyCrawl
|
|
7
|
+
# Fetches and parses robots.txt for a given site.
|
|
8
|
+
# Supports User-agent: *, Disallow, Allow, and Crawl-delay directives.
|
|
9
|
+
# Fails open — any fetch/parse error allows all URLs.
|
|
10
|
+
class RobotsParser
|
|
11
|
+
# Fetch robots.txt from base_url and return a parser instance.
|
|
12
|
+
# Returns a permissive (allow-all) instance on any network error.
|
|
13
|
+
def self.fetch(base_url)
|
|
14
|
+
uri = URI.join(base_url, '/robots.txt')
|
|
15
|
+
response = Net::HTTP.start(uri.host, uri.port,
|
|
16
|
+
use_ssl: uri.scheme == 'https',
|
|
17
|
+
open_timeout: 5,
|
|
18
|
+
read_timeout: 5) do |http|
|
|
19
|
+
http.get(uri.request_uri)
|
|
20
|
+
end
|
|
21
|
+
new(response.is_a?(Net::HTTPOK) ? response.body : '')
|
|
22
|
+
rescue StandardError
|
|
23
|
+
new('') # network error or invalid URL → allow everything
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def initialize(content)
|
|
27
|
+
@rules = parse(content.to_s)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Returns true if the given URL is allowed to be crawled.
|
|
31
|
+
def allowed?(url)
|
|
32
|
+
path = URI.parse(url).path
|
|
33
|
+
path = '/' if path.nil? || path.empty?
|
|
34
|
+
|
|
35
|
+
# Allow rules take precedence over Disallow when both match.
|
|
36
|
+
return true if @rules[:allow].any? { |rule| path_matches?(path, rule) }
|
|
37
|
+
return false if @rules[:disallow].any? { |rule| path_matches?(path, rule) }
|
|
38
|
+
|
|
39
|
+
true
|
|
40
|
+
rescue URI::InvalidURIError
|
|
41
|
+
true
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Returns the Crawl-delay value in seconds, or nil if not specified.
|
|
45
|
+
def crawl_delay
|
|
46
|
+
@rules[:crawl_delay]
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
def parse(content)
|
|
52
|
+
rules = { allow: [], disallow: [], crawl_delay: nil }
|
|
53
|
+
in_relevant_section = false
|
|
54
|
+
|
|
55
|
+
content.each_line do |raw_line|
|
|
56
|
+
line = raw_line.strip.sub(/#.*$/, '').strip
|
|
57
|
+
next if line.empty?
|
|
58
|
+
|
|
59
|
+
key, value = line.split(':', 2).map(&:strip)
|
|
60
|
+
next unless key && value
|
|
61
|
+
|
|
62
|
+
case key.downcase
|
|
63
|
+
when 'user-agent'
|
|
64
|
+
in_relevant_section = (value == '*')
|
|
65
|
+
when 'disallow'
|
|
66
|
+
rules[:disallow] << value if in_relevant_section && !value.empty?
|
|
67
|
+
when 'allow'
|
|
68
|
+
rules[:allow] << value if in_relevant_section && !value.empty?
|
|
69
|
+
when 'crawl-delay'
|
|
70
|
+
rules[:crawl_delay] = value.to_f if in_relevant_section && value.match?(/\A\d+(\.\d+)?\z/)
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
rules
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Matches a URL path against a robots.txt rule pattern.
|
|
78
|
+
# Supports * (wildcard) and $ (end-of-string anchor).
|
|
79
|
+
def path_matches?(path, rule)
|
|
80
|
+
return false if rule.empty?
|
|
81
|
+
|
|
82
|
+
pattern = Regexp.escape(rule).gsub('\*', '.*').gsub('\$', '\z')
|
|
83
|
+
path.match?(/\A#{pattern}/)
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'set'
|
|
4
|
+
require_relative 'robots_parser'
|
|
4
5
|
|
|
5
6
|
class RubyCrawl
|
|
6
7
|
# BFS crawler that follows links with deduplication.
|
|
@@ -46,7 +47,8 @@ class RubyCrawl
|
|
|
46
47
|
@same_host_only = options.fetch(:same_host_only, true)
|
|
47
48
|
@wait_until = options.fetch(:wait_until, nil)
|
|
48
49
|
@block_resources = options.fetch(:block_resources, nil)
|
|
49
|
-
@max_attempts
|
|
50
|
+
@max_attempts = options.fetch(:max_attempts, nil)
|
|
51
|
+
@respect_robots_txt = options.fetch(:respect_robots_txt, false)
|
|
50
52
|
@visited = Set.new
|
|
51
53
|
@queue = []
|
|
52
54
|
end
|
|
@@ -58,6 +60,7 @@ class RubyCrawl
|
|
|
58
60
|
raise ConfigurationError, "Invalid start URL: #{start_url}" unless normalized
|
|
59
61
|
|
|
60
62
|
@base_url = normalized
|
|
63
|
+
@robots = @respect_robots_txt ? RobotsParser.fetch(@base_url) : nil
|
|
61
64
|
enqueue(normalized, 0)
|
|
62
65
|
process_queue(&block)
|
|
63
66
|
end
|
|
@@ -71,6 +74,8 @@ class RubyCrawl
|
|
|
71
74
|
url, depth = item
|
|
72
75
|
next if @visited.include?(url)
|
|
73
76
|
|
|
77
|
+
sleep(@robots.crawl_delay) if @robots&.crawl_delay && pages_crawled.positive?
|
|
78
|
+
|
|
74
79
|
result = process_page(url, depth)
|
|
75
80
|
next unless result
|
|
76
81
|
|
|
@@ -130,11 +135,20 @@ class RubyCrawl
|
|
|
130
135
|
next unless normalized
|
|
131
136
|
next if @visited.include?(normalized)
|
|
132
137
|
next if @same_host_only && !UrlNormalizer.same_host?(normalized, @base_url)
|
|
138
|
+
next if robots_disallowed?(normalized)
|
|
133
139
|
|
|
134
140
|
enqueue(normalized, depth)
|
|
135
141
|
end
|
|
136
142
|
end
|
|
137
143
|
|
|
144
|
+
def robots_disallowed?(url)
|
|
145
|
+
return false unless @robots
|
|
146
|
+
return false if @robots.allowed?(url)
|
|
147
|
+
|
|
148
|
+
warn "[rubycrawl] Skipping #{url} (disallowed by robots.txt)"
|
|
149
|
+
true
|
|
150
|
+
end
|
|
151
|
+
|
|
138
152
|
def enqueue(url, depth)
|
|
139
153
|
return if @visited.include?(url)
|
|
140
154
|
|
|
@@ -27,11 +27,12 @@ namespace :rubycrawl do
|
|
|
27
27
|
|
|
28
28
|
# RubyCrawl Configuration
|
|
29
29
|
RubyCrawl.configure(
|
|
30
|
-
# wait_until: "load",
|
|
31
|
-
# block_resources: true,
|
|
32
|
-
# max_attempts: 3,
|
|
33
|
-
# timeout: 30,
|
|
34
|
-
# headless: true,
|
|
30
|
+
# wait_until: "load", # "load", "domcontentloaded", "networkidle"
|
|
31
|
+
# block_resources: true, # block images/fonts/CSS/media for speed
|
|
32
|
+
# max_attempts: 3, # retry count with exponential backoff
|
|
33
|
+
# timeout: 30, # browser navigation timeout in seconds
|
|
34
|
+
# headless: true, # set false to see the browser (debugging)
|
|
35
|
+
# respect_robots_txt: false, # set true to honour robots.txt and Crawl-delay
|
|
35
36
|
)
|
|
36
37
|
RUBY
|
|
37
38
|
|
data/lib/rubycrawl/version.rb
CHANGED
data/lib/rubycrawl.rb
CHANGED
|
@@ -81,7 +81,8 @@ class RubyCrawl
|
|
|
81
81
|
@max_attempts = options.fetch(:max_attempts, 3)
|
|
82
82
|
@timeout = options.fetch(:timeout, 30)
|
|
83
83
|
@headless = options.fetch(:headless, true)
|
|
84
|
-
@browser_options
|
|
84
|
+
@browser_options = options.fetch(:browser_options, {})
|
|
85
|
+
@respect_robots_txt = options.fetch(:respect_robots_txt, false)
|
|
85
86
|
end
|
|
86
87
|
|
|
87
88
|
def with_retries(max_attempts)
|
|
@@ -101,12 +102,13 @@ class RubyCrawl
|
|
|
101
102
|
|
|
102
103
|
def build_crawler_options(options)
|
|
103
104
|
{
|
|
104
|
-
max_pages:
|
|
105
|
-
max_depth:
|
|
106
|
-
same_host_only:
|
|
107
|
-
wait_until:
|
|
108
|
-
block_resources:
|
|
109
|
-
max_attempts:
|
|
105
|
+
max_pages: options.fetch(:max_pages, 50),
|
|
106
|
+
max_depth: options.fetch(:max_depth, 3),
|
|
107
|
+
same_host_only: options.fetch(:same_host_only, true),
|
|
108
|
+
wait_until: options.fetch(:wait_until, @wait_until),
|
|
109
|
+
block_resources: options.fetch(:block_resources, @block_resources),
|
|
110
|
+
max_attempts: options.fetch(:max_attempts, @max_attempts),
|
|
111
|
+
respect_robots_txt: options.fetch(:respect_robots_txt, @respect_robots_txt)
|
|
110
112
|
}
|
|
111
113
|
end
|
|
112
114
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: rubycrawl
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.4.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- RubyCrawl contributors
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-03-
|
|
11
|
+
date: 2026-03-17 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: ferrum
|
|
@@ -52,11 +52,13 @@ files:
|
|
|
52
52
|
- lib/rubycrawl.rb
|
|
53
53
|
- lib/rubycrawl/browser.rb
|
|
54
54
|
- lib/rubycrawl/browser/extraction.rb
|
|
55
|
+
- lib/rubycrawl/browser/readability.js
|
|
55
56
|
- lib/rubycrawl/errors.rb
|
|
56
57
|
- lib/rubycrawl/helpers.rb
|
|
57
58
|
- lib/rubycrawl/markdown_converter.rb
|
|
58
59
|
- lib/rubycrawl/railtie.rb
|
|
59
60
|
- lib/rubycrawl/result.rb
|
|
61
|
+
- lib/rubycrawl/robots_parser.rb
|
|
60
62
|
- lib/rubycrawl/site_crawler.rb
|
|
61
63
|
- lib/rubycrawl/tasks/install.rake
|
|
62
64
|
- lib/rubycrawl/url_normalizer.rb
|