rubycrawl 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +263 -311
- data/lib/rubycrawl/browser/extraction.rb +106 -0
- data/lib/rubycrawl/browser.rb +106 -0
- data/lib/rubycrawl/errors.rb +1 -1
- data/lib/rubycrawl/helpers.rb +9 -41
- data/lib/rubycrawl/markdown_converter.rb +5 -5
- data/lib/rubycrawl/result.rb +55 -25
- data/lib/rubycrawl/site_crawler.rb +46 -20
- data/lib/rubycrawl/tasks/install.rake +17 -56
- data/lib/rubycrawl/url_normalizer.rb +5 -1
- data/lib/rubycrawl/version.rb +1 -1
- data/lib/rubycrawl.rb +37 -66
- data/rubycrawl.gemspec +5 -5
- metadata +20 -6
- data/Gemfile +0 -11
- data/lib/rubycrawl/service_client.rb +0 -86
data/lib/rubycrawl.rb
CHANGED
|
@@ -3,25 +3,26 @@
|
|
|
3
3
|
require_relative 'rubycrawl/version'
|
|
4
4
|
require_relative 'rubycrawl/errors'
|
|
5
5
|
require_relative 'rubycrawl/helpers'
|
|
6
|
-
require_relative 'rubycrawl/
|
|
6
|
+
require_relative 'rubycrawl/browser'
|
|
7
7
|
require_relative 'rubycrawl/url_normalizer'
|
|
8
8
|
require_relative 'rubycrawl/markdown_converter'
|
|
9
9
|
require_relative 'rubycrawl/result'
|
|
10
10
|
require_relative 'rubycrawl/site_crawler'
|
|
11
11
|
require_relative 'rubycrawl/railtie' if defined?(Rails)
|
|
12
12
|
|
|
13
|
-
# RubyCrawl
|
|
13
|
+
# RubyCrawl — pure Ruby web crawler with full JavaScript rendering via Ferrum.
|
|
14
14
|
class RubyCrawl
|
|
15
15
|
include Helpers
|
|
16
16
|
|
|
17
|
-
DEFAULT_HOST = '127.0.0.1'
|
|
18
|
-
DEFAULT_PORT = 3344
|
|
19
|
-
|
|
20
17
|
class << self
|
|
21
18
|
def client
|
|
22
19
|
@client ||= new
|
|
23
20
|
end
|
|
24
21
|
|
|
22
|
+
# Crawl a single URL and return a Result.
|
|
23
|
+
# @param url [String]
|
|
24
|
+
# @param options [Hash] wait_until:, block_resources:, max_attempts:
|
|
25
|
+
# @return [RubyCrawl::Result]
|
|
25
26
|
def crawl(url, **options)
|
|
26
27
|
client.crawl(url, **options)
|
|
27
28
|
end
|
|
@@ -34,12 +35,12 @@ class RubyCrawl
|
|
|
34
35
|
# @param max_depth [Integer] Maximum link depth from start URL (default: 3)
|
|
35
36
|
# @param same_host_only [Boolean] Only follow links on the same host (default: true)
|
|
36
37
|
# @yield [page] Yields each page result as it is crawled
|
|
37
|
-
# @yieldparam page [SiteCrawler::PageResult]
|
|
38
|
+
# @yieldparam page [SiteCrawler::PageResult]
|
|
38
39
|
# @return [Integer] Number of pages crawled
|
|
39
40
|
#
|
|
40
|
-
# @example
|
|
41
|
+
# @example
|
|
41
42
|
# RubyCrawl.crawl_site("https://example.com", max_pages: 100) do |page|
|
|
42
|
-
# Page.create!(url: page.url,
|
|
43
|
+
# Page.create!(url: page.url, content: page.clean_text, depth: page.depth)
|
|
43
44
|
# end
|
|
44
45
|
def crawl_site(url, ...)
|
|
45
46
|
client.crawl_site(url, ...)
|
|
@@ -52,90 +53,60 @@ class RubyCrawl
|
|
|
52
53
|
|
|
53
54
|
def initialize(**options)
|
|
54
55
|
load_options(options)
|
|
55
|
-
|
|
56
|
+
@browser = Browser.new(
|
|
57
|
+
timeout: @timeout,
|
|
58
|
+
headless: @headless,
|
|
59
|
+
browser_options: @browser_options
|
|
60
|
+
)
|
|
56
61
|
end
|
|
57
62
|
|
|
58
|
-
def crawl(url, wait_until: @wait_until, block_resources: @block_resources,
|
|
63
|
+
def crawl(url, wait_until: @wait_until, block_resources: @block_resources, max_attempts: @max_attempts)
|
|
59
64
|
validate_url!(url)
|
|
60
|
-
|
|
61
|
-
with_retries(
|
|
62
|
-
|
|
63
|
-
response = @service_client.post_json('/crawl', payload)
|
|
64
|
-
raise_node_error!(response)
|
|
65
|
-
build_result(response)
|
|
65
|
+
validate_wait_until!(wait_until)
|
|
66
|
+
with_retries(max_attempts) do
|
|
67
|
+
@browser.crawl(url, wait_until: wait_until, block_resources: block_resources)
|
|
66
68
|
end
|
|
67
69
|
end
|
|
68
70
|
|
|
69
|
-
# Crawl multiple pages starting from a URL, following links.
|
|
70
|
-
# @see RubyCrawl.crawl_site
|
|
71
71
|
def crawl_site(url, **options, &block)
|
|
72
|
-
@service_client.ensure_running
|
|
73
72
|
crawler_options = build_crawler_options(options)
|
|
74
|
-
|
|
75
|
-
crawler.crawl(url, &block)
|
|
73
|
+
SiteCrawler.new(self, crawler_options).crawl(url, &block)
|
|
76
74
|
end
|
|
77
75
|
|
|
78
76
|
private
|
|
79
77
|
|
|
80
|
-
def
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
78
|
+
def load_options(options)
|
|
79
|
+
@wait_until = options.fetch(:wait_until, nil)
|
|
80
|
+
@block_resources = options.fetch(:block_resources, nil)
|
|
81
|
+
@max_attempts = options.fetch(:max_attempts, 3)
|
|
82
|
+
@timeout = options.fetch(:timeout, 30)
|
|
83
|
+
@headless = options.fetch(:headless, true)
|
|
84
|
+
@browser_options = options.fetch(:browser_options, {})
|
|
86
85
|
end
|
|
87
86
|
|
|
88
|
-
def with_retries(
|
|
87
|
+
def with_retries(max_attempts)
|
|
89
88
|
attempt = 0
|
|
90
89
|
begin
|
|
91
90
|
yield
|
|
92
91
|
rescue ServiceError, TimeoutError => e
|
|
93
92
|
attempt += 1
|
|
94
|
-
raise unless attempt <
|
|
93
|
+
raise unless attempt < max_attempts
|
|
95
94
|
|
|
96
|
-
|
|
95
|
+
backoff = 2**attempt
|
|
96
|
+
warn "[rubycrawl] Attempt #{attempt + 1}/#{max_attempts} failed, retrying in #{backoff}s: #{e.message}"
|
|
97
|
+
sleep(backoff)
|
|
97
98
|
retry
|
|
98
99
|
end
|
|
99
100
|
end
|
|
100
101
|
|
|
101
|
-
def load_options(options)
|
|
102
|
-
@host = options.fetch(:host, DEFAULT_HOST)
|
|
103
|
-
@port = Integer(options.fetch(:port, DEFAULT_PORT))
|
|
104
|
-
@node_dir = options.fetch(:node_dir, default_node_dir)
|
|
105
|
-
@node_bin = options.fetch(:node_bin, ENV.fetch('RUBYCRAWL_NODE_BIN', nil)) || 'node'
|
|
106
|
-
@node_log = options.fetch(:node_log, ENV.fetch('RUBYCRAWL_NODE_LOG', nil))
|
|
107
|
-
@wait_until = options.fetch(:wait_until, nil)
|
|
108
|
-
@block_resources = options.fetch(:block_resources, nil)
|
|
109
|
-
@max_retries = options.fetch(:max_retries, 3)
|
|
110
|
-
end
|
|
111
|
-
|
|
112
|
-
def build_service_client
|
|
113
|
-
@service_client = ServiceClient.new(
|
|
114
|
-
host: @host,
|
|
115
|
-
port: @port,
|
|
116
|
-
node_dir: @node_dir,
|
|
117
|
-
node_bin: @node_bin,
|
|
118
|
-
node_log: @node_log
|
|
119
|
-
)
|
|
120
|
-
end
|
|
121
|
-
|
|
122
|
-
def retry_with_backoff(attempt, retries, error)
|
|
123
|
-
backoff_seconds = 2**attempt
|
|
124
|
-
warn "[rubycrawl] Retry #{attempt}/#{retries - 1} after #{backoff_seconds}s: #{error.message}"
|
|
125
|
-
sleep(backoff_seconds)
|
|
126
|
-
end
|
|
127
|
-
|
|
128
102
|
def build_crawler_options(options)
|
|
129
103
|
{
|
|
130
|
-
max_pages:
|
|
131
|
-
max_depth:
|
|
132
|
-
same_host_only:
|
|
133
|
-
wait_until:
|
|
134
|
-
block_resources: options.fetch(:block_resources, @block_resources)
|
|
104
|
+
max_pages: options.fetch(:max_pages, 50),
|
|
105
|
+
max_depth: options.fetch(:max_depth, 3),
|
|
106
|
+
same_host_only: options.fetch(:same_host_only, true),
|
|
107
|
+
wait_until: options.fetch(:wait_until, @wait_until),
|
|
108
|
+
block_resources: options.fetch(:block_resources, @block_resources),
|
|
109
|
+
max_attempts: options.fetch(:max_attempts, @max_attempts)
|
|
135
110
|
}
|
|
136
111
|
end
|
|
137
|
-
|
|
138
|
-
def default_node_dir
|
|
139
|
-
File.expand_path('../node', __dir__)
|
|
140
|
-
end
|
|
141
112
|
end
|
data/rubycrawl.gemspec
CHANGED
|
@@ -8,21 +8,21 @@ Gem::Specification.new do |spec|
|
|
|
8
8
|
spec.authors = ['RubyCrawl contributors']
|
|
9
9
|
spec.email = ['ganesh.navale@zohomail.in']
|
|
10
10
|
|
|
11
|
-
spec.summary = '
|
|
12
|
-
spec.description = '
|
|
11
|
+
spec.summary = 'Pure Ruby web crawler with full JavaScript rendering'
|
|
12
|
+
spec.description = 'rubycrawl uses Ferrum (Chrome DevTools Protocol) for JS rendering.'
|
|
13
13
|
spec.homepage = 'https://github.com/craft-wise/rubycrawl'
|
|
14
14
|
spec.license = 'MIT'
|
|
15
15
|
|
|
16
16
|
spec.required_ruby_version = '>= 3.0'
|
|
17
17
|
|
|
18
|
-
spec.files
|
|
19
|
-
spec.files += %w[README.md LICENSE
|
|
18
|
+
spec.files = Dir.glob('{lib}/**/*', File::FNM_DOTMATCH).reject { |f| File.directory?(f) }
|
|
19
|
+
spec.files += %w[README.md LICENSE Rakefile rubycrawl.gemspec .rspec]
|
|
20
20
|
|
|
21
21
|
spec.bindir = 'bin'
|
|
22
22
|
spec.executables = []
|
|
23
23
|
spec.require_paths = ['lib']
|
|
24
24
|
|
|
25
|
-
|
|
25
|
+
spec.add_dependency 'ferrum', '~> 0.15'
|
|
26
26
|
spec.add_dependency 'reverse_markdown', '~> 2.1'
|
|
27
27
|
|
|
28
28
|
spec.metadata['rubygems_mfa_required'] = 'true'
|
metadata
CHANGED
|
@@ -1,15 +1,29 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: rubycrawl
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- RubyCrawl contributors
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-03-16 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: ferrum
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '0.15'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '0.15'
|
|
13
27
|
- !ruby/object:Gem::Dependency
|
|
14
28
|
name: reverse_markdown
|
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -24,7 +38,7 @@ dependencies:
|
|
|
24
38
|
- - "~>"
|
|
25
39
|
- !ruby/object:Gem::Version
|
|
26
40
|
version: '2.1'
|
|
27
|
-
description:
|
|
41
|
+
description: rubycrawl uses Ferrum (Chrome DevTools Protocol) for JS rendering.
|
|
28
42
|
email:
|
|
29
43
|
- ganesh.navale@zohomail.in
|
|
30
44
|
executables: []
|
|
@@ -32,17 +46,17 @@ extensions: []
|
|
|
32
46
|
extra_rdoc_files: []
|
|
33
47
|
files:
|
|
34
48
|
- ".rspec"
|
|
35
|
-
- Gemfile
|
|
36
49
|
- LICENSE
|
|
37
50
|
- README.md
|
|
38
51
|
- Rakefile
|
|
39
52
|
- lib/rubycrawl.rb
|
|
53
|
+
- lib/rubycrawl/browser.rb
|
|
54
|
+
- lib/rubycrawl/browser/extraction.rb
|
|
40
55
|
- lib/rubycrawl/errors.rb
|
|
41
56
|
- lib/rubycrawl/helpers.rb
|
|
42
57
|
- lib/rubycrawl/markdown_converter.rb
|
|
43
58
|
- lib/rubycrawl/railtie.rb
|
|
44
59
|
- lib/rubycrawl/result.rb
|
|
45
|
-
- lib/rubycrawl/service_client.rb
|
|
46
60
|
- lib/rubycrawl/site_crawler.rb
|
|
47
61
|
- lib/rubycrawl/tasks/install.rake
|
|
48
62
|
- lib/rubycrawl/url_normalizer.rb
|
|
@@ -71,5 +85,5 @@ requirements: []
|
|
|
71
85
|
rubygems_version: 3.5.22
|
|
72
86
|
signing_key:
|
|
73
87
|
specification_version: 4
|
|
74
|
-
summary:
|
|
88
|
+
summary: Pure Ruby web crawler with full JavaScript rendering
|
|
75
89
|
test_files: []
|
data/Gemfile
DELETED
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'json'
|
|
4
|
-
require 'net/http'
|
|
5
|
-
require 'uri'
|
|
6
|
-
|
|
7
|
-
class RubyCrawl
|
|
8
|
-
# Handles node service lifecycle and HTTP requests.
|
|
9
|
-
class ServiceClient
|
|
10
|
-
def initialize(host:, port:, node_dir:, node_bin:, node_log:)
|
|
11
|
-
@host = host
|
|
12
|
-
@port = Integer(port)
|
|
13
|
-
@node_dir = node_dir
|
|
14
|
-
@node_bin = node_bin
|
|
15
|
-
@node_log = node_log
|
|
16
|
-
@node_pid = nil
|
|
17
|
-
end
|
|
18
|
-
|
|
19
|
-
def ensure_running
|
|
20
|
-
return if healthy?
|
|
21
|
-
|
|
22
|
-
start_service
|
|
23
|
-
wait_until_healthy
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
def post_json(path, body)
|
|
27
|
-
uri = URI("http://#{@host}:#{@port}#{path}")
|
|
28
|
-
request = build_request(uri, body)
|
|
29
|
-
response = perform_request(uri, request)
|
|
30
|
-
JSON.parse(response.body)
|
|
31
|
-
rescue JSON::ParserError => e
|
|
32
|
-
raise ServiceError, "Node service returned invalid JSON: #{e.message}"
|
|
33
|
-
rescue Errno::ECONNREFUSED, Errno::ECONNRESET => e
|
|
34
|
-
raise ServiceError, "Cannot connect to node service at #{uri}: #{e.message}"
|
|
35
|
-
rescue Net::OpenTimeout, Net::ReadTimeout => e
|
|
36
|
-
raise TimeoutError, "Request to node service timed out: #{e.message}"
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
private
|
|
40
|
-
|
|
41
|
-
def build_request(uri, body)
|
|
42
|
-
request = Net::HTTP::Post.new(uri)
|
|
43
|
-
request['Content-Type'] = 'application/json'
|
|
44
|
-
request.body = JSON.generate(body)
|
|
45
|
-
request
|
|
46
|
-
end
|
|
47
|
-
|
|
48
|
-
def perform_request(uri, request)
|
|
49
|
-
Net::HTTP.start(uri.host, uri.port, open_timeout: 5, read_timeout: 30) do |http|
|
|
50
|
-
http.request(request)
|
|
51
|
-
end
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
def start_service
|
|
55
|
-
raise ServiceError, "rubycrawl node service directory not found: #{@node_dir}" unless Dir.exist?(@node_dir)
|
|
56
|
-
|
|
57
|
-
env = { 'RUBYCRAWL_NODE_PORT' => @port.to_s }
|
|
58
|
-
out = @node_log ? File.open(@node_log, 'a') : File::NULL
|
|
59
|
-
err = @node_log ? out : File::NULL
|
|
60
|
-
@node_pid = Process.spawn(env, @node_bin, 'src/index.js', chdir: @node_dir, out: out, err: err)
|
|
61
|
-
Process.detach(@node_pid)
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
def wait_until_healthy(timeout: 5)
|
|
65
|
-
deadline = Time.now + timeout
|
|
66
|
-
until Time.now > deadline
|
|
67
|
-
return true if healthy?
|
|
68
|
-
|
|
69
|
-
sleep 0.2
|
|
70
|
-
end
|
|
71
|
-
|
|
72
|
-
raise ServiceError, "rubycrawl node service failed to start within #{timeout}s. " \
|
|
73
|
-
"Check logs at #{@node_log || 'RUBYCRAWL_NODE_LOG'}"
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
def healthy?
|
|
77
|
-
uri = URI("http://#{@host}:#{@port}/health")
|
|
78
|
-
response = Net::HTTP.start(uri.host, uri.port, open_timeout: 1, read_timeout: 1) do |http|
|
|
79
|
-
http.get(uri.request_uri)
|
|
80
|
-
end
|
|
81
|
-
response.is_a?(Net::HTTPSuccess)
|
|
82
|
-
rescue StandardError
|
|
83
|
-
false
|
|
84
|
-
end
|
|
85
|
-
end
|
|
86
|
-
end
|