rubycrawl 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +427 -210
- data/lib/rubycrawl/helpers.rb +15 -11
- data/lib/rubycrawl/markdown_converter.rb +3 -3
- data/lib/rubycrawl/result.rb +10 -11
- data/lib/rubycrawl/service_client.rb +25 -3
- data/lib/rubycrawl/site_crawler.rb +14 -6
- data/lib/rubycrawl/version.rb +1 -1
- data/lib/rubycrawl.rb +33 -7
- data/node/.gitignore +2 -0
- data/node/.npmrc +1 -0
- data/node/README.md +19 -0
- data/node/package-lock.json +72 -0
- data/node/package.json +14 -0
- data/node/src/index.js +389 -0
- data/rubycrawl.gemspec +3 -2
- metadata +8 -3
- data/Gemfile +0 -11
data/lib/rubycrawl/helpers.rb
CHANGED
|
@@ -17,14 +17,22 @@ class RubyCrawl
|
|
|
17
17
|
if uri.host&.match?(/^(localhost|127\.|192\.168\.|10\.|172\.(1[6-9]|2[0-9]|3[01]))/)
|
|
18
18
|
warn '[rubycrawl] Warning: Crawling internal/private IP addresses'
|
|
19
19
|
end
|
|
20
|
-
rescue URI::InvalidURIError => e
|
|
20
|
+
rescue URI::InvalidURIError, TypeError => e
|
|
21
21
|
raise ConfigurationError, "Invalid URL: #{e.message}"
|
|
22
22
|
end
|
|
23
23
|
|
|
24
|
-
|
|
24
|
+
VALID_WAIT_UNTIL = %w[load domcontentloaded networkidle commit].freeze
|
|
25
|
+
|
|
26
|
+
def build_payload(url, wait_until, block_resources, session_id = nil)
|
|
27
|
+
if wait_until && !VALID_WAIT_UNTIL.include?(wait_until.to_s)
|
|
28
|
+
raise ConfigurationError,
|
|
29
|
+
"Invalid wait_until: #{wait_until.inspect}. Must be one of: #{VALID_WAIT_UNTIL.join(', ')}"
|
|
30
|
+
end
|
|
31
|
+
|
|
25
32
|
payload = { url: url }
|
|
26
33
|
payload[:wait_until] = wait_until if wait_until
|
|
27
34
|
payload[:block_resources] = block_resources unless block_resources.nil?
|
|
35
|
+
payload[:session_id] = session_id if session_id
|
|
28
36
|
payload
|
|
29
37
|
end
|
|
30
38
|
|
|
@@ -39,11 +47,9 @@ class RubyCrawl
|
|
|
39
47
|
|
|
40
48
|
def error_class_for(error_code)
|
|
41
49
|
case error_code
|
|
42
|
-
when '
|
|
43
|
-
TimeoutError
|
|
44
|
-
when 'navigation_failed', 'crawl_failed'
|
|
50
|
+
when 'crawl_failed'
|
|
45
51
|
NavigationError
|
|
46
|
-
when 'invalid_json', '
|
|
52
|
+
when 'invalid_json', 'session_create_failed', 'session_destroy_failed'
|
|
47
53
|
ServiceError
|
|
48
54
|
else
|
|
49
55
|
Error
|
|
@@ -52,12 +58,10 @@ class RubyCrawl
|
|
|
52
58
|
|
|
53
59
|
def error_message_for(error_code, error_message)
|
|
54
60
|
case error_code
|
|
55
|
-
when '
|
|
56
|
-
"Crawl timeout: #{error_message}"
|
|
57
|
-
when 'navigation_failed', 'crawl_failed'
|
|
61
|
+
when 'crawl_failed'
|
|
58
62
|
"Navigation failed: #{error_message}"
|
|
59
|
-
when 'invalid_json', '
|
|
60
|
-
"
|
|
63
|
+
when 'invalid_json', 'session_create_failed', 'session_destroy_failed'
|
|
64
|
+
"Service error [#{error_code}]: #{error_message}"
|
|
61
65
|
else
|
|
62
66
|
"Crawl error [#{error_code}]: #{error_message}"
|
|
63
67
|
end
|
|
@@ -15,10 +15,10 @@ class RubyCrawl
|
|
|
15
15
|
|
|
16
16
|
# Convert HTML to Markdown with resolved URLs.
|
|
17
17
|
#
|
|
18
|
-
# @param html [String] The HTML
|
|
18
|
+
# @param html [String] The page HTML to convert
|
|
19
19
|
# @param base_url [String, nil] Base URL to resolve relative URLs
|
|
20
|
-
# @param options [Hash] Options
|
|
21
|
-
# @return [String]
|
|
20
|
+
# @param options [Hash] Options passed to ReverseMarkdown
|
|
21
|
+
# @return [String] Markdown content with absolute URLs
|
|
22
22
|
def convert(html, base_url: nil, **options)
|
|
23
23
|
return '' if html.nil? || html.empty?
|
|
24
24
|
|
data/lib/rubycrawl/result.rb
CHANGED
|
@@ -1,38 +1,37 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
class RubyCrawl
|
|
4
|
-
# Result object with lazy
|
|
4
|
+
# Result object with lazy clean_markdown conversion.
|
|
5
5
|
class Result
|
|
6
6
|
attr_reader :text, :html, :links, :metadata
|
|
7
7
|
|
|
8
|
-
def initialize(text:, html:, links:, metadata
|
|
8
|
+
def initialize(text:, html:, links:, metadata:)
|
|
9
9
|
@text = text
|
|
10
10
|
@html = html
|
|
11
11
|
@links = links
|
|
12
12
|
@metadata = metadata
|
|
13
|
-
@markdown = markdown unless markdown.to_s.empty?
|
|
14
13
|
end
|
|
15
14
|
|
|
16
|
-
# Returns markdown
|
|
15
|
+
# Returns clean markdown converted from the page HTML.
|
|
17
16
|
# Relative URLs are resolved using the page's final_url.
|
|
18
17
|
#
|
|
19
18
|
# @return [String] Markdown content with absolute URLs
|
|
20
|
-
def
|
|
21
|
-
@
|
|
19
|
+
def clean_markdown
|
|
20
|
+
@clean_markdown ||= MarkdownConverter.convert(html, base_url: final_url)
|
|
22
21
|
end
|
|
23
22
|
|
|
24
23
|
# The final URL after redirects.
|
|
25
24
|
#
|
|
26
25
|
# @return [String, nil]
|
|
27
26
|
def final_url
|
|
28
|
-
metadata['final_url']
|
|
27
|
+
metadata['final_url']
|
|
29
28
|
end
|
|
30
29
|
|
|
31
|
-
# Check if
|
|
30
|
+
# Check if clean_markdown has been computed.
|
|
32
31
|
#
|
|
33
32
|
# @return [Boolean]
|
|
34
|
-
def
|
|
35
|
-
!@
|
|
33
|
+
def clean_markdown?
|
|
34
|
+
!@clean_markdown.nil?
|
|
36
35
|
end
|
|
37
36
|
|
|
38
37
|
def to_h
|
|
@@ -41,7 +40,7 @@ class RubyCrawl
|
|
|
41
40
|
html: html,
|
|
42
41
|
links: links,
|
|
43
42
|
metadata: metadata,
|
|
44
|
-
|
|
43
|
+
clean_markdown: @clean_markdown
|
|
45
44
|
}
|
|
46
45
|
end
|
|
47
46
|
end
|
|
@@ -36,6 +36,24 @@ class RubyCrawl
|
|
|
36
36
|
raise TimeoutError, "Request to node service timed out: #{e.message}"
|
|
37
37
|
end
|
|
38
38
|
|
|
39
|
+
# Create a session for reusing browser context across multiple crawls.
|
|
40
|
+
# @return [String] session_id
|
|
41
|
+
def create_session
|
|
42
|
+
response = post_json('/session/create', {})
|
|
43
|
+
raise ServiceError, "Failed to create session: #{response['error']}" if response['error']
|
|
44
|
+
|
|
45
|
+
response['session_id']
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Destroy a session and close its browser context.
|
|
49
|
+
# @param session_id [String]
|
|
50
|
+
def destroy_session(session_id)
|
|
51
|
+
post_json('/session/destroy', { session_id: session_id })
|
|
52
|
+
rescue StandardError
|
|
53
|
+
# Ignore errors on destroy - context may already be closed
|
|
54
|
+
nil
|
|
55
|
+
end
|
|
56
|
+
|
|
39
57
|
private
|
|
40
58
|
|
|
41
59
|
def build_request(uri, body)
|
|
@@ -55,9 +73,13 @@ class RubyCrawl
|
|
|
55
73
|
raise ServiceError, "rubycrawl node service directory not found: #{@node_dir}" unless Dir.exist?(@node_dir)
|
|
56
74
|
|
|
57
75
|
env = { 'RUBYCRAWL_NODE_PORT' => @port.to_s }
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
76
|
+
if @node_log
|
|
77
|
+
out = File.open(@node_log, 'a')
|
|
78
|
+
@node_pid = Process.spawn(env, @node_bin, 'src/index.js', chdir: @node_dir, out: out, err: out)
|
|
79
|
+
out.close
|
|
80
|
+
else
|
|
81
|
+
@node_pid = Process.spawn(env, @node_bin, 'src/index.js', chdir: @node_dir, out: File::NULL, err: File::NULL)
|
|
82
|
+
end
|
|
61
83
|
Process.detach(@node_pid)
|
|
62
84
|
end
|
|
63
85
|
|
|
@@ -5,7 +5,7 @@ require 'set'
|
|
|
5
5
|
class RubyCrawl
|
|
6
6
|
# BFS crawler that follows links with deduplication.
|
|
7
7
|
class SiteCrawler
|
|
8
|
-
# Page result yielded to the block with lazy
|
|
8
|
+
# Page result yielded to the block with lazy clean_markdown.
|
|
9
9
|
class PageResult
|
|
10
10
|
attr_reader :url, :html, :links, :metadata, :depth
|
|
11
11
|
|
|
@@ -17,14 +17,15 @@ class RubyCrawl
|
|
|
17
17
|
@depth = depth
|
|
18
18
|
end
|
|
19
19
|
|
|
20
|
-
#
|
|
21
|
-
|
|
22
|
-
|
|
20
|
+
# Returns clean markdown converted from the page HTML.
|
|
21
|
+
# Relative URLs are resolved using the page's final_url.
|
|
22
|
+
def clean_markdown
|
|
23
|
+
@clean_markdown ||= MarkdownConverter.convert(html, base_url: final_url)
|
|
23
24
|
end
|
|
24
25
|
|
|
25
26
|
# The final URL after redirects.
|
|
26
27
|
def final_url
|
|
27
|
-
metadata['final_url'] ||
|
|
28
|
+
metadata['final_url'] || url
|
|
28
29
|
end
|
|
29
30
|
end
|
|
30
31
|
|
|
@@ -35,8 +36,10 @@ class RubyCrawl
|
|
|
35
36
|
@same_host_only = options.fetch(:same_host_only, true)
|
|
36
37
|
@wait_until = options.fetch(:wait_until, nil)
|
|
37
38
|
@block_resources = options.fetch(:block_resources, nil)
|
|
39
|
+
@max_attempts = options.fetch(:max_attempts, nil)
|
|
38
40
|
@visited = Set.new
|
|
39
41
|
@queue = []
|
|
42
|
+
@session_id = nil
|
|
40
43
|
end
|
|
41
44
|
|
|
42
45
|
def crawl(start_url, &block)
|
|
@@ -46,8 +49,11 @@ class RubyCrawl
|
|
|
46
49
|
raise ConfigurationError, "Invalid start URL: #{start_url}" unless normalized
|
|
47
50
|
|
|
48
51
|
@base_url = normalized
|
|
52
|
+
@session_id = @client.create_session
|
|
49
53
|
enqueue(normalized, 0)
|
|
50
54
|
process_queue(&block)
|
|
55
|
+
ensure
|
|
56
|
+
@client.destroy_session(@session_id) if @session_id
|
|
51
57
|
end
|
|
52
58
|
|
|
53
59
|
private
|
|
@@ -77,7 +83,9 @@ class RubyCrawl
|
|
|
77
83
|
end
|
|
78
84
|
|
|
79
85
|
def crawl_page(url, depth)
|
|
80
|
-
|
|
86
|
+
opts = { wait_until: @wait_until, block_resources: @block_resources, session_id: @session_id }
|
|
87
|
+
opts[:max_attempts] = @max_attempts if @max_attempts
|
|
88
|
+
result = @client.crawl(url, **opts)
|
|
81
89
|
build_page_result(url, depth, result)
|
|
82
90
|
rescue Error => e
|
|
83
91
|
warn "[rubycrawl] Failed to crawl #{url}: #{e.message}"
|
data/lib/rubycrawl/version.rb
CHANGED
data/lib/rubycrawl.rb
CHANGED
|
@@ -45,6 +45,18 @@ class RubyCrawl
|
|
|
45
45
|
client.crawl_site(url, ...)
|
|
46
46
|
end
|
|
47
47
|
|
|
48
|
+
# Create a session for reusing browser context across multiple crawls.
|
|
49
|
+
# @return [String] session_id
|
|
50
|
+
def create_session
|
|
51
|
+
client.create_session
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Destroy a session and close its browser context.
|
|
55
|
+
# @param session_id [String]
|
|
56
|
+
def destroy_session(session_id)
|
|
57
|
+
client.destroy_session(session_id)
|
|
58
|
+
end
|
|
59
|
+
|
|
48
60
|
def configure(**options)
|
|
49
61
|
@client = new(**options)
|
|
50
62
|
end
|
|
@@ -55,17 +67,30 @@ class RubyCrawl
|
|
|
55
67
|
build_service_client
|
|
56
68
|
end
|
|
57
69
|
|
|
58
|
-
def crawl(url, wait_until: @wait_until, block_resources: @block_resources,
|
|
70
|
+
def crawl(url, wait_until: @wait_until, block_resources: @block_resources, max_attempts: @max_attempts, session_id: nil)
|
|
59
71
|
validate_url!(url)
|
|
60
72
|
@service_client.ensure_running
|
|
61
|
-
with_retries(
|
|
62
|
-
payload = build_payload(url, wait_until, block_resources)
|
|
73
|
+
with_retries(max_attempts) do
|
|
74
|
+
payload = build_payload(url, wait_until, block_resources, session_id)
|
|
63
75
|
response = @service_client.post_json('/crawl', payload)
|
|
64
76
|
raise_node_error!(response)
|
|
65
77
|
build_result(response)
|
|
66
78
|
end
|
|
67
79
|
end
|
|
68
80
|
|
|
81
|
+
# Create a session for reusing browser context.
|
|
82
|
+
# @return [String] session_id
|
|
83
|
+
def create_session
|
|
84
|
+
@service_client.ensure_running
|
|
85
|
+
@service_client.create_session
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Destroy a session.
|
|
89
|
+
# @param session_id [String]
|
|
90
|
+
def destroy_session(session_id)
|
|
91
|
+
@service_client.destroy_session(session_id)
|
|
92
|
+
end
|
|
93
|
+
|
|
69
94
|
# Crawl multiple pages starting from a URL, following links.
|
|
70
95
|
# @see RubyCrawl.crawl_site
|
|
71
96
|
def crawl_site(url, **options, &block)
|
|
@@ -106,7 +131,7 @@ class RubyCrawl
|
|
|
106
131
|
@node_log = options.fetch(:node_log, ENV.fetch('RUBYCRAWL_NODE_LOG', nil))
|
|
107
132
|
@wait_until = options.fetch(:wait_until, nil)
|
|
108
133
|
@block_resources = options.fetch(:block_resources, nil)
|
|
109
|
-
@
|
|
134
|
+
@max_attempts = options.fetch(:max_attempts, 3)
|
|
110
135
|
end
|
|
111
136
|
|
|
112
137
|
def build_service_client
|
|
@@ -119,9 +144,9 @@ class RubyCrawl
|
|
|
119
144
|
)
|
|
120
145
|
end
|
|
121
146
|
|
|
122
|
-
def retry_with_backoff(attempt,
|
|
147
|
+
def retry_with_backoff(attempt, max_attempts, error)
|
|
123
148
|
backoff_seconds = 2**attempt
|
|
124
|
-
warn "[rubycrawl]
|
|
149
|
+
warn "[rubycrawl] Attempt #{attempt + 1}/#{max_attempts} failed, retrying in #{backoff_seconds}s: #{error.message}"
|
|
125
150
|
sleep(backoff_seconds)
|
|
126
151
|
end
|
|
127
152
|
|
|
@@ -131,7 +156,8 @@ class RubyCrawl
|
|
|
131
156
|
max_depth: options.fetch(:max_depth, 3),
|
|
132
157
|
same_host_only: options.fetch(:same_host_only, true),
|
|
133
158
|
wait_until: options.fetch(:wait_until, @wait_until),
|
|
134
|
-
block_resources: options.fetch(:block_resources, @block_resources)
|
|
159
|
+
block_resources: options.fetch(:block_resources, @block_resources),
|
|
160
|
+
max_attempts: options.fetch(:max_attempts, @max_attempts)
|
|
135
161
|
}
|
|
136
162
|
end
|
|
137
163
|
|
data/node/.gitignore
ADDED
data/node/.npmrc
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
fund=false
|
data/node/README.md
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# rubycrawl Node Service
|
|
2
|
+
|
|
3
|
+
Local Playwright-backed HTTP service used by the Ruby gem.
|
|
4
|
+
|
|
5
|
+
## Run
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
npm install
|
|
9
|
+
npm start
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## Environment
|
|
13
|
+
|
|
14
|
+
Create a `.env` file (or copy from `.env.example`) if you need custom settings.
|
|
15
|
+
|
|
16
|
+
## Endpoints
|
|
17
|
+
|
|
18
|
+
- `POST /crawl` JSON body: `{ "url": "https://example.com" }`
|
|
19
|
+
- `GET /health`
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "rubycrawl-node",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"lockfileVersion": 3,
|
|
5
|
+
"requires": true,
|
|
6
|
+
"packages": {
|
|
7
|
+
"": {
|
|
8
|
+
"name": "rubycrawl-node",
|
|
9
|
+
"version": "0.1.0",
|
|
10
|
+
"dependencies": {
|
|
11
|
+
"dotenv": "^16.4.5",
|
|
12
|
+
"playwright": "^1.41.0"
|
|
13
|
+
}
|
|
14
|
+
},
|
|
15
|
+
"node_modules/dotenv": {
|
|
16
|
+
"version": "16.6.1",
|
|
17
|
+
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz",
|
|
18
|
+
"integrity": "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==",
|
|
19
|
+
"license": "BSD-2-Clause",
|
|
20
|
+
"engines": {
|
|
21
|
+
"node": ">=12"
|
|
22
|
+
},
|
|
23
|
+
"funding": {
|
|
24
|
+
"url": "https://dotenvx.com"
|
|
25
|
+
}
|
|
26
|
+
},
|
|
27
|
+
"node_modules/fsevents": {
|
|
28
|
+
"version": "2.3.2",
|
|
29
|
+
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
|
|
30
|
+
"integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
|
|
31
|
+
"hasInstallScript": true,
|
|
32
|
+
"license": "MIT",
|
|
33
|
+
"optional": true,
|
|
34
|
+
"os": [
|
|
35
|
+
"darwin"
|
|
36
|
+
],
|
|
37
|
+
"engines": {
|
|
38
|
+
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
|
|
39
|
+
}
|
|
40
|
+
},
|
|
41
|
+
"node_modules/playwright": {
|
|
42
|
+
"version": "1.58.0",
|
|
43
|
+
"resolved": "https://registry.npmjs.org/playwright/-/playwright-1.58.0.tgz",
|
|
44
|
+
"integrity": "sha512-2SVA0sbPktiIY/MCOPX8e86ehA/e+tDNq+e5Y8qjKYti2Z/JG7xnronT/TXTIkKbYGWlCbuucZ6dziEgkoEjQQ==",
|
|
45
|
+
"license": "Apache-2.0",
|
|
46
|
+
"dependencies": {
|
|
47
|
+
"playwright-core": "1.58.0"
|
|
48
|
+
},
|
|
49
|
+
"bin": {
|
|
50
|
+
"playwright": "cli.js"
|
|
51
|
+
},
|
|
52
|
+
"engines": {
|
|
53
|
+
"node": ">=18"
|
|
54
|
+
},
|
|
55
|
+
"optionalDependencies": {
|
|
56
|
+
"fsevents": "2.3.2"
|
|
57
|
+
}
|
|
58
|
+
},
|
|
59
|
+
"node_modules/playwright-core": {
|
|
60
|
+
"version": "1.58.0",
|
|
61
|
+
"resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.58.0.tgz",
|
|
62
|
+
"integrity": "sha512-aaoB1RWrdNi3//rOeKuMiS65UCcgOVljU46At6eFcOFPFHWtd2weHRRow6z/n+Lec0Lvu0k9ZPKJSjPugikirw==",
|
|
63
|
+
"license": "Apache-2.0",
|
|
64
|
+
"bin": {
|
|
65
|
+
"playwright-core": "cli.js"
|
|
66
|
+
},
|
|
67
|
+
"engines": {
|
|
68
|
+
"node": ">=18"
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
data/node/package.json
ADDED