rubycrawl 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +167 -432
- data/lib/rubycrawl/browser/extraction.rb +106 -0
- data/lib/rubycrawl/browser.rb +106 -0
- data/lib/rubycrawl/errors.rb +1 -1
- data/lib/rubycrawl/helpers.rb +8 -44
- data/lib/rubycrawl/markdown_converter.rb +2 -2
- data/lib/rubycrawl/result.rb +49 -18
- data/lib/rubycrawl/site_crawler.rb +40 -22
- data/lib/rubycrawl/tasks/install.rake +17 -56
- data/lib/rubycrawl/url_normalizer.rb +5 -1
- data/lib/rubycrawl/version.rb +1 -1
- data/lib/rubycrawl.rb +35 -90
- data/rubycrawl.gemspec +3 -4
- metadata +19 -10
- data/lib/rubycrawl/service_client.rb +0 -108
- data/node/.gitignore +0 -2
- data/node/.npmrc +0 -1
- data/node/README.md +0 -19
- data/node/package-lock.json +0 -72
- data/node/package.json +0 -14
- data/node/src/index.js +0 -389
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class RubyCrawl
|
|
4
|
+
class Browser
|
|
5
|
+
# JavaScript extraction constants, evaluated inside Chromium via page.evaluate().
|
|
6
|
+
# Ported verbatim from node/src/index.js — logic is unchanged.
|
|
7
|
+
# NOISE_SELECTORS is interpolated directly into EXTRACT_CONTENT_JS (no need to
|
|
8
|
+
# pass as a JS argument as the Node version did).
|
|
9
|
+
module Extraction
|
|
10
|
+
# All constants are IIFEs — Ferrum's page.evaluate() evaluates an expression,
|
|
11
|
+
# it does NOT call function definitions. Wrapping as (() => { ... })() ensures
|
|
12
|
+
# the function is immediately invoked and its return value is captured.
|
|
13
|
+
EXTRACT_METADATA_JS = <<~JS
|
|
14
|
+
(() => {
|
|
15
|
+
const getMeta = (name) => {
|
|
16
|
+
const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
|
|
17
|
+
return meta?.getAttribute("content") || null;
|
|
18
|
+
};
|
|
19
|
+
const getLink = (rel) => {
|
|
20
|
+
const link = document.querySelector(`link[rel="${rel}"]`);
|
|
21
|
+
return link?.getAttribute("href") || null;
|
|
22
|
+
};
|
|
23
|
+
return {
|
|
24
|
+
title: document.title || null,
|
|
25
|
+
description: getMeta("description") || getMeta("og:description") || null,
|
|
26
|
+
keywords: getMeta("keywords"),
|
|
27
|
+
author: getMeta("author"),
|
|
28
|
+
og_title: getMeta("og:title"),
|
|
29
|
+
og_description: getMeta("og:description"),
|
|
30
|
+
og_image: getMeta("og:image"),
|
|
31
|
+
og_url: getMeta("og:url"),
|
|
32
|
+
og_type: getMeta("og:type"),
|
|
33
|
+
twitter_card: getMeta("twitter:card"),
|
|
34
|
+
twitter_title: getMeta("twitter:title"),
|
|
35
|
+
twitter_description: getMeta("twitter:description"),
|
|
36
|
+
twitter_image: getMeta("twitter:image"),
|
|
37
|
+
canonical: getLink("canonical"),
|
|
38
|
+
lang: document.documentElement.lang || null,
|
|
39
|
+
charset: document.characterSet || null,
|
|
40
|
+
};
|
|
41
|
+
})()
|
|
42
|
+
JS
|
|
43
|
+
|
|
44
|
+
EXTRACT_LINKS_JS = <<~JS
|
|
45
|
+
(() => Array.from(document.querySelectorAll("a[href]")).map(link => ({
|
|
46
|
+
url: link.href,
|
|
47
|
+
text: (link.textContent || "").trim(),
|
|
48
|
+
title: link.getAttribute("title") || null,
|
|
49
|
+
rel: link.getAttribute("rel") || null,
|
|
50
|
+
})))()
|
|
51
|
+
JS
|
|
52
|
+
|
|
53
|
+
EXTRACT_RAW_TEXT_JS = <<~JS
|
|
54
|
+
(() => (document.body?.innerText || "").trim())()
|
|
55
|
+
JS
|
|
56
|
+
|
|
57
|
+
# Semantic noise selectors — covers standard HTML5 elements and ARIA roles.
|
|
58
|
+
# Interpolated directly into EXTRACT_CONTENT_JS as a string literal.
|
|
59
|
+
NOISE_SELECTORS = [
|
|
60
|
+
'nav', 'header', 'footer', 'aside',
|
|
61
|
+
'[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
|
|
62
|
+
'[role="complementary"]', '[role="dialog"]', '[role="tooltip"]',
|
|
63
|
+
'[role="alert"]', '[aria-hidden="true"]',
|
|
64
|
+
'script', 'style', 'noscript', 'iframe'
|
|
65
|
+
].join(', ').freeze
|
|
66
|
+
|
|
67
|
+
# Removes semantic noise (nav/header/footer/aside + ARIA roles) and high
|
|
68
|
+
# link-density containers, then returns both clean plain text and clean HTML.
|
|
69
|
+
# DOM mutations are reversed after extraction so the page is unchanged.
|
|
70
|
+
EXTRACT_CONTENT_JS = <<~JS.freeze
|
|
71
|
+
(() => {
|
|
72
|
+
const noiseSelectors = #{NOISE_SELECTORS.to_json};
|
|
73
|
+
function linkDensity(el) {
|
|
74
|
+
const total = (el.innerText || "").trim().length;
|
|
75
|
+
if (!total) return 1;
|
|
76
|
+
const linked = Array.from(el.querySelectorAll("a"))
|
|
77
|
+
.reduce((sum, a) => sum + (a.innerText || "").trim().length, 0);
|
|
78
|
+
return linked / total;
|
|
79
|
+
}
|
|
80
|
+
const removed = [];
|
|
81
|
+
function stash(el) {
|
|
82
|
+
if (el.parentNode) {
|
|
83
|
+
removed.push({ el, parent: el.parentNode, next: el.nextSibling });
|
|
84
|
+
el.parentNode.removeChild(el);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
document.body.querySelectorAll(noiseSelectors).forEach(stash);
|
|
88
|
+
const blockTags = new Set(["script", "style", "noscript", "link", "meta"]);
|
|
89
|
+
const topChildren = Array.from(document.body.children)
|
|
90
|
+
.filter(el => !blockTags.has(el.tagName.toLowerCase()));
|
|
91
|
+
const roots = topChildren.length === 1
|
|
92
|
+
? [document.body, topChildren[0]] : [document.body];
|
|
93
|
+
for (const root of roots) {
|
|
94
|
+
for (const el of Array.from(root.children)) {
|
|
95
|
+
const text = (el.innerText || "").trim();
|
|
96
|
+
if (text.length >= 20 && linkDensity(el) > 0.5) stash(el);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
const cleanHtml = document.body.innerHTML;
|
|
100
|
+
removed.reverse().forEach(({ el, parent, next }) => parent.insertBefore(el, next));
|
|
101
|
+
return { cleanHtml };
|
|
102
|
+
})()
|
|
103
|
+
JS
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'ferrum'
|
|
4
|
+
require_relative 'result'
|
|
5
|
+
require_relative 'errors'
|
|
6
|
+
require_relative 'browser/extraction'
|
|
7
|
+
|
|
8
|
+
class RubyCrawl
|
|
9
|
+
# Wraps Ferrum to provide a simple crawl interface.
|
|
10
|
+
# Each crawl gets its own isolated page (own context = own cookies/storage).
|
|
11
|
+
# Browser (Chrome) is launched once lazily and reused across crawls.
|
|
12
|
+
class Browser
|
|
13
|
+
BLOCKED_RESOURCE_TYPES = %w[image media font stylesheet].freeze
|
|
14
|
+
|
|
15
|
+
def initialize(timeout: 30, headless: true, browser_options: {})
|
|
16
|
+
@timeout = timeout
|
|
17
|
+
@headless = headless
|
|
18
|
+
@browser_options = browser_options
|
|
19
|
+
@browser = nil
|
|
20
|
+
@mutex = Mutex.new
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Crawl a URL and return a RubyCrawl::Result.
|
|
24
|
+
#
|
|
25
|
+
# @param url [String]
|
|
26
|
+
# @param wait_until [String, nil] "load", "domcontentloaded", "networkidle", "commit"
|
|
27
|
+
# @param block_resources [Boolean] block images/fonts/CSS/media for speed
|
|
28
|
+
# @return [RubyCrawl::Result]
|
|
29
|
+
def crawl(url, wait_until: nil, block_resources: true)
|
|
30
|
+
page = lazy_browser.create_page(new_context: true)
|
|
31
|
+
|
|
32
|
+
begin
|
|
33
|
+
setup_resource_blocking(page) if block_resources
|
|
34
|
+
navigate(page, url, wait_until.to_s)
|
|
35
|
+
extract(page)
|
|
36
|
+
rescue ::Ferrum::TimeoutError => e
|
|
37
|
+
raise RubyCrawl::TimeoutError, "Navigation timed out: #{e.message}"
|
|
38
|
+
rescue ::Ferrum::StatusError => e
|
|
39
|
+
raise RubyCrawl::NavigationError, "Navigation failed: #{e.message}"
|
|
40
|
+
rescue ::Ferrum::Error => e
|
|
41
|
+
raise RubyCrawl::ServiceError, "Browser error: #{e.message}"
|
|
42
|
+
ensure
|
|
43
|
+
begin
|
|
44
|
+
page&.close
|
|
45
|
+
rescue StandardError
|
|
46
|
+
nil
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
# Lazy-initialise the Ferrum::Browser singleton.
|
|
54
|
+
# Mutex prevents double-init in threaded environments (Puma).
|
|
55
|
+
# Chrome launches after fork in forking servers (Unicorn) since @browser is nil in each worker.
|
|
56
|
+
def lazy_browser
|
|
57
|
+
@mutex.synchronize { @browser ||= launch_browser }
|
|
58
|
+
rescue ::Ferrum::Error => e
|
|
59
|
+
raise RubyCrawl::ServiceError, "Failed to launch browser: #{e.message}"
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def launch_browser
|
|
63
|
+
b = Ferrum::Browser.new(
|
|
64
|
+
headless: @headless,
|
|
65
|
+
timeout: @timeout,
|
|
66
|
+
browser_options: @browser_options
|
|
67
|
+
)
|
|
68
|
+
at_exit do
|
|
69
|
+
b.quit
|
|
70
|
+
rescue StandardError
|
|
71
|
+
nil # process is exiting anyway
|
|
72
|
+
end
|
|
73
|
+
b
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def setup_resource_blocking(page)
|
|
77
|
+
page.network.intercept
|
|
78
|
+
page.on(:request) do |request|
|
|
79
|
+
BLOCKED_RESOURCE_TYPES.include?(request.resource_type) ? request.abort : request.continue
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def navigate(page, url, wait_until)
|
|
84
|
+
page.go_to(url)
|
|
85
|
+
# go_to waits for load by default. networkidle needs an extra wait.
|
|
86
|
+
page.network.wait_for_idle(connections: 0, duration: 0.5) if wait_until == 'networkidle'
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def extract(page)
|
|
90
|
+
html = page.body
|
|
91
|
+
final_url = page.current_url
|
|
92
|
+
metadata = page.evaluate(Extraction::EXTRACT_METADATA_JS)
|
|
93
|
+
links = page.evaluate(Extraction::EXTRACT_LINKS_JS)
|
|
94
|
+
raw_text = page.evaluate(Extraction::EXTRACT_RAW_TEXT_JS)
|
|
95
|
+
content = page.evaluate(Extraction::EXTRACT_CONTENT_JS)
|
|
96
|
+
|
|
97
|
+
Result.new(
|
|
98
|
+
html: html,
|
|
99
|
+
raw_text: raw_text.to_s,
|
|
100
|
+
clean_html: content['cleanHtml'].to_s,
|
|
101
|
+
links: Array(links),
|
|
102
|
+
metadata: { 'final_url' => final_url }.merge(metadata || {})
|
|
103
|
+
)
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
data/lib/rubycrawl/errors.rb
CHANGED
|
@@ -4,7 +4,7 @@ class RubyCrawl
|
|
|
4
4
|
# Base error class for all RubyCrawl errors
|
|
5
5
|
class Error < StandardError; end
|
|
6
6
|
|
|
7
|
-
# Raised when the
|
|
7
|
+
# Raised when the browser fails to launch or is unavailable
|
|
8
8
|
class ServiceError < Error; end
|
|
9
9
|
|
|
10
10
|
# Raised when page navigation fails (timeout, DNS, SSL, etc.)
|
data/lib/rubycrawl/helpers.rb
CHANGED
|
@@ -3,8 +3,10 @@
|
|
|
3
3
|
require 'uri'
|
|
4
4
|
|
|
5
5
|
class RubyCrawl
|
|
6
|
-
#
|
|
6
|
+
# Validation helpers mixed into RubyCrawl.
|
|
7
7
|
module Helpers
|
|
8
|
+
VALID_WAIT_UNTIL = %w[load domcontentloaded networkidle commit].freeze
|
|
9
|
+
|
|
8
10
|
private
|
|
9
11
|
|
|
10
12
|
def validate_url!(url)
|
|
@@ -21,50 +23,12 @@ class RubyCrawl
|
|
|
21
23
|
raise ConfigurationError, "Invalid URL: #{e.message}"
|
|
22
24
|
end
|
|
23
25
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
if wait_until && !VALID_WAIT_UNTIL.include?(wait_until.to_s)
|
|
28
|
-
raise ConfigurationError,
|
|
29
|
-
"Invalid wait_until: #{wait_until.inspect}. Must be one of: #{VALID_WAIT_UNTIL.join(', ')}"
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
payload = { url: url }
|
|
33
|
-
payload[:wait_until] = wait_until if wait_until
|
|
34
|
-
payload[:block_resources] = block_resources unless block_resources.nil?
|
|
35
|
-
payload[:session_id] = session_id if session_id
|
|
36
|
-
payload
|
|
37
|
-
end
|
|
26
|
+
def validate_wait_until!(wait_until)
|
|
27
|
+
return unless wait_until
|
|
28
|
+
return if VALID_WAIT_UNTIL.include?(wait_until.to_s)
|
|
38
29
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
text: response['text'].to_s,
|
|
42
|
-
html: response['html'].to_s,
|
|
43
|
-
links: Array(response['links']),
|
|
44
|
-
metadata: response['metadata'].is_a?(Hash) ? response['metadata'] : {}
|
|
45
|
-
)
|
|
46
|
-
end
|
|
47
|
-
|
|
48
|
-
def error_class_for(error_code)
|
|
49
|
-
case error_code
|
|
50
|
-
when 'crawl_failed'
|
|
51
|
-
NavigationError
|
|
52
|
-
when 'invalid_json', 'session_create_failed', 'session_destroy_failed'
|
|
53
|
-
ServiceError
|
|
54
|
-
else
|
|
55
|
-
Error
|
|
56
|
-
end
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
def error_message_for(error_code, error_message)
|
|
60
|
-
case error_code
|
|
61
|
-
when 'crawl_failed'
|
|
62
|
-
"Navigation failed: #{error_message}"
|
|
63
|
-
when 'invalid_json', 'session_create_failed', 'session_destroy_failed'
|
|
64
|
-
"Service error [#{error_code}]: #{error_message}"
|
|
65
|
-
else
|
|
66
|
-
"Crawl error [#{error_code}]: #{error_message}"
|
|
67
|
-
end
|
|
30
|
+
raise ConfigurationError,
|
|
31
|
+
"Invalid wait_until: #{wait_until.inspect}. Must be one of: #{VALID_WAIT_UNTIL.join(', ')}"
|
|
68
32
|
end
|
|
69
33
|
end
|
|
70
34
|
end
|
data/lib/rubycrawl/result.rb
CHANGED
|
@@ -1,34 +1,47 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'cgi'
|
|
4
|
+
|
|
3
5
|
class RubyCrawl
|
|
4
|
-
#
|
|
6
|
+
# Immutable result object returned from every crawl.
|
|
7
|
+
# clean_text and clean_markdown are both derived lazily from clean_html so
|
|
8
|
+
# they have consistent content coverage (including hidden/collapsed elements).
|
|
5
9
|
class Result
|
|
6
|
-
attr_reader :
|
|
10
|
+
attr_reader :raw_text, :clean_html, :html, :links, :metadata
|
|
7
11
|
|
|
8
|
-
def initialize(
|
|
9
|
-
@
|
|
10
|
-
@
|
|
11
|
-
@
|
|
12
|
-
@
|
|
12
|
+
def initialize(raw_text:, clean_html:, html:, links:, metadata:)
|
|
13
|
+
@raw_text = raw_text
|
|
14
|
+
@clean_html = clean_html
|
|
15
|
+
@html = html
|
|
16
|
+
@links = links
|
|
17
|
+
@metadata = metadata
|
|
13
18
|
end
|
|
14
19
|
|
|
15
|
-
#
|
|
16
|
-
#
|
|
20
|
+
# Plain text derived from noise-stripped HTML.
|
|
21
|
+
# Captures hidden/collapsed content (accordions, tabs) that innerText misses.
|
|
22
|
+
# Lazy — computed on first access.
|
|
17
23
|
#
|
|
18
|
-
# @return [String]
|
|
24
|
+
# @return [String]
|
|
25
|
+
def clean_text
|
|
26
|
+
@clean_text ||= html_to_text(clean_html.empty? ? html : clean_html)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Markdown derived from noise-stripped HTML.
|
|
30
|
+
# Preserves document structure (headings, lists, links).
|
|
31
|
+
# Lazy — computed on first access.
|
|
32
|
+
#
|
|
33
|
+
# @return [String]
|
|
19
34
|
def clean_markdown
|
|
20
|
-
|
|
35
|
+
source = clean_html.empty? ? html : clean_html
|
|
36
|
+
@clean_markdown ||= MarkdownConverter.convert(source, base_url: final_url)
|
|
21
37
|
end
|
|
22
38
|
|
|
23
39
|
# The final URL after redirects.
|
|
24
|
-
#
|
|
25
40
|
# @return [String, nil]
|
|
26
41
|
def final_url
|
|
27
42
|
metadata['final_url']
|
|
28
43
|
end
|
|
29
44
|
|
|
30
|
-
# Check if clean_markdown has been computed.
|
|
31
|
-
#
|
|
32
45
|
# @return [Boolean]
|
|
33
46
|
def clean_markdown?
|
|
34
47
|
!@clean_markdown.nil?
|
|
@@ -36,12 +49,30 @@ class RubyCrawl
|
|
|
36
49
|
|
|
37
50
|
def to_h
|
|
38
51
|
{
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
52
|
+
raw_text: raw_text,
|
|
53
|
+
clean_text: @clean_text,
|
|
54
|
+
clean_html: clean_html,
|
|
55
|
+
html: html,
|
|
56
|
+
links: links,
|
|
57
|
+
metadata: metadata,
|
|
43
58
|
clean_markdown: @clean_markdown
|
|
44
59
|
}
|
|
45
60
|
end
|
|
61
|
+
|
|
62
|
+
private
|
|
63
|
+
|
|
64
|
+
# Convert HTML to plain text without any external dependencies.
|
|
65
|
+
# Block-level elements (p, div, h1-h6, li, br, etc.) become newlines
|
|
66
|
+
# so paragraph structure is preserved. HTML entities are unescaped.
|
|
67
|
+
def html_to_text(source)
|
|
68
|
+
text = source
|
|
69
|
+
.gsub(%r{</?(p|div|h[1-6]|li|br|tr|section|article|blockquote|pre)[^>]*>}i, "\n")
|
|
70
|
+
.gsub(/<[^>]+>/, '')
|
|
71
|
+
CGI.unescapeHTML(text)
|
|
72
|
+
.gsub(/[ \t]+/, ' ')
|
|
73
|
+
.gsub(/ *\n */, "\n")
|
|
74
|
+
.gsub(/\n{3,}/, "\n\n")
|
|
75
|
+
.strip
|
|
76
|
+
end
|
|
46
77
|
end
|
|
47
78
|
end
|
|
@@ -7,20 +7,30 @@ class RubyCrawl
|
|
|
7
7
|
class SiteCrawler
|
|
8
8
|
# Page result yielded to the block with lazy clean_markdown.
|
|
9
9
|
class PageResult
|
|
10
|
-
attr_reader :url, :html, :links, :metadata, :depth
|
|
11
|
-
|
|
12
|
-
def initialize(url:, html:, links:, metadata:, depth:)
|
|
13
|
-
@url
|
|
14
|
-
@html
|
|
15
|
-
@
|
|
16
|
-
@
|
|
17
|
-
@
|
|
10
|
+
attr_reader :url, :html, :raw_text, :clean_html, :links, :metadata, :depth
|
|
11
|
+
|
|
12
|
+
def initialize(url:, html:, raw_text:, clean_html:, links:, metadata:, depth:)
|
|
13
|
+
@url = url
|
|
14
|
+
@html = html
|
|
15
|
+
@raw_text = raw_text
|
|
16
|
+
@clean_html = clean_html
|
|
17
|
+
@links = links
|
|
18
|
+
@metadata = metadata
|
|
19
|
+
@depth = depth
|
|
18
20
|
end
|
|
19
21
|
|
|
20
|
-
#
|
|
21
|
-
|
|
22
|
+
# Plain text derived from noise-stripped HTML. Lazy — same as Result#clean_text.
|
|
23
|
+
def clean_text
|
|
24
|
+
@clean_text ||= Result.new(
|
|
25
|
+
html: html, raw_text: raw_text, clean_html: clean_html,
|
|
26
|
+
links: links, metadata: metadata
|
|
27
|
+
).clean_text
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Markdown derived from noise-stripped HTML. Lazy — same as Result#clean_markdown.
|
|
22
31
|
def clean_markdown
|
|
23
|
-
|
|
32
|
+
source = clean_html.empty? ? html : clean_html
|
|
33
|
+
@clean_markdown ||= MarkdownConverter.convert(source, base_url: final_url)
|
|
24
34
|
end
|
|
25
35
|
|
|
26
36
|
# The final URL after redirects.
|
|
@@ -39,7 +49,6 @@ class RubyCrawl
|
|
|
39
49
|
@max_attempts = options.fetch(:max_attempts, nil)
|
|
40
50
|
@visited = Set.new
|
|
41
51
|
@queue = []
|
|
42
|
-
@session_id = nil
|
|
43
52
|
end
|
|
44
53
|
|
|
45
54
|
def crawl(start_url, &block)
|
|
@@ -49,11 +58,8 @@ class RubyCrawl
|
|
|
49
58
|
raise ConfigurationError, "Invalid start URL: #{start_url}" unless normalized
|
|
50
59
|
|
|
51
60
|
@base_url = normalized
|
|
52
|
-
@session_id = @client.create_session
|
|
53
61
|
enqueue(normalized, 0)
|
|
54
62
|
process_queue(&block)
|
|
55
|
-
ensure
|
|
56
|
-
@client.destroy_session(@session_id) if @session_id
|
|
57
63
|
end
|
|
58
64
|
|
|
59
65
|
private
|
|
@@ -78,12 +84,22 @@ class RubyCrawl
|
|
|
78
84
|
def process_page(url, depth)
|
|
79
85
|
@visited.add(url)
|
|
80
86
|
result = crawl_page(url, depth)
|
|
81
|
-
|
|
87
|
+
return unless result
|
|
88
|
+
|
|
89
|
+
# Mark final_url visited to prevent re-crawling after redirects
|
|
90
|
+
# e.g. axonchat.ai → www.axonchat.ai should not crawl www again.
|
|
91
|
+
final = UrlNormalizer.normalize(result.final_url)
|
|
92
|
+
@visited.add(final) if final
|
|
93
|
+
|
|
94
|
+
# Update base_url on first crawl so same_host checks use the canonical host.
|
|
95
|
+
@base_url = final if depth.zero? && final
|
|
96
|
+
|
|
97
|
+
enqueue_links(result.links, depth + 1) if depth < @max_depth
|
|
82
98
|
result
|
|
83
99
|
end
|
|
84
100
|
|
|
85
101
|
def crawl_page(url, depth)
|
|
86
|
-
opts = { wait_until: @wait_until, block_resources: @block_resources
|
|
102
|
+
opts = { wait_until: @wait_until, block_resources: @block_resources }
|
|
87
103
|
opts[:max_attempts] = @max_attempts if @max_attempts
|
|
88
104
|
result = @client.crawl(url, **opts)
|
|
89
105
|
build_page_result(url, depth, result)
|
|
@@ -94,11 +110,13 @@ class RubyCrawl
|
|
|
94
110
|
|
|
95
111
|
def build_page_result(url, depth, result)
|
|
96
112
|
PageResult.new(
|
|
97
|
-
url:
|
|
98
|
-
html:
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
113
|
+
url: url,
|
|
114
|
+
html: result.html,
|
|
115
|
+
raw_text: result.raw_text,
|
|
116
|
+
clean_html: result.clean_html,
|
|
117
|
+
links: extract_urls(result.links),
|
|
118
|
+
metadata: result.metadata,
|
|
119
|
+
depth: depth
|
|
102
120
|
)
|
|
103
121
|
end
|
|
104
122
|
|
|
@@ -1,85 +1,46 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
# rubocop:disable Metrics/BlockLength
|
|
4
3
|
namespace :rubycrawl do
|
|
5
|
-
desc '
|
|
4
|
+
desc 'Check system dependencies and generate Rails initializer'
|
|
6
5
|
task :install do
|
|
7
6
|
require 'fileutils'
|
|
8
7
|
|
|
9
|
-
#
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
[rubycrawl] ERROR: Node.js is not installed or not in PATH.
|
|
13
|
-
|
|
14
|
-
RubyCrawl requires Node.js (v18+ recommended) for browser automation.
|
|
15
|
-
|
|
16
|
-
Install Node.js:
|
|
17
|
-
- macOS: brew install node
|
|
18
|
-
- Ubuntu: curl -fsSL https://deb.nodesource.com/setup_lts.x | sudo -E bash - && sudo apt-get install -y nodejs
|
|
19
|
-
- Windows: https://nodejs.org/en/download/
|
|
20
|
-
|
|
21
|
-
After installing, run this task again:
|
|
22
|
-
bundle exec rake rubycrawl:install
|
|
23
|
-
MSG
|
|
8
|
+
# Ferrum manages Chrome automatically, but warn if not found in common locations
|
|
9
|
+
chrome_found = %w[google-chrome chromium-browser chromium].any? do |cmd|
|
|
10
|
+
system("which #{cmd}", out: File::NULL, err: File::NULL)
|
|
24
11
|
end
|
|
25
12
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
Dir.chdir(node_dir) do
|
|
32
|
-
puts('[rubycrawl] Installing Node dependencies...')
|
|
33
|
-
system('npm', 'install') || abort('[rubycrawl] ERROR: npm install failed')
|
|
34
|
-
|
|
35
|
-
puts('[rubycrawl] Installing Playwright browsers...')
|
|
36
|
-
system('npx', 'playwright', 'install') || abort('[rubycrawl] ERROR: playwright install failed')
|
|
13
|
+
unless chrome_found
|
|
14
|
+
warn '[rubycrawl] Chrome/Chromium not found in PATH. Ferrum will attempt to locate it automatically.'
|
|
15
|
+
warn '[rubycrawl] macOS: brew install --cask google-chrome'
|
|
16
|
+
warn '[rubycrawl] Ubuntu: sudo apt-get install -y chromium-browser'
|
|
17
|
+
warn '[rubycrawl] See README for Docker examples.'
|
|
37
18
|
end
|
|
38
19
|
|
|
39
20
|
if defined?(Rails)
|
|
40
21
|
initializer_path = Rails.root.join('config', 'initializers', 'rubycrawl.rb')
|
|
41
22
|
if File.exist?(initializer_path)
|
|
42
|
-
puts
|
|
23
|
+
puts "[rubycrawl] Initializer already exists at #{initializer_path}"
|
|
43
24
|
else
|
|
44
25
|
content = <<~RUBY
|
|
45
26
|
# frozen_string_literal: true
|
|
46
27
|
|
|
47
28
|
# RubyCrawl Configuration
|
|
48
|
-
# =======================
|
|
49
|
-
# Uncomment and modify options as needed.
|
|
50
|
-
|
|
51
29
|
RubyCrawl.configure(
|
|
52
|
-
# wait_until
|
|
53
|
-
#
|
|
54
|
-
#
|
|
55
|
-
#
|
|
56
|
-
#
|
|
57
|
-
|
|
58
|
-
# Block images, fonts, CSS, media for faster crawls (2-3x speedup)
|
|
59
|
-
# block_resources: true,
|
|
60
|
-
|
|
61
|
-
# Maximum retry attempts for transient failures (with exponential backoff)
|
|
62
|
-
# max_retries: 3,
|
|
63
|
-
|
|
64
|
-
# Node service settings (usually no need to change)
|
|
65
|
-
# host: "127.0.0.1",
|
|
66
|
-
# port: 3344,
|
|
67
|
-
|
|
68
|
-
# Custom Node.js binary path (if not in PATH)
|
|
69
|
-
# node_bin: "/usr/local/bin/node",
|
|
70
|
-
|
|
71
|
-
# Log file for Node service output (useful for debugging)
|
|
72
|
-
# node_log: Rails.root.join("log", "rubycrawl.log").to_s
|
|
30
|
+
# wait_until: "load", # "load", "domcontentloaded", "networkidle"
|
|
31
|
+
# block_resources: true, # block images/fonts/CSS/media for speed
|
|
32
|
+
# max_attempts: 3, # retry count with exponential backoff
|
|
33
|
+
# timeout: 30, # browser navigation timeout in seconds
|
|
34
|
+
# headless: true, # set false to see the browser (debugging)
|
|
73
35
|
)
|
|
74
36
|
RUBY
|
|
75
37
|
|
|
76
38
|
FileUtils.mkdir_p(File.dirname(initializer_path))
|
|
77
39
|
File.write(initializer_path, content)
|
|
78
|
-
puts
|
|
40
|
+
puts "[rubycrawl] Created initializer at #{initializer_path}"
|
|
79
41
|
end
|
|
80
42
|
else
|
|
81
|
-
puts
|
|
43
|
+
puts '[rubycrawl] Rails not detected. Skipping initializer creation.'
|
|
82
44
|
end
|
|
83
45
|
end
|
|
84
46
|
end
|
|
85
|
-
# rubocop:enable Metrics/BlockLength
|
|
@@ -29,11 +29,15 @@ class RubyCrawl
|
|
|
29
29
|
def same_host?(url, base_url)
|
|
30
30
|
uri = URI.parse(url)
|
|
31
31
|
base_uri = URI.parse(base_url)
|
|
32
|
-
uri.host
|
|
32
|
+
canonical_host(uri.host) == canonical_host(base_uri.host)
|
|
33
33
|
rescue URI::InvalidURIError
|
|
34
34
|
false
|
|
35
35
|
end
|
|
36
36
|
|
|
37
|
+
def canonical_host(host)
|
|
38
|
+
host&.downcase&.delete_prefix('www.')
|
|
39
|
+
end
|
|
40
|
+
|
|
37
41
|
def parse_uri(url, base_url)
|
|
38
42
|
uri = URI.parse(url)
|
|
39
43
|
return uri if uri.absolute?
|
data/lib/rubycrawl/version.rb
CHANGED