rubycrawl 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ class RubyCrawl
4
+ class Browser
5
+ # JavaScript extraction constants, evaluated inside Chromium via page.evaluate().
6
+ # Ported verbatim from node/src/index.js — logic is unchanged.
7
+ # NOISE_SELECTORS is interpolated directly into EXTRACT_CONTENT_JS (no need to
8
+ # pass as a JS argument as the Node version did).
9
+ module Extraction
10
+ # All constants are IIFEs — Ferrum's page.evaluate() evaluates an expression,
11
+ # it does NOT call function definitions. Wrapping as (() => { ... })() ensures
12
+ # the function is immediately invoked and its return value is captured.
13
+ EXTRACT_METADATA_JS = <<~JS
14
+ (() => {
15
+ const getMeta = (name) => {
16
+ const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
17
+ return meta?.getAttribute("content") || null;
18
+ };
19
+ const getLink = (rel) => {
20
+ const link = document.querySelector(`link[rel="${rel}"]`);
21
+ return link?.getAttribute("href") || null;
22
+ };
23
+ return {
24
+ title: document.title || null,
25
+ description: getMeta("description") || getMeta("og:description") || null,
26
+ keywords: getMeta("keywords"),
27
+ author: getMeta("author"),
28
+ og_title: getMeta("og:title"),
29
+ og_description: getMeta("og:description"),
30
+ og_image: getMeta("og:image"),
31
+ og_url: getMeta("og:url"),
32
+ og_type: getMeta("og:type"),
33
+ twitter_card: getMeta("twitter:card"),
34
+ twitter_title: getMeta("twitter:title"),
35
+ twitter_description: getMeta("twitter:description"),
36
+ twitter_image: getMeta("twitter:image"),
37
+ canonical: getLink("canonical"),
38
+ lang: document.documentElement.lang || null,
39
+ charset: document.characterSet || null,
40
+ };
41
+ })()
42
+ JS
43
+
44
+ EXTRACT_LINKS_JS = <<~JS
45
+ (() => Array.from(document.querySelectorAll("a[href]")).map(link => ({
46
+ url: link.href,
47
+ text: (link.textContent || "").trim(),
48
+ title: link.getAttribute("title") || null,
49
+ rel: link.getAttribute("rel") || null,
50
+ })))()
51
+ JS
52
+
53
+ EXTRACT_RAW_TEXT_JS = <<~JS
54
+ (() => (document.body?.innerText || "").trim())()
55
+ JS
56
+
57
+ # Semantic noise selectors — covers standard HTML5 elements and ARIA roles.
58
+ # Interpolated directly into EXTRACT_CONTENT_JS as a string literal.
59
+ NOISE_SELECTORS = [
60
+ 'nav', 'header', 'footer', 'aside',
61
+ '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
62
+ '[role="complementary"]', '[role="dialog"]', '[role="tooltip"]',
63
+ '[role="alert"]', '[aria-hidden="true"]',
64
+ 'script', 'style', 'noscript', 'iframe'
65
+ ].join(', ').freeze
66
+
67
+ # Removes semantic noise (nav/header/footer/aside + ARIA roles) and high
68
+ # link-density containers, then returns both clean plain text and clean HTML.
69
+ # DOM mutations are reversed after extraction so the page is unchanged.
70
+ EXTRACT_CONTENT_JS = <<~JS.freeze
71
+ (() => {
72
+ const noiseSelectors = #{NOISE_SELECTORS.to_json};
73
+ function linkDensity(el) {
74
+ const total = (el.innerText || "").trim().length;
75
+ if (!total) return 1;
76
+ const linked = Array.from(el.querySelectorAll("a"))
77
+ .reduce((sum, a) => sum + (a.innerText || "").trim().length, 0);
78
+ return linked / total;
79
+ }
80
+ const removed = [];
81
+ function stash(el) {
82
+ if (el.parentNode) {
83
+ removed.push({ el, parent: el.parentNode, next: el.nextSibling });
84
+ el.parentNode.removeChild(el);
85
+ }
86
+ }
87
+ document.body.querySelectorAll(noiseSelectors).forEach(stash);
88
+ const blockTags = new Set(["script", "style", "noscript", "link", "meta"]);
89
+ const topChildren = Array.from(document.body.children)
90
+ .filter(el => !blockTags.has(el.tagName.toLowerCase()));
91
+ const roots = topChildren.length === 1
92
+ ? [document.body, topChildren[0]] : [document.body];
93
+ for (const root of roots) {
94
+ for (const el of Array.from(root.children)) {
95
+ const text = (el.innerText || "").trim();
96
+ if (text.length >= 20 && linkDensity(el) > 0.5) stash(el);
97
+ }
98
+ }
99
+ const cleanHtml = document.body.innerHTML;
100
+ removed.reverse().forEach(({ el, parent, next }) => parent.insertBefore(el, next));
101
+ return { cleanHtml };
102
+ })()
103
+ JS
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'ferrum'
4
+ require_relative 'result'
5
+ require_relative 'errors'
6
+ require_relative 'browser/extraction'
7
+
8
+ class RubyCrawl
9
+ # Wraps Ferrum to provide a simple crawl interface.
10
+ # Each crawl gets its own isolated page (own context = own cookies/storage).
11
+ # Browser (Chrome) is launched once lazily and reused across crawls.
12
+ class Browser
13
+ BLOCKED_RESOURCE_TYPES = %w[image media font stylesheet].freeze
14
+
15
+ def initialize(timeout: 30, headless: true, browser_options: {})
16
+ @timeout = timeout
17
+ @headless = headless
18
+ @browser_options = browser_options
19
+ @browser = nil
20
+ @mutex = Mutex.new
21
+ end
22
+
23
+ # Crawl a URL and return a RubyCrawl::Result.
24
+ #
25
+ # @param url [String]
26
+ # @param wait_until [String, nil] "load", "domcontentloaded", "networkidle", "commit"
27
+ # @param block_resources [Boolean] block images/fonts/CSS/media for speed
28
+ # @return [RubyCrawl::Result]
29
+ def crawl(url, wait_until: nil, block_resources: true)
30
+ page = lazy_browser.create_page(new_context: true)
31
+
32
+ begin
33
+ setup_resource_blocking(page) if block_resources
34
+ navigate(page, url, wait_until.to_s)
35
+ extract(page)
36
+ rescue ::Ferrum::TimeoutError => e
37
+ raise RubyCrawl::TimeoutError, "Navigation timed out: #{e.message}"
38
+ rescue ::Ferrum::StatusError => e
39
+ raise RubyCrawl::NavigationError, "Navigation failed: #{e.message}"
40
+ rescue ::Ferrum::Error => e
41
+ raise RubyCrawl::ServiceError, "Browser error: #{e.message}"
42
+ ensure
43
+ begin
44
+ page&.close
45
+ rescue StandardError
46
+ nil
47
+ end
48
+ end
49
+ end
50
+
51
+ private
52
+
53
+ # Lazy-initialise the Ferrum::Browser singleton.
54
+ # Mutex prevents double-init in threaded environments (Puma).
55
+ # Chrome launches after fork in forking servers (Unicorn) since @browser is nil in each worker.
56
+ def lazy_browser
57
+ @mutex.synchronize { @browser ||= launch_browser }
58
+ rescue ::Ferrum::Error => e
59
+ raise RubyCrawl::ServiceError, "Failed to launch browser: #{e.message}"
60
+ end
61
+
62
+ def launch_browser
63
+ b = Ferrum::Browser.new(
64
+ headless: @headless,
65
+ timeout: @timeout,
66
+ browser_options: @browser_options
67
+ )
68
+ at_exit do
69
+ b.quit
70
+ rescue StandardError
71
+ nil # process is exiting anyway
72
+ end
73
+ b
74
+ end
75
+
76
+ def setup_resource_blocking(page)
77
+ page.network.intercept
78
+ page.on(:request) do |request|
79
+ BLOCKED_RESOURCE_TYPES.include?(request.resource_type) ? request.abort : request.continue
80
+ end
81
+ end
82
+
83
+ def navigate(page, url, wait_until)
84
+ page.go_to(url)
85
+ # go_to waits for load by default. networkidle needs an extra wait.
86
+ page.network.wait_for_idle(connections: 0, duration: 0.5) if wait_until == 'networkidle'
87
+ end
88
+
89
+ def extract(page)
90
+ html = page.body
91
+ final_url = page.current_url
92
+ metadata = page.evaluate(Extraction::EXTRACT_METADATA_JS)
93
+ links = page.evaluate(Extraction::EXTRACT_LINKS_JS)
94
+ raw_text = page.evaluate(Extraction::EXTRACT_RAW_TEXT_JS)
95
+ content = page.evaluate(Extraction::EXTRACT_CONTENT_JS)
96
+
97
+ Result.new(
98
+ html: html,
99
+ raw_text: raw_text.to_s,
100
+ clean_html: content['cleanHtml'].to_s,
101
+ links: Array(links),
102
+ metadata: { 'final_url' => final_url }.merge(metadata || {})
103
+ )
104
+ end
105
+ end
106
+ end
@@ -4,7 +4,7 @@ class RubyCrawl
4
4
  # Base error class for all RubyCrawl errors
5
5
  class Error < StandardError; end
6
6
 
7
- # Raised when the Node.js service fails to start or is unavailable
7
+ # Raised when the browser fails to launch or is unavailable
8
8
  class ServiceError < Error; end
9
9
 
10
10
  # Raised when page navigation fails (timeout, DNS, SSL, etc.)
@@ -3,8 +3,10 @@
3
3
  require 'uri'
4
4
 
5
5
  class RubyCrawl
6
- # Helper methods for payloads, validation, and errors.
6
+ # Validation helpers mixed into RubyCrawl.
7
7
  module Helpers
8
+ VALID_WAIT_UNTIL = %w[load domcontentloaded networkidle commit].freeze
9
+
8
10
  private
9
11
 
10
12
  def validate_url!(url)
@@ -21,50 +23,12 @@ class RubyCrawl
21
23
  raise ConfigurationError, "Invalid URL: #{e.message}"
22
24
  end
23
25
 
24
- VALID_WAIT_UNTIL = %w[load domcontentloaded networkidle commit].freeze
25
-
26
- def build_payload(url, wait_until, block_resources, session_id = nil)
27
- if wait_until && !VALID_WAIT_UNTIL.include?(wait_until.to_s)
28
- raise ConfigurationError,
29
- "Invalid wait_until: #{wait_until.inspect}. Must be one of: #{VALID_WAIT_UNTIL.join(', ')}"
30
- end
31
-
32
- payload = { url: url }
33
- payload[:wait_until] = wait_until if wait_until
34
- payload[:block_resources] = block_resources unless block_resources.nil?
35
- payload[:session_id] = session_id if session_id
36
- payload
37
- end
26
+ def validate_wait_until!(wait_until)
27
+ return unless wait_until
28
+ return if VALID_WAIT_UNTIL.include?(wait_until.to_s)
38
29
 
39
- def build_result(response)
40
- Result.new(
41
- text: response['text'].to_s,
42
- html: response['html'].to_s,
43
- links: Array(response['links']),
44
- metadata: response['metadata'].is_a?(Hash) ? response['metadata'] : {}
45
- )
46
- end
47
-
48
- def error_class_for(error_code)
49
- case error_code
50
- when 'crawl_failed'
51
- NavigationError
52
- when 'invalid_json', 'session_create_failed', 'session_destroy_failed'
53
- ServiceError
54
- else
55
- Error
56
- end
57
- end
58
-
59
- def error_message_for(error_code, error_message)
60
- case error_code
61
- when 'crawl_failed'
62
- "Navigation failed: #{error_message}"
63
- when 'invalid_json', 'session_create_failed', 'session_destroy_failed'
64
- "Service error [#{error_code}]: #{error_message}"
65
- else
66
- "Crawl error [#{error_code}]: #{error_message}"
67
- end
30
+ raise ConfigurationError,
31
+ "Invalid wait_until: #{wait_until.inspect}. Must be one of: #{VALID_WAIT_UNTIL.join(', ')}"
68
32
  end
69
33
  end
70
34
  end
@@ -55,9 +55,9 @@ class RubyCrawl
55
55
 
56
56
  def default_options
57
57
  {
58
- unknown_tags: :bypass,
58
+ unknown_tags: :bypass,
59
59
  github_flavored: true,
60
- tag_border: ''
60
+ tag_border: ''
61
61
  }
62
62
  end
63
63
  end
@@ -1,34 +1,47 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'cgi'
4
+
3
5
  class RubyCrawl
4
- # Result object with lazy clean_markdown conversion.
6
+ # Immutable result object returned from every crawl.
7
+ # clean_text and clean_markdown are both derived lazily from clean_html so
8
+ # they have consistent content coverage (including hidden/collapsed elements).
5
9
  class Result
6
- attr_reader :text, :html, :links, :metadata
10
+ attr_reader :raw_text, :clean_html, :html, :links, :metadata
7
11
 
8
- def initialize(text:, html:, links:, metadata:)
9
- @text = text
10
- @html = html
11
- @links = links
12
- @metadata = metadata
12
+ def initialize(raw_text:, clean_html:, html:, links:, metadata:)
13
+ @raw_text = raw_text
14
+ @clean_html = clean_html
15
+ @html = html
16
+ @links = links
17
+ @metadata = metadata
13
18
  end
14
19
 
15
- # Returns clean markdown converted from the page HTML.
16
- # Relative URLs are resolved using the page's final_url.
20
+ # Plain text derived from noise-stripped HTML.
21
+ # Captures hidden/collapsed content (accordions, tabs) that innerText misses.
22
+ # Lazy — computed on first access.
17
23
  #
18
- # @return [String] Markdown content with absolute URLs
24
+ # @return [String]
25
+ def clean_text
26
+ @clean_text ||= html_to_text(clean_html.empty? ? html : clean_html)
27
+ end
28
+
29
+ # Markdown derived from noise-stripped HTML.
30
+ # Preserves document structure (headings, lists, links).
31
+ # Lazy — computed on first access.
32
+ #
33
+ # @return [String]
19
34
  def clean_markdown
20
- @clean_markdown ||= MarkdownConverter.convert(html, base_url: final_url)
35
+ source = clean_html.empty? ? html : clean_html
36
+ @clean_markdown ||= MarkdownConverter.convert(source, base_url: final_url)
21
37
  end
22
38
 
23
39
  # The final URL after redirects.
24
- #
25
40
  # @return [String, nil]
26
41
  def final_url
27
42
  metadata['final_url']
28
43
  end
29
44
 
30
- # Check if clean_markdown has been computed.
31
- #
32
45
  # @return [Boolean]
33
46
  def clean_markdown?
34
47
  !@clean_markdown.nil?
@@ -36,12 +49,30 @@ class RubyCrawl
36
49
 
37
50
  def to_h
38
51
  {
39
- text: text,
40
- html: html,
41
- links: links,
42
- metadata: metadata,
52
+ raw_text: raw_text,
53
+ clean_text: @clean_text,
54
+ clean_html: clean_html,
55
+ html: html,
56
+ links: links,
57
+ metadata: metadata,
43
58
  clean_markdown: @clean_markdown
44
59
  }
45
60
  end
61
+
62
+ private
63
+
64
+ # Convert HTML to plain text without any external dependencies.
65
+ # Block-level elements (p, div, h1-h6, li, br, etc.) become newlines
66
+ # so paragraph structure is preserved. HTML entities are unescaped.
67
+ def html_to_text(source)
68
+ text = source
69
+ .gsub(%r{</?(p|div|h[1-6]|li|br|tr|section|article|blockquote|pre)[^>]*>}i, "\n")
70
+ .gsub(/<[^>]+>/, '')
71
+ CGI.unescapeHTML(text)
72
+ .gsub(/[ \t]+/, ' ')
73
+ .gsub(/ *\n */, "\n")
74
+ .gsub(/\n{3,}/, "\n\n")
75
+ .strip
76
+ end
46
77
  end
47
78
  end
@@ -7,20 +7,30 @@ class RubyCrawl
7
7
  class SiteCrawler
8
8
  # Page result yielded to the block with lazy clean_markdown.
9
9
  class PageResult
10
- attr_reader :url, :html, :links, :metadata, :depth
11
-
12
- def initialize(url:, html:, links:, metadata:, depth:)
13
- @url = url
14
- @html = html
15
- @links = links
16
- @metadata = metadata
17
- @depth = depth
10
+ attr_reader :url, :html, :raw_text, :clean_html, :links, :metadata, :depth
11
+
12
+ def initialize(url:, html:, raw_text:, clean_html:, links:, metadata:, depth:)
13
+ @url = url
14
+ @html = html
15
+ @raw_text = raw_text
16
+ @clean_html = clean_html
17
+ @links = links
18
+ @metadata = metadata
19
+ @depth = depth
18
20
  end
19
21
 
20
- # Returns clean markdown converted from the page HTML.
21
- # Relative URLs are resolved using the page's final_url.
22
+ # Plain text derived from noise-stripped HTML. Lazy — same as Result#clean_text.
23
+ def clean_text
24
+ @clean_text ||= Result.new(
25
+ html: html, raw_text: raw_text, clean_html: clean_html,
26
+ links: links, metadata: metadata
27
+ ).clean_text
28
+ end
29
+
30
+ # Markdown derived from noise-stripped HTML. Lazy — same as Result#clean_markdown.
22
31
  def clean_markdown
23
- @clean_markdown ||= MarkdownConverter.convert(html, base_url: final_url)
32
+ source = clean_html.empty? ? html : clean_html
33
+ @clean_markdown ||= MarkdownConverter.convert(source, base_url: final_url)
24
34
  end
25
35
 
26
36
  # The final URL after redirects.
@@ -39,7 +49,6 @@ class RubyCrawl
39
49
  @max_attempts = options.fetch(:max_attempts, nil)
40
50
  @visited = Set.new
41
51
  @queue = []
42
- @session_id = nil
43
52
  end
44
53
 
45
54
  def crawl(start_url, &block)
@@ -49,11 +58,8 @@ class RubyCrawl
49
58
  raise ConfigurationError, "Invalid start URL: #{start_url}" unless normalized
50
59
 
51
60
  @base_url = normalized
52
- @session_id = @client.create_session
53
61
  enqueue(normalized, 0)
54
62
  process_queue(&block)
55
- ensure
56
- @client.destroy_session(@session_id) if @session_id
57
63
  end
58
64
 
59
65
  private
@@ -78,12 +84,22 @@ class RubyCrawl
78
84
  def process_page(url, depth)
79
85
  @visited.add(url)
80
86
  result = crawl_page(url, depth)
81
- enqueue_links(result.links, depth + 1) if result && depth < @max_depth
87
+ return unless result
88
+
89
+ # Mark final_url visited to prevent re-crawling after redirects
90
+ # e.g. axonchat.ai → www.axonchat.ai should not crawl www again.
91
+ final = UrlNormalizer.normalize(result.final_url)
92
+ @visited.add(final) if final
93
+
94
+ # Update base_url on first crawl so same_host checks use the canonical host.
95
+ @base_url = final if depth.zero? && final
96
+
97
+ enqueue_links(result.links, depth + 1) if depth < @max_depth
82
98
  result
83
99
  end
84
100
 
85
101
  def crawl_page(url, depth)
86
- opts = { wait_until: @wait_until, block_resources: @block_resources, session_id: @session_id }
102
+ opts = { wait_until: @wait_until, block_resources: @block_resources }
87
103
  opts[:max_attempts] = @max_attempts if @max_attempts
88
104
  result = @client.crawl(url, **opts)
89
105
  build_page_result(url, depth, result)
@@ -94,11 +110,13 @@ class RubyCrawl
94
110
 
95
111
  def build_page_result(url, depth, result)
96
112
  PageResult.new(
97
- url: url,
98
- html: result.html,
99
- links: extract_urls(result.links),
100
- metadata: result.metadata,
101
- depth: depth
113
+ url: url,
114
+ html: result.html,
115
+ raw_text: result.raw_text,
116
+ clean_html: result.clean_html,
117
+ links: extract_urls(result.links),
118
+ metadata: result.metadata,
119
+ depth: depth
102
120
  )
103
121
  end
104
122
 
@@ -1,85 +1,46 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # rubocop:disable Metrics/BlockLength
4
3
  namespace :rubycrawl do
5
- desc 'Install Node dependencies and create initializer'
4
+ desc 'Check system dependencies and generate Rails initializer'
6
5
  task :install do
7
6
  require 'fileutils'
8
7
 
9
- # Check Node.js is installed
10
- unless system('node', '--version', out: File::NULL, err: File::NULL)
11
- abort <<~MSG
12
- [rubycrawl] ERROR: Node.js is not installed or not in PATH.
13
-
14
- RubyCrawl requires Node.js (v18+ recommended) for browser automation.
15
-
16
- Install Node.js:
17
- - macOS: brew install node
18
- - Ubuntu: curl -fsSL https://deb.nodesource.com/setup_lts.x | sudo -E bash - && sudo apt-get install -y nodejs
19
- - Windows: https://nodejs.org/en/download/
20
-
21
- After installing, run this task again:
22
- bundle exec rake rubycrawl:install
23
- MSG
8
+ # Ferrum manages Chrome automatically, but warn if not found in common locations
9
+ chrome_found = %w[google-chrome chromium-browser chromium].any? do |cmd|
10
+ system("which #{cmd}", out: File::NULL, err: File::NULL)
24
11
  end
25
12
 
26
- gem_root = File.expand_path('../../../', __dir__)
27
- node_dir = File.join(gem_root, 'node')
28
-
29
- abort("[rubycrawl] ERROR: node directory not found at #{node_dir}") unless Dir.exist?(node_dir)
30
-
31
- Dir.chdir(node_dir) do
32
- puts('[rubycrawl] Installing Node dependencies...')
33
- system('npm', 'install') || abort('[rubycrawl] ERROR: npm install failed')
34
-
35
- puts('[rubycrawl] Installing Playwright browsers...')
36
- system('npx', 'playwright', 'install') || abort('[rubycrawl] ERROR: playwright install failed')
13
+ unless chrome_found
14
+ warn '[rubycrawl] Chrome/Chromium not found in PATH. Ferrum will attempt to locate it automatically.'
15
+ warn '[rubycrawl] macOS: brew install --cask google-chrome'
16
+ warn '[rubycrawl] Ubuntu: sudo apt-get install -y chromium-browser'
17
+ warn '[rubycrawl] See README for Docker examples.'
37
18
  end
38
19
 
39
20
  if defined?(Rails)
40
21
  initializer_path = Rails.root.join('config', 'initializers', 'rubycrawl.rb')
41
22
  if File.exist?(initializer_path)
42
- puts("[rubycrawl] Initializer already exists at #{initializer_path}")
23
+ puts "[rubycrawl] Initializer already exists at #{initializer_path}"
43
24
  else
44
25
  content = <<~RUBY
45
26
  # frozen_string_literal: true
46
27
 
47
28
  # RubyCrawl Configuration
48
- # =======================
49
- # Uncomment and modify options as needed.
50
-
51
29
  RubyCrawl.configure(
52
- # wait_until - Page load strategy:
53
- # "load" - Wait for load event (fastest, good for static sites)
54
- # "domcontentloaded" - Wait for DOM ready (medium speed)
55
- # "networkidle" - Wait until no network requests for 500ms (best for SPAs)
56
- # wait_until: "load",
57
-
58
- # Block images, fonts, CSS, media for faster crawls (2-3x speedup)
59
- # block_resources: true,
60
-
61
- # Maximum retry attempts for transient failures (with exponential backoff)
62
- # max_retries: 3,
63
-
64
- # Node service settings (usually no need to change)
65
- # host: "127.0.0.1",
66
- # port: 3344,
67
-
68
- # Custom Node.js binary path (if not in PATH)
69
- # node_bin: "/usr/local/bin/node",
70
-
71
- # Log file for Node service output (useful for debugging)
72
- # node_log: Rails.root.join("log", "rubycrawl.log").to_s
30
+ # wait_until: "load", # "load", "domcontentloaded", "networkidle"
31
+ # block_resources: true, # block images/fonts/CSS/media for speed
32
+ # max_attempts: 3, # retry count with exponential backoff
33
+ # timeout: 30, # browser navigation timeout in seconds
34
+ # headless: true, # set false to see the browser (debugging)
73
35
  )
74
36
  RUBY
75
37
 
76
38
  FileUtils.mkdir_p(File.dirname(initializer_path))
77
39
  File.write(initializer_path, content)
78
- puts("[rubycrawl] Created initializer at #{initializer_path}")
40
+ puts "[rubycrawl] Created initializer at #{initializer_path}"
79
41
  end
80
42
  else
81
- puts('[rubycrawl] Rails not detected. Skipping initializer creation.')
43
+ puts '[rubycrawl] Rails not detected. Skipping initializer creation.'
82
44
  end
83
45
  end
84
46
  end
85
- # rubocop:enable Metrics/BlockLength
@@ -29,11 +29,15 @@ class RubyCrawl
29
29
  def same_host?(url, base_url)
30
30
  uri = URI.parse(url)
31
31
  base_uri = URI.parse(base_url)
32
- uri.host&.downcase == base_uri.host&.downcase
32
+ canonical_host(uri.host) == canonical_host(base_uri.host)
33
33
  rescue URI::InvalidURIError
34
34
  false
35
35
  end
36
36
 
37
+ def canonical_host(host)
38
+ host&.downcase&.delete_prefix('www.')
39
+ end
40
+
37
41
  def parse_uri(url, base_url)
38
42
  uri = URI.parse(url)
39
43
  return uri if uri.absolute?
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class RubyCrawl
4
- VERSION = '0.1.4'
4
+ VERSION = '0.2.0'
5
5
  end