RubyGems - crawlscope - Versions diffs - 0.1.0 → 0.3.0 - Mend

crawlscope 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +7 -8
data/README.md +21 -14
data/lib/crawlscope/browser.rb +8 -0
data/lib/crawlscope/cli.rb +15 -10
data/lib/crawlscope/configuration.rb +20 -5
data/lib/crawlscope/context.rb +9 -0
data/lib/crawlscope/{audit.rb → crawl.rb} +68 -58
data/lib/crawlscope/crawler.rb +19 -1
data/lib/crawlscope/http.rb +1 -1
data/lib/crawlscope/rake_tasks.rb +28 -0
data/lib/crawlscope/rules/links.rb +99 -48
data/lib/crawlscope/rules/metadata.rb +57 -11
data/lib/crawlscope/rules/structured_data.rb +61 -1
data/lib/crawlscope/run.rb +60 -0
data/lib/crawlscope/schema_registry.rb +3 -349
data/lib/crawlscope/schemas.rb +406 -0
data/lib/crawlscope/sitemap.rb +18 -6
data/lib/crawlscope/structured_data/audit.rb +7 -7
data/lib/crawlscope/structured_data/check.rb +35 -0
data/lib/crawlscope/structured_data/reporter.rb +69 -0
data/lib/crawlscope/url.rb +14 -0
data/lib/crawlscope/version.rb +1 -1
data/lib/tasks/crawlscope_tasks.rake +12 -23
data/test/crawlscope/browser_test.rb +155 -0
data/test/crawlscope/cli_test.rb +143 -7
data/test/crawlscope/configuration_test.rb +49 -0
data/test/crawlscope/{audit_test.rb → crawl_test.rb} +23 -7
data/test/crawlscope/crawler_test.rb +34 -0
data/test/crawlscope/http_test.rb +56 -0
data/test/crawlscope/links_rule_test.rb +149 -5
data/test/crawlscope/metadata_rule_test.rb +77 -0
data/test/crawlscope/rule_registry_test.rb +32 -0
data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
data/test/crawlscope/schema_registry_test.rb +19 -0
data/test/crawlscope/sitemap_test.rb +55 -0
data/test/crawlscope/structured_data_document_test.rb +36 -0
data/test/crawlscope/structured_data_report_test.rb +3 -3
data/test/crawlscope/structured_data_reporter_test.rb +2 -2
data/test/crawlscope/structured_data_rule_test.rb +111 -0
data/test/crawlscope/structured_data_writer_test.rb +2 -2
data/test/crawlscope/url_test.rb +31 -0
metadata +15 -5
data/lib/crawlscope/task.rb +0 -131

data/lib/tasks/crawlscope_tasks.rake CHANGED Viewed

@@ -1,44 +1,33 @@
 namespace :crawlscope do
-  desc "Validate sitemap URLs with the default Crawlscope rules. ENV: BASE_URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
+  desc "Validate URLs with all default Crawlscope rules. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
   task validate: :environment do
-    status = Crawlscope::Cli.start(["validate"], out: $stdout, err: $stderr)
-    exit(status) unless status.zero?
+    Crawlscope::RakeTasks.validate
   end
   namespace :validate do
-    desc "Validate JSON-LD on one or more URLs. ENV: URL (required, semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
+    desc "Directly validate JSON-LD on one or more URLs. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
     task ldjson: :environment do
-      status = Crawlscope::Cli.start(["ldjson"], out: $stdout, err: $stderr)
-      exit(status) unless status.zero?
+      Crawlscope::RakeTasks.ldjson
     end
-    desc "Validate sitemap URLs with the metadata rule. ENV: BASE_URL, SITEMAP, JS=1"
+    desc "Validate URLs with the metadata rule. ENV: URL, SITEMAP, JS=1"
     task metadata: :environment do
-      crawlscope_task_with_rules("metadata")
+      Crawlscope::RakeTasks.validate_rule("metadata")
     end
-    desc "Validate sitemap URLs with the structured_data rule. ENV: BASE_URL, SITEMAP, JS=1"
+    desc "Validate sitemap URLs with the structured_data rule. ENV: URL, SITEMAP, JS=1"
     task structured_data: :environment do
-      crawlscope_task_with_rules("structured_data")
+      Crawlscope::RakeTasks.validate_rule("structured_data")
     end
-    desc "Validate sitemap URLs with the uniqueness rule. ENV: BASE_URL, SITEMAP, JS=1"
+    desc "Validate URLs with the uniqueness rule. ENV: URL, SITEMAP, JS=1"
     task uniqueness: :environment do
-      crawlscope_task_with_rules("uniqueness")
+      Crawlscope::RakeTasks.validate_rule("uniqueness")
     end
-    desc "Validate sitemap URLs with the links rule. ENV: BASE_URL, SITEMAP, JS=1"
+    desc "Validate URLs with the links rule. ENV: URL, SITEMAP, JS=1"
     task links: :environment do
-      crawlscope_task_with_rules("links")
+      Crawlscope::RakeTasks.validate_rule("links")
     end
   end
-  def crawlscope_task_with_rules(rules)
-    original_rules = ENV["RULES"]
-    ENV["RULES"] = rules
-    status = Crawlscope::Cli.start(["validate"], out: $stdout, err: $stderr)
-    exit(status) unless status.zero?
-  ensure
-    ENV["RULES"] = original_rules
-  end
 end

data/test/crawlscope/browser_test.rb ADDED Viewed

@@ -0,0 +1,155 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeBrowserTest < Minitest::Test
+  Response = Data.define(:url, :headers)
+  class FakeBrowser
+    attr_reader :quit_called
+    def quit
+      @quit_called = true
+    end
+  end
+  class FakeNetwork
+    attr_reader :cleared, :idle_waits, :status
+    def initialize(response:, status: 200)
+      @response = response
+      @status = status
+      @cleared = []
+      @idle_waits = []
+    end
+    def clear(scope)
+      @cleared << scope
+    end
+    attr_reader :response
+    def wait_for_idle(duration:, timeout:)
+      @idle_waits << {duration: duration, timeout: timeout}
+    end
+  end
+  class FakePage
+    attr_reader :evaluations, :network, :visited_url
+    def initialize(network:, body: "<html></html>", current_url: "", url: "")
+      @network = network
+      @body = body
+      @current_url = current_url
+      @url = url
+      @evaluations = []
+    end
+    attr_reader :body
+    attr_reader :current_url
+    def evaluate(script)
+      @evaluations << script
+    end
+    def go_to(url)
+      @visited_url = url
+    end
+    attr_reader :url
+  end
+  def test_fetch_returns_rendered_page
+    network = FakeNetwork.new(response: Response.new(url: "https://example.com/final", headers: {"content-type" => "text/html"}))
+    page = FakePage.new(network: network, body: "<html><body>Hello</body></html>")
+    browser = browser_with(page: page, scroll_page: false)
+    result = browser.fetch("https://example.com/start")
+    assert_equal "https://example.com/start", page.visited_url
+    assert_equal [:traffic], network.cleared
+    assert_equal "https://example.com/final", result.final_url
+    assert_equal "https://example.com/final", result.normalized_final_url
+    assert_equal 200, result.status
+    assert result.html?
+    assert_equal [], page.evaluations
+  end
+  def test_fetch_scrolls_when_enabled
+    network = FakeNetwork.new(response: Response.new(url: "", headers: {}))
+    page = FakePage.new(network: network, current_url: "https://example.com/current")
+    browser = browser_with(page: page, scroll_page: true)
+    result = browser.fetch("https://example.com/start")
+    assert_equal "https://example.com/current", result.final_url
+    assert_equal 3, page.evaluations.size
+    assert_equal 4, network.idle_waits.size
+  end
+  def test_fetch_falls_back_to_page_url_and_original_url
+    page_url_network = FakeNetwork.new(response: nil)
+    page_url = FakePage.new(network: page_url_network, url: "https://example.com/page")
+    page_url_result = browser_with(page: page_url).fetch("https://example.com/start")
+    original_url_network = FakeNetwork.new(response: nil)
+    original_url = FakePage.new(network: original_url_network)
+    original_url_result = browser_with(page: original_url).fetch("https://example.com/start")
+    assert_equal "https://example.com/page", page_url_result.final_url
+    assert_equal "https://example.com/start", original_url_result.final_url
+  end
+  def test_fetch_returns_error_page_when_navigation_fails
+    page = Object.new
+    def page.network
+      raise Timeout::Error, "browser failed"
+    end
+    result = browser_with(page: page).fetch("https://example.com/start")
+    assert_equal "https://example.com/start", result.final_url
+    assert_nil result.status
+    assert_equal "Timeout::Error: browser failed", result.error
+  end
+  def test_fetch_reraises_programmer_errors
+    page = Object.new
+    def page.network
+      raise NoMethodError, "bad call"
+    end
+    browser = browser_with(page: page)
+    assert_raises(NoMethodError) { browser.fetch("https://example.com/start") }
+  end
+  def test_close_quits_browser
+    fake_browser = FakeBrowser.new
+    browser = browser_with(browser: fake_browser)
+    browser.close
+    assert fake_browser.quit_called
+  end
+  def test_close_allows_missing_browser
+    browser = browser_with(browser: nil)
+    assert_nil browser.close
+  end
+  private
+  def browser_with(page: FakePage.new(network: FakeNetwork.new(response: nil)), browser: FakeBrowser.new, scroll_page: false)
+    Crawlscope::Browser.allocate.tap do |instance|
+      instance.instance_variable_set(:@base_url, "https://example.com")
+      instance.instance_variable_set(:@timeout_seconds, 20)
+      instance.instance_variable_set(:@network_idle_timeout_seconds, 5)
+      instance.instance_variable_set(:@scroll_page, scroll_page)
+      instance.instance_variable_set(:@browser, browser)
+      instance.instance_variable_set(:@page, page)
+    end
+  end
+end

data/test/crawlscope/cli_test.rb CHANGED Viewed

@@ -4,9 +4,10 @@ require "test_helper"
 class CrawlscopeCliTest < Minitest::Test
   class FakeConfiguration
-    attr_accessor :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
+    attr_accessor :base_url, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
     def initialize
+      @base_url = nil
       @concurrency = 10
       @network_idle_timeout_seconds = 5
       @renderer = :http
@@ -19,7 +20,7 @@ class CrawlscopeCliTest < Minitest::Test
   end
   class FakeTask
-    attr_reader :validate_arguments, :ldjson_arguments
+    attr_reader :validate_arguments, :json_ld_arguments
     def validate(base_url:, sitemap_path:, rule_names:)
       @validate_arguments = {
@@ -31,8 +32,8 @@ class CrawlscopeCliTest < Minitest::Test
       success_result
     end
-    def validate_ldjson(urls:, debug:, renderer:, report_path:, summary:, timeout_seconds:)
-      @ldjson_arguments = {
+    def validate_json_ld(urls:, debug:, renderer:, report_path:, summary:, timeout_seconds:)
+      @json_ld_arguments = {
         urls: urls,
         debug: debug,
         renderer: renderer,
@@ -51,6 +52,20 @@ class CrawlscopeCliTest < Minitest::Test
     end
   end
+  class FailingTask < FakeTask
+    private
+    def success_result
+      Struct.new(:ok?).new(false)
+    end
+  end
+  class InvalidTask < FakeTask
+    def validate(base_url:, sitemap_path:, rule_names:)
+      raise Crawlscope::ValidationError, "No URLs found in sitemap: #{sitemap_path}"
+    end
+  end
   def test_version_prints_current_version
     out = StringIO.new
     err = StringIO.new
@@ -70,7 +85,7 @@ class CrawlscopeCliTest < Minitest::Test
     assert_equal 1, status
     assert_includes err.string, "Unknown command: unknown"
-    assert_includes err.string, "crawlscope validate --base-url"
+    assert_includes err.string, "crawlscope validate --url"
   end
   def test_validate_passes_arguments_to_task
@@ -80,7 +95,7 @@ class CrawlscopeCliTest < Minitest::Test
     err = StringIO.new
     status = Crawlscope::Cli.start(
-      ["validate", "--base-url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3"],
+      ["validate", "--url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3"],
       out: out,
       err: err,
       configuration: configuration,
@@ -125,12 +140,133 @@ class CrawlscopeCliTest < Minitest::Test
         summary: true,
         timeout_seconds: 20
       },
-      task.ldjson_arguments
+      task.json_ld_arguments
     )
     assert_same out, configuration.output
     assert_empty err.string
   end
+  def test_ldjson_defaults_to_configured_base_url
+    configuration = FakeConfiguration.new
+    configuration.base_url = "https://example.com"
+    task = FakeTask.new
+    status = Crawlscope::Cli.start(["ldjson"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
+    assert_equal 0, status
+    assert_equal ["https://example.com"], task.json_ld_arguments[:urls]
+  end
+  def test_validate_caps_default_browser_concurrency
+    configuration = FakeConfiguration.new
+    task = FakeTask.new
+    out = StringIO.new
+    err = StringIO.new
+    with_env("JS" => "1") do
+      status = Crawlscope::Cli.start(["validate", "--url", "https://example.com"], out: out, err: err, configuration: configuration, task: task)
+      assert_equal 0, status
+    end
+    assert_equal :browser, configuration.renderer
+    assert_equal 4, configuration.concurrency
+    assert_includes out.string, "Default JS concurrency capped at 4"
+  end
+  def test_validate_uses_url_environment_as_base_url_for_default_sitemap
+    configuration = FakeConfiguration.new
+    task = FakeTask.new
+    with_env("URL" => "https://example.com") do
+      status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
+      assert_equal 0, status
+    end
+    assert_equal "https://example.com", task.validate_arguments[:base_url]
+    assert_nil task.validate_arguments[:sitemap_path]
+  end
+  def test_validate_uses_sitemap_mode_when_sitemap_is_configured
+    task = FakeTask.new
+    with_env("URL" => "https://example.com", "SITEMAP" => "https://example.com/sitemap.xml") do
+      status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: StringIO.new, configuration: FakeConfiguration.new, task: task)
+      assert_equal 0, status
+    end
+    assert_equal "https://example.com", task.validate_arguments[:base_url]
+    assert_equal "https://example.com/sitemap.xml", task.validate_arguments[:sitemap_path]
+  end
+  def test_ldjson_accepts_repeated_urls_and_options
+    configuration = FakeConfiguration.new
+    task = FakeTask.new
+    out = StringIO.new
+    err = StringIO.new
+    status = Crawlscope::Cli.start(
+      ["ldjson", "--url", "https://example.com/a", "--url", "https://example.com/b", "--renderer", "browser", "--timeout", "12", "--network-idle-timeout", "3", "--report-path", "report.json", "--debug", "--summary"],
+      out: out,
+      err: err,
+      configuration: configuration,
+      task: task
+    )
+    assert_equal 0, status
+    assert_equal(
+      {
+        urls: ["https://example.com/a", "https://example.com/b"],
+        debug: true,
+        renderer: :browser,
+        report_path: "report.json",
+        summary: true,
+        timeout_seconds: 12
+      },
+      task.json_ld_arguments
+    )
+    assert_equal 3, configuration.network_idle_timeout_seconds
+  end
+  def test_ldjson_defaults_to_localhost
+    out = StringIO.new
+    err = StringIO.new
+    task = FakeTask.new
+    status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: FakeConfiguration.new, task: task)
+    assert_equal 0, status
+    assert_equal ["http://localhost:3000"], task.json_ld_arguments[:urls]
+    assert_empty err.string
+  end
+  def test_invalid_integer_option_returns_error
+    out = StringIO.new
+    err = StringIO.new
+    status = Crawlscope::Cli.start(["validate", "--timeout", "0"], out: out, err: err, configuration: FakeConfiguration.new, task: FakeTask.new)
+    assert_equal 1, status
+    assert_includes err.string, "timeout must be >= 1"
+  end
+  def test_failed_result_returns_failed_status
+    status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: StringIO.new, configuration: FakeConfiguration.new, task: FailingTask.new)
+    assert_equal 1, status
+  end
+  def test_validation_errors_return_failed_status_without_reraising
+    err = StringIO.new
+    status = Crawlscope::Cli.start(["validate"], out: StringIO.new, err: err, configuration: FakeConfiguration.new, task: InvalidTask.new)
+    assert_equal 1, status
+    assert_includes err.string, "No URLs found in sitemap"
+  end
   private
   def with_env(overrides)

data/test/crawlscope/configuration_test.rb CHANGED Viewed

@@ -42,4 +42,53 @@ class CrawlscopeConfigurationTest < Minitest::Test
     assert_equal "Crawlscope sitemap_path is not configured", error.message
   end
+  def test_defaults_are_normalized
+    config = Crawlscope::Configuration.new
+    assert_equal [200, 301, 302], config.allowed_statuses
+    assert_equal 10, config.concurrency
+    assert_equal 4, config.browser_concurrency
+    assert_equal 5, config.network_idle_timeout_seconds
+    assert_equal :http, config.renderer
+    assert_equal 20, config.timeout_seconds
+    assert_equal $stdout, config.output
+    assert config.scroll_page?
+  end
+  def test_configured_values_are_normalized
+    config = Crawlscope::Configuration.new
+    config.allowed_statuses = ["200", "404"]
+    config.concurrency = "2"
+    config.network_idle_timeout_seconds = "7"
+    config.renderer = "browser"
+    config.timeout_seconds = "9"
+    config.scroll_page = false
+    assert_equal [200, 404], config.allowed_statuses
+    assert_equal 2, config.concurrency
+    assert_equal 2, config.browser_concurrency
+    assert_equal 7, config.network_idle_timeout_seconds
+    assert_equal :browser, config.renderer
+    assert_equal 9, config.timeout_seconds
+    refute config.scroll_page?
+  end
+  def test_renderer_must_be_supported
+    config = Crawlscope::Configuration.new
+    config.renderer = "webkit"
+    error = assert_raises(Crawlscope::ConfigurationError) { config.renderer }
+    assert_equal "Crawlscope renderer must be http or browser", error.message
+  end
+  def test_numeric_values_must_be_positive_integers
+    config = Crawlscope::Configuration.new
+    config.concurrency = "0"
+    error = assert_raises(Crawlscope::ConfigurationError) { config.concurrency }
+    assert_equal "Crawlscope concurrency must be an integer >= 1", error.message
+  end
 end

data/test/crawlscope/{audit_test.rb → crawl_test.rb} RENAMED Viewed

@@ -2,7 +2,7 @@
 require "test_helper"
-class CrawlscopeAuditTest < Minitest::Test
+class CrawlscopeCrawlTest < Minitest::Test
   def setup
     @tmp_dir = Dir.mktmpdir
     @sitemap_path = File.join(@tmp_dir, "sitemap.xml")
@@ -31,8 +31,16 @@ class CrawlscopeAuditTest < Minitest::Test
           <html>
             <head>
               <title>Pricing</title>
-              <meta name="description" content="Plans for hotels and restaurants">
+              <meta name="description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
               <link rel="canonical" href="https://example.com/pricing">
+              <meta property="og:title" content="Pricing">
+              <meta property="og:description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
+              <meta property="og:url" content="https://example.com/pricing">
+              <meta property="og:type" content="website">
+              <meta property="og:image" content="https://example.com/icon.png">
+              <script type="application/ld+json">
+                {"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
+              </script>
             </head>
             <body>
               <main>
@@ -43,7 +51,7 @@ class CrawlscopeAuditTest < Minitest::Test
         HTML
       )
-    result = Crawlscope::Audit.new(
+    result = Crawlscope::Crawl.new(
       base_url: "https://example.com",
       sitemap_path: @sitemap_path,
       rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
@@ -84,7 +92,7 @@ class CrawlscopeAuditTest < Minitest::Test
         HTML
       )
-    result = Crawlscope::Audit.new(
+    result = Crawlscope::Crawl.new(
       base_url: "https://example.com",
       sitemap_path: @sitemap_path,
       rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,
@@ -92,7 +100,7 @@ class CrawlscopeAuditTest < Minitest::Test
     ).call
     refute result.ok?
-    assert_equal %i[meta_description_too_long missing_canonical missing_h1 title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
+    assert_equal %i[incomplete_open_graph_tags meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
   end
   def test_uses_browser_when_renderer_is_browser
@@ -125,8 +133,16 @@ class CrawlscopeAuditTest < Minitest::Test
           <html>
             <head>
               <title>Pricing</title>
-              <meta name="description" content="Plans for hotels and restaurants">
+              <meta name="description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
               <link rel="canonical" href="https://example.com/pricing">
+              <meta property="og:title" content="Pricing">
+              <meta property="og:description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
+              <meta property="og:url" content="https://example.com/pricing">
+              <meta property="og:type" content="website">
+              <meta property="og:image" content="https://example.com/icon.png">
+              <script type="application/ld+json">
+                {"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
+              </script>
             </head>
             <body>
               <main>
@@ -149,7 +165,7 @@ class CrawlscopeAuditTest < Minitest::Test
       end
     end.new
-    result = Crawlscope::Audit.new(
+    result = Crawlscope::Crawl.new(
       base_url: "https://example.com",
       sitemap_path: @sitemap_path,
       rules: Crawlscope::RuleRegistry.default(site_name: "Example").rules,

data/test/crawlscope/crawler_test.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeCrawlerTest < Minitest::Test
+  class RaisingFetcher
+    def fetch(url)
+      raise Timeout::Error, "fetch timed out" if url.include?("timeout")
+      Crawlscope::Page.new(
+        url: url,
+        normalized_url: url,
+        final_url: url,
+        normalized_final_url: url,
+        status: 200,
+        headers: {},
+        body: "<html></html>",
+        doc: Nokogiri::HTML("<html></html>")
+      )
+    end
+  end
+  def test_returns_error_page_when_fetcher_raises
+    pages = Crawlscope::Crawler.new(page_fetcher: RaisingFetcher.new, concurrency: 2).call(
+      ["https://example.com/ok", "https://example.com/timeout"]
+    )
+    assert_equal 2, pages.size
+    error_page = pages.find { |page| page.url == "https://example.com/timeout" }
+    assert_nil error_page.status
+    assert_equal "Timeout::Error: fetch timed out", error_page.error
+  end
+end

data/test/crawlscope/http_test.rb ADDED Viewed

@@ -0,0 +1,56 @@
+# frozen_string_literal: true
+require "test_helper"
+class CrawlscopeHttpTest < Minitest::Test
+  def test_fetch_parses_html_response
+    stub_request(:get, "https://example.com/page")
+      .to_return(status: 200, headers: {"Content-Type" => "text/html"}, body: "<html><body>Hello</body></html>")
+    page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/page")
+    assert_equal 200, page.status
+    assert page.html?
+    assert_equal "Hello", page.doc.at_css("body").text
+  end
+  def test_fetch_parses_responses_without_content_type_as_html
+    stub_request(:get, "https://example.com/page")
+      .to_return(status: 200, body: "<html><body>Hello</body></html>")
+    page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/page")
+    assert page.html?
+  end
+  def test_fetch_leaves_non_html_response_unparsed
+    stub_request(:get, "https://example.com/feed.xml")
+      .to_return(status: 200, headers: {"content-type" => "application/xml"}, body: "<feed></feed>")
+    page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/feed.xml")
+    assert_equal 200, page.status
+    refute page.html?
+    assert_equal "<feed></feed>", page.body
+  end
+  def test_fetch_returns_error_page_for_failed_requests
+    stub_request(:get, "https://example.com/down").to_timeout
+    page = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2).fetch("https://example.com/down")
+    assert_nil page.status
+    assert_includes page.error, "Faraday::ConnectionFailed"
+    assert_equal "https://example.com/down", page.final_url
+  end
+  def test_fetch_reraises_programmer_errors
+    http = Crawlscope::Http.new(base_url: "https://example.com", timeout_seconds: 2)
+    def http.connection
+      raise NoMethodError, "bad call"
+    end
+    assert_raises(NoMethodError) { http.fetch("https://example.com/down") }
+  end
+end