crawlscope 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/README.md +32 -0
  4. data/lib/crawlscope/cli.rb +16 -0
  5. data/lib/crawlscope/configuration.rb +10 -1
  6. data/lib/crawlscope/context.rb +1 -1
  7. data/lib/crawlscope/crawl.rb +72 -14
  8. data/lib/crawlscope/crawler.rb +3 -17
  9. data/lib/crawlscope/document_text.rb +7 -2
  10. data/lib/crawlscope/fetch_executor/async.rb +32 -0
  11. data/lib/crawlscope/fetch_executor/threaded.rb +32 -0
  12. data/lib/crawlscope/fetch_executor.rb +43 -0
  13. data/lib/crawlscope/http.rb +7 -1
  14. data/lib/crawlscope/reporter.rb +123 -14
  15. data/lib/crawlscope/result.rb +1 -1
  16. data/lib/crawlscope/rules/content_quality.rb +1 -1
  17. data/lib/crawlscope/rules/indexability.rb +28 -6
  18. data/lib/crawlscope/rules/links.rb +80 -16
  19. data/lib/crawlscope/rules/uniqueness.rb +23 -4
  20. data/lib/crawlscope/sitemap.rb +30 -11
  21. data/lib/crawlscope/version.rb +1 -1
  22. data/lib/tasks/crawlscope_tasks.rake +1 -1
  23. data/test/crawlscope/cli_test.rb +28 -2
  24. data/test/crawlscope/configuration_test.rb +21 -0
  25. data/test/crawlscope/content_quality_rule_test.rb +18 -0
  26. data/test/crawlscope/crawl_test.rb +142 -4
  27. data/test/crawlscope/crawler_test.rb +61 -0
  28. data/test/crawlscope/fetch_executor_test.rb +44 -0
  29. data/test/crawlscope/links_rule_test.rb +101 -0
  30. data/test/crawlscope/reporter_test.rb +136 -11
  31. data/test/crawlscope/result_test.rb +35 -0
  32. data/test/crawlscope/sitemap_test.rb +52 -0
  33. data/test/performance/async_fetch_benchmark.rb +127 -0
  34. data/test/performance/fetch_executor_matrix.rb +162 -0
  35. data/test/performance/sitemap_expansion_benchmark.rb +121 -0
  36. metadata +38 -2
@@ -1,7 +1,11 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "uri"
4
+
3
5
  module Crawlscope
4
6
  class Reporter
7
+ MAX_ISSUES_PER_GROUP = 20
8
+
5
9
  def initialize(io:)
6
10
  @io = io
7
11
  end
@@ -13,36 +17,141 @@ module Crawlscope
13
17
  @io.puts("URLs: #{result.urls.size}")
14
18
  @io.puts("Pages: #{result.pages.size}")
15
19
 
16
- if result.ok?
20
+ if result.issues.size.zero?
17
21
  @io.puts("Status: OK")
18
22
  return
19
23
  end
20
24
 
21
- @io.puts("Status: FAILED")
22
- @io.puts("Issues: #{result.issues.size}")
25
+ @io.puts("Status: #{status_for(result.issues)}")
26
+ @io.puts("Issues: #{result.issues.size} total (#{severity_summary(result.issues)})")
23
27
  @io.puts("")
24
28
 
25
- report_grouped_issues("Severity", result.issues.by_severity)
29
+ report_summary(result.issues)
26
30
  @io.puts("")
27
- report_grouped_issues("Category", result.issues.by_category)
31
+ report_issue_groups(result.issues, base_url: result.base_url)
28
32
  end
29
33
 
30
34
  private
31
35
 
32
- def report_grouped_issues(title, grouped_issues)
33
- @io.puts("#{title}:")
36
+ def status_for(issues)
37
+ grouped = issues.by_severity
38
+
39
+ if grouped.key?(:error)
40
+ "FAILED"
41
+ elsif grouped.key?(:warning)
42
+ "WARNINGS"
43
+ else
44
+ "NOTICES"
45
+ end
46
+ end
47
+
48
+ def severity_summary(issues)
49
+ grouped = issues.by_severity
50
+ return "" if grouped.empty?
51
+
52
+ grouped
53
+ .sort_by { |severity, severity_issues| [-severity_issues.size, severity.to_s] }
54
+ .map { |severity, severity_issues| "#{severity_issues.size} #{pluralize(severity, severity_issues.size)}" }
55
+ .join(", ")
56
+ end
34
57
 
35
- grouped_issues.sort_by { |name, _issues| name.to_s }.each do |name, issues|
36
- @io.puts("#{name}: #{issues.size}")
37
- issues.each do |issue|
38
- @io.puts(" - #{offense(issue)}")
58
+ def report_summary(issues)
59
+ @io.puts("Summary:")
60
+
61
+ issues.by_category
62
+ .sort_by { |category, category_issues| [-category_issues.size, category.to_s] }
63
+ .each do |category, category_issues|
64
+ @io.puts(" #{category.to_s.ljust(16)} #{category_issues.size}")
39
65
  end
66
+ end
67
+
68
+ def report_issue_groups(issues, base_url:)
69
+ grouped = issues.to_a.group_by { |issue| [issue.category, issue.code] }
70
+
71
+ grouped
72
+ .sort_by { |(category, code), grouped_issues| [-grouped_issues.size, category.to_s, code.to_s] }
73
+ .each do |(category, code), grouped_issues|
74
+ @io.puts("#{category} / #{code}: #{grouped_issues.size}")
75
+
76
+ grouped_issues.first(MAX_ISSUES_PER_GROUP).each do |issue|
77
+ @io.puts(" - #{compact_issue(issue, base_url: base_url)}")
78
+ end
79
+
80
+ remaining_count = grouped_issues.size - MAX_ISSUES_PER_GROUP
81
+ @io.puts(" ... #{remaining_count} more") if remaining_count.positive?
82
+ @io.puts("")
83
+ end
84
+ end
85
+
86
+ def compact_issue(issue, base_url:)
87
+ parts = []
88
+ parts << relative_url(issue.url, base_url: base_url) if issue.url
89
+
90
+ detail = compact_detail(issue, base_url: base_url)
91
+ parts << detail unless detail.empty?
92
+
93
+ parts.compact.join(" ")
94
+ end
95
+
96
+ def compact_detail(issue, base_url:)
97
+ details = issue.details || {}
98
+ fragments = []
99
+
100
+ inbound = details[:dofollow_inbound_count] || details[:inbound_count]
101
+ fragments << "inbound #{inbound}/#{details[:minimum]}" if inbound && details[:minimum]
102
+
103
+ if details[:ratio] && details[:threshold]
104
+ fragments << "ratio #{format_number(details[:ratio])}/#{format_number(details[:threshold])}"
105
+ end
106
+
107
+ fragments << "count #{details[:count]}" if details[:count]
108
+ fragments << "length #{details[:length]}" if details[:length]
109
+ fragments << "status #{details[:status]}" if details[:status]
110
+ fragments << "final: #{relative_url(details[:final_url], base_url: base_url)}" if details[:final_url]
111
+ fragments << "sources: #{relative_urls(details[:source_urls], base_url: base_url).join(", ")}" if details[:source_urls]&.any?
112
+ fragments << "source: #{relative_url(details[:source_url], base_url: base_url)}" if details[:source_url]
113
+ fragments << "targets: #{relative_urls(details[:target_urls], base_url: base_url).join(", ")}" if details[:target_urls]&.any?
114
+
115
+ return issue.message if fragments.empty?
116
+
117
+ case issue.code
118
+ when :low_dofollow_inlinks, :low_inbound_anchor_links, :low_unique_token_ratio, :low_visible_text_ratio
119
+ fragments.join(" ")
120
+ else
121
+ ([issue.message] + fragments).join(" ")
40
122
  end
41
123
  end
42
124
 
43
- def offense(issue)
44
- parts = ["[#{issue.severity}]", issue.code, issue.url, issue.message]
45
- parts.compact.join(" ")
125
+ def relative_urls(urls, base_url:)
126
+ Array(urls).map { |url| relative_url(url, base_url: base_url) }
127
+ end
128
+
129
+ def relative_url(url, base_url:)
130
+ return url unless url && base_url
131
+
132
+ uri = URI.parse(url)
133
+ base_uri = URI.parse(base_url)
134
+
135
+ return url unless uri.host == base_uri.host && uri.scheme == base_uri.scheme && uri.port == base_uri.port
136
+
137
+ relative = uri.path.to_s.empty? ? "/" : uri.path
138
+ relative += "?#{uri.query}" if uri.query
139
+ relative += "##{uri.fragment}" if uri.fragment
140
+ relative
141
+ rescue URI::InvalidURIError
142
+ url
143
+ end
144
+
145
+ def format_number(value)
146
+ return format("%.3f", value) if value.is_a?(Float)
147
+
148
+ value.to_s
149
+ end
150
+
151
+ def pluralize(word, count)
152
+ return word.to_s if count == 1
153
+
154
+ "#{word}s"
46
155
  end
47
156
  end
48
157
  end
@@ -3,7 +3,7 @@
3
3
  module Crawlscope
4
4
  Result = Data.define(:base_url, :sitemap_path, :urls, :pages, :issues) do
5
5
  def ok?
6
- issues.none?(&:error?) && issues.none?(&:warning?) && issues.none?(&:notice?)
6
+ issues.none?(&:error?)
7
7
  end
8
8
  end
9
9
  end
@@ -55,7 +55,7 @@ module Crawlscope
55
55
  end
56
56
 
57
57
  def validate_visible_text_ratio(page, issues)
58
- html_bytes = DocumentText.html_for(page.doc).bytesize
58
+ html_bytes = DocumentText.content_ratio_html_for(page.doc).bytesize
59
59
  return if html_bytes.zero?
60
60
 
61
61
  visible_text = DocumentText.text_for(page.doc)
@@ -6,6 +6,31 @@ module Crawlscope
6
6
  ROBOTS_META_SELECTOR = 'meta[name="robots"], meta[name="googlebot"]'
7
7
  X_ROBOTS_TAG_HEADER = "x-robots-tag"
8
8
 
9
+ def self.noindex_header?(headers)
10
+ noindex?(header_value(headers, X_ROBOTS_TAG_HEADER))
11
+ end
12
+
13
+ def self.noindex_meta?(doc)
14
+ return false unless doc
15
+
16
+ doc.css(ROBOTS_META_SELECTOR).any? { |tag| noindex?(tag["content"].to_s) }
17
+ end
18
+
19
+ def self.header_value(headers, name)
20
+ headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
21
+ end
22
+
23
+ def self.directives(value)
24
+ value
25
+ .split(",")
26
+ .map { |directive| directive.split(":", 2).last.to_s.strip }
27
+ .reject(&:empty?)
28
+ end
29
+
30
+ def self.noindex?(value)
31
+ directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
32
+ end
33
+
9
34
  attr_reader :code
10
35
 
11
36
  def initialize
@@ -28,18 +53,15 @@ module Crawlscope
28
53
  end
29
54
 
30
55
  def header_value(page, name)
31
- page.headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
56
+ self.class.header_value(page.headers, name)
32
57
  end
33
58
 
34
59
  def directives(value)
35
- value
36
- .split(",")
37
- .map { |directive| directive.split(":", 2).last.to_s.strip }
38
- .reject(&:empty?)
60
+ self.class.directives(value)
39
61
  end
40
62
 
41
63
  def noindex?(value)
42
- directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
64
+ self.class.noindex?(value)
43
65
  end
44
66
 
45
67
  def follow?(value)
@@ -21,8 +21,13 @@ module Crawlscope
21
21
  def call(urls:, pages:, issues:, context:)
22
22
  @allowed_statuses = context.fetch(:allowed_statuses)
23
23
  @base_url = context.fetch(:base_url)
24
+ @concurrency = context_value(context, :concurrency, default: 1)
25
+ @fetch_executor = context_value(context, :fetch_executor)
24
26
  @resolve_target = context.fetch(:resolve_target)
27
+ @resolve_targets = context.resolve_targets if context.respond_to?(:resolve_targets)
28
+ @resolve_targets ||= context[:resolve_targets] if context.respond_to?(:[])
25
29
  @base_host = URI.parse(@base_url).host
30
+ @resolved_targets_by_url = {}
26
31
 
27
32
  links = extract_links(pages)
28
33
  validate_url_hygiene(urls, links, issues)
@@ -42,7 +47,12 @@ module Crawlscope
42
47
  end
43
48
 
44
49
  def extract_links(pages)
45
- pages.select(&:html?).flat_map { |page| page_links(page) }
50
+ html_pages = pages.select(&:html?)
51
+ FetchExecutor.map(
52
+ name: @fetch_executor,
53
+ concurrency: @concurrency,
54
+ items: html_pages
55
+ ) { |page| page_links(page) }.flatten
46
56
  end
47
57
 
48
58
  def page_links(page)
@@ -175,9 +185,12 @@ module Crawlscope
175
185
 
176
186
  def resolve_links(links, issues)
177
187
  resolved_links = []
188
+ grouped_links_by_target = links.group_by { |link| link[:target_url] }
189
+ targets_by_url = resolve_targets(grouped_links_by_target.keys)
190
+ @resolved_targets_by_url.merge!(targets_by_url)
178
191
 
179
- links.group_by { |link| link[:target_url] }.each do |target_url, grouped_links|
180
- target = resolve_target(target_url)
192
+ grouped_links_by_target.each do |target_url, grouped_links|
193
+ target = target_for(target_url)
181
194
 
182
195
  if target.unresolved?
183
196
  report_unresolved_target(target_url, grouped_links, issues, target.resolution)
@@ -221,6 +234,23 @@ module Crawlscope
221
234
  LinkTarget.new(target_url: target_url, resolution: resolution)
222
235
  end
223
236
 
237
+ def resolve_targets(target_urls)
238
+ if @resolve_targets
239
+ @resolve_targets.call(target_urls).to_h do |target_url, resolution|
240
+ [target_url, LinkTarget.new(target_url: target_url, resolution: resolution)]
241
+ end
242
+ else
243
+ target_urls.to_h { |target_url| [target_url, resolve_target(target_url)] }
244
+ end
245
+ end
246
+
247
+ def target_for(target_url)
248
+ @resolved_targets_by_url.fetch(target_url) do
249
+ target = resolve_target(target_url)
250
+ @resolved_targets_by_url[target_url] = target
251
+ end
252
+ end
253
+
224
254
  LinkTarget = Data.define(:target_url, :resolution) do
225
255
  def allowed?(statuses)
226
256
  statuses.include?(status)
@@ -243,6 +273,11 @@ module Crawlscope
243
273
  resolution && resolution[:html]
244
274
  end
245
275
 
276
+ def noindex?
277
+ Crawlscope::Rules::Indexability.noindex_header?(resolution[:headers] || {}) ||
278
+ Crawlscope::Rules::Indexability.noindex_meta?(resolution[:doc])
279
+ end
280
+
246
281
  def status
247
282
  resolution && resolution[:status]
248
283
  end
@@ -419,8 +454,9 @@ module Crawlscope
419
454
  next if reported_urls.include?(final_url)
420
455
  next unless crawlable_path?(link[:final_path])
421
456
 
422
- target = resolve_target(final_url)
457
+ target = target_for(final_url)
423
458
  next unless target.allowed?(@allowed_statuses) && target.html?
459
+ next if target.noindex?
424
460
 
425
461
  reported_urls << final_url
426
462
 
@@ -500,16 +536,14 @@ module Crawlscope
500
536
  return if sitemap_pages.size < 2
501
537
 
502
538
  dofollow_counts_by_path = dofollow_counts_by_final_path(resolved_links)
503
-
504
- sitemap_pages.each do |page|
505
- canonical_url = canonical_url_for(page)
506
- next if canonical_url.nil?
507
-
508
- target_uri = URI.parse(canonical_url)
509
- next if target_uri.host != @base_host
510
-
511
- canonical_path = Url.path(canonical_url)
512
- if canonical_path && dofollow_counts_by_path[canonical_path].zero?
539
+ canonical_entries = canonical_entries_for(sitemap_pages)
540
+ @resolved_targets_by_url.merge!(resolve_targets(canonical_entries.map { |entry| entry.fetch(:canonical_url) }))
541
+
542
+ canonical_entries.each do |entry|
543
+ page = entry.fetch(:page)
544
+ canonical_url = entry.fetch(:canonical_url)
545
+ canonical_path = entry.fetch(:canonical_path)
546
+ if canonical_path && !root_path?(canonical_path) && dofollow_counts_by_path[canonical_path].zero?
513
547
  issues.add(
514
548
  code: :canonical_no_internal_inlinks,
515
549
  severity: :warning,
@@ -521,8 +555,24 @@ module Crawlscope
521
555
  end
522
556
 
523
557
  validate_canonical_target_status(page, canonical_url, issues)
558
+ end
559
+ end
560
+
561
+ def canonical_entries_for(pages)
562
+ pages.filter_map do |page|
563
+ canonical_url = canonical_url_for(page)
564
+ next if canonical_url.nil?
565
+
566
+ target_uri = URI.parse(canonical_url)
567
+ next if target_uri.host != @base_host
568
+
569
+ {
570
+ canonical_path: Url.path(canonical_url),
571
+ canonical_url: canonical_url,
572
+ page: page
573
+ }
524
574
  rescue URI::InvalidURIError
525
- next
575
+ nil
526
576
  end
527
577
  end
528
578
 
@@ -542,8 +592,12 @@ module Crawlscope
542
592
  Url.normalize(canonical, base_url: page.url)
543
593
  end
544
594
 
595
+ def root_path?(path)
596
+ path == "/"
597
+ end
598
+
545
599
  def validate_canonical_target_status(page, canonical_url, issues)
546
- target = resolve_target(canonical_url)
600
+ target = target_for(canonical_url)
547
601
 
548
602
  if target.unresolved? || target.ignored_error?
549
603
  return
@@ -569,6 +623,16 @@ module Crawlscope
569
623
  )
570
624
  end
571
625
  end
626
+
627
+ def context_value(context, name, default: nil)
628
+ if context.respond_to?(name)
629
+ context.public_send(name)
630
+ elsif context.respond_to?(:key?) && context.key?(name)
631
+ context[name]
632
+ else
633
+ default
634
+ end
635
+ end
572
636
  end
573
637
  end
574
638
  end
@@ -26,11 +26,10 @@ module Crawlscope
26
26
  end
27
27
 
28
28
  def call(urls:, pages:, issues:, context:)
29
- page_summaries = pages.filter_map do |page|
30
- next unless page.html?
29
+ @concurrency = context_value(context, :concurrency, default: 1)
30
+ @fetch_executor = context_value(context, :fetch_executor)
31
31
 
32
- summary_for(page)
33
- end
32
+ page_summaries = summarize_pages(pages)
34
33
 
35
34
  validate_duplicates(page_summaries, issues)
36
35
  validate_near_duplicates(page_summaries, issues)
@@ -66,6 +65,16 @@ module Crawlscope
66
65
  }
67
66
  end
68
67
 
68
+ def summarize_pages(pages)
69
+ html_pages = pages.select(&:html?)
70
+
71
+ FetchExecutor.map(
72
+ name: @fetch_executor,
73
+ concurrency: @concurrency,
74
+ items: html_pages
75
+ ) { |page| summary_for(page) }
76
+ end
77
+
69
78
  def validate_duplicates(page_summaries, issues)
70
79
  duplicates_for(page_summaries, :title).each do |value, urls|
71
80
  issues.add(
@@ -177,6 +186,16 @@ module Crawlscope
177
186
 
178
187
  intersection_size.to_f / smaller_set_size
179
188
  end
189
+
190
+ def context_value(context, name, default: nil)
191
+ if context.respond_to?(name)
192
+ context.public_send(name)
193
+ elsif context.respond_to?(:key?) && context.key?(name)
194
+ context[name]
195
+ else
196
+ default
197
+ end
198
+ end
180
199
  end
181
200
  end
182
201
  end
@@ -9,20 +9,31 @@ module Crawlscope
9
9
  class Sitemap
10
10
  SITEMAP_NAMESPACE = {"xmlns" => "http://www.sitemaps.org/schemas/sitemap/0.9"}.freeze
11
11
 
12
- def initialize(path:)
12
+ def initialize(path:, adapter: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, fetch_executor: Configuration::DEFAULT_FETCH_EXECUTOR, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS)
13
13
  @path = path
14
+ @adapter = adapter
15
+ @concurrency = concurrency
16
+ @fetch_executor = fetch_executor
17
+ @timeout_seconds = timeout_seconds
14
18
  end
15
19
 
16
20
  def urls(base_url:)
17
- collect_urls(@path, base_url: base_url, visited: Set.new).uniq
21
+ collect_urls(@path, base_url: base_url, visited: Set.new, visited_mutex: Mutex.new).uniq
18
22
  end
19
23
 
20
24
  private
21
25
 
22
- def collect_urls(source, base_url:, visited:)
23
- return [] if visited.include?(source)
26
+ def collect_urls(source, base_url:, visited:, visited_mutex:)
27
+ already_visited = visited_mutex.synchronize do
28
+ if visited.include?(source)
29
+ true
30
+ else
31
+ visited.add(source)
32
+ false
33
+ end
34
+ end
35
+ return [] if already_visited
24
36
 
25
- visited.add(source)
26
37
  document = Nokogiri::XML(read(source))
27
38
  root_name = document.root&.name
28
39
  unless %w[sitemapindex urlset].include?(root_name)
@@ -30,10 +41,13 @@ module Crawlscope
30
41
  end
31
42
 
32
43
  if root_name == "sitemapindex"
33
- document.xpath("//xmlns:sitemap/xmlns:loc", SITEMAP_NAMESPACE).flat_map do |node|
34
- child_source = resolve_child_source(source, node.text.to_s.strip, base_url: base_url)
35
- collect_urls(child_source, base_url: base_url, visited: visited)
44
+ child_sources = document.xpath("//xmlns:sitemap/xmlns:loc", SITEMAP_NAMESPACE).map do |node|
45
+ resolve_child_source(source, node.text.to_s.strip, base_url: base_url)
36
46
  end
47
+
48
+ fetch_executor.call(child_sources) do |child_source|
49
+ collect_urls(child_source, base_url: base_url, visited: visited, visited_mutex: visited_mutex)
50
+ end.flatten
37
51
  else
38
52
  document.xpath("//xmlns:url/xmlns:loc", SITEMAP_NAMESPACE).map do |node|
39
53
  Url.normalize_for_base(node.text.to_s.strip, base_url: base_url)
@@ -77,11 +91,16 @@ module Crawlscope
77
91
  end
78
92
 
79
93
  def connection
80
- @connection ||= Faraday.new do |faraday|
94
+ Faraday.new do |faraday|
81
95
  faraday.response :follow_redirects, limit: Http::MAX_REDIRECTS
82
- faraday.options.timeout = 20
83
- faraday.options.open_timeout = 20
96
+ faraday.options.timeout = @timeout_seconds
97
+ faraday.options.open_timeout = @timeout_seconds
98
+ faraday.adapter @adapter if @adapter
84
99
  end
85
100
  end
101
+
102
+ def fetch_executor
103
+ @fetch_executor_instance ||= FetchExecutor.build(name: @fetch_executor, concurrency: @concurrency)
104
+ end
86
105
  end
87
106
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Crawlscope
4
- VERSION = "0.5.0"
4
+ VERSION = "0.6.0"
5
5
  end
@@ -1,5 +1,5 @@
1
1
  namespace :crawlscope do
2
- desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
2
+ desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY, FETCH_EXECUTOR"
3
3
  task :validate, [:url, :sitemap, :rules] => :environment do |_task, args|
4
4
  Crawlscope::RakeTasks.validate(url: args[:url], sitemap_path: args[:sitemap], rule_names: args[:rules])
5
5
  end
@@ -4,11 +4,12 @@ require "test_helper"
4
4
 
5
5
  class CrawlscopeCliTest < Minitest::Test
6
6
  class FakeConfiguration
7
- attr_accessor :base_url, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
7
+ attr_accessor :base_url, :concurrency, :fetch_executor, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
8
8
 
9
9
  def initialize
10
10
  @base_url = nil
11
11
  @concurrency = 10
12
+ @fetch_executor = :async
12
13
  @network_idle_timeout_seconds = 5
13
14
  @renderer = :http
14
15
  @timeout_seconds = 20
@@ -95,7 +96,7 @@ class CrawlscopeCliTest < Minitest::Test
95
96
  err = StringIO.new
96
97
 
97
98
  status = Crawlscope::Cli.start(
98
- ["validate", "--url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3"],
99
+ ["validate", "--url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3", "--fetch-executor", "async"],
99
100
  out: out,
100
101
  err: err,
101
102
  configuration: configuration,
@@ -115,10 +116,35 @@ class CrawlscopeCliTest < Minitest::Test
115
116
  assert_equal 30, configuration.timeout_seconds
116
117
  assert_equal 9, configuration.network_idle_timeout_seconds
117
118
  assert_equal 3, configuration.concurrency
119
+ assert_equal "async", configuration.fetch_executor
118
120
  assert_same out, configuration.output
119
121
  assert_empty err.string
120
122
  end
121
123
 
124
+ def test_validate_reads_fetch_executor_from_environment
125
+ configuration = FakeConfiguration.new
126
+ task = FakeTask.new
127
+
128
+ with_env("FETCH_EXECUTOR" => "async") do
129
+ status = Crawlscope::Cli.start(["validate", "--url", "https://example.com"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
130
+
131
+ assert_equal 0, status
132
+ end
133
+
134
+ assert_equal "async", configuration.fetch_executor
135
+ end
136
+
137
+ def test_validate_uses_threaded_executor_for_browser_rendering_by_default
138
+ configuration = FakeConfiguration.new
139
+ task = FakeTask.new
140
+
141
+ status = Crawlscope::Cli.start(["validate", "--url", "https://example.com", "--renderer", "browser"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
142
+
143
+ assert_equal 0, status
144
+ assert_equal :browser, configuration.renderer
145
+ assert_equal :threaded, configuration.fetch_executor
146
+ end
147
+
122
148
  def test_ldjson_reads_urls_from_environment
123
149
  configuration = FakeConfiguration.new
124
150
  task = FakeTask.new
@@ -13,6 +13,7 @@ class CrawlscopeConfigurationTest < Minitest::Test
13
13
  config.sitemap_path = -> { "/tmp/sitemap.xml" }
14
14
  config.site_name = -> { "Example" }
15
15
  config.concurrency = -> { 4 }
16
+ config.fetch_executor = -> { :threaded }
16
17
  end
17
18
 
18
19
  audit = Crawlscope.configuration.audit
@@ -20,6 +21,7 @@ class CrawlscopeConfigurationTest < Minitest::Test
20
21
  assert_equal "https://example.com", audit.instance_variable_get(:@base_url)
21
22
  assert_equal "/tmp/sitemap.xml", audit.instance_variable_get(:@sitemap_path)
22
23
  assert_equal 4, audit.instance_variable_get(:@concurrency)
24
+ assert_equal :threaded, audit.instance_variable_get(:@fetch_executor)
23
25
  assert_equal %i[
24
26
  indexability
25
27
  metadata
@@ -55,6 +57,7 @@ class CrawlscopeConfigurationTest < Minitest::Test
55
57
 
56
58
  assert_equal [200, 301, 302], config.allowed_statuses
57
59
  assert_equal 10, config.concurrency
60
+ assert_equal :async, config.fetch_executor
58
61
  assert_equal 4, config.browser_concurrency
59
62
  assert_equal 5, config.network_idle_timeout_seconds
60
63
  assert_equal :http, config.renderer
@@ -63,10 +66,18 @@ class CrawlscopeConfigurationTest < Minitest::Test
63
66
  assert config.scroll_page?
64
67
  end
65
68
 
69
+ def test_browser_renderer_defaults_to_threaded_fetch_executor
70
+ config = Crawlscope::Configuration.new
71
+ config.renderer = :browser
72
+
73
+ assert_equal :threaded, config.fetch_executor
74
+ end
75
+
66
76
  def test_configured_values_are_normalized
67
77
  config = Crawlscope::Configuration.new
68
78
  config.allowed_statuses = ["200", "404"]
69
79
  config.concurrency = "2"
80
+ config.fetch_executor = "async"
70
81
  config.network_idle_timeout_seconds = "7"
71
82
  config.renderer = "browser"
72
83
  config.timeout_seconds = "9"
@@ -74,6 +85,7 @@ class CrawlscopeConfigurationTest < Minitest::Test
74
85
 
75
86
  assert_equal [200, 404], config.allowed_statuses
76
87
  assert_equal 2, config.concurrency
88
+ assert_equal :async, config.fetch_executor
77
89
  assert_equal 2, config.browser_concurrency
78
90
  assert_equal 7, config.network_idle_timeout_seconds
79
91
  assert_equal :browser, config.renderer
@@ -90,6 +102,15 @@ class CrawlscopeConfigurationTest < Minitest::Test
90
102
  assert_equal "Crawlscope renderer must be http or browser", error.message
91
103
  end
92
104
 
105
+ def test_fetch_executor_must_be_supported
106
+ config = Crawlscope::Configuration.new
107
+ config.fetch_executor = "processes"
108
+
109
+ error = assert_raises(Crawlscope::ConfigurationError) { config.fetch_executor }
110
+
111
+ assert_equal "Crawlscope fetch_executor must be threaded or async", error.message
112
+ end
113
+
93
114
  def test_numeric_values_must_be_positive_integers
94
115
  config = Crawlscope::Configuration.new
95
116
  config.concurrency = "0"