crawlscope 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/README.md +32 -0
- data/lib/crawlscope/cli.rb +16 -0
- data/lib/crawlscope/configuration.rb +10 -1
- data/lib/crawlscope/context.rb +1 -1
- data/lib/crawlscope/crawl.rb +72 -14
- data/lib/crawlscope/crawler.rb +3 -17
- data/lib/crawlscope/document_text.rb +7 -2
- data/lib/crawlscope/fetch_executor/async.rb +32 -0
- data/lib/crawlscope/fetch_executor/threaded.rb +32 -0
- data/lib/crawlscope/fetch_executor.rb +43 -0
- data/lib/crawlscope/http.rb +7 -1
- data/lib/crawlscope/reporter.rb +123 -14
- data/lib/crawlscope/result.rb +1 -1
- data/lib/crawlscope/rules/content_quality.rb +1 -1
- data/lib/crawlscope/rules/indexability.rb +28 -6
- data/lib/crawlscope/rules/links.rb +80 -16
- data/lib/crawlscope/rules/uniqueness.rb +23 -4
- data/lib/crawlscope/sitemap.rb +30 -11
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +1 -1
- data/test/crawlscope/cli_test.rb +28 -2
- data/test/crawlscope/configuration_test.rb +21 -0
- data/test/crawlscope/content_quality_rule_test.rb +18 -0
- data/test/crawlscope/crawl_test.rb +142 -4
- data/test/crawlscope/crawler_test.rb +61 -0
- data/test/crawlscope/fetch_executor_test.rb +44 -0
- data/test/crawlscope/links_rule_test.rb +101 -0
- data/test/crawlscope/reporter_test.rb +136 -11
- data/test/crawlscope/result_test.rb +35 -0
- data/test/crawlscope/sitemap_test.rb +52 -0
- data/test/performance/async_fetch_benchmark.rb +127 -0
- data/test/performance/fetch_executor_matrix.rb +162 -0
- data/test/performance/sitemap_expansion_benchmark.rb +121 -0
- metadata +38 -2
data/lib/crawlscope/reporter.rb
CHANGED
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "uri"
|
|
4
|
+
|
|
3
5
|
module Crawlscope
|
|
4
6
|
class Reporter
|
|
7
|
+
MAX_ISSUES_PER_GROUP = 20
|
|
8
|
+
|
|
5
9
|
def initialize(io:)
|
|
6
10
|
@io = io
|
|
7
11
|
end
|
|
@@ -13,36 +17,141 @@ module Crawlscope
|
|
|
13
17
|
@io.puts("URLs: #{result.urls.size}")
|
|
14
18
|
@io.puts("Pages: #{result.pages.size}")
|
|
15
19
|
|
|
16
|
-
if result.
|
|
20
|
+
if result.issues.size.zero?
|
|
17
21
|
@io.puts("Status: OK")
|
|
18
22
|
return
|
|
19
23
|
end
|
|
20
24
|
|
|
21
|
-
@io.puts("Status:
|
|
22
|
-
@io.puts("Issues: #{result.issues.size}")
|
|
25
|
+
@io.puts("Status: #{status_for(result.issues)}")
|
|
26
|
+
@io.puts("Issues: #{result.issues.size} total (#{severity_summary(result.issues)})")
|
|
23
27
|
@io.puts("")
|
|
24
28
|
|
|
25
|
-
|
|
29
|
+
report_summary(result.issues)
|
|
26
30
|
@io.puts("")
|
|
27
|
-
|
|
31
|
+
report_issue_groups(result.issues, base_url: result.base_url)
|
|
28
32
|
end
|
|
29
33
|
|
|
30
34
|
private
|
|
31
35
|
|
|
32
|
-
def
|
|
33
|
-
|
|
36
|
+
def status_for(issues)
|
|
37
|
+
grouped = issues.by_severity
|
|
38
|
+
|
|
39
|
+
if grouped.key?(:error)
|
|
40
|
+
"FAILED"
|
|
41
|
+
elsif grouped.key?(:warning)
|
|
42
|
+
"WARNINGS"
|
|
43
|
+
else
|
|
44
|
+
"NOTICES"
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def severity_summary(issues)
|
|
49
|
+
grouped = issues.by_severity
|
|
50
|
+
return "" if grouped.empty?
|
|
51
|
+
|
|
52
|
+
grouped
|
|
53
|
+
.sort_by { |severity, severity_issues| [-severity_issues.size, severity.to_s] }
|
|
54
|
+
.map { |severity, severity_issues| "#{severity_issues.size} #{pluralize(severity, severity_issues.size)}" }
|
|
55
|
+
.join(", ")
|
|
56
|
+
end
|
|
34
57
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
58
|
+
def report_summary(issues)
|
|
59
|
+
@io.puts("Summary:")
|
|
60
|
+
|
|
61
|
+
issues.by_category
|
|
62
|
+
.sort_by { |category, category_issues| [-category_issues.size, category.to_s] }
|
|
63
|
+
.each do |category, category_issues|
|
|
64
|
+
@io.puts(" #{category.to_s.ljust(16)} #{category_issues.size}")
|
|
39
65
|
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def report_issue_groups(issues, base_url:)
|
|
69
|
+
grouped = issues.to_a.group_by { |issue| [issue.category, issue.code] }
|
|
70
|
+
|
|
71
|
+
grouped
|
|
72
|
+
.sort_by { |(category, code), grouped_issues| [-grouped_issues.size, category.to_s, code.to_s] }
|
|
73
|
+
.each do |(category, code), grouped_issues|
|
|
74
|
+
@io.puts("#{category} / #{code}: #{grouped_issues.size}")
|
|
75
|
+
|
|
76
|
+
grouped_issues.first(MAX_ISSUES_PER_GROUP).each do |issue|
|
|
77
|
+
@io.puts(" - #{compact_issue(issue, base_url: base_url)}")
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
remaining_count = grouped_issues.size - MAX_ISSUES_PER_GROUP
|
|
81
|
+
@io.puts(" ... #{remaining_count} more") if remaining_count.positive?
|
|
82
|
+
@io.puts("")
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def compact_issue(issue, base_url:)
|
|
87
|
+
parts = []
|
|
88
|
+
parts << relative_url(issue.url, base_url: base_url) if issue.url
|
|
89
|
+
|
|
90
|
+
detail = compact_detail(issue, base_url: base_url)
|
|
91
|
+
parts << detail unless detail.empty?
|
|
92
|
+
|
|
93
|
+
parts.compact.join(" ")
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def compact_detail(issue, base_url:)
|
|
97
|
+
details = issue.details || {}
|
|
98
|
+
fragments = []
|
|
99
|
+
|
|
100
|
+
inbound = details[:dofollow_inbound_count] || details[:inbound_count]
|
|
101
|
+
fragments << "inbound #{inbound}/#{details[:minimum]}" if inbound && details[:minimum]
|
|
102
|
+
|
|
103
|
+
if details[:ratio] && details[:threshold]
|
|
104
|
+
fragments << "ratio #{format_number(details[:ratio])}/#{format_number(details[:threshold])}"
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
fragments << "count #{details[:count]}" if details[:count]
|
|
108
|
+
fragments << "length #{details[:length]}" if details[:length]
|
|
109
|
+
fragments << "status #{details[:status]}" if details[:status]
|
|
110
|
+
fragments << "final: #{relative_url(details[:final_url], base_url: base_url)}" if details[:final_url]
|
|
111
|
+
fragments << "sources: #{relative_urls(details[:source_urls], base_url: base_url).join(", ")}" if details[:source_urls]&.any?
|
|
112
|
+
fragments << "source: #{relative_url(details[:source_url], base_url: base_url)}" if details[:source_url]
|
|
113
|
+
fragments << "targets: #{relative_urls(details[:target_urls], base_url: base_url).join(", ")}" if details[:target_urls]&.any?
|
|
114
|
+
|
|
115
|
+
return issue.message if fragments.empty?
|
|
116
|
+
|
|
117
|
+
case issue.code
|
|
118
|
+
when :low_dofollow_inlinks, :low_inbound_anchor_links, :low_unique_token_ratio, :low_visible_text_ratio
|
|
119
|
+
fragments.join(" ")
|
|
120
|
+
else
|
|
121
|
+
([issue.message] + fragments).join(" ")
|
|
40
122
|
end
|
|
41
123
|
end
|
|
42
124
|
|
|
43
|
-
def
|
|
44
|
-
|
|
45
|
-
|
|
125
|
+
def relative_urls(urls, base_url:)
|
|
126
|
+
Array(urls).map { |url| relative_url(url, base_url: base_url) }
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def relative_url(url, base_url:)
|
|
130
|
+
return url unless url && base_url
|
|
131
|
+
|
|
132
|
+
uri = URI.parse(url)
|
|
133
|
+
base_uri = URI.parse(base_url)
|
|
134
|
+
|
|
135
|
+
return url unless uri.host == base_uri.host && uri.scheme == base_uri.scheme && uri.port == base_uri.port
|
|
136
|
+
|
|
137
|
+
relative = uri.path.to_s.empty? ? "/" : uri.path
|
|
138
|
+
relative += "?#{uri.query}" if uri.query
|
|
139
|
+
relative += "##{uri.fragment}" if uri.fragment
|
|
140
|
+
relative
|
|
141
|
+
rescue URI::InvalidURIError
|
|
142
|
+
url
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def format_number(value)
|
|
146
|
+
return format("%.3f", value) if value.is_a?(Float)
|
|
147
|
+
|
|
148
|
+
value.to_s
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def pluralize(word, count)
|
|
152
|
+
return word.to_s if count == 1
|
|
153
|
+
|
|
154
|
+
"#{word}s"
|
|
46
155
|
end
|
|
47
156
|
end
|
|
48
157
|
end
|
data/lib/crawlscope/result.rb
CHANGED
|
@@ -55,7 +55,7 @@ module Crawlscope
|
|
|
55
55
|
end
|
|
56
56
|
|
|
57
57
|
def validate_visible_text_ratio(page, issues)
|
|
58
|
-
html_bytes = DocumentText.
|
|
58
|
+
html_bytes = DocumentText.content_ratio_html_for(page.doc).bytesize
|
|
59
59
|
return if html_bytes.zero?
|
|
60
60
|
|
|
61
61
|
visible_text = DocumentText.text_for(page.doc)
|
|
@@ -6,6 +6,31 @@ module Crawlscope
|
|
|
6
6
|
ROBOTS_META_SELECTOR = 'meta[name="robots"], meta[name="googlebot"]'
|
|
7
7
|
X_ROBOTS_TAG_HEADER = "x-robots-tag"
|
|
8
8
|
|
|
9
|
+
def self.noindex_header?(headers)
|
|
10
|
+
noindex?(header_value(headers, X_ROBOTS_TAG_HEADER))
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def self.noindex_meta?(doc)
|
|
14
|
+
return false unless doc
|
|
15
|
+
|
|
16
|
+
doc.css(ROBOTS_META_SELECTOR).any? { |tag| noindex?(tag["content"].to_s) }
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def self.header_value(headers, name)
|
|
20
|
+
headers.find { |key, _value| key.to_s.casecmp?(name) }&.last.to_s
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def self.directives(value)
|
|
24
|
+
value
|
|
25
|
+
.split(",")
|
|
26
|
+
.map { |directive| directive.split(":", 2).last.to_s.strip }
|
|
27
|
+
.reject(&:empty?)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def self.noindex?(value)
|
|
31
|
+
directives(value).any? { |directive| directive.casecmp?("noindex") || directive.casecmp?("none") }
|
|
32
|
+
end
|
|
33
|
+
|
|
9
34
|
attr_reader :code
|
|
10
35
|
|
|
11
36
|
def initialize
|
|
@@ -28,18 +53,15 @@ module Crawlscope
|
|
|
28
53
|
end
|
|
29
54
|
|
|
30
55
|
def header_value(page, name)
|
|
31
|
-
page.headers
|
|
56
|
+
self.class.header_value(page.headers, name)
|
|
32
57
|
end
|
|
33
58
|
|
|
34
59
|
def directives(value)
|
|
35
|
-
value
|
|
36
|
-
.split(",")
|
|
37
|
-
.map { |directive| directive.split(":", 2).last.to_s.strip }
|
|
38
|
-
.reject(&:empty?)
|
|
60
|
+
self.class.directives(value)
|
|
39
61
|
end
|
|
40
62
|
|
|
41
63
|
def noindex?(value)
|
|
42
|
-
|
|
64
|
+
self.class.noindex?(value)
|
|
43
65
|
end
|
|
44
66
|
|
|
45
67
|
def follow?(value)
|
|
@@ -21,8 +21,13 @@ module Crawlscope
|
|
|
21
21
|
def call(urls:, pages:, issues:, context:)
|
|
22
22
|
@allowed_statuses = context.fetch(:allowed_statuses)
|
|
23
23
|
@base_url = context.fetch(:base_url)
|
|
24
|
+
@concurrency = context_value(context, :concurrency, default: 1)
|
|
25
|
+
@fetch_executor = context_value(context, :fetch_executor)
|
|
24
26
|
@resolve_target = context.fetch(:resolve_target)
|
|
27
|
+
@resolve_targets = context.resolve_targets if context.respond_to?(:resolve_targets)
|
|
28
|
+
@resolve_targets ||= context[:resolve_targets] if context.respond_to?(:[])
|
|
25
29
|
@base_host = URI.parse(@base_url).host
|
|
30
|
+
@resolved_targets_by_url = {}
|
|
26
31
|
|
|
27
32
|
links = extract_links(pages)
|
|
28
33
|
validate_url_hygiene(urls, links, issues)
|
|
@@ -42,7 +47,12 @@ module Crawlscope
|
|
|
42
47
|
end
|
|
43
48
|
|
|
44
49
|
def extract_links(pages)
|
|
45
|
-
pages.select(&:html?)
|
|
50
|
+
html_pages = pages.select(&:html?)
|
|
51
|
+
FetchExecutor.map(
|
|
52
|
+
name: @fetch_executor,
|
|
53
|
+
concurrency: @concurrency,
|
|
54
|
+
items: html_pages
|
|
55
|
+
) { |page| page_links(page) }.flatten
|
|
46
56
|
end
|
|
47
57
|
|
|
48
58
|
def page_links(page)
|
|
@@ -175,9 +185,12 @@ module Crawlscope
|
|
|
175
185
|
|
|
176
186
|
def resolve_links(links, issues)
|
|
177
187
|
resolved_links = []
|
|
188
|
+
grouped_links_by_target = links.group_by { |link| link[:target_url] }
|
|
189
|
+
targets_by_url = resolve_targets(grouped_links_by_target.keys)
|
|
190
|
+
@resolved_targets_by_url.merge!(targets_by_url)
|
|
178
191
|
|
|
179
|
-
|
|
180
|
-
target =
|
|
192
|
+
grouped_links_by_target.each do |target_url, grouped_links|
|
|
193
|
+
target = target_for(target_url)
|
|
181
194
|
|
|
182
195
|
if target.unresolved?
|
|
183
196
|
report_unresolved_target(target_url, grouped_links, issues, target.resolution)
|
|
@@ -221,6 +234,23 @@ module Crawlscope
|
|
|
221
234
|
LinkTarget.new(target_url: target_url, resolution: resolution)
|
|
222
235
|
end
|
|
223
236
|
|
|
237
|
+
def resolve_targets(target_urls)
|
|
238
|
+
if @resolve_targets
|
|
239
|
+
@resolve_targets.call(target_urls).to_h do |target_url, resolution|
|
|
240
|
+
[target_url, LinkTarget.new(target_url: target_url, resolution: resolution)]
|
|
241
|
+
end
|
|
242
|
+
else
|
|
243
|
+
target_urls.to_h { |target_url| [target_url, resolve_target(target_url)] }
|
|
244
|
+
end
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
def target_for(target_url)
|
|
248
|
+
@resolved_targets_by_url.fetch(target_url) do
|
|
249
|
+
target = resolve_target(target_url)
|
|
250
|
+
@resolved_targets_by_url[target_url] = target
|
|
251
|
+
end
|
|
252
|
+
end
|
|
253
|
+
|
|
224
254
|
LinkTarget = Data.define(:target_url, :resolution) do
|
|
225
255
|
def allowed?(statuses)
|
|
226
256
|
statuses.include?(status)
|
|
@@ -243,6 +273,11 @@ module Crawlscope
|
|
|
243
273
|
resolution && resolution[:html]
|
|
244
274
|
end
|
|
245
275
|
|
|
276
|
+
def noindex?
|
|
277
|
+
Crawlscope::Rules::Indexability.noindex_header?(resolution[:headers] || {}) ||
|
|
278
|
+
Crawlscope::Rules::Indexability.noindex_meta?(resolution[:doc])
|
|
279
|
+
end
|
|
280
|
+
|
|
246
281
|
def status
|
|
247
282
|
resolution && resolution[:status]
|
|
248
283
|
end
|
|
@@ -419,8 +454,9 @@ module Crawlscope
|
|
|
419
454
|
next if reported_urls.include?(final_url)
|
|
420
455
|
next unless crawlable_path?(link[:final_path])
|
|
421
456
|
|
|
422
|
-
target =
|
|
457
|
+
target = target_for(final_url)
|
|
423
458
|
next unless target.allowed?(@allowed_statuses) && target.html?
|
|
459
|
+
next if target.noindex?
|
|
424
460
|
|
|
425
461
|
reported_urls << final_url
|
|
426
462
|
|
|
@@ -500,16 +536,14 @@ module Crawlscope
|
|
|
500
536
|
return if sitemap_pages.size < 2
|
|
501
537
|
|
|
502
538
|
dofollow_counts_by_path = dofollow_counts_by_final_path(resolved_links)
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
canonical_path = Url.path(canonical_url)
|
|
512
|
-
if canonical_path && dofollow_counts_by_path[canonical_path].zero?
|
|
539
|
+
canonical_entries = canonical_entries_for(sitemap_pages)
|
|
540
|
+
@resolved_targets_by_url.merge!(resolve_targets(canonical_entries.map { |entry| entry.fetch(:canonical_url) }))
|
|
541
|
+
|
|
542
|
+
canonical_entries.each do |entry|
|
|
543
|
+
page = entry.fetch(:page)
|
|
544
|
+
canonical_url = entry.fetch(:canonical_url)
|
|
545
|
+
canonical_path = entry.fetch(:canonical_path)
|
|
546
|
+
if canonical_path && !root_path?(canonical_path) && dofollow_counts_by_path[canonical_path].zero?
|
|
513
547
|
issues.add(
|
|
514
548
|
code: :canonical_no_internal_inlinks,
|
|
515
549
|
severity: :warning,
|
|
@@ -521,8 +555,24 @@ module Crawlscope
|
|
|
521
555
|
end
|
|
522
556
|
|
|
523
557
|
validate_canonical_target_status(page, canonical_url, issues)
|
|
558
|
+
end
|
|
559
|
+
end
|
|
560
|
+
|
|
561
|
+
def canonical_entries_for(pages)
|
|
562
|
+
pages.filter_map do |page|
|
|
563
|
+
canonical_url = canonical_url_for(page)
|
|
564
|
+
next if canonical_url.nil?
|
|
565
|
+
|
|
566
|
+
target_uri = URI.parse(canonical_url)
|
|
567
|
+
next if target_uri.host != @base_host
|
|
568
|
+
|
|
569
|
+
{
|
|
570
|
+
canonical_path: Url.path(canonical_url),
|
|
571
|
+
canonical_url: canonical_url,
|
|
572
|
+
page: page
|
|
573
|
+
}
|
|
524
574
|
rescue URI::InvalidURIError
|
|
525
|
-
|
|
575
|
+
nil
|
|
526
576
|
end
|
|
527
577
|
end
|
|
528
578
|
|
|
@@ -542,8 +592,12 @@ module Crawlscope
|
|
|
542
592
|
Url.normalize(canonical, base_url: page.url)
|
|
543
593
|
end
|
|
544
594
|
|
|
595
|
+
def root_path?(path)
|
|
596
|
+
path == "/"
|
|
597
|
+
end
|
|
598
|
+
|
|
545
599
|
def validate_canonical_target_status(page, canonical_url, issues)
|
|
546
|
-
target =
|
|
600
|
+
target = target_for(canonical_url)
|
|
547
601
|
|
|
548
602
|
if target.unresolved? || target.ignored_error?
|
|
549
603
|
return
|
|
@@ -569,6 +623,16 @@ module Crawlscope
|
|
|
569
623
|
)
|
|
570
624
|
end
|
|
571
625
|
end
|
|
626
|
+
|
|
627
|
+
def context_value(context, name, default: nil)
|
|
628
|
+
if context.respond_to?(name)
|
|
629
|
+
context.public_send(name)
|
|
630
|
+
elsif context.respond_to?(:key?) && context.key?(name)
|
|
631
|
+
context[name]
|
|
632
|
+
else
|
|
633
|
+
default
|
|
634
|
+
end
|
|
635
|
+
end
|
|
572
636
|
end
|
|
573
637
|
end
|
|
574
638
|
end
|
|
@@ -26,11 +26,10 @@ module Crawlscope
|
|
|
26
26
|
end
|
|
27
27
|
|
|
28
28
|
def call(urls:, pages:, issues:, context:)
|
|
29
|
-
|
|
30
|
-
|
|
29
|
+
@concurrency = context_value(context, :concurrency, default: 1)
|
|
30
|
+
@fetch_executor = context_value(context, :fetch_executor)
|
|
31
31
|
|
|
32
|
-
|
|
33
|
-
end
|
|
32
|
+
page_summaries = summarize_pages(pages)
|
|
34
33
|
|
|
35
34
|
validate_duplicates(page_summaries, issues)
|
|
36
35
|
validate_near_duplicates(page_summaries, issues)
|
|
@@ -66,6 +65,16 @@ module Crawlscope
|
|
|
66
65
|
}
|
|
67
66
|
end
|
|
68
67
|
|
|
68
|
+
def summarize_pages(pages)
|
|
69
|
+
html_pages = pages.select(&:html?)
|
|
70
|
+
|
|
71
|
+
FetchExecutor.map(
|
|
72
|
+
name: @fetch_executor,
|
|
73
|
+
concurrency: @concurrency,
|
|
74
|
+
items: html_pages
|
|
75
|
+
) { |page| summary_for(page) }
|
|
76
|
+
end
|
|
77
|
+
|
|
69
78
|
def validate_duplicates(page_summaries, issues)
|
|
70
79
|
duplicates_for(page_summaries, :title).each do |value, urls|
|
|
71
80
|
issues.add(
|
|
@@ -177,6 +186,16 @@ module Crawlscope
|
|
|
177
186
|
|
|
178
187
|
intersection_size.to_f / smaller_set_size
|
|
179
188
|
end
|
|
189
|
+
|
|
190
|
+
def context_value(context, name, default: nil)
|
|
191
|
+
if context.respond_to?(name)
|
|
192
|
+
context.public_send(name)
|
|
193
|
+
elsif context.respond_to?(:key?) && context.key?(name)
|
|
194
|
+
context[name]
|
|
195
|
+
else
|
|
196
|
+
default
|
|
197
|
+
end
|
|
198
|
+
end
|
|
180
199
|
end
|
|
181
200
|
end
|
|
182
201
|
end
|
data/lib/crawlscope/sitemap.rb
CHANGED
|
@@ -9,20 +9,31 @@ module Crawlscope
|
|
|
9
9
|
class Sitemap
|
|
10
10
|
SITEMAP_NAMESPACE = {"xmlns" => "http://www.sitemaps.org/schemas/sitemap/0.9"}.freeze
|
|
11
11
|
|
|
12
|
-
def initialize(path:)
|
|
12
|
+
def initialize(path:, adapter: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, fetch_executor: Configuration::DEFAULT_FETCH_EXECUTOR, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS)
|
|
13
13
|
@path = path
|
|
14
|
+
@adapter = adapter
|
|
15
|
+
@concurrency = concurrency
|
|
16
|
+
@fetch_executor = fetch_executor
|
|
17
|
+
@timeout_seconds = timeout_seconds
|
|
14
18
|
end
|
|
15
19
|
|
|
16
20
|
def urls(base_url:)
|
|
17
|
-
collect_urls(@path, base_url: base_url, visited: Set.new).uniq
|
|
21
|
+
collect_urls(@path, base_url: base_url, visited: Set.new, visited_mutex: Mutex.new).uniq
|
|
18
22
|
end
|
|
19
23
|
|
|
20
24
|
private
|
|
21
25
|
|
|
22
|
-
def collect_urls(source, base_url:, visited:)
|
|
23
|
-
|
|
26
|
+
def collect_urls(source, base_url:, visited:, visited_mutex:)
|
|
27
|
+
already_visited = visited_mutex.synchronize do
|
|
28
|
+
if visited.include?(source)
|
|
29
|
+
true
|
|
30
|
+
else
|
|
31
|
+
visited.add(source)
|
|
32
|
+
false
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
return [] if already_visited
|
|
24
36
|
|
|
25
|
-
visited.add(source)
|
|
26
37
|
document = Nokogiri::XML(read(source))
|
|
27
38
|
root_name = document.root&.name
|
|
28
39
|
unless %w[sitemapindex urlset].include?(root_name)
|
|
@@ -30,10 +41,13 @@ module Crawlscope
|
|
|
30
41
|
end
|
|
31
42
|
|
|
32
43
|
if root_name == "sitemapindex"
|
|
33
|
-
document.xpath("//xmlns:sitemap/xmlns:loc", SITEMAP_NAMESPACE).
|
|
34
|
-
|
|
35
|
-
collect_urls(child_source, base_url: base_url, visited: visited)
|
|
44
|
+
child_sources = document.xpath("//xmlns:sitemap/xmlns:loc", SITEMAP_NAMESPACE).map do |node|
|
|
45
|
+
resolve_child_source(source, node.text.to_s.strip, base_url: base_url)
|
|
36
46
|
end
|
|
47
|
+
|
|
48
|
+
fetch_executor.call(child_sources) do |child_source|
|
|
49
|
+
collect_urls(child_source, base_url: base_url, visited: visited, visited_mutex: visited_mutex)
|
|
50
|
+
end.flatten
|
|
37
51
|
else
|
|
38
52
|
document.xpath("//xmlns:url/xmlns:loc", SITEMAP_NAMESPACE).map do |node|
|
|
39
53
|
Url.normalize_for_base(node.text.to_s.strip, base_url: base_url)
|
|
@@ -77,11 +91,16 @@ module Crawlscope
|
|
|
77
91
|
end
|
|
78
92
|
|
|
79
93
|
def connection
|
|
80
|
-
|
|
94
|
+
Faraday.new do |faraday|
|
|
81
95
|
faraday.response :follow_redirects, limit: Http::MAX_REDIRECTS
|
|
82
|
-
faraday.options.timeout =
|
|
83
|
-
faraday.options.open_timeout =
|
|
96
|
+
faraday.options.timeout = @timeout_seconds
|
|
97
|
+
faraday.options.open_timeout = @timeout_seconds
|
|
98
|
+
faraday.adapter @adapter if @adapter
|
|
84
99
|
end
|
|
85
100
|
end
|
|
101
|
+
|
|
102
|
+
def fetch_executor
|
|
103
|
+
@fetch_executor_instance ||= FetchExecutor.build(name: @fetch_executor, concurrency: @concurrency)
|
|
104
|
+
end
|
|
86
105
|
end
|
|
87
106
|
end
|
data/lib/crawlscope/version.rb
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
namespace :crawlscope do
|
|
2
|
-
desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
|
|
2
|
+
desc "Validate URLs with all default Crawlscope rules. Args: [url,sitemap,rules]. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY, FETCH_EXECUTOR"
|
|
3
3
|
task :validate, [:url, :sitemap, :rules] => :environment do |_task, args|
|
|
4
4
|
Crawlscope::RakeTasks.validate(url: args[:url], sitemap_path: args[:sitemap], rule_names: args[:rules])
|
|
5
5
|
end
|
data/test/crawlscope/cli_test.rb
CHANGED
|
@@ -4,11 +4,12 @@ require "test_helper"
|
|
|
4
4
|
|
|
5
5
|
class CrawlscopeCliTest < Minitest::Test
|
|
6
6
|
class FakeConfiguration
|
|
7
|
-
attr_accessor :base_url, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
|
|
7
|
+
attr_accessor :base_url, :concurrency, :fetch_executor, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
|
|
8
8
|
|
|
9
9
|
def initialize
|
|
10
10
|
@base_url = nil
|
|
11
11
|
@concurrency = 10
|
|
12
|
+
@fetch_executor = :async
|
|
12
13
|
@network_idle_timeout_seconds = 5
|
|
13
14
|
@renderer = :http
|
|
14
15
|
@timeout_seconds = 20
|
|
@@ -95,7 +96,7 @@ class CrawlscopeCliTest < Minitest::Test
|
|
|
95
96
|
err = StringIO.new
|
|
96
97
|
|
|
97
98
|
status = Crawlscope::Cli.start(
|
|
98
|
-
["validate", "--url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3"],
|
|
99
|
+
["validate", "--url", "https://example.com", "--sitemap", "https://example.com/sitemap-pages.xml", "--rules", "metadata,links", "--renderer", "browser", "--timeout", "30", "--network-idle-timeout", "9", "--concurrency", "3", "--fetch-executor", "async"],
|
|
99
100
|
out: out,
|
|
100
101
|
err: err,
|
|
101
102
|
configuration: configuration,
|
|
@@ -115,10 +116,35 @@ class CrawlscopeCliTest < Minitest::Test
|
|
|
115
116
|
assert_equal 30, configuration.timeout_seconds
|
|
116
117
|
assert_equal 9, configuration.network_idle_timeout_seconds
|
|
117
118
|
assert_equal 3, configuration.concurrency
|
|
119
|
+
assert_equal "async", configuration.fetch_executor
|
|
118
120
|
assert_same out, configuration.output
|
|
119
121
|
assert_empty err.string
|
|
120
122
|
end
|
|
121
123
|
|
|
124
|
+
def test_validate_reads_fetch_executor_from_environment
|
|
125
|
+
configuration = FakeConfiguration.new
|
|
126
|
+
task = FakeTask.new
|
|
127
|
+
|
|
128
|
+
with_env("FETCH_EXECUTOR" => "async") do
|
|
129
|
+
status = Crawlscope::Cli.start(["validate", "--url", "https://example.com"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
|
|
130
|
+
|
|
131
|
+
assert_equal 0, status
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
assert_equal "async", configuration.fetch_executor
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def test_validate_uses_threaded_executor_for_browser_rendering_by_default
|
|
138
|
+
configuration = FakeConfiguration.new
|
|
139
|
+
task = FakeTask.new
|
|
140
|
+
|
|
141
|
+
status = Crawlscope::Cli.start(["validate", "--url", "https://example.com", "--renderer", "browser"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)
|
|
142
|
+
|
|
143
|
+
assert_equal 0, status
|
|
144
|
+
assert_equal :browser, configuration.renderer
|
|
145
|
+
assert_equal :threaded, configuration.fetch_executor
|
|
146
|
+
end
|
|
147
|
+
|
|
122
148
|
def test_ldjson_reads_urls_from_environment
|
|
123
149
|
configuration = FakeConfiguration.new
|
|
124
150
|
task = FakeTask.new
|
|
@@ -13,6 +13,7 @@ class CrawlscopeConfigurationTest < Minitest::Test
|
|
|
13
13
|
config.sitemap_path = -> { "/tmp/sitemap.xml" }
|
|
14
14
|
config.site_name = -> { "Example" }
|
|
15
15
|
config.concurrency = -> { 4 }
|
|
16
|
+
config.fetch_executor = -> { :threaded }
|
|
16
17
|
end
|
|
17
18
|
|
|
18
19
|
audit = Crawlscope.configuration.audit
|
|
@@ -20,6 +21,7 @@ class CrawlscopeConfigurationTest < Minitest::Test
|
|
|
20
21
|
assert_equal "https://example.com", audit.instance_variable_get(:@base_url)
|
|
21
22
|
assert_equal "/tmp/sitemap.xml", audit.instance_variable_get(:@sitemap_path)
|
|
22
23
|
assert_equal 4, audit.instance_variable_get(:@concurrency)
|
|
24
|
+
assert_equal :threaded, audit.instance_variable_get(:@fetch_executor)
|
|
23
25
|
assert_equal %i[
|
|
24
26
|
indexability
|
|
25
27
|
metadata
|
|
@@ -55,6 +57,7 @@ class CrawlscopeConfigurationTest < Minitest::Test
|
|
|
55
57
|
|
|
56
58
|
assert_equal [200, 301, 302], config.allowed_statuses
|
|
57
59
|
assert_equal 10, config.concurrency
|
|
60
|
+
assert_equal :async, config.fetch_executor
|
|
58
61
|
assert_equal 4, config.browser_concurrency
|
|
59
62
|
assert_equal 5, config.network_idle_timeout_seconds
|
|
60
63
|
assert_equal :http, config.renderer
|
|
@@ -63,10 +66,18 @@ class CrawlscopeConfigurationTest < Minitest::Test
|
|
|
63
66
|
assert config.scroll_page?
|
|
64
67
|
end
|
|
65
68
|
|
|
69
|
+
def test_browser_renderer_defaults_to_threaded_fetch_executor
|
|
70
|
+
config = Crawlscope::Configuration.new
|
|
71
|
+
config.renderer = :browser
|
|
72
|
+
|
|
73
|
+
assert_equal :threaded, config.fetch_executor
|
|
74
|
+
end
|
|
75
|
+
|
|
66
76
|
def test_configured_values_are_normalized
|
|
67
77
|
config = Crawlscope::Configuration.new
|
|
68
78
|
config.allowed_statuses = ["200", "404"]
|
|
69
79
|
config.concurrency = "2"
|
|
80
|
+
config.fetch_executor = "async"
|
|
70
81
|
config.network_idle_timeout_seconds = "7"
|
|
71
82
|
config.renderer = "browser"
|
|
72
83
|
config.timeout_seconds = "9"
|
|
@@ -74,6 +85,7 @@ class CrawlscopeConfigurationTest < Minitest::Test
|
|
|
74
85
|
|
|
75
86
|
assert_equal [200, 404], config.allowed_statuses
|
|
76
87
|
assert_equal 2, config.concurrency
|
|
88
|
+
assert_equal :async, config.fetch_executor
|
|
77
89
|
assert_equal 2, config.browser_concurrency
|
|
78
90
|
assert_equal 7, config.network_idle_timeout_seconds
|
|
79
91
|
assert_equal :browser, config.renderer
|
|
@@ -90,6 +102,15 @@ class CrawlscopeConfigurationTest < Minitest::Test
|
|
|
90
102
|
assert_equal "Crawlscope renderer must be http or browser", error.message
|
|
91
103
|
end
|
|
92
104
|
|
|
105
|
+
def test_fetch_executor_must_be_supported
|
|
106
|
+
config = Crawlscope::Configuration.new
|
|
107
|
+
config.fetch_executor = "processes"
|
|
108
|
+
|
|
109
|
+
error = assert_raises(Crawlscope::ConfigurationError) { config.fetch_executor }
|
|
110
|
+
|
|
111
|
+
assert_equal "Crawlscope fetch_executor must be threaded or async", error.message
|
|
112
|
+
end
|
|
113
|
+
|
|
93
114
|
def test_numeric_values_must_be_positive_integers
|
|
94
115
|
config = Crawlscope::Configuration.new
|
|
95
116
|
config.concurrency = "0"
|