crawlscope 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -8
- data/README.md +21 -14
- data/lib/crawlscope/browser.rb +8 -0
- data/lib/crawlscope/cli.rb +15 -10
- data/lib/crawlscope/configuration.rb +20 -5
- data/lib/crawlscope/context.rb +9 -0
- data/lib/crawlscope/{audit.rb → crawl.rb} +68 -58
- data/lib/crawlscope/crawler.rb +19 -1
- data/lib/crawlscope/http.rb +1 -1
- data/lib/crawlscope/rake_tasks.rb +28 -0
- data/lib/crawlscope/rules/links.rb +99 -48
- data/lib/crawlscope/rules/metadata.rb +57 -11
- data/lib/crawlscope/rules/structured_data.rb +61 -1
- data/lib/crawlscope/run.rb +60 -0
- data/lib/crawlscope/schema_registry.rb +3 -349
- data/lib/crawlscope/schemas.rb +406 -0
- data/lib/crawlscope/sitemap.rb +18 -6
- data/lib/crawlscope/structured_data/audit.rb +7 -7
- data/lib/crawlscope/structured_data/check.rb +35 -0
- data/lib/crawlscope/structured_data/reporter.rb +69 -0
- data/lib/crawlscope/url.rb +14 -0
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +12 -23
- data/test/crawlscope/browser_test.rb +155 -0
- data/test/crawlscope/cli_test.rb +143 -7
- data/test/crawlscope/configuration_test.rb +49 -0
- data/test/crawlscope/{audit_test.rb → crawl_test.rb} +23 -7
- data/test/crawlscope/crawler_test.rb +34 -0
- data/test/crawlscope/http_test.rb +56 -0
- data/test/crawlscope/links_rule_test.rb +149 -5
- data/test/crawlscope/metadata_rule_test.rb +77 -0
- data/test/crawlscope/rule_registry_test.rb +32 -0
- data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
- data/test/crawlscope/schema_registry_test.rb +19 -0
- data/test/crawlscope/sitemap_test.rb +55 -0
- data/test/crawlscope/structured_data_document_test.rb +36 -0
- data/test/crawlscope/structured_data_report_test.rb +3 -3
- data/test/crawlscope/structured_data_reporter_test.rb +2 -2
- data/test/crawlscope/structured_data_rule_test.rb +111 -0
- data/test/crawlscope/structured_data_writer_test.rb +2 -2
- data/test/crawlscope/url_test.rb +31 -0
- metadata +15 -5
- data/lib/crawlscope/task.rb +0 -131
|
@@ -5,7 +5,7 @@ require "uri"
|
|
|
5
5
|
module Crawlscope
|
|
6
6
|
module Rules
|
|
7
7
|
class Links
|
|
8
|
-
|
|
8
|
+
LINK_SELECTORS = "a[href]"
|
|
9
9
|
INTERNAL_PATH_PREFIXES_TO_SKIP = ["/rails/", "/cdn-cgi/"].freeze
|
|
10
10
|
LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze
|
|
11
11
|
MAX_SOURCES_IN_ERROR = 3
|
|
@@ -33,48 +33,53 @@ module Crawlscope
|
|
|
33
33
|
private
|
|
34
34
|
|
|
35
35
|
def contextual_links(doc)
|
|
36
|
-
|
|
37
|
-
return links unless links.empty?
|
|
38
|
-
|
|
39
|
-
doc.css("a[href]")
|
|
36
|
+
doc.css(LINK_SELECTORS)
|
|
40
37
|
end
|
|
41
38
|
|
|
42
39
|
def extract_links(pages)
|
|
43
|
-
|
|
40
|
+
pages.select(&:html?).flat_map { |page| page_links(page) }
|
|
41
|
+
end
|
|
44
42
|
|
|
45
|
-
|
|
46
|
-
|
|
43
|
+
def page_links(page)
|
|
44
|
+
source_path = Url.path(page.normalized_url)
|
|
45
|
+
return [] unless crawlable_source_path?(source_path)
|
|
47
46
|
|
|
48
|
-
|
|
49
|
-
|
|
47
|
+
contextual_links(page.doc).filter_map do |node|
|
|
48
|
+
link_for(page: page, source_path: source_path, node: node)
|
|
49
|
+
end
|
|
50
|
+
end
|
|
50
51
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
next if href.start_with?("#")
|
|
55
|
-
next if LINK_SCHEMES_TO_SKIP.any? { |prefix| href.start_with?(prefix) }
|
|
52
|
+
def link_for(page:, source_path:, node:)
|
|
53
|
+
href = node["href"].to_s.strip
|
|
54
|
+
return unless crawlable_href?(href)
|
|
56
55
|
|
|
57
|
-
|
|
58
|
-
|
|
56
|
+
anchor_text = normalize_anchor_text(node.text)
|
|
57
|
+
return if anchor_text.empty?
|
|
59
58
|
|
|
60
|
-
|
|
61
|
-
|
|
59
|
+
target_url = normalize_internal_link(page.normalized_url, href)
|
|
60
|
+
return if target_url.nil?
|
|
62
61
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
next if skip_internal_path?(target_path)
|
|
62
|
+
target_path = Url.path(target_url)
|
|
63
|
+
return unless crawlable_path?(target_path)
|
|
66
64
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
65
|
+
{
|
|
66
|
+
anchor_text: anchor_text,
|
|
67
|
+
source_path: source_path,
|
|
68
|
+
source_url: page.normalized_url,
|
|
69
|
+
target_path: target_path,
|
|
70
|
+
target_url: target_url
|
|
71
|
+
}
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def crawlable_href?(href)
|
|
75
|
+
return false if href.empty?
|
|
76
|
+
return false if href.start_with?("#")
|
|
77
|
+
|
|
78
|
+
LINK_SCHEMES_TO_SKIP.none? { |prefix| href.start_with?(prefix) }
|
|
79
|
+
end
|
|
76
80
|
|
|
77
|
-
|
|
81
|
+
def crawlable_path?(path)
|
|
82
|
+
!path.nil? && !skip_internal_path?(path)
|
|
78
83
|
end
|
|
79
84
|
|
|
80
85
|
def normalize_anchor_text(text)
|
|
@@ -122,39 +127,85 @@ module Crawlscope
|
|
|
122
127
|
resolved_links = []
|
|
123
128
|
|
|
124
129
|
links.group_by { |link| link[:target_url] }.each do |target_url, grouped_links|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
130
|
+
target = resolve_target(target_url)
|
|
131
|
+
|
|
132
|
+
if target.unresolved?
|
|
133
|
+
report_unresolved_target(target_url, grouped_links, issues, target.resolution)
|
|
128
134
|
next
|
|
129
135
|
end
|
|
130
136
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
if status.nil?
|
|
134
|
-
next if resolution[:crawled] && resolution[:error]
|
|
135
|
-
|
|
136
|
-
report_unresolved_target(target_url, grouped_links, issues, resolution)
|
|
137
|
+
if target.ignored_error?
|
|
137
138
|
next
|
|
138
139
|
end
|
|
139
140
|
|
|
140
|
-
unless
|
|
141
|
-
report_broken_target(target_url, grouped_links, issues, status)
|
|
141
|
+
unless target.allowed?(@allowed_statuses)
|
|
142
|
+
report_broken_target(target_url, grouped_links, issues, target.status)
|
|
142
143
|
next
|
|
143
144
|
end
|
|
144
145
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
next if final_path.nil?
|
|
148
|
-
next if skip_internal_path?(final_path)
|
|
146
|
+
report_redirect_target(target_url, grouped_links, issues, target) if target.redirect?
|
|
147
|
+
next unless crawlable_path?(target.final_path)
|
|
149
148
|
|
|
150
149
|
grouped_links.each do |link|
|
|
151
|
-
resolved_links << link.merge(final_path: final_path, final_url: final_url)
|
|
150
|
+
resolved_links << link.merge(final_path: target.final_path, final_url: target.final_url)
|
|
152
151
|
end
|
|
153
152
|
end
|
|
154
153
|
|
|
155
154
|
resolved_links
|
|
156
155
|
end
|
|
157
156
|
|
|
157
|
+
def report_redirect_target(target_url, grouped_links, issues, target)
|
|
158
|
+
source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
|
|
159
|
+
issues.add(
|
|
160
|
+
code: :internal_link_redirects,
|
|
161
|
+
severity: :warning,
|
|
162
|
+
category: :links,
|
|
163
|
+
url: target_url,
|
|
164
|
+
message: "internal link redirects to #{target.final_url} (sources: #{source_urls.join(", ")})",
|
|
165
|
+
details: {final_url: target.final_url, source_urls: source_urls, status: target.status}
|
|
166
|
+
)
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def resolve_target(target_url)
|
|
170
|
+
resolution = @resolve_target.call(target_url)
|
|
171
|
+
LinkTarget.new(target_url: target_url, resolution: resolution)
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
LinkTarget = Data.define(:target_url, :resolution) do
|
|
175
|
+
def allowed?(statuses)
|
|
176
|
+
statuses.include?(status)
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def final_path
|
|
180
|
+
Url.path(final_url)
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def final_url
|
|
184
|
+
value = resolution[:final_url].to_s
|
|
185
|
+
value.empty? ? target_url : value
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def ignored_error?
|
|
189
|
+
resolution && status.nil? && resolution[:crawled] && resolution[:error]
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
def status
|
|
193
|
+
resolution && resolution[:status]
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
def redirect?
|
|
197
|
+
(status && (300..399).cover?(status.to_i)) || final_url != target_url
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
def unresolved?
|
|
201
|
+
resolution.nil? || (status.nil? && !ignored_error?)
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
def crawlable_source_path?(path)
|
|
206
|
+
!path.nil? && INTERNAL_PATH_PREFIXES_TO_SKIP.none? { |prefix| path.start_with?(prefix) }
|
|
207
|
+
end
|
|
208
|
+
|
|
158
209
|
def skip_internal_path?(path)
|
|
159
210
|
return true if path == "/"
|
|
160
211
|
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "uri"
|
|
4
|
+
|
|
3
5
|
module Crawlscope
|
|
4
6
|
module Rules
|
|
5
7
|
class Metadata
|
|
6
8
|
TITLE_MAX_LENGTH = 72
|
|
9
|
+
DESCRIPTION_MIN_LENGTH = 110
|
|
7
10
|
DESCRIPTION_MAX_LENGTH = 160
|
|
11
|
+
REQUIRED_OPEN_GRAPH_PROPERTIES = %w[og:title og:description og:url og:type og:image].freeze
|
|
8
12
|
|
|
9
13
|
attr_reader :code
|
|
10
14
|
|
|
@@ -21,22 +25,35 @@ module Crawlscope
|
|
|
21
25
|
validate_title(page, issues)
|
|
22
26
|
validate_description(page, issues)
|
|
23
27
|
validate_canonical(page, issues)
|
|
28
|
+
validate_open_graph(page, issues)
|
|
24
29
|
end
|
|
25
30
|
end
|
|
26
31
|
|
|
27
32
|
private
|
|
28
33
|
|
|
29
34
|
def validate_h1(page, issues)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
35
|
+
h1s = page.doc.css("h1")
|
|
36
|
+
return if h1s.one?
|
|
37
|
+
|
|
38
|
+
if h1s.empty?
|
|
39
|
+
issues.add(
|
|
40
|
+
code: :missing_h1,
|
|
41
|
+
severity: :warning,
|
|
42
|
+
category: :metadata,
|
|
43
|
+
url: page.url,
|
|
44
|
+
message: "missing <h1>",
|
|
45
|
+
details: {}
|
|
46
|
+
)
|
|
47
|
+
else
|
|
48
|
+
issues.add(
|
|
49
|
+
code: :multiple_h1,
|
|
50
|
+
severity: :warning,
|
|
51
|
+
category: :metadata,
|
|
52
|
+
url: page.url,
|
|
53
|
+
message: "multiple <h1> tags (#{h1s.size})",
|
|
54
|
+
details: {count: h1s.size}
|
|
55
|
+
)
|
|
56
|
+
end
|
|
40
57
|
end
|
|
41
58
|
|
|
42
59
|
def validate_title(page, issues)
|
|
@@ -56,6 +73,8 @@ module Crawlscope
|
|
|
56
73
|
|
|
57
74
|
if description.empty?
|
|
58
75
|
issues.add(code: :missing_meta_description, severity: :warning, category: :metadata, url: page.url, message: "missing meta description", details: {})
|
|
76
|
+
elsif description.length < DESCRIPTION_MIN_LENGTH
|
|
77
|
+
issues.add(code: :meta_description_too_short, severity: :warning, category: :metadata, url: page.url, message: "meta description too short (#{description.length})", details: {length: description.length, minimum: DESCRIPTION_MIN_LENGTH})
|
|
59
78
|
elsif description.length > DESCRIPTION_MAX_LENGTH
|
|
60
79
|
issues.add(code: :meta_description_too_long, severity: :warning, category: :metadata, url: page.url, message: "meta description too long (#{description.length})", details: {length: description.length})
|
|
61
80
|
end
|
|
@@ -71,7 +90,7 @@ module Crawlscope
|
|
|
71
90
|
|
|
72
91
|
normalized_canonical = Url.normalize(canonical, base_url: page.url)
|
|
73
92
|
normalized_page_url = Url.normalize(page.url, base_url: page.url)
|
|
74
|
-
return if normalized_canonical
|
|
93
|
+
return if canonical_matches_page?(normalized_canonical, normalized_page_url)
|
|
75
94
|
|
|
76
95
|
issues.add(
|
|
77
96
|
code: :canonical_mismatch,
|
|
@@ -88,6 +107,33 @@ module Crawlscope
|
|
|
88
107
|
|
|
89
108
|
title.split(/[^[:alnum:]]+/).count { |token| token.casecmp?(@site_name) } > 1
|
|
90
109
|
end
|
|
110
|
+
|
|
111
|
+
def validate_open_graph(page, issues)
|
|
112
|
+
missing = REQUIRED_OPEN_GRAPH_PROPERTIES.reject do |property|
|
|
113
|
+
page.doc.at_css(%(meta[property="#{property}"][content]))
|
|
114
|
+
end
|
|
115
|
+
return if missing.empty?
|
|
116
|
+
|
|
117
|
+
issues.add(
|
|
118
|
+
code: :incomplete_open_graph_tags,
|
|
119
|
+
severity: :warning,
|
|
120
|
+
category: :metadata,
|
|
121
|
+
url: page.url,
|
|
122
|
+
message: "Open Graph tags incomplete (missing #{missing.join(", ")})",
|
|
123
|
+
details: {missing: missing}
|
|
124
|
+
)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def canonical_matches_page?(canonical, page_url)
|
|
128
|
+
canonical == page_url || (local_url?(page_url) && Url.path(canonical) == Url.path(page_url))
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def local_url?(url)
|
|
132
|
+
host = URI.parse(url.to_s).host.to_s
|
|
133
|
+
["localhost", "127.0.0.1", "0.0.0.0", "::1"].include?(host)
|
|
134
|
+
rescue URI::InvalidURIError
|
|
135
|
+
false
|
|
136
|
+
end
|
|
91
137
|
end
|
|
92
138
|
end
|
|
93
139
|
end
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
module Crawlscope
|
|
4
4
|
module Rules
|
|
5
5
|
class StructuredData
|
|
6
|
+
CAREER_DETAIL_PATH = %r{/careers/[^/]+/?\z}
|
|
7
|
+
|
|
6
8
|
attr_reader :code
|
|
7
9
|
|
|
8
10
|
def initialize
|
|
@@ -23,8 +25,21 @@ module Crawlscope
|
|
|
23
25
|
|
|
24
26
|
def validate_page(page, issues, schema_registry)
|
|
25
27
|
document = Crawlscope::StructuredData::Document.new(html: page.body)
|
|
28
|
+
items = document.items
|
|
29
|
+
|
|
30
|
+
if items.empty?
|
|
31
|
+
issues.add(
|
|
32
|
+
code: :missing_structured_data,
|
|
33
|
+
severity: :warning,
|
|
34
|
+
category: :structured_data,
|
|
35
|
+
url: page.url,
|
|
36
|
+
message: "no structured data found; add JSON-LD or microdata markup",
|
|
37
|
+
details: {expected_sources: ["json-ld", "microdata"]}
|
|
38
|
+
)
|
|
39
|
+
return
|
|
40
|
+
end
|
|
26
41
|
|
|
27
|
-
|
|
42
|
+
items.each do |item|
|
|
28
43
|
data = item.data
|
|
29
44
|
source = item.source
|
|
30
45
|
|
|
@@ -52,6 +67,51 @@ module Crawlscope
|
|
|
52
67
|
details: {errors: errors, source: source}
|
|
53
68
|
)
|
|
54
69
|
end
|
|
70
|
+
|
|
71
|
+
validate_job_posting_count(page, items, issues)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def validate_job_posting_count(page, items, issues)
|
|
75
|
+
job_postings = items.select { |item| structured_data_types(item.data).include?("JobPosting") }
|
|
76
|
+
return if job_postings.size == 1
|
|
77
|
+
|
|
78
|
+
if job_postings.size > 1
|
|
79
|
+
issues.add(
|
|
80
|
+
code: :multiple_job_postings,
|
|
81
|
+
severity: :warning,
|
|
82
|
+
category: :structured_data,
|
|
83
|
+
url: page.url,
|
|
84
|
+
message: "multiple JobPosting structured data blocks found",
|
|
85
|
+
details: {count: job_postings.size}
|
|
86
|
+
)
|
|
87
|
+
elsif career_detail_page?(page.url)
|
|
88
|
+
issues.add(
|
|
89
|
+
code: :missing_job_posting,
|
|
90
|
+
severity: :warning,
|
|
91
|
+
category: :structured_data,
|
|
92
|
+
url: page.url,
|
|
93
|
+
message: "career detail page missing JobPosting structured data",
|
|
94
|
+
details: {expected_type: "JobPosting"}
|
|
95
|
+
)
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def structured_data_types(data)
|
|
100
|
+
return [] unless data.is_a?(Hash)
|
|
101
|
+
|
|
102
|
+
types = Array(data["@type"]).map(&:to_s)
|
|
103
|
+
|
|
104
|
+
if data["@graph"].is_a?(Array)
|
|
105
|
+
types.concat(data["@graph"].flat_map { |entry| structured_data_types(entry) })
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
types
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def career_detail_page?(url)
|
|
112
|
+
URI(url).path.match?(CAREER_DETAIL_PATH)
|
|
113
|
+
rescue URI::InvalidURIError
|
|
114
|
+
false
|
|
55
115
|
end
|
|
56
116
|
end
|
|
57
117
|
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Crawlscope
|
|
4
|
+
class Run
|
|
5
|
+
def initialize(configuration: Crawlscope.configuration, reporter: Reporter.new(io: configuration.output))
|
|
6
|
+
@configuration = configuration
|
|
7
|
+
@reporter = reporter
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def validate(base_url: nil, sitemap_path: nil, rule_names: nil)
|
|
11
|
+
resolved_base_url = base_url || default_base_url
|
|
12
|
+
crawl = @configuration.audit(
|
|
13
|
+
base_url: resolved_base_url,
|
|
14
|
+
sitemap_path: sitemap_path || default_sitemap_path(base_url: resolved_base_url),
|
|
15
|
+
rule_names: rule_names
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
result = crawl.call
|
|
19
|
+
@reporter.report(result)
|
|
20
|
+
result
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def validate_json_ld(urls:, debug: false, renderer: @configuration.renderer, timeout_seconds: @configuration.timeout_seconds, report_path: nil, summary: false)
|
|
24
|
+
StructuredData::Check.new(configuration: @configuration).call(
|
|
25
|
+
urls: urls,
|
|
26
|
+
debug: debug,
|
|
27
|
+
renderer: renderer,
|
|
28
|
+
timeout_seconds: timeout_seconds,
|
|
29
|
+
report_path: report_path,
|
|
30
|
+
summary: summary
|
|
31
|
+
)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def default_base_url
|
|
37
|
+
value = @configuration.base_url
|
|
38
|
+
return value unless value.to_s.strip.empty?
|
|
39
|
+
|
|
40
|
+
"http://localhost:3000"
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def default_sitemap_path(base_url:)
|
|
44
|
+
value = @configuration.sitemap_path
|
|
45
|
+
return value unless value.to_s.strip.empty?
|
|
46
|
+
|
|
47
|
+
local_path = File.expand_path("public/sitemap.xml", Dir.pwd)
|
|
48
|
+
return local_path if local_path_default?(base_url: base_url) && File.exist?(local_path)
|
|
49
|
+
|
|
50
|
+
"#{base_url.to_s.chomp("/")}/sitemap.xml"
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def local_path_default?(base_url:)
|
|
54
|
+
host = URI.parse(base_url.to_s).host.to_s
|
|
55
|
+
["localhost", "127.0.0.1"].include?(host)
|
|
56
|
+
rescue URI::InvalidURIError
|
|
57
|
+
false
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|