crawlscope 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +7 -8
  3. data/README.md +21 -14
  4. data/lib/crawlscope/browser.rb +8 -0
  5. data/lib/crawlscope/cli.rb +15 -10
  6. data/lib/crawlscope/configuration.rb +20 -5
  7. data/lib/crawlscope/context.rb +9 -0
  8. data/lib/crawlscope/{audit.rb → crawl.rb} +68 -58
  9. data/lib/crawlscope/crawler.rb +19 -1
  10. data/lib/crawlscope/http.rb +1 -1
  11. data/lib/crawlscope/rake_tasks.rb +28 -0
  12. data/lib/crawlscope/rules/links.rb +99 -48
  13. data/lib/crawlscope/rules/metadata.rb +57 -11
  14. data/lib/crawlscope/rules/structured_data.rb +61 -1
  15. data/lib/crawlscope/run.rb +60 -0
  16. data/lib/crawlscope/schema_registry.rb +3 -349
  17. data/lib/crawlscope/schemas.rb +406 -0
  18. data/lib/crawlscope/sitemap.rb +18 -6
  19. data/lib/crawlscope/structured_data/audit.rb +7 -7
  20. data/lib/crawlscope/structured_data/check.rb +35 -0
  21. data/lib/crawlscope/structured_data/reporter.rb +69 -0
  22. data/lib/crawlscope/url.rb +14 -0
  23. data/lib/crawlscope/version.rb +1 -1
  24. data/lib/tasks/crawlscope_tasks.rake +12 -23
  25. data/test/crawlscope/browser_test.rb +155 -0
  26. data/test/crawlscope/cli_test.rb +143 -7
  27. data/test/crawlscope/configuration_test.rb +49 -0
  28. data/test/crawlscope/{audit_test.rb → crawl_test.rb} +23 -7
  29. data/test/crawlscope/crawler_test.rb +34 -0
  30. data/test/crawlscope/http_test.rb +56 -0
  31. data/test/crawlscope/links_rule_test.rb +149 -5
  32. data/test/crawlscope/metadata_rule_test.rb +77 -0
  33. data/test/crawlscope/rule_registry_test.rb +32 -0
  34. data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
  35. data/test/crawlscope/schema_registry_test.rb +19 -0
  36. data/test/crawlscope/sitemap_test.rb +55 -0
  37. data/test/crawlscope/structured_data_document_test.rb +36 -0
  38. data/test/crawlscope/structured_data_report_test.rb +3 -3
  39. data/test/crawlscope/structured_data_reporter_test.rb +2 -2
  40. data/test/crawlscope/structured_data_rule_test.rb +111 -0
  41. data/test/crawlscope/structured_data_writer_test.rb +2 -2
  42. data/test/crawlscope/url_test.rb +31 -0
  43. metadata +15 -5
  44. data/lib/crawlscope/task.rb +0 -131
@@ -5,7 +5,7 @@ require "uri"
5
5
  module Crawlscope
6
6
  module Rules
7
7
  class Links
8
- CONTEXTUAL_LINK_SELECTORS = "main a[href], article a[href]"
8
+ LINK_SELECTORS = "a[href]"
9
9
  INTERNAL_PATH_PREFIXES_TO_SKIP = ["/rails/", "/cdn-cgi/"].freeze
10
10
  LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze
11
11
  MAX_SOURCES_IN_ERROR = 3
@@ -33,48 +33,53 @@ module Crawlscope
33
33
  private
34
34
 
35
35
  def contextual_links(doc)
36
- links = doc.css(CONTEXTUAL_LINK_SELECTORS)
37
- return links unless links.empty?
38
-
39
- doc.css("a[href]")
36
+ doc.css(LINK_SELECTORS)
40
37
  end
41
38
 
42
39
  def extract_links(pages)
43
- links = []
40
+ pages.select(&:html?).flat_map { |page| page_links(page) }
41
+ end
44
42
 
45
- pages.each do |page|
46
- next unless page.html?
43
+ def page_links(page)
44
+ source_path = Url.path(page.normalized_url)
45
+ return [] unless crawlable_source_path?(source_path)
47
46
 
48
- source_path = Url.path(page.normalized_url)
49
- next if source_path.nil?
47
+ contextual_links(page.doc).filter_map do |node|
48
+ link_for(page: page, source_path: source_path, node: node)
49
+ end
50
+ end
50
51
 
51
- contextual_links(page.doc).each do |node|
52
- href = node["href"].to_s.strip
53
- next if href.empty?
54
- next if href.start_with?("#")
55
- next if LINK_SCHEMES_TO_SKIP.any? { |prefix| href.start_with?(prefix) }
52
+ def link_for(page:, source_path:, node:)
53
+ href = node["href"].to_s.strip
54
+ return unless crawlable_href?(href)
56
55
 
57
- anchor_text = normalize_anchor_text(node.text)
58
- next if anchor_text.empty?
56
+ anchor_text = normalize_anchor_text(node.text)
57
+ return if anchor_text.empty?
59
58
 
60
- target_url = normalize_internal_link(page.normalized_url, href)
61
- next if target_url.nil?
59
+ target_url = normalize_internal_link(page.normalized_url, href)
60
+ return if target_url.nil?
62
61
 
63
- target_path = Url.path(target_url)
64
- next if target_path.nil?
65
- next if skip_internal_path?(target_path)
62
+ target_path = Url.path(target_url)
63
+ return unless crawlable_path?(target_path)
66
64
 
67
- links << {
68
- anchor_text: anchor_text,
69
- source_path: source_path,
70
- source_url: page.normalized_url,
71
- target_path: target_path,
72
- target_url: target_url
73
- }
74
- end
75
- end
65
+ {
66
+ anchor_text: anchor_text,
67
+ source_path: source_path,
68
+ source_url: page.normalized_url,
69
+ target_path: target_path,
70
+ target_url: target_url
71
+ }
72
+ end
73
+
74
+ def crawlable_href?(href)
75
+ return false if href.empty?
76
+ return false if href.start_with?("#")
77
+
78
+ LINK_SCHEMES_TO_SKIP.none? { |prefix| href.start_with?(prefix) }
79
+ end
76
80
 
77
- links
81
+ def crawlable_path?(path)
82
+ !path.nil? && !skip_internal_path?(path)
78
83
  end
79
84
 
80
85
  def normalize_anchor_text(text)
@@ -122,39 +127,85 @@ module Crawlscope
122
127
  resolved_links = []
123
128
 
124
129
  links.group_by { |link| link[:target_url] }.each do |target_url, grouped_links|
125
- resolution = @resolve_target.call(target_url)
126
- if resolution.nil?
127
- report_unresolved_target(target_url, grouped_links, issues, resolution)
130
+ target = resolve_target(target_url)
131
+
132
+ if target.unresolved?
133
+ report_unresolved_target(target_url, grouped_links, issues, target.resolution)
128
134
  next
129
135
  end
130
136
 
131
- status = resolution[:status]
132
-
133
- if status.nil?
134
- next if resolution[:crawled] && resolution[:error]
135
-
136
- report_unresolved_target(target_url, grouped_links, issues, resolution)
137
+ if target.ignored_error?
137
138
  next
138
139
  end
139
140
 
140
- unless @allowed_statuses.include?(status)
141
- report_broken_target(target_url, grouped_links, issues, status)
141
+ unless target.allowed?(@allowed_statuses)
142
+ report_broken_target(target_url, grouped_links, issues, target.status)
142
143
  next
143
144
  end
144
145
 
145
- final_url = resolution[:final_url].to_s.empty? ? target_url : resolution[:final_url]
146
- final_path = Url.path(final_url)
147
- next if final_path.nil?
148
- next if skip_internal_path?(final_path)
146
+ report_redirect_target(target_url, grouped_links, issues, target) if target.redirect?
147
+ next unless crawlable_path?(target.final_path)
149
148
 
150
149
  grouped_links.each do |link|
151
- resolved_links << link.merge(final_path: final_path, final_url: final_url)
150
+ resolved_links << link.merge(final_path: target.final_path, final_url: target.final_url)
152
151
  end
153
152
  end
154
153
 
155
154
  resolved_links
156
155
  end
157
156
 
157
+ def report_redirect_target(target_url, grouped_links, issues, target)
158
+ source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
159
+ issues.add(
160
+ code: :internal_link_redirects,
161
+ severity: :warning,
162
+ category: :links,
163
+ url: target_url,
164
+ message: "internal link redirects to #{target.final_url} (sources: #{source_urls.join(", ")})",
165
+ details: {final_url: target.final_url, source_urls: source_urls, status: target.status}
166
+ )
167
+ end
168
+
169
+ def resolve_target(target_url)
170
+ resolution = @resolve_target.call(target_url)
171
+ LinkTarget.new(target_url: target_url, resolution: resolution)
172
+ end
173
+
174
+ LinkTarget = Data.define(:target_url, :resolution) do
175
+ def allowed?(statuses)
176
+ statuses.include?(status)
177
+ end
178
+
179
+ def final_path
180
+ Url.path(final_url)
181
+ end
182
+
183
+ def final_url
184
+ value = resolution[:final_url].to_s
185
+ value.empty? ? target_url : value
186
+ end
187
+
188
+ def ignored_error?
189
+ resolution && status.nil? && resolution[:crawled] && resolution[:error]
190
+ end
191
+
192
+ def status
193
+ resolution && resolution[:status]
194
+ end
195
+
196
+ def redirect?
197
+ (status && (300..399).cover?(status.to_i)) || final_url != target_url
198
+ end
199
+
200
+ def unresolved?
201
+ resolution.nil? || (status.nil? && !ignored_error?)
202
+ end
203
+ end
204
+
205
+ def crawlable_source_path?(path)
206
+ !path.nil? && INTERNAL_PATH_PREFIXES_TO_SKIP.none? { |prefix| path.start_with?(prefix) }
207
+ end
208
+
158
209
  def skip_internal_path?(path)
159
210
  return true if path == "/"
160
211
 
@@ -1,10 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "uri"
4
+
3
5
  module Crawlscope
4
6
  module Rules
5
7
  class Metadata
6
8
  TITLE_MAX_LENGTH = 72
9
+ DESCRIPTION_MIN_LENGTH = 110
7
10
  DESCRIPTION_MAX_LENGTH = 160
11
+ REQUIRED_OPEN_GRAPH_PROPERTIES = %w[og:title og:description og:url og:type og:image].freeze
8
12
 
9
13
  attr_reader :code
10
14
 
@@ -21,22 +25,35 @@ module Crawlscope
21
25
  validate_title(page, issues)
22
26
  validate_description(page, issues)
23
27
  validate_canonical(page, issues)
28
+ validate_open_graph(page, issues)
24
29
  end
25
30
  end
26
31
 
27
32
  private
28
33
 
29
34
  def validate_h1(page, issues)
30
- return unless page.doc.at_css("h1").nil?
31
-
32
- issues.add(
33
- code: :missing_h1,
34
- severity: :warning,
35
- category: :metadata,
36
- url: page.url,
37
- message: "missing <h1>",
38
- details: {}
39
- )
35
+ h1s = page.doc.css("h1")
36
+ return if h1s.one?
37
+
38
+ if h1s.empty?
39
+ issues.add(
40
+ code: :missing_h1,
41
+ severity: :warning,
42
+ category: :metadata,
43
+ url: page.url,
44
+ message: "missing <h1>",
45
+ details: {}
46
+ )
47
+ else
48
+ issues.add(
49
+ code: :multiple_h1,
50
+ severity: :warning,
51
+ category: :metadata,
52
+ url: page.url,
53
+ message: "multiple <h1> tags (#{h1s.size})",
54
+ details: {count: h1s.size}
55
+ )
56
+ end
40
57
  end
41
58
 
42
59
  def validate_title(page, issues)
@@ -56,6 +73,8 @@ module Crawlscope
56
73
 
57
74
  if description.empty?
58
75
  issues.add(code: :missing_meta_description, severity: :warning, category: :metadata, url: page.url, message: "missing meta description", details: {})
76
+ elsif description.length < DESCRIPTION_MIN_LENGTH
77
+ issues.add(code: :meta_description_too_short, severity: :warning, category: :metadata, url: page.url, message: "meta description too short (#{description.length})", details: {length: description.length, minimum: DESCRIPTION_MIN_LENGTH})
59
78
  elsif description.length > DESCRIPTION_MAX_LENGTH
60
79
  issues.add(code: :meta_description_too_long, severity: :warning, category: :metadata, url: page.url, message: "meta description too long (#{description.length})", details: {length: description.length})
61
80
  end
@@ -71,7 +90,7 @@ module Crawlscope
71
90
 
72
91
  normalized_canonical = Url.normalize(canonical, base_url: page.url)
73
92
  normalized_page_url = Url.normalize(page.url, base_url: page.url)
74
- return if normalized_canonical == normalized_page_url
93
+ return if canonical_matches_page?(normalized_canonical, normalized_page_url)
75
94
 
76
95
  issues.add(
77
96
  code: :canonical_mismatch,
@@ -88,6 +107,33 @@ module Crawlscope
88
107
 
89
108
  title.split(/[^[:alnum:]]+/).count { |token| token.casecmp?(@site_name) } > 1
90
109
  end
110
+
111
+ def validate_open_graph(page, issues)
112
+ missing = REQUIRED_OPEN_GRAPH_PROPERTIES.reject do |property|
113
+ page.doc.at_css(%(meta[property="#{property}"][content]))
114
+ end
115
+ return if missing.empty?
116
+
117
+ issues.add(
118
+ code: :incomplete_open_graph_tags,
119
+ severity: :warning,
120
+ category: :metadata,
121
+ url: page.url,
122
+ message: "Open Graph tags incomplete (missing #{missing.join(", ")})",
123
+ details: {missing: missing}
124
+ )
125
+ end
126
+
127
+ def canonical_matches_page?(canonical, page_url)
128
+ canonical == page_url || (local_url?(page_url) && Url.path(canonical) == Url.path(page_url))
129
+ end
130
+
131
+ def local_url?(url)
132
+ host = URI.parse(url.to_s).host.to_s
133
+ ["localhost", "127.0.0.1", "0.0.0.0", "::1"].include?(host)
134
+ rescue URI::InvalidURIError
135
+ false
136
+ end
91
137
  end
92
138
  end
93
139
  end
@@ -3,6 +3,8 @@
3
3
  module Crawlscope
4
4
  module Rules
5
5
  class StructuredData
6
+ CAREER_DETAIL_PATH = %r{/careers/[^/]+/?\z}
7
+
6
8
  attr_reader :code
7
9
 
8
10
  def initialize
@@ -23,8 +25,21 @@ module Crawlscope
23
25
 
24
26
  def validate_page(page, issues, schema_registry)
25
27
  document = Crawlscope::StructuredData::Document.new(html: page.body)
28
+ items = document.items
29
+
30
+ if items.empty?
31
+ issues.add(
32
+ code: :missing_structured_data,
33
+ severity: :warning,
34
+ category: :structured_data,
35
+ url: page.url,
36
+ message: "no structured data found; add JSON-LD or microdata markup",
37
+ details: {expected_sources: ["json-ld", "microdata"]}
38
+ )
39
+ return
40
+ end
26
41
 
27
- document.items.each do |item|
42
+ items.each do |item|
28
43
  data = item.data
29
44
  source = item.source
30
45
 
@@ -52,6 +67,51 @@ module Crawlscope
52
67
  details: {errors: errors, source: source}
53
68
  )
54
69
  end
70
+
71
+ validate_job_posting_count(page, items, issues)
72
+ end
73
+
74
+ def validate_job_posting_count(page, items, issues)
75
+ job_postings = items.select { |item| structured_data_types(item.data).include?("JobPosting") }
76
+ return if job_postings.size == 1
77
+
78
+ if job_postings.size > 1
79
+ issues.add(
80
+ code: :multiple_job_postings,
81
+ severity: :warning,
82
+ category: :structured_data,
83
+ url: page.url,
84
+ message: "multiple JobPosting structured data blocks found",
85
+ details: {count: job_postings.size}
86
+ )
87
+ elsif career_detail_page?(page.url)
88
+ issues.add(
89
+ code: :missing_job_posting,
90
+ severity: :warning,
91
+ category: :structured_data,
92
+ url: page.url,
93
+ message: "career detail page missing JobPosting structured data",
94
+ details: {expected_type: "JobPosting"}
95
+ )
96
+ end
97
+ end
98
+
99
+ def structured_data_types(data)
100
+ return [] unless data.is_a?(Hash)
101
+
102
+ types = Array(data["@type"]).map(&:to_s)
103
+
104
+ if data["@graph"].is_a?(Array)
105
+ types.concat(data["@graph"].flat_map { |entry| structured_data_types(entry) })
106
+ end
107
+
108
+ types
109
+ end
110
+
111
+ def career_detail_page?(url)
112
+ URI(url).path.match?(CAREER_DETAIL_PATH)
113
+ rescue URI::InvalidURIError
114
+ false
55
115
  end
56
116
  end
57
117
  end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ class Run
5
+ def initialize(configuration: Crawlscope.configuration, reporter: Reporter.new(io: configuration.output))
6
+ @configuration = configuration
7
+ @reporter = reporter
8
+ end
9
+
10
+ def validate(base_url: nil, sitemap_path: nil, rule_names: nil)
11
+ resolved_base_url = base_url || default_base_url
12
+ crawl = @configuration.audit(
13
+ base_url: resolved_base_url,
14
+ sitemap_path: sitemap_path || default_sitemap_path(base_url: resolved_base_url),
15
+ rule_names: rule_names
16
+ )
17
+
18
+ result = crawl.call
19
+ @reporter.report(result)
20
+ result
21
+ end
22
+
23
+ def validate_json_ld(urls:, debug: false, renderer: @configuration.renderer, timeout_seconds: @configuration.timeout_seconds, report_path: nil, summary: false)
24
+ StructuredData::Check.new(configuration: @configuration).call(
25
+ urls: urls,
26
+ debug: debug,
27
+ renderer: renderer,
28
+ timeout_seconds: timeout_seconds,
29
+ report_path: report_path,
30
+ summary: summary
31
+ )
32
+ end
33
+
34
+ private
35
+
36
+ def default_base_url
37
+ value = @configuration.base_url
38
+ return value unless value.to_s.strip.empty?
39
+
40
+ "http://localhost:3000"
41
+ end
42
+
43
+ def default_sitemap_path(base_url:)
44
+ value = @configuration.sitemap_path
45
+ return value unless value.to_s.strip.empty?
46
+
47
+ local_path = File.expand_path("public/sitemap.xml", Dir.pwd)
48
+ return local_path if local_path_default?(base_url: base_url) && File.exist?(local_path)
49
+
50
+ "#{base_url.to_s.chomp("/")}/sitemap.xml"
51
+ end
52
+
53
+ def local_path_default?(base_url:)
54
+ host = URI.parse(base_url.to_s).host.to_s
55
+ ["localhost", "127.0.0.1"].include?(host)
56
+ rescue URI::InvalidURIError
57
+ false
58
+ end
59
+ end
60
+ end