crawlscope 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +31 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +323 -0
  5. data/exe/crawlscope +6 -0
  6. data/lib/crawlscope/audit.rb +128 -0
  7. data/lib/crawlscope/browser.rb +88 -0
  8. data/lib/crawlscope/cli.rb +245 -0
  9. data/lib/crawlscope/configuration.rb +123 -0
  10. data/lib/crawlscope/crawler.rb +28 -0
  11. data/lib/crawlscope/http.rb +77 -0
  12. data/lib/crawlscope/issue.rb +17 -0
  13. data/lib/crawlscope/issue_collection.rb +41 -0
  14. data/lib/crawlscope/page.rb +23 -0
  15. data/lib/crawlscope/railtie.rb +9 -0
  16. data/lib/crawlscope/reporter.rb +33 -0
  17. data/lib/crawlscope/result.rb +9 -0
  18. data/lib/crawlscope/rule_registry.rb +39 -0
  19. data/lib/crawlscope/rules/links.rb +220 -0
  20. data/lib/crawlscope/rules/metadata.rb +93 -0
  21. data/lib/crawlscope/rules/structured_data.rb +58 -0
  22. data/lib/crawlscope/rules/uniqueness.rb +88 -0
  23. data/lib/crawlscope/schema_registry.rb +431 -0
  24. data/lib/crawlscope/sitemap.rb +67 -0
  25. data/lib/crawlscope/structured_data/audit.rb +150 -0
  26. data/lib/crawlscope/structured_data/document.rb +93 -0
  27. data/lib/crawlscope/structured_data/report.rb +77 -0
  28. data/lib/crawlscope/structured_data/reporter.rb +73 -0
  29. data/lib/crawlscope/structured_data/writer.rb +26 -0
  30. data/lib/crawlscope/task.rb +131 -0
  31. data/lib/crawlscope/url.rb +43 -0
  32. data/lib/crawlscope/version.rb +5 -0
  33. data/lib/crawlscope.rb +34 -0
  34. data/lib/tasks/crawlscope_tasks.rake +44 -0
  35. data/test/crawlscope/audit_test.rb +165 -0
  36. data/test/crawlscope/cli_test.rb +157 -0
  37. data/test/crawlscope/configuration_test.rb +45 -0
  38. data/test/crawlscope/links_rule_test.rb +87 -0
  39. data/test/crawlscope/loader_test.rb +11 -0
  40. data/test/crawlscope/reporter_test.rb +50 -0
  41. data/test/crawlscope/schema_registry_test.rb +89 -0
  42. data/test/crawlscope/sitemap_test.rb +51 -0
  43. data/test/crawlscope/structured_data_audit_test.rb +118 -0
  44. data/test/crawlscope/structured_data_document_test.rb +28 -0
  45. data/test/crawlscope/structured_data_report_test.rb +37 -0
  46. data/test/crawlscope/structured_data_reporter_test.rb +32 -0
  47. data/test/crawlscope/structured_data_rule_test.rb +78 -0
  48. data/test/crawlscope/structured_data_writer_test.rb +32 -0
  49. data/test/crawlscope/task_test.rb +206 -0
  50. data/test/crawlscope/uniqueness_rule_test.rb +46 -0
  51. data/test/test_helper.rb +23 -0
  52. metadata +271 -0
@@ -0,0 +1,220 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module Crawlscope
6
+ module Rules
7
+ class Links
8
+ CONTEXTUAL_LINK_SELECTORS = "main a[href], article a[href]"
9
+ INTERNAL_PATH_PREFIXES_TO_SKIP = ["/rails/", "/cdn-cgi/"].freeze
10
+ LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze
11
+ MAX_SOURCES_IN_ERROR = 3
12
+ MIN_INBOUND_ANCHOR_LINKS = 1
13
+
14
+ attr_reader :code
15
+
16
+ def initialize
17
+ @code = :links
18
+ end
19
+
20
+ def call(urls:, pages:, issues:, context:)
21
+ @allowed_statuses = context.fetch(:allowed_statuses)
22
+ @base_url = context.fetch(:base_url)
23
+ @resolve_target = context.fetch(:resolve_target)
24
+ @base_host = URI.parse(@base_url).host
25
+
26
+ links = extract_links(pages)
27
+ return if links.empty?
28
+
29
+ resolved_links = resolve_links(links, issues)
30
+ validate_inbound_counts(urls, pages, resolved_links, issues)
31
+ end
32
+
33
+ private
34
+
35
+ def contextual_links(doc)
36
+ links = doc.css(CONTEXTUAL_LINK_SELECTORS)
37
+ return links unless links.empty?
38
+
39
+ doc.css("a[href]")
40
+ end
41
+
42
+ def extract_links(pages)
43
+ links = []
44
+
45
+ pages.each do |page|
46
+ next unless page.html?
47
+
48
+ source_path = Url.path(page.normalized_url)
49
+ next if source_path.nil?
50
+
51
+ contextual_links(page.doc).each do |node|
52
+ href = node["href"].to_s.strip
53
+ next if href.empty?
54
+ next if href.start_with?("#")
55
+ next if LINK_SCHEMES_TO_SKIP.any? { |prefix| href.start_with?(prefix) }
56
+
57
+ anchor_text = normalize_anchor_text(node.text)
58
+ next if anchor_text.empty?
59
+
60
+ target_url = normalize_internal_link(page.normalized_url, href)
61
+ next if target_url.nil?
62
+
63
+ target_path = Url.path(target_url)
64
+ next if target_path.nil?
65
+ next if skip_internal_path?(target_path)
66
+
67
+ links << {
68
+ anchor_text: anchor_text,
69
+ source_path: source_path,
70
+ source_url: page.normalized_url,
71
+ target_path: target_path,
72
+ target_url: target_url
73
+ }
74
+ end
75
+ end
76
+
77
+ links
78
+ end
79
+
80
+ def normalize_anchor_text(text)
81
+ text.to_s.gsub(/\s+/, " ").strip
82
+ end
83
+
84
+ def normalize_internal_link(source_url, href)
85
+ absolute_url = URI.join(source_url, href).to_s
86
+ uri = URI.parse(absolute_url)
87
+ return if uri.host != @base_host
88
+
89
+ uri.fragment = nil
90
+ Url.normalize(uri.to_s, base_url: @base_url)
91
+ rescue URI::InvalidURIError
92
+ nil
93
+ end
94
+
95
+ def report_broken_target(target_url, grouped_links, issues, status)
96
+ source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
97
+ issues.add(
98
+ code: :broken_internal_link,
99
+ severity: :warning,
100
+ category: :links,
101
+ url: target_url,
102
+ message: "broken internal link (HTTP #{status}, sources: #{source_urls.join(", ")})",
103
+ details: {source_urls: source_urls, status: status}
104
+ )
105
+ end
106
+
107
+ def report_unresolved_target(target_url, grouped_links, issues, resolution)
108
+ source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
109
+ suffix = (resolution && resolution[:error]) ? " (#{resolution[:error]})" : ""
110
+
111
+ issues.add(
112
+ code: :unresolved_internal_link,
113
+ severity: :warning,
114
+ category: :links,
115
+ url: target_url,
116
+ message: "unable to validate internal link#{suffix} (sources: #{source_urls.join(", ")})",
117
+ details: {error: resolution && resolution[:error], source_urls: source_urls}
118
+ )
119
+ end
120
+
121
+ def resolve_links(links, issues)
122
+ resolved_links = []
123
+
124
+ links.group_by { |link| link[:target_url] }.each do |target_url, grouped_links|
125
+ resolution = @resolve_target.call(target_url)
126
+ if resolution.nil?
127
+ report_unresolved_target(target_url, grouped_links, issues, resolution)
128
+ next
129
+ end
130
+
131
+ status = resolution[:status]
132
+
133
+ if status.nil?
134
+ next if resolution[:crawled] && resolution[:error]
135
+
136
+ report_unresolved_target(target_url, grouped_links, issues, resolution)
137
+ next
138
+ end
139
+
140
+ unless @allowed_statuses.include?(status)
141
+ report_broken_target(target_url, grouped_links, issues, status)
142
+ next
143
+ end
144
+
145
+ final_url = resolution[:final_url].to_s.empty? ? target_url : resolution[:final_url]
146
+ final_path = Url.path(final_url)
147
+ next if final_path.nil?
148
+ next if skip_internal_path?(final_path)
149
+
150
+ grouped_links.each do |link|
151
+ resolved_links << link.merge(final_path: final_path, final_url: final_url)
152
+ end
153
+ end
154
+
155
+ resolved_links
156
+ end
157
+
158
+ def skip_internal_path?(path)
159
+ return true if path == "/"
160
+
161
+ INTERNAL_PATH_PREFIXES_TO_SKIP.any? { |prefix| path.start_with?(prefix) }
162
+ end
163
+
164
+ def validate_inbound_counts(urls, pages, resolved_links, issues)
165
+ sitemap_paths = urls.each_with_object({}) do |url, memo|
166
+ normalized_url = Url.normalize(url, base_url: @base_url)
167
+ path = Url.path(normalized_url)
168
+ next if path.nil?
169
+ next if skip_internal_path?(path)
170
+
171
+ memo[path] = normalized_url
172
+ end
173
+
174
+ html_paths = pages.each_with_object(Set.new) do |page, result|
175
+ next unless page.html?
176
+
177
+ [page.normalized_url, page.normalized_final_url].compact.each do |url|
178
+ path = Url.path(url)
179
+ next if path.nil?
180
+ next if skip_internal_path?(path)
181
+
182
+ result << path
183
+ end
184
+ end
185
+
186
+ inbound_anchor_counts = Hash.new(0)
187
+ sample_sources_by_target = Hash.new { |hash, key| hash[key] = [] }
188
+
189
+ resolved_links.each do |link|
190
+ target_path = link[:final_path]
191
+ next unless sitemap_paths.key?(target_path)
192
+ next if link[:source_path] == target_path
193
+
194
+ inbound_anchor_counts[target_path] += 1
195
+ source_samples = sample_sources_by_target[target_path]
196
+ source_samples << link[:source_url] unless source_samples.include?(link[:source_url])
197
+ end
198
+
199
+ sitemap_paths.each do |path, target_url|
200
+ next unless html_paths.include?(path)
201
+
202
+ inbound_count = inbound_anchor_counts[path]
203
+ next if inbound_count >= MIN_INBOUND_ANCHOR_LINKS
204
+
205
+ source_samples = sample_sources_by_target[path].first(MAX_SOURCES_IN_ERROR)
206
+ source_info = source_samples.any? ? " (sources: #{source_samples.join(", ")})" : ""
207
+
208
+ issues.add(
209
+ code: :low_inbound_anchor_links,
210
+ severity: :warning,
211
+ category: :links,
212
+ url: target_url,
213
+ message: "inbound anchor links #{inbound_count} below #{MIN_INBOUND_ANCHOR_LINKS}#{source_info}",
214
+ details: {inbound_count: inbound_count, minimum: MIN_INBOUND_ANCHOR_LINKS, source_urls: source_samples}
215
+ )
216
+ end
217
+ end
218
+ end
219
+ end
220
+ end
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ module Rules
5
+ class Metadata
6
+ TITLE_MAX_LENGTH = 72
7
+ DESCRIPTION_MAX_LENGTH = 160
8
+
9
+ attr_reader :code
10
+
11
+ def initialize(site_name: nil)
12
+ @site_name = site_name.to_s.strip
13
+ @code = :metadata
14
+ end
15
+
16
+ def call(urls:, pages:, issues:, context: nil)
17
+ pages.each do |page|
18
+ next unless page.html?
19
+
20
+ validate_h1(page, issues)
21
+ validate_title(page, issues)
22
+ validate_description(page, issues)
23
+ validate_canonical(page, issues)
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def validate_h1(page, issues)
30
+ return unless page.doc.at_css("h1").nil?
31
+
32
+ issues.add(
33
+ code: :missing_h1,
34
+ severity: :warning,
35
+ category: :metadata,
36
+ url: page.url,
37
+ message: "missing <h1>",
38
+ details: {}
39
+ )
40
+ end
41
+
42
+ def validate_title(page, issues)
43
+ title = page.doc.at_css("title")&.text.to_s.strip
44
+
45
+ if title.empty?
46
+ issues.add(code: :missing_title, severity: :warning, category: :metadata, url: page.url, message: "missing <title>", details: {})
47
+ elsif title.length > TITLE_MAX_LENGTH
48
+ issues.add(code: :title_too_long, severity: :warning, category: :metadata, url: page.url, message: "title too long (#{title.length})", details: {length: title.length})
49
+ elsif repeated_site_name?(title)
50
+ issues.add(code: :title_repeats_site_name, severity: :warning, category: :metadata, url: page.url, message: "title repeats #{@site_name}", details: {site_name: @site_name})
51
+ end
52
+ end
53
+
54
+ def validate_description(page, issues)
55
+ description = page.doc.at_css('meta[name="description"]')&.[]("content").to_s.strip
56
+
57
+ if description.empty?
58
+ issues.add(code: :missing_meta_description, severity: :warning, category: :metadata, url: page.url, message: "missing meta description", details: {})
59
+ elsif description.length > DESCRIPTION_MAX_LENGTH
60
+ issues.add(code: :meta_description_too_long, severity: :warning, category: :metadata, url: page.url, message: "meta description too long (#{description.length})", details: {length: description.length})
61
+ end
62
+ end
63
+
64
+ def validate_canonical(page, issues)
65
+ canonical = page.doc.at_css('link[rel="canonical"]')&.[]("href").to_s.strip
66
+
67
+ if canonical.empty?
68
+ issues.add(code: :missing_canonical, severity: :warning, category: :metadata, url: page.url, message: "missing canonical link", details: {})
69
+ return
70
+ end
71
+
72
+ normalized_canonical = Url.normalize(canonical, base_url: page.url)
73
+ normalized_page_url = Url.normalize(page.url, base_url: page.url)
74
+ return if normalized_canonical == normalized_page_url
75
+
76
+ issues.add(
77
+ code: :canonical_mismatch,
78
+ severity: :warning,
79
+ category: :metadata,
80
+ url: page.url,
81
+ message: "canonical mismatch (#{canonical})",
82
+ details: {canonical: canonical}
83
+ )
84
+ end
85
+
86
+ def repeated_site_name?(title)
87
+ return false if @site_name.empty?
88
+
89
+ title.split(/[^[:alnum:]]+/).count { |token| token.casecmp?(@site_name) } > 1
90
+ end
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ module Rules
5
+ class StructuredData
6
+ attr_reader :code
7
+
8
+ def initialize
9
+ @code = :structured_data
10
+ end
11
+
12
+ def call(urls:, pages:, issues:, context:)
13
+ schema_registry = context.fetch(:schema_registry)
14
+
15
+ pages.each do |page|
16
+ next unless page.html?
17
+
18
+ validate_page(page, issues, schema_registry)
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def validate_page(page, issues, schema_registry)
25
+ document = Crawlscope::StructuredData::Document.new(html: page.body)
26
+
27
+ document.items.each do |item|
28
+ data = item.data
29
+ source = item.source
30
+
31
+ if data.is_a?(Hash) && data[:error]
32
+ issues.add(
33
+ code: :structured_data_parse_error,
34
+ severity: :warning,
35
+ category: :structured_data,
36
+ url: page.url,
37
+ message: "#{source} parse error: #{data[:message]}",
38
+ details: {source: source}
39
+ )
40
+ next
41
+ end
42
+
43
+ errors = schema_registry.validate(data)
44
+ next if errors.empty?
45
+
46
+ issues.add(
47
+ code: :structured_data_schema_error,
48
+ severity: :warning,
49
+ category: :structured_data,
50
+ url: page.url,
51
+ message: "#{source} schema errors: #{errors.to_json}",
52
+ details: {errors: errors, source: source}
53
+ )
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "digest"
4
+
5
+ module Crawlscope
6
+ module Rules
7
+ class Uniqueness
8
+ attr_reader :code
9
+
10
+ def initialize
11
+ @code = :uniqueness
12
+ end
13
+
14
+ def call(urls:, pages:, issues:, context:)
15
+ page_summaries = pages.filter_map do |page|
16
+ next unless page.html?
17
+
18
+ summary_for(page)
19
+ end
20
+
21
+ validate_duplicates(page_summaries, issues)
22
+ end
23
+
24
+ private
25
+
26
+ def content_fingerprint_digest(doc)
27
+ text = doc.at_css("main")&.text.to_s
28
+ text = doc.at_css("body")&.text.to_s if text.empty?
29
+ normalized = text.gsub(/\s+/, " ").strip
30
+ return if normalized.length < 200
31
+
32
+ Digest::SHA256.hexdigest(normalized)
33
+ end
34
+
35
+ def duplicates_for(pages, field)
36
+ pages
37
+ .select { |page| !page[field].nil? && !page[field].to_s.empty? }
38
+ .group_by { |page| page[field] }
39
+ .transform_values { |items| items.map { |item| item[:url] } }
40
+ .select { |_value, urls| urls.size > 1 }
41
+ end
42
+
43
+ def summary_for(page)
44
+ {
45
+ content_fingerprint_digest: content_fingerprint_digest(page.doc),
46
+ description: page.doc.at_css('meta[name="description"]')&.[]("content").to_s.strip,
47
+ title: page.doc.at_css("title")&.text.to_s.strip,
48
+ url: page.url
49
+ }
50
+ end
51
+
52
+ def validate_duplicates(page_summaries, issues)
53
+ duplicates_for(page_summaries, :title).each do |value, urls|
54
+ issues.add(
55
+ code: :duplicate_title,
56
+ severity: :warning,
57
+ category: :uniqueness,
58
+ url: nil,
59
+ message: "duplicate title '#{value}' => #{urls.join(", ")}",
60
+ details: {urls: urls, value: value}
61
+ )
62
+ end
63
+
64
+ duplicates_for(page_summaries, :description).each do |value, urls|
65
+ issues.add(
66
+ code: :duplicate_meta_description,
67
+ severity: :warning,
68
+ category: :uniqueness,
69
+ url: nil,
70
+ message: "duplicate meta description '#{value}' => #{urls.join(", ")}",
71
+ details: {urls: urls, value: value}
72
+ )
73
+ end
74
+
75
+ duplicates_for(page_summaries, :content_fingerprint_digest).each_value do |urls|
76
+ issues.add(
77
+ code: :duplicate_content_fingerprint,
78
+ severity: :warning,
79
+ category: :uniqueness,
80
+ url: nil,
81
+ message: "duplicate page content fingerprint => #{urls.join(", ")}",
82
+ details: {urls: urls}
83
+ )
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end