crawlscope 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +31 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +323 -0
  5. data/exe/crawlscope +6 -0
  6. data/lib/crawlscope/audit.rb +128 -0
  7. data/lib/crawlscope/browser.rb +88 -0
  8. data/lib/crawlscope/cli.rb +245 -0
  9. data/lib/crawlscope/configuration.rb +123 -0
  10. data/lib/crawlscope/crawler.rb +28 -0
  11. data/lib/crawlscope/http.rb +77 -0
  12. data/lib/crawlscope/issue.rb +17 -0
  13. data/lib/crawlscope/issue_collection.rb +41 -0
  14. data/lib/crawlscope/page.rb +23 -0
  15. data/lib/crawlscope/railtie.rb +9 -0
  16. data/lib/crawlscope/reporter.rb +33 -0
  17. data/lib/crawlscope/result.rb +9 -0
  18. data/lib/crawlscope/rule_registry.rb +39 -0
  19. data/lib/crawlscope/rules/links.rb +220 -0
  20. data/lib/crawlscope/rules/metadata.rb +93 -0
  21. data/lib/crawlscope/rules/structured_data.rb +58 -0
  22. data/lib/crawlscope/rules/uniqueness.rb +88 -0
  23. data/lib/crawlscope/schema_registry.rb +431 -0
  24. data/lib/crawlscope/sitemap.rb +67 -0
  25. data/lib/crawlscope/structured_data/audit.rb +150 -0
  26. data/lib/crawlscope/structured_data/document.rb +93 -0
  27. data/lib/crawlscope/structured_data/report.rb +77 -0
  28. data/lib/crawlscope/structured_data/reporter.rb +73 -0
  29. data/lib/crawlscope/structured_data/writer.rb +26 -0
  30. data/lib/crawlscope/task.rb +131 -0
  31. data/lib/crawlscope/url.rb +43 -0
  32. data/lib/crawlscope/version.rb +5 -0
  33. data/lib/crawlscope.rb +34 -0
  34. data/lib/tasks/crawlscope_tasks.rake +44 -0
  35. data/test/crawlscope/audit_test.rb +165 -0
  36. data/test/crawlscope/cli_test.rb +157 -0
  37. data/test/crawlscope/configuration_test.rb +45 -0
  38. data/test/crawlscope/links_rule_test.rb +87 -0
  39. data/test/crawlscope/loader_test.rb +11 -0
  40. data/test/crawlscope/reporter_test.rb +50 -0
  41. data/test/crawlscope/schema_registry_test.rb +89 -0
  42. data/test/crawlscope/sitemap_test.rb +51 -0
  43. data/test/crawlscope/structured_data_audit_test.rb +118 -0
  44. data/test/crawlscope/structured_data_document_test.rb +28 -0
  45. data/test/crawlscope/structured_data_report_test.rb +37 -0
  46. data/test/crawlscope/structured_data_reporter_test.rb +32 -0
  47. data/test/crawlscope/structured_data_rule_test.rb +78 -0
  48. data/test/crawlscope/structured_data_writer_test.rb +32 -0
  49. data/test/crawlscope/task_test.rb +206 -0
  50. data/test/crawlscope/uniqueness_rule_test.rb +46 -0
  51. data/test/test_helper.rb +23 -0
  52. metadata +271 -0
@@ -0,0 +1,245 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "optparse"
4
+
5
+ module Crawlscope
6
+ class Cli
7
+ def self.start(argv, out: $stdout, err: $stderr, **options)
8
+ new(argv, out: out, err: err, **options).call
9
+ end
10
+
11
+ def initialize(argv, out:, err:, configuration: Configuration.new, task: nil)
12
+ @argv = Array(argv).dup
13
+ @out = out
14
+ @err = err
15
+ @configuration = configuration
16
+ @configuration.output = out
17
+ @task = task
18
+ end
19
+
20
+ def call
21
+ command = @argv.shift.to_s
22
+
23
+ case command
24
+ when "help", ""
25
+ @out.puts(general_usage)
26
+ 0
27
+ when "validate"
28
+ run_validate
29
+ when "ldjson"
30
+ run_ldjson
31
+ when "version", "--version", "-v"
32
+ @out.puts(Crawlscope::VERSION)
33
+ 0
34
+ else
35
+ @err.puts("Unknown command: #{command}")
36
+ @err.puts("")
37
+ @err.puts(general_usage)
38
+ 1
39
+ end
40
+ rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ArgumentError => error
41
+ @err.puts(error.message)
42
+ @err.puts("")
43
+ @err.puts(general_usage)
44
+ 1
45
+ end
46
+
47
+ private
48
+
49
+ def general_usage
50
+ <<~TEXT
51
+ Usage:
52
+ crawlscope validate --base-url https://example.com [options]
53
+ crawlscope ldjson --url https://example.com/page [options]
54
+ crawlscope version
55
+
56
+ Commands:
57
+ validate Audit sitemap URLs for metadata, structured data, uniqueness, and links
58
+ ldjson Validate structured data on one or more URLs
59
+ version Print the gem version
60
+ TEXT
61
+ end
62
+
63
+ def run_ldjson
64
+ options = {
65
+ debug: env_enabled?("DEBUG"),
66
+ renderer: resolved_renderer,
67
+ report_path: normalized_string(ENV["REPORT_PATH"]),
68
+ summary: env_enabled?("SUMMARY"),
69
+ timeout_seconds: resolved_integer("TIMEOUT", default: @configuration.timeout_seconds, minimum: 1),
70
+ urls: resolved_urls_from_env
71
+ }
72
+
73
+ parser = OptionParser.new do |opts|
74
+ opts.banner = "Usage: crawlscope ldjson --url https://example.com/page [options]"
75
+
76
+ opts.on("--url URL", "Validate one URL (repeatable)") do |value|
77
+ options[:urls] << value.strip
78
+ end
79
+
80
+ opts.on("--debug", "Print detected structured data") do
81
+ options[:debug] = true
82
+ end
83
+
84
+ opts.on("--summary", "Print grouped summary output") do
85
+ options[:summary] = true
86
+ end
87
+
88
+ opts.on("--report-path PATH", "Write a JSON report to PATH") do |value|
89
+ options[:report_path] = value
90
+ end
91
+
92
+ opts.on("--renderer NAME", "Use http or browser rendering") do |value|
93
+ options[:renderer] = value.to_sym
94
+ end
95
+
96
+ opts.on("--timeout SECONDS", Integer, "Set request timeout") do |value|
97
+ options[:timeout_seconds] = integer_option(value, minimum: 1, name: "timeout")
98
+ end
99
+
100
+ opts.on("--network-idle-timeout SECONDS", Integer, "Set browser network idle timeout") do |value|
101
+ @configuration.network_idle_timeout_seconds = integer_option(value, minimum: 1, name: "network-idle-timeout")
102
+ end
103
+ end
104
+
105
+ parser.parse!(@argv)
106
+
107
+ urls = options[:urls].map(&:strip).reject(&:empty?)
108
+ raise ConfigurationError, "Crawlscope URL is not configured" if urls.empty?
109
+
110
+ configure_renderer(options[:renderer])
111
+
112
+ result = task.validate_ldjson(
113
+ urls: urls,
114
+ debug: options[:debug],
115
+ renderer: options[:renderer],
116
+ report_path: options[:report_path],
117
+ summary: options[:summary],
118
+ timeout_seconds: options[:timeout_seconds]
119
+ )
120
+
121
+ result.ok? ? 0 : 1
122
+ end
123
+
124
+ def run_validate
125
+ options = {
126
+ base_url: normalized_string(ENV["BASE_URL"]),
127
+ rule_names: normalized_string(ENV["RULES"]),
128
+ sitemap_path: normalized_string(ENV["SITEMAP"])
129
+ }
130
+
131
+ configure_renderer(resolved_renderer)
132
+ @configuration.concurrency = resolved_concurrency
133
+ @configuration.network_idle_timeout_seconds = resolved_integer("NETWORK_IDLE_TIMEOUT", default: @configuration.network_idle_timeout_seconds, minimum: 1)
134
+ @configuration.timeout_seconds = resolved_integer("TIMEOUT", default: @configuration.timeout_seconds, minimum: 1)
135
+
136
+ parser = OptionParser.new do |opts|
137
+ opts.banner = "Usage: crawlscope validate --base-url https://example.com [options]"
138
+
139
+ opts.on("--base-url URL", "Set the site base URL") do |value|
140
+ options[:base_url] = value
141
+ end
142
+
143
+ opts.on("--sitemap PATH_OR_URL", "Set the sitemap path or URL") do |value|
144
+ options[:sitemap_path] = value
145
+ end
146
+
147
+ opts.on("--rules CSV", "Run a subset of rules, for example metadata,links") do |value|
148
+ options[:rule_names] = value
149
+ end
150
+
151
+ opts.on("--renderer NAME", "Use http or browser rendering") do |value|
152
+ configure_renderer(value.to_sym)
153
+ end
154
+
155
+ opts.on("--timeout SECONDS", Integer, "Set request timeout") do |value|
156
+ @configuration.timeout_seconds = integer_option(value, minimum: 1, name: "timeout")
157
+ end
158
+
159
+ opts.on("--network-idle-timeout SECONDS", Integer, "Set browser network idle timeout") do |value|
160
+ @configuration.network_idle_timeout_seconds = integer_option(value, minimum: 1, name: "network-idle-timeout")
161
+ end
162
+
163
+ opts.on("--concurrency COUNT", Integer, "Set crawl concurrency") do |value|
164
+ @configuration.concurrency = integer_option(value, minimum: 1, name: "concurrency")
165
+ end
166
+ end
167
+
168
+ parser.parse!(@argv)
169
+
170
+ result = task.validate(
171
+ base_url: options[:base_url],
172
+ sitemap_path: options[:sitemap_path],
173
+ rule_names: options[:rule_names]
174
+ )
175
+
176
+ result.ok? ? 0 : 1
177
+ end
178
+
179
+ def configure_renderer(renderer)
180
+ @configuration.renderer = renderer
181
+ end
182
+
183
+ def env_enabled?(name)
184
+ ENV[name].to_s == "1"
185
+ end
186
+
187
+ def integer_option(value, minimum:, name:)
188
+ integer = value.is_a?(Integer) ? value : Integer(value, 10)
189
+ raise ArgumentError, "#{name} must be >= #{minimum}" if integer < minimum
190
+
191
+ integer
192
+ rescue ArgumentError => error
193
+ raise error if error.message == "#{name} must be >= #{minimum}"
194
+
195
+ raise ArgumentError, "#{name} must be an integer >= #{minimum}"
196
+ end
197
+
198
+ def normalized_string(value)
199
+ normalized = value.to_s.strip
200
+ normalized.empty? ? nil : normalized
201
+ end
202
+
203
+ def resolved_concurrency
204
+ configured_concurrency = resolved_integer("CONCURRENCY", default: @configuration.concurrency, minimum: 1)
205
+
206
+ if @configuration.renderer == :browser && normalized_string(ENV["CONCURRENCY"]).nil?
207
+ browser_concurrency = @configuration.browser_concurrency
208
+
209
+ if configured_concurrency > browser_concurrency
210
+ @configuration.output.puts("Default JS concurrency capped at #{browser_concurrency}. Set CONCURRENCY to override.")
211
+ browser_concurrency
212
+ else
213
+ configured_concurrency
214
+ end
215
+ else
216
+ configured_concurrency
217
+ end
218
+ end
219
+
220
+ def resolved_integer(name, default:, minimum:)
221
+ raw_value = normalized_string(ENV[name])
222
+ return default if raw_value.nil?
223
+
224
+ integer_option(raw_value, minimum: minimum, name: name.downcase.tr("_", "-"))
225
+ end
226
+
227
+ def resolved_renderer
228
+ renderer = normalized_string(ENV["RENDERER"])
229
+ return renderer.to_sym if renderer
230
+
231
+ env_enabled?("JS") ? :browser : :http
232
+ end
233
+
234
+ def resolved_urls_from_env
235
+ raw_urls = normalized_string(ENV["URL"])
236
+ return [] if raw_urls.nil?
237
+
238
+ raw_urls.split(";").map(&:strip).reject(&:empty?)
239
+ end
240
+
241
+ def task
242
+ @task ||= Task.new(configuration: @configuration, reporter: Reporter.new(io: @out))
243
+ end
244
+ end
245
+ end
@@ -0,0 +1,123 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ class Configuration
5
+ DEFAULT_ALLOWED_STATUSES = [200, 301, 302].freeze
6
+ DEFAULT_BROWSER_CONCURRENCY = 4
7
+ DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS = 5
8
+ DEFAULT_BROWSER_SCROLL_PAGE = true
9
+ DEFAULT_CONCURRENCY = 10
10
+ DEFAULT_TIMEOUT_SECONDS = 20
11
+
12
+ attr_writer :allowed_statuses, :base_url, :browser_factory, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :rule_registry, :schema_registry, :scroll_page, :site_name, :sitemap_path, :timeout_seconds
13
+
14
+ def allowed_statuses
15
+ value = resolve(@allowed_statuses)
16
+ Array(value.nil? ? DEFAULT_ALLOWED_STATUSES : value).map(&:to_i)
17
+ end
18
+
19
+ def base_url
20
+ resolve(@base_url)
21
+ end
22
+
23
+ def browser_factory
24
+ resolve(@browser_factory)
25
+ end
26
+
27
+ def concurrency
28
+ value = resolve(@concurrency)
29
+ value.nil? ? DEFAULT_CONCURRENCY : value.to_i
30
+ end
31
+
32
+ def browser_concurrency
33
+ value = concurrency
34
+ default_value = DEFAULT_BROWSER_CONCURRENCY
35
+
36
+ if value > default_value
37
+ default_value
38
+ else
39
+ value
40
+ end
41
+ end
42
+
43
+ def network_idle_timeout_seconds
44
+ value = resolve(@network_idle_timeout_seconds)
45
+ value.nil? ? DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS : value.to_i
46
+ end
47
+
48
+ def output
49
+ value = resolve(@output)
50
+ value.nil? ? $stdout : value
51
+ end
52
+
53
+ def renderer
54
+ value = resolve(@renderer)
55
+ normalized_value = value.to_s.strip
56
+ normalized_value = "http" if normalized_value.empty?
57
+
58
+ normalized_value.to_sym
59
+ end
60
+
61
+ def rule_registry
62
+ value = resolve(@rule_registry)
63
+ return value unless value.nil?
64
+
65
+ RuleRegistry.default(site_name: site_name)
66
+ end
67
+
68
+ def audit(base_url: self.base_url, sitemap_path: self.sitemap_path, rule_names: nil)
69
+ if base_url.to_s.strip.empty?
70
+ raise ConfigurationError, "Crawlscope base_url is not configured"
71
+ end
72
+
73
+ if sitemap_path.to_s.strip.empty?
74
+ raise ConfigurationError, "Crawlscope sitemap_path is not configured"
75
+ end
76
+
77
+ Audit.new(
78
+ base_url: base_url,
79
+ sitemap_path: sitemap_path,
80
+ browser_factory: browser_factory,
81
+ concurrency: concurrency,
82
+ network_idle_timeout_seconds: network_idle_timeout_seconds,
83
+ renderer: renderer,
84
+ timeout_seconds: timeout_seconds,
85
+ allowed_statuses: allowed_statuses,
86
+ rules: rule_registry.rules_for(rule_names),
87
+ schema_registry: schema_registry,
88
+ scroll_page: scroll_page?
89
+ )
90
+ end
91
+
92
+ def schema_registry
93
+ value = resolve(@schema_registry)
94
+ return value unless value.nil?
95
+
96
+ SchemaRegistry.default
97
+ end
98
+
99
+ def site_name
100
+ resolve(@site_name)
101
+ end
102
+
103
+ def scroll_page?
104
+ value = resolve(@scroll_page)
105
+ value.nil? ? DEFAULT_BROWSER_SCROLL_PAGE : value
106
+ end
107
+
108
+ def sitemap_path
109
+ resolve(@sitemap_path)
110
+ end
111
+
112
+ def timeout_seconds
113
+ value = resolve(@timeout_seconds)
114
+ value.nil? ? DEFAULT_TIMEOUT_SECONDS : value.to_i
115
+ end
116
+
117
+ private
118
+
119
+ def resolve(value)
120
+ value.respond_to?(:call) ? value.call : value
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "concurrent"
4
+
5
+ module Crawlscope
6
+ class Crawler
7
+ def initialize(page_fetcher:, concurrency:)
8
+ @page_fetcher = page_fetcher
9
+ @concurrency = concurrency
10
+ end
11
+
12
+ def call(urls)
13
+ pages = Concurrent::Array.new
14
+ pool = Concurrent::FixedThreadPool.new(@concurrency)
15
+
16
+ urls.each do |url|
17
+ pool.post do
18
+ pages << @page_fetcher.fetch(url)
19
+ end
20
+ end
21
+
22
+ pool.shutdown
23
+ pool.wait_for_termination
24
+
25
+ pages.to_a
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "concurrent"
4
+ require "faraday"
5
+ require "faraday/follow_redirects"
6
+ require "nokogiri"
7
+
8
+ module Crawlscope
9
+ class Http
10
+ MAX_REDIRECTS = 5
11
+ USER_AGENT = "Mozilla/5.0 (compatible; Crawlscope/1.0)"
12
+
13
+ def initialize(base_url:, timeout_seconds:)
14
+ @base_url = base_url
15
+ @timeout_seconds = timeout_seconds
16
+ @connections_by_thread = Concurrent::Map.new
17
+ end
18
+
19
+ def close
20
+ @connections_by_thread.clear
21
+ end
22
+
23
+ def fetch(url)
24
+ response = connection.get(url) do |request|
25
+ request.headers["User-Agent"] = USER_AGENT
26
+ end
27
+
28
+ final_url = response.env.url.to_s
29
+ final_url = url if final_url.empty?
30
+ headers = response.headers.to_h
31
+ body = response.body.to_s
32
+ doc = if response.status == 200 && html_response?(headers)
33
+ Nokogiri::HTML(body)
34
+ end
35
+
36
+ Page.new(
37
+ url: url,
38
+ normalized_url: Url.normalize(url, base_url: @base_url),
39
+ final_url: final_url,
40
+ normalized_final_url: Url.normalize(final_url, base_url: @base_url),
41
+ status: response.status,
42
+ headers: headers,
43
+ body: body,
44
+ doc: doc
45
+ )
46
+ rescue => error
47
+ Page.new(
48
+ url: url,
49
+ normalized_url: Url.normalize(url, base_url: @base_url),
50
+ final_url: url,
51
+ normalized_final_url: Url.normalize(url, base_url: @base_url),
52
+ status: nil,
53
+ headers: {},
54
+ body: nil,
55
+ doc: nil,
56
+ error: "#{error.class}: #{error.message}"
57
+ )
58
+ end
59
+
60
+ private
61
+
62
+ def connection
63
+ @connections_by_thread.compute_if_absent(Thread.current.object_id) do
64
+ Faraday.new do |faraday|
65
+ faraday.response :follow_redirects, limit: MAX_REDIRECTS
66
+ faraday.options.timeout = @timeout_seconds
67
+ faraday.options.open_timeout = @timeout_seconds
68
+ end
69
+ end
70
+ end
71
+
72
+ def html_response?(headers)
73
+ content_type = headers["content-type"] || headers.find { |key, _value| key.to_s.casecmp("content-type").zero? }&.last.to_s
74
+ content_type.empty? || content_type.include?("text/html")
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ Issue = Data.define(:code, :severity, :category, :url, :message, :details) do
5
+ def error?
6
+ severity == :error
7
+ end
8
+
9
+ def warning?
10
+ severity == :warning
11
+ end
12
+
13
+ def notice?
14
+ severity == :notice
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ class IssueCollection
5
+ include Enumerable
6
+
7
+ def initialize(issues = [])
8
+ @issues = issues.dup
9
+ end
10
+
11
+ def add(issue = nil, **attributes)
12
+ issue ||= Issue.new(**attributes)
13
+ @issues << issue
14
+ issue
15
+ end
16
+
17
+ def any?
18
+ @issues.any?
19
+ end
20
+
21
+ def each(&block)
22
+ @issues.each(&block)
23
+ end
24
+
25
+ def size
26
+ @issues.size
27
+ end
28
+
29
+ def to_a
30
+ @issues.dup
31
+ end
32
+
33
+ def by_category
34
+ @issues.group_by(&:category)
35
+ end
36
+
37
+ def by_severity
38
+ @issues.group_by(&:severity)
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ class Page
5
+ attr_reader :body, :doc, :error, :final_url, :headers, :normalized_final_url, :normalized_url, :status, :url
6
+
7
+ def initialize(url:, normalized_url:, final_url:, normalized_final_url:, status:, headers:, body:, doc:, error: nil)
8
+ @url = url
9
+ @normalized_url = normalized_url
10
+ @final_url = final_url
11
+ @normalized_final_url = normalized_final_url
12
+ @status = status
13
+ @headers = headers || {}
14
+ @body = body
15
+ @doc = doc
16
+ @error = error
17
+ end
18
+
19
+ def html?
20
+ !doc.nil?
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ class Railtie < Rails::Railtie
5
+ rake_tasks do
6
+ load File.expand_path("../tasks/crawlscope_tasks.rake", __dir__)
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ class Reporter
5
+ def initialize(io:)
6
+ @io = io
7
+ end
8
+
9
+ def report(result)
10
+ @io.puts("Crawlscope validation")
11
+ @io.puts("Base URL: #{result.base_url}")
12
+ @io.puts("Sitemap: #{result.sitemap_path}")
13
+ @io.puts("URLs: #{result.urls.size}")
14
+ @io.puts("Pages: #{result.pages.size}")
15
+
16
+ if result.ok?
17
+ @io.puts("Status: OK")
18
+ return
19
+ end
20
+
21
+ @io.puts("Status: FAILED")
22
+ @io.puts("Issues: #{result.issues.size}")
23
+
24
+ result.issues.by_severity.sort_by { |severity, _issues| severity.to_s }.each do |severity, issues|
25
+ @io.puts("#{severity}: #{issues.size}")
26
+ end
27
+
28
+ result.issues.each do |issue|
29
+ @io.puts("- [#{issue.severity}] #{issue.url} #{issue.message}")
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ Result = Data.define(:base_url, :sitemap_path, :urls, :pages, :issues) do
5
+ def ok?
6
+ issues.none?(&:error?) && issues.none?(&:warning?) && issues.none?(&:notice?)
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ class RuleRegistry
5
+ attr_reader :default_codes, :rules
6
+
7
+ def initialize(rules:, default_codes: nil)
8
+ @rules = Array(rules)
9
+ @default_codes = Array(default_codes).map(&:to_sym)
10
+ end
11
+
12
+ def self.default(site_name: nil)
13
+ new(
14
+ rules: [
15
+ Rules::Metadata.new(site_name: site_name),
16
+ Rules::StructuredData.new,
17
+ Rules::Uniqueness.new,
18
+ Rules::Links.new
19
+ ],
20
+ default_codes: %i[metadata structured_data uniqueness links]
21
+ )
22
+ end
23
+
24
+ def codes
25
+ @rules.map(&:code)
26
+ end
27
+
28
+ def rules_for(names)
29
+ normalized_names = Array(names).flat_map { |value| value.to_s.split(",") }.map(&:strip).reject(&:empty?)
30
+ normalized_names = @default_codes.map(&:to_s) if normalized_names.empty?
31
+
32
+ selected_rules = @rules.select { |rule| normalized_names.include?(rule.code.to_s) }
33
+ missing_rules = normalized_names - selected_rules.map { |rule| rule.code.to_s }
34
+ return selected_rules if missing_rules.empty?
35
+
36
+ raise ConfigurationError, "Unknown Crawlscope rules: #{missing_rules.join(", ")}"
37
+ end
38
+ end
39
+ end