crawlscope 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +31 -0
- data/LICENSE.txt +21 -0
- data/README.md +323 -0
- data/exe/crawlscope +6 -0
- data/lib/crawlscope/audit.rb +128 -0
- data/lib/crawlscope/browser.rb +88 -0
- data/lib/crawlscope/cli.rb +245 -0
- data/lib/crawlscope/configuration.rb +123 -0
- data/lib/crawlscope/crawler.rb +28 -0
- data/lib/crawlscope/http.rb +77 -0
- data/lib/crawlscope/issue.rb +17 -0
- data/lib/crawlscope/issue_collection.rb +41 -0
- data/lib/crawlscope/page.rb +23 -0
- data/lib/crawlscope/railtie.rb +9 -0
- data/lib/crawlscope/reporter.rb +33 -0
- data/lib/crawlscope/result.rb +9 -0
- data/lib/crawlscope/rule_registry.rb +39 -0
- data/lib/crawlscope/rules/links.rb +220 -0
- data/lib/crawlscope/rules/metadata.rb +93 -0
- data/lib/crawlscope/rules/structured_data.rb +58 -0
- data/lib/crawlscope/rules/uniqueness.rb +88 -0
- data/lib/crawlscope/schema_registry.rb +431 -0
- data/lib/crawlscope/sitemap.rb +67 -0
- data/lib/crawlscope/structured_data/audit.rb +150 -0
- data/lib/crawlscope/structured_data/document.rb +93 -0
- data/lib/crawlscope/structured_data/report.rb +77 -0
- data/lib/crawlscope/structured_data/reporter.rb +73 -0
- data/lib/crawlscope/structured_data/writer.rb +26 -0
- data/lib/crawlscope/task.rb +131 -0
- data/lib/crawlscope/url.rb +43 -0
- data/lib/crawlscope/version.rb +5 -0
- data/lib/crawlscope.rb +34 -0
- data/lib/tasks/crawlscope_tasks.rake +44 -0
- data/test/crawlscope/audit_test.rb +165 -0
- data/test/crawlscope/cli_test.rb +157 -0
- data/test/crawlscope/configuration_test.rb +45 -0
- data/test/crawlscope/links_rule_test.rb +87 -0
- data/test/crawlscope/loader_test.rb +11 -0
- data/test/crawlscope/reporter_test.rb +50 -0
- data/test/crawlscope/schema_registry_test.rb +89 -0
- data/test/crawlscope/sitemap_test.rb +51 -0
- data/test/crawlscope/structured_data_audit_test.rb +118 -0
- data/test/crawlscope/structured_data_document_test.rb +28 -0
- data/test/crawlscope/structured_data_report_test.rb +37 -0
- data/test/crawlscope/structured_data_reporter_test.rb +32 -0
- data/test/crawlscope/structured_data_rule_test.rb +78 -0
- data/test/crawlscope/structured_data_writer_test.rb +32 -0
- data/test/crawlscope/task_test.rb +206 -0
- data/test/crawlscope/uniqueness_rule_test.rb +46 -0
- data/test/test_helper.rb +23 -0
- metadata +271 -0
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "optparse"
|
|
4
|
+
|
|
5
|
+
module Crawlscope
|
|
6
|
+
class Cli
|
|
7
|
+
def self.start(argv, out: $stdout, err: $stderr, **options)
|
|
8
|
+
new(argv, out: out, err: err, **options).call
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def initialize(argv, out:, err:, configuration: Configuration.new, task: nil)
|
|
12
|
+
@argv = Array(argv).dup
|
|
13
|
+
@out = out
|
|
14
|
+
@err = err
|
|
15
|
+
@configuration = configuration
|
|
16
|
+
@configuration.output = out
|
|
17
|
+
@task = task
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def call
|
|
21
|
+
command = @argv.shift.to_s
|
|
22
|
+
|
|
23
|
+
case command
|
|
24
|
+
when "help", ""
|
|
25
|
+
@out.puts(general_usage)
|
|
26
|
+
0
|
|
27
|
+
when "validate"
|
|
28
|
+
run_validate
|
|
29
|
+
when "ldjson"
|
|
30
|
+
run_ldjson
|
|
31
|
+
when "version", "--version", "-v"
|
|
32
|
+
@out.puts(Crawlscope::VERSION)
|
|
33
|
+
0
|
|
34
|
+
else
|
|
35
|
+
@err.puts("Unknown command: #{command}")
|
|
36
|
+
@err.puts("")
|
|
37
|
+
@err.puts(general_usage)
|
|
38
|
+
1
|
|
39
|
+
end
|
|
40
|
+
rescue OptionParser::InvalidOption, OptionParser::MissingArgument, ConfigurationError, ArgumentError => error
|
|
41
|
+
@err.puts(error.message)
|
|
42
|
+
@err.puts("")
|
|
43
|
+
@err.puts(general_usage)
|
|
44
|
+
1
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
private
|
|
48
|
+
|
|
49
|
+
def general_usage
|
|
50
|
+
<<~TEXT
|
|
51
|
+
Usage:
|
|
52
|
+
crawlscope validate --base-url https://example.com [options]
|
|
53
|
+
crawlscope ldjson --url https://example.com/page [options]
|
|
54
|
+
crawlscope version
|
|
55
|
+
|
|
56
|
+
Commands:
|
|
57
|
+
validate Audit sitemap URLs for metadata, structured data, uniqueness, and links
|
|
58
|
+
ldjson Validate structured data on one or more URLs
|
|
59
|
+
version Print the gem version
|
|
60
|
+
TEXT
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def run_ldjson
|
|
64
|
+
options = {
|
|
65
|
+
debug: env_enabled?("DEBUG"),
|
|
66
|
+
renderer: resolved_renderer,
|
|
67
|
+
report_path: normalized_string(ENV["REPORT_PATH"]),
|
|
68
|
+
summary: env_enabled?("SUMMARY"),
|
|
69
|
+
timeout_seconds: resolved_integer("TIMEOUT", default: @configuration.timeout_seconds, minimum: 1),
|
|
70
|
+
urls: resolved_urls_from_env
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
parser = OptionParser.new do |opts|
|
|
74
|
+
opts.banner = "Usage: crawlscope ldjson --url https://example.com/page [options]"
|
|
75
|
+
|
|
76
|
+
opts.on("--url URL", "Validate one URL (repeatable)") do |value|
|
|
77
|
+
options[:urls] << value.strip
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
opts.on("--debug", "Print detected structured data") do
|
|
81
|
+
options[:debug] = true
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
opts.on("--summary", "Print grouped summary output") do
|
|
85
|
+
options[:summary] = true
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
opts.on("--report-path PATH", "Write a JSON report to PATH") do |value|
|
|
89
|
+
options[:report_path] = value
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
opts.on("--renderer NAME", "Use http or browser rendering") do |value|
|
|
93
|
+
options[:renderer] = value.to_sym
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
opts.on("--timeout SECONDS", Integer, "Set request timeout") do |value|
|
|
97
|
+
options[:timeout_seconds] = integer_option(value, minimum: 1, name: "timeout")
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
opts.on("--network-idle-timeout SECONDS", Integer, "Set browser network idle timeout") do |value|
|
|
101
|
+
@configuration.network_idle_timeout_seconds = integer_option(value, minimum: 1, name: "network-idle-timeout")
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
parser.parse!(@argv)
|
|
106
|
+
|
|
107
|
+
urls = options[:urls].map(&:strip).reject(&:empty?)
|
|
108
|
+
raise ConfigurationError, "Crawlscope URL is not configured" if urls.empty?
|
|
109
|
+
|
|
110
|
+
configure_renderer(options[:renderer])
|
|
111
|
+
|
|
112
|
+
result = task.validate_ldjson(
|
|
113
|
+
urls: urls,
|
|
114
|
+
debug: options[:debug],
|
|
115
|
+
renderer: options[:renderer],
|
|
116
|
+
report_path: options[:report_path],
|
|
117
|
+
summary: options[:summary],
|
|
118
|
+
timeout_seconds: options[:timeout_seconds]
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
result.ok? ? 0 : 1
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def run_validate
|
|
125
|
+
options = {
|
|
126
|
+
base_url: normalized_string(ENV["BASE_URL"]),
|
|
127
|
+
rule_names: normalized_string(ENV["RULES"]),
|
|
128
|
+
sitemap_path: normalized_string(ENV["SITEMAP"])
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
configure_renderer(resolved_renderer)
|
|
132
|
+
@configuration.concurrency = resolved_concurrency
|
|
133
|
+
@configuration.network_idle_timeout_seconds = resolved_integer("NETWORK_IDLE_TIMEOUT", default: @configuration.network_idle_timeout_seconds, minimum: 1)
|
|
134
|
+
@configuration.timeout_seconds = resolved_integer("TIMEOUT", default: @configuration.timeout_seconds, minimum: 1)
|
|
135
|
+
|
|
136
|
+
parser = OptionParser.new do |opts|
|
|
137
|
+
opts.banner = "Usage: crawlscope validate --base-url https://example.com [options]"
|
|
138
|
+
|
|
139
|
+
opts.on("--base-url URL", "Set the site base URL") do |value|
|
|
140
|
+
options[:base_url] = value
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
opts.on("--sitemap PATH_OR_URL", "Set the sitemap path or URL") do |value|
|
|
144
|
+
options[:sitemap_path] = value
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
opts.on("--rules CSV", "Run a subset of rules, for example metadata,links") do |value|
|
|
148
|
+
options[:rule_names] = value
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
opts.on("--renderer NAME", "Use http or browser rendering") do |value|
|
|
152
|
+
configure_renderer(value.to_sym)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
opts.on("--timeout SECONDS", Integer, "Set request timeout") do |value|
|
|
156
|
+
@configuration.timeout_seconds = integer_option(value, minimum: 1, name: "timeout")
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
opts.on("--network-idle-timeout SECONDS", Integer, "Set browser network idle timeout") do |value|
|
|
160
|
+
@configuration.network_idle_timeout_seconds = integer_option(value, minimum: 1, name: "network-idle-timeout")
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
opts.on("--concurrency COUNT", Integer, "Set crawl concurrency") do |value|
|
|
164
|
+
@configuration.concurrency = integer_option(value, minimum: 1, name: "concurrency")
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
parser.parse!(@argv)
|
|
169
|
+
|
|
170
|
+
result = task.validate(
|
|
171
|
+
base_url: options[:base_url],
|
|
172
|
+
sitemap_path: options[:sitemap_path],
|
|
173
|
+
rule_names: options[:rule_names]
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
result.ok? ? 0 : 1
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def configure_renderer(renderer)
|
|
180
|
+
@configuration.renderer = renderer
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def env_enabled?(name)
|
|
184
|
+
ENV[name].to_s == "1"
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def integer_option(value, minimum:, name:)
|
|
188
|
+
integer = value.is_a?(Integer) ? value : Integer(value, 10)
|
|
189
|
+
raise ArgumentError, "#{name} must be >= #{minimum}" if integer < minimum
|
|
190
|
+
|
|
191
|
+
integer
|
|
192
|
+
rescue ArgumentError => error
|
|
193
|
+
raise error if error.message == "#{name} must be >= #{minimum}"
|
|
194
|
+
|
|
195
|
+
raise ArgumentError, "#{name} must be an integer >= #{minimum}"
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
def normalized_string(value)
|
|
199
|
+
normalized = value.to_s.strip
|
|
200
|
+
normalized.empty? ? nil : normalized
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def resolved_concurrency
|
|
204
|
+
configured_concurrency = resolved_integer("CONCURRENCY", default: @configuration.concurrency, minimum: 1)
|
|
205
|
+
|
|
206
|
+
if @configuration.renderer == :browser && normalized_string(ENV["CONCURRENCY"]).nil?
|
|
207
|
+
browser_concurrency = @configuration.browser_concurrency
|
|
208
|
+
|
|
209
|
+
if configured_concurrency > browser_concurrency
|
|
210
|
+
@configuration.output.puts("Default JS concurrency capped at #{browser_concurrency}. Set CONCURRENCY to override.")
|
|
211
|
+
browser_concurrency
|
|
212
|
+
else
|
|
213
|
+
configured_concurrency
|
|
214
|
+
end
|
|
215
|
+
else
|
|
216
|
+
configured_concurrency
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def resolved_integer(name, default:, minimum:)
|
|
221
|
+
raw_value = normalized_string(ENV[name])
|
|
222
|
+
return default if raw_value.nil?
|
|
223
|
+
|
|
224
|
+
integer_option(raw_value, minimum: minimum, name: name.downcase.tr("_", "-"))
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
def resolved_renderer
|
|
228
|
+
renderer = normalized_string(ENV["RENDERER"])
|
|
229
|
+
return renderer.to_sym if renderer
|
|
230
|
+
|
|
231
|
+
env_enabled?("JS") ? :browser : :http
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
def resolved_urls_from_env
|
|
235
|
+
raw_urls = normalized_string(ENV["URL"])
|
|
236
|
+
return [] if raw_urls.nil?
|
|
237
|
+
|
|
238
|
+
raw_urls.split(";").map(&:strip).reject(&:empty?)
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def task
|
|
242
|
+
@task ||= Task.new(configuration: @configuration, reporter: Reporter.new(io: @out))
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
end
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Crawlscope
|
|
4
|
+
class Configuration
|
|
5
|
+
DEFAULT_ALLOWED_STATUSES = [200, 301, 302].freeze
|
|
6
|
+
DEFAULT_BROWSER_CONCURRENCY = 4
|
|
7
|
+
DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS = 5
|
|
8
|
+
DEFAULT_BROWSER_SCROLL_PAGE = true
|
|
9
|
+
DEFAULT_CONCURRENCY = 10
|
|
10
|
+
DEFAULT_TIMEOUT_SECONDS = 20
|
|
11
|
+
|
|
12
|
+
attr_writer :allowed_statuses, :base_url, :browser_factory, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :rule_registry, :schema_registry, :scroll_page, :site_name, :sitemap_path, :timeout_seconds
|
|
13
|
+
|
|
14
|
+
def allowed_statuses
|
|
15
|
+
value = resolve(@allowed_statuses)
|
|
16
|
+
Array(value.nil? ? DEFAULT_ALLOWED_STATUSES : value).map(&:to_i)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def base_url
|
|
20
|
+
resolve(@base_url)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def browser_factory
|
|
24
|
+
resolve(@browser_factory)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def concurrency
|
|
28
|
+
value = resolve(@concurrency)
|
|
29
|
+
value.nil? ? DEFAULT_CONCURRENCY : value.to_i
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def browser_concurrency
|
|
33
|
+
value = concurrency
|
|
34
|
+
default_value = DEFAULT_BROWSER_CONCURRENCY
|
|
35
|
+
|
|
36
|
+
if value > default_value
|
|
37
|
+
default_value
|
|
38
|
+
else
|
|
39
|
+
value
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def network_idle_timeout_seconds
|
|
44
|
+
value = resolve(@network_idle_timeout_seconds)
|
|
45
|
+
value.nil? ? DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS : value.to_i
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def output
|
|
49
|
+
value = resolve(@output)
|
|
50
|
+
value.nil? ? $stdout : value
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def renderer
|
|
54
|
+
value = resolve(@renderer)
|
|
55
|
+
normalized_value = value.to_s.strip
|
|
56
|
+
normalized_value = "http" if normalized_value.empty?
|
|
57
|
+
|
|
58
|
+
normalized_value.to_sym
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def rule_registry
|
|
62
|
+
value = resolve(@rule_registry)
|
|
63
|
+
return value unless value.nil?
|
|
64
|
+
|
|
65
|
+
RuleRegistry.default(site_name: site_name)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def audit(base_url: self.base_url, sitemap_path: self.sitemap_path, rule_names: nil)
|
|
69
|
+
if base_url.to_s.strip.empty?
|
|
70
|
+
raise ConfigurationError, "Crawlscope base_url is not configured"
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
if sitemap_path.to_s.strip.empty?
|
|
74
|
+
raise ConfigurationError, "Crawlscope sitemap_path is not configured"
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
Audit.new(
|
|
78
|
+
base_url: base_url,
|
|
79
|
+
sitemap_path: sitemap_path,
|
|
80
|
+
browser_factory: browser_factory,
|
|
81
|
+
concurrency: concurrency,
|
|
82
|
+
network_idle_timeout_seconds: network_idle_timeout_seconds,
|
|
83
|
+
renderer: renderer,
|
|
84
|
+
timeout_seconds: timeout_seconds,
|
|
85
|
+
allowed_statuses: allowed_statuses,
|
|
86
|
+
rules: rule_registry.rules_for(rule_names),
|
|
87
|
+
schema_registry: schema_registry,
|
|
88
|
+
scroll_page: scroll_page?
|
|
89
|
+
)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def schema_registry
|
|
93
|
+
value = resolve(@schema_registry)
|
|
94
|
+
return value unless value.nil?
|
|
95
|
+
|
|
96
|
+
SchemaRegistry.default
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def site_name
|
|
100
|
+
resolve(@site_name)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def scroll_page?
|
|
104
|
+
value = resolve(@scroll_page)
|
|
105
|
+
value.nil? ? DEFAULT_BROWSER_SCROLL_PAGE : value
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def sitemap_path
|
|
109
|
+
resolve(@sitemap_path)
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def timeout_seconds
|
|
113
|
+
value = resolve(@timeout_seconds)
|
|
114
|
+
value.nil? ? DEFAULT_TIMEOUT_SECONDS : value.to_i
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
private
|
|
118
|
+
|
|
119
|
+
def resolve(value)
|
|
120
|
+
value.respond_to?(:call) ? value.call : value
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "concurrent"
|
|
4
|
+
|
|
5
|
+
module Crawlscope
|
|
6
|
+
class Crawler
|
|
7
|
+
def initialize(page_fetcher:, concurrency:)
|
|
8
|
+
@page_fetcher = page_fetcher
|
|
9
|
+
@concurrency = concurrency
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def call(urls)
|
|
13
|
+
pages = Concurrent::Array.new
|
|
14
|
+
pool = Concurrent::FixedThreadPool.new(@concurrency)
|
|
15
|
+
|
|
16
|
+
urls.each do |url|
|
|
17
|
+
pool.post do
|
|
18
|
+
pages << @page_fetcher.fetch(url)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
pool.shutdown
|
|
23
|
+
pool.wait_for_termination
|
|
24
|
+
|
|
25
|
+
pages.to_a
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "concurrent"
|
|
4
|
+
require "faraday"
|
|
5
|
+
require "faraday/follow_redirects"
|
|
6
|
+
require "nokogiri"
|
|
7
|
+
|
|
8
|
+
module Crawlscope
|
|
9
|
+
class Http
|
|
10
|
+
MAX_REDIRECTS = 5
|
|
11
|
+
USER_AGENT = "Mozilla/5.0 (compatible; Crawlscope/1.0)"
|
|
12
|
+
|
|
13
|
+
def initialize(base_url:, timeout_seconds:)
|
|
14
|
+
@base_url = base_url
|
|
15
|
+
@timeout_seconds = timeout_seconds
|
|
16
|
+
@connections_by_thread = Concurrent::Map.new
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def close
|
|
20
|
+
@connections_by_thread.clear
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def fetch(url)
|
|
24
|
+
response = connection.get(url) do |request|
|
|
25
|
+
request.headers["User-Agent"] = USER_AGENT
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
final_url = response.env.url.to_s
|
|
29
|
+
final_url = url if final_url.empty?
|
|
30
|
+
headers = response.headers.to_h
|
|
31
|
+
body = response.body.to_s
|
|
32
|
+
doc = if response.status == 200 && html_response?(headers)
|
|
33
|
+
Nokogiri::HTML(body)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
Page.new(
|
|
37
|
+
url: url,
|
|
38
|
+
normalized_url: Url.normalize(url, base_url: @base_url),
|
|
39
|
+
final_url: final_url,
|
|
40
|
+
normalized_final_url: Url.normalize(final_url, base_url: @base_url),
|
|
41
|
+
status: response.status,
|
|
42
|
+
headers: headers,
|
|
43
|
+
body: body,
|
|
44
|
+
doc: doc
|
|
45
|
+
)
|
|
46
|
+
rescue => error
|
|
47
|
+
Page.new(
|
|
48
|
+
url: url,
|
|
49
|
+
normalized_url: Url.normalize(url, base_url: @base_url),
|
|
50
|
+
final_url: url,
|
|
51
|
+
normalized_final_url: Url.normalize(url, base_url: @base_url),
|
|
52
|
+
status: nil,
|
|
53
|
+
headers: {},
|
|
54
|
+
body: nil,
|
|
55
|
+
doc: nil,
|
|
56
|
+
error: "#{error.class}: #{error.message}"
|
|
57
|
+
)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
def connection
|
|
63
|
+
@connections_by_thread.compute_if_absent(Thread.current.object_id) do
|
|
64
|
+
Faraday.new do |faraday|
|
|
65
|
+
faraday.response :follow_redirects, limit: MAX_REDIRECTS
|
|
66
|
+
faraday.options.timeout = @timeout_seconds
|
|
67
|
+
faraday.options.open_timeout = @timeout_seconds
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def html_response?(headers)
|
|
73
|
+
content_type = headers["content-type"] || headers.find { |key, _value| key.to_s.casecmp("content-type").zero? }&.last.to_s
|
|
74
|
+
content_type.empty? || content_type.include?("text/html")
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Crawlscope
|
|
4
|
+
Issue = Data.define(:code, :severity, :category, :url, :message, :details) do
|
|
5
|
+
def error?
|
|
6
|
+
severity == :error
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def warning?
|
|
10
|
+
severity == :warning
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def notice?
|
|
14
|
+
severity == :notice
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Crawlscope
|
|
4
|
+
class IssueCollection
|
|
5
|
+
include Enumerable
|
|
6
|
+
|
|
7
|
+
def initialize(issues = [])
|
|
8
|
+
@issues = issues.dup
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def add(issue = nil, **attributes)
|
|
12
|
+
issue ||= Issue.new(**attributes)
|
|
13
|
+
@issues << issue
|
|
14
|
+
issue
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def any?
|
|
18
|
+
@issues.any?
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def each(&block)
|
|
22
|
+
@issues.each(&block)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def size
|
|
26
|
+
@issues.size
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def to_a
|
|
30
|
+
@issues.dup
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def by_category
|
|
34
|
+
@issues.group_by(&:category)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def by_severity
|
|
38
|
+
@issues.group_by(&:severity)
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Crawlscope
|
|
4
|
+
class Page
|
|
5
|
+
attr_reader :body, :doc, :error, :final_url, :headers, :normalized_final_url, :normalized_url, :status, :url
|
|
6
|
+
|
|
7
|
+
def initialize(url:, normalized_url:, final_url:, normalized_final_url:, status:, headers:, body:, doc:, error: nil)
|
|
8
|
+
@url = url
|
|
9
|
+
@normalized_url = normalized_url
|
|
10
|
+
@final_url = final_url
|
|
11
|
+
@normalized_final_url = normalized_final_url
|
|
12
|
+
@status = status
|
|
13
|
+
@headers = headers || {}
|
|
14
|
+
@body = body
|
|
15
|
+
@doc = doc
|
|
16
|
+
@error = error
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def html?
|
|
20
|
+
!doc.nil?
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Crawlscope
|
|
4
|
+
class Reporter
|
|
5
|
+
def initialize(io:)
|
|
6
|
+
@io = io
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def report(result)
|
|
10
|
+
@io.puts("Crawlscope validation")
|
|
11
|
+
@io.puts("Base URL: #{result.base_url}")
|
|
12
|
+
@io.puts("Sitemap: #{result.sitemap_path}")
|
|
13
|
+
@io.puts("URLs: #{result.urls.size}")
|
|
14
|
+
@io.puts("Pages: #{result.pages.size}")
|
|
15
|
+
|
|
16
|
+
if result.ok?
|
|
17
|
+
@io.puts("Status: OK")
|
|
18
|
+
return
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
@io.puts("Status: FAILED")
|
|
22
|
+
@io.puts("Issues: #{result.issues.size}")
|
|
23
|
+
|
|
24
|
+
result.issues.by_severity.sort_by { |severity, _issues| severity.to_s }.each do |severity, issues|
|
|
25
|
+
@io.puts("#{severity}: #{issues.size}")
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
result.issues.each do |issue|
|
|
29
|
+
@io.puts("- [#{issue.severity}] #{issue.url} #{issue.message}")
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Crawlscope
|
|
4
|
+
class RuleRegistry
|
|
5
|
+
attr_reader :default_codes, :rules
|
|
6
|
+
|
|
7
|
+
def initialize(rules:, default_codes: nil)
|
|
8
|
+
@rules = Array(rules)
|
|
9
|
+
@default_codes = Array(default_codes).map(&:to_sym)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def self.default(site_name: nil)
|
|
13
|
+
new(
|
|
14
|
+
rules: [
|
|
15
|
+
Rules::Metadata.new(site_name: site_name),
|
|
16
|
+
Rules::StructuredData.new,
|
|
17
|
+
Rules::Uniqueness.new,
|
|
18
|
+
Rules::Links.new
|
|
19
|
+
],
|
|
20
|
+
default_codes: %i[metadata structured_data uniqueness links]
|
|
21
|
+
)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def codes
|
|
25
|
+
@rules.map(&:code)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def rules_for(names)
|
|
29
|
+
normalized_names = Array(names).flat_map { |value| value.to_s.split(",") }.map(&:strip).reject(&:empty?)
|
|
30
|
+
normalized_names = @default_codes.map(&:to_s) if normalized_names.empty?
|
|
31
|
+
|
|
32
|
+
selected_rules = @rules.select { |rule| normalized_names.include?(rule.code.to_s) }
|
|
33
|
+
missing_rules = normalized_names - selected_rules.map { |rule| rule.code.to_s }
|
|
34
|
+
return selected_rules if missing_rules.empty?
|
|
35
|
+
|
|
36
|
+
raise ConfigurationError, "Unknown Crawlscope rules: #{missing_rules.join(", ")}"
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|