crawlscope 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -8
- data/README.md +21 -14
- data/lib/crawlscope/browser.rb +8 -0
- data/lib/crawlscope/cli.rb +15 -10
- data/lib/crawlscope/configuration.rb +20 -5
- data/lib/crawlscope/context.rb +9 -0
- data/lib/crawlscope/{audit.rb → crawl.rb} +68 -58
- data/lib/crawlscope/crawler.rb +19 -1
- data/lib/crawlscope/http.rb +1 -1
- data/lib/crawlscope/rake_tasks.rb +28 -0
- data/lib/crawlscope/rules/links.rb +99 -48
- data/lib/crawlscope/rules/metadata.rb +57 -11
- data/lib/crawlscope/rules/structured_data.rb +61 -1
- data/lib/crawlscope/run.rb +60 -0
- data/lib/crawlscope/schema_registry.rb +3 -349
- data/lib/crawlscope/schemas.rb +406 -0
- data/lib/crawlscope/sitemap.rb +18 -6
- data/lib/crawlscope/structured_data/audit.rb +7 -7
- data/lib/crawlscope/structured_data/check.rb +35 -0
- data/lib/crawlscope/structured_data/reporter.rb +69 -0
- data/lib/crawlscope/url.rb +14 -0
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +12 -23
- data/test/crawlscope/browser_test.rb +155 -0
- data/test/crawlscope/cli_test.rb +143 -7
- data/test/crawlscope/configuration_test.rb +49 -0
- data/test/crawlscope/{audit_test.rb → crawl_test.rb} +23 -7
- data/test/crawlscope/crawler_test.rb +34 -0
- data/test/crawlscope/http_test.rb +56 -0
- data/test/crawlscope/links_rule_test.rb +149 -5
- data/test/crawlscope/metadata_rule_test.rb +77 -0
- data/test/crawlscope/rule_registry_test.rb +32 -0
- data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
- data/test/crawlscope/schema_registry_test.rb +19 -0
- data/test/crawlscope/sitemap_test.rb +55 -0
- data/test/crawlscope/structured_data_document_test.rb +36 -0
- data/test/crawlscope/structured_data_report_test.rb +3 -3
- data/test/crawlscope/structured_data_reporter_test.rb +2 -2
- data/test/crawlscope/structured_data_rule_test.rb +111 -0
- data/test/crawlscope/structured_data_writer_test.rb +2 -2
- data/test/crawlscope/url_test.rb +31 -0
- metadata +15 -5
- data/lib/crawlscope/task.rb +0 -131
|
@@ -5,9 +5,9 @@ require "test_helper"
|
|
|
5
5
|
|
|
6
6
|
class CrawlscopeStructuredDataWriterTest < Minitest::Test
|
|
7
7
|
def test_writes_json_report
|
|
8
|
-
result = Crawlscope::StructuredData::Audit::
|
|
8
|
+
result = Crawlscope::StructuredData::Audit::Outcome.new(
|
|
9
9
|
entries: [
|
|
10
|
-
Crawlscope::StructuredData::Audit::
|
|
10
|
+
Crawlscope::StructuredData::Audit::Page.new(
|
|
11
11
|
url: "https://example.com/article",
|
|
12
12
|
status: 200,
|
|
13
13
|
structured_items: [{source: "json-ld", data: {"@type" => "Article"}}],
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class CrawlscopeUrlTest < Minitest::Test
|
|
6
|
+
def test_normalize_resolves_relative_urls_and_removes_trailing_slash
|
|
7
|
+
assert_equal "https://example.com/pricing", Crawlscope::Url.normalize("/pricing/", base_url: "https://example.com")
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def test_normalize_preserves_non_default_port
|
|
11
|
+
assert_equal "http://localhost:3000/pricing", Crawlscope::Url.normalize("/pricing", base_url: "http://localhost:3000")
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def test_normalize_for_base_rebases_absolute_urls
|
|
15
|
+
assert_equal(
|
|
16
|
+
"http://localhost:3000/features",
|
|
17
|
+
Crawlscope::Url.normalize_for_base("https://www.example.com/features", base_url: "http://localhost:3000")
|
|
18
|
+
)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def test_path_normalizes_blank_and_trailing_slash
|
|
22
|
+
assert_equal "/", Crawlscope::Url.path("https://example.com")
|
|
23
|
+
assert_equal "/features", Crawlscope::Url.path("https://example.com/features/")
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def test_invalid_urls_are_returned_or_ignored
|
|
27
|
+
assert_equal "http:// bad", Crawlscope::Url.normalize("http:// bad", base_url: "https://example.com")
|
|
28
|
+
assert_nil Crawlscope::Url.path("http:// bad")
|
|
29
|
+
refute Crawlscope::Url.remote?("http:// bad")
|
|
30
|
+
end
|
|
31
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: crawlscope
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Paulo Fidalgo
|
|
@@ -193,16 +193,18 @@ files:
|
|
|
193
193
|
- README.md
|
|
194
194
|
- exe/crawlscope
|
|
195
195
|
- lib/crawlscope.rb
|
|
196
|
-
- lib/crawlscope/audit.rb
|
|
197
196
|
- lib/crawlscope/browser.rb
|
|
198
197
|
- lib/crawlscope/cli.rb
|
|
199
198
|
- lib/crawlscope/configuration.rb
|
|
199
|
+
- lib/crawlscope/context.rb
|
|
200
|
+
- lib/crawlscope/crawl.rb
|
|
200
201
|
- lib/crawlscope/crawler.rb
|
|
201
202
|
- lib/crawlscope/http.rb
|
|
202
203
|
- lib/crawlscope/issue.rb
|
|
203
204
|
- lib/crawlscope/issue_collection.rb
|
|
204
205
|
- lib/crawlscope/page.rb
|
|
205
206
|
- lib/crawlscope/railtie.rb
|
|
207
|
+
- lib/crawlscope/rake_tasks.rb
|
|
206
208
|
- lib/crawlscope/reporter.rb
|
|
207
209
|
- lib/crawlscope/result.rb
|
|
208
210
|
- lib/crawlscope/rule_registry.rb
|
|
@@ -210,23 +212,31 @@ files:
|
|
|
210
212
|
- lib/crawlscope/rules/metadata.rb
|
|
211
213
|
- lib/crawlscope/rules/structured_data.rb
|
|
212
214
|
- lib/crawlscope/rules/uniqueness.rb
|
|
215
|
+
- lib/crawlscope/run.rb
|
|
213
216
|
- lib/crawlscope/schema_registry.rb
|
|
217
|
+
- lib/crawlscope/schemas.rb
|
|
214
218
|
- lib/crawlscope/sitemap.rb
|
|
215
219
|
- lib/crawlscope/structured_data/audit.rb
|
|
220
|
+
- lib/crawlscope/structured_data/check.rb
|
|
216
221
|
- lib/crawlscope/structured_data/document.rb
|
|
217
222
|
- lib/crawlscope/structured_data/report.rb
|
|
218
223
|
- lib/crawlscope/structured_data/reporter.rb
|
|
219
224
|
- lib/crawlscope/structured_data/writer.rb
|
|
220
|
-
- lib/crawlscope/task.rb
|
|
221
225
|
- lib/crawlscope/url.rb
|
|
222
226
|
- lib/crawlscope/version.rb
|
|
223
227
|
- lib/tasks/crawlscope_tasks.rake
|
|
224
|
-
- test/crawlscope/
|
|
228
|
+
- test/crawlscope/browser_test.rb
|
|
225
229
|
- test/crawlscope/cli_test.rb
|
|
226
230
|
- test/crawlscope/configuration_test.rb
|
|
231
|
+
- test/crawlscope/crawl_test.rb
|
|
232
|
+
- test/crawlscope/crawler_test.rb
|
|
233
|
+
- test/crawlscope/http_test.rb
|
|
227
234
|
- test/crawlscope/links_rule_test.rb
|
|
228
235
|
- test/crawlscope/loader_test.rb
|
|
236
|
+
- test/crawlscope/metadata_rule_test.rb
|
|
229
237
|
- test/crawlscope/reporter_test.rb
|
|
238
|
+
- test/crawlscope/rule_registry_test.rb
|
|
239
|
+
- test/crawlscope/run_test.rb
|
|
230
240
|
- test/crawlscope/schema_registry_test.rb
|
|
231
241
|
- test/crawlscope/sitemap_test.rb
|
|
232
242
|
- test/crawlscope/structured_data_audit_test.rb
|
|
@@ -235,8 +245,8 @@ files:
|
|
|
235
245
|
- test/crawlscope/structured_data_reporter_test.rb
|
|
236
246
|
- test/crawlscope/structured_data_rule_test.rb
|
|
237
247
|
- test/crawlscope/structured_data_writer_test.rb
|
|
238
|
-
- test/crawlscope/task_test.rb
|
|
239
248
|
- test/crawlscope/uniqueness_rule_test.rb
|
|
249
|
+
- test/crawlscope/url_test.rb
|
|
240
250
|
- test/test_helper.rb
|
|
241
251
|
homepage: https://www.ethos-link.com/opensource/crawlscope
|
|
242
252
|
licenses:
|
data/lib/crawlscope/task.rb
DELETED
|
@@ -1,131 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "json"
|
|
4
|
-
|
|
5
|
-
module Crawlscope
|
|
6
|
-
class Task
|
|
7
|
-
def initialize(configuration: Crawlscope.configuration, reporter: Reporter.new(io: configuration.output))
|
|
8
|
-
@configuration = configuration
|
|
9
|
-
@reporter = reporter
|
|
10
|
-
end
|
|
11
|
-
|
|
12
|
-
def validate(base_url: nil, sitemap_path: nil, rule_names: nil)
|
|
13
|
-
resolved_base_url = base_url || default_base_url
|
|
14
|
-
audit = @configuration.audit(
|
|
15
|
-
base_url: resolved_base_url,
|
|
16
|
-
sitemap_path: sitemap_path || default_sitemap_path(base_url: resolved_base_url),
|
|
17
|
-
rule_names: rule_names
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
result = audit.call
|
|
21
|
-
@reporter.report(result)
|
|
22
|
-
result
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
def validate_ldjson(urls:, debug: false, renderer: @configuration.renderer, timeout_seconds: @configuration.timeout_seconds, report_path: nil, summary: false)
|
|
26
|
-
audit = StructuredData::Audit.new(
|
|
27
|
-
browser_factory: @configuration.browser_factory,
|
|
28
|
-
network_idle_timeout_seconds: @configuration.network_idle_timeout_seconds,
|
|
29
|
-
renderer: renderer,
|
|
30
|
-
schema_registry: @configuration.schema_registry,
|
|
31
|
-
scroll_page: @configuration.scroll_page?,
|
|
32
|
-
timeout_seconds: timeout_seconds
|
|
33
|
-
)
|
|
34
|
-
result = audit.call(urls: urls)
|
|
35
|
-
|
|
36
|
-
report_ldjson_result(result, debug: debug, renderer: renderer)
|
|
37
|
-
StructuredData::Writer.new(path: report_path).write(result) if report_path
|
|
38
|
-
StructuredData::Reporter.new(io: @configuration.output, report_path: report_path).report(result) if summary
|
|
39
|
-
result
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
private
|
|
43
|
-
|
|
44
|
-
def default_base_url
|
|
45
|
-
value = @configuration.base_url
|
|
46
|
-
return value unless value.to_s.strip.empty?
|
|
47
|
-
|
|
48
|
-
"http://localhost:3000"
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
def default_sitemap_path(base_url:)
|
|
52
|
-
value = @configuration.sitemap_path
|
|
53
|
-
return value unless value.to_s.strip.empty?
|
|
54
|
-
|
|
55
|
-
local_path = File.expand_path("public/sitemap.xml", Dir.pwd)
|
|
56
|
-
if local_path_default?(base_url: base_url) && File.exist?(local_path)
|
|
57
|
-
return local_path
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
"#{base_url.to_s.chomp("/")}/sitemap.xml"
|
|
61
|
-
end
|
|
62
|
-
|
|
63
|
-
def local_path_default?(base_url:)
|
|
64
|
-
host = URI.parse(base_url.to_s).host.to_s
|
|
65
|
-
["localhost", "127.0.0.1"].include?(host)
|
|
66
|
-
rescue URI::InvalidURIError
|
|
67
|
-
false
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
def report_ldjson_result(result, debug:, renderer:)
|
|
71
|
-
if renderer == :browser
|
|
72
|
-
@configuration.output.puts("JavaScript mode enabled (Ferrum)")
|
|
73
|
-
end
|
|
74
|
-
|
|
75
|
-
@configuration.output.puts("Validating JSON-LD on #{result.entries.size} URL(s)")
|
|
76
|
-
@configuration.output.puts("")
|
|
77
|
-
|
|
78
|
-
result.entries.each do |entry|
|
|
79
|
-
@configuration.output.puts("=" * 80)
|
|
80
|
-
@configuration.output.puts("URL: #{entry.url}")
|
|
81
|
-
@configuration.output.puts("=" * 80)
|
|
82
|
-
|
|
83
|
-
if entry.fetch_error
|
|
84
|
-
@configuration.output.puts("Error: #{entry.fetch_error}")
|
|
85
|
-
@configuration.output.puts("")
|
|
86
|
-
next
|
|
87
|
-
end
|
|
88
|
-
|
|
89
|
-
if entry.status
|
|
90
|
-
@configuration.output.puts("Status: #{entry.status}")
|
|
91
|
-
else
|
|
92
|
-
@configuration.output.puts("Status: JS runtime fetch")
|
|
93
|
-
end
|
|
94
|
-
|
|
95
|
-
@configuration.output.puts("Structured data found: #{entry.structured_items.size} (JSON-LD: #{entry.json_ld_count}, Microdata: #{entry.microdata_count})")
|
|
96
|
-
|
|
97
|
-
if debug && entry.structured_items.any?
|
|
98
|
-
@configuration.output.puts("")
|
|
99
|
-
@configuration.output.puts("--- Detected Structured Data ---")
|
|
100
|
-
|
|
101
|
-
entry.structured_items.each_with_index do |item, index|
|
|
102
|
-
@configuration.output.puts("")
|
|
103
|
-
@configuration.output.puts("## Item #{index + 1} [#{item[:source]}]")
|
|
104
|
-
@configuration.output.puts(JSON.pretty_generate(item[:data]))
|
|
105
|
-
end
|
|
106
|
-
|
|
107
|
-
@configuration.output.puts("")
|
|
108
|
-
@configuration.output.puts("--- End ---")
|
|
109
|
-
end
|
|
110
|
-
|
|
111
|
-
@configuration.output.puts("")
|
|
112
|
-
@configuration.output.puts("Validation results:")
|
|
113
|
-
|
|
114
|
-
if entry.errors.empty?
|
|
115
|
-
@configuration.output.puts(" All valid!")
|
|
116
|
-
else
|
|
117
|
-
entry.errors.each do |error|
|
|
118
|
-
@configuration.output.puts(" #{error[:type]}: INVALID [#{error[:source]}]")
|
|
119
|
-
error[:errors].each do |validation_error|
|
|
120
|
-
@configuration.output.puts(" - field: #{validation_error[:field]}, issue: #{validation_error[:issue]}")
|
|
121
|
-
end
|
|
122
|
-
end
|
|
123
|
-
end
|
|
124
|
-
|
|
125
|
-
@configuration.output.puts("")
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
@configuration.output.puts("STATUS: #{result.ok? ? "OK" : "FAILED"}")
|
|
129
|
-
end
|
|
130
|
-
end
|
|
131
|
-
end
|