crawlscope 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -11
- data/README.md +20 -13
- data/lib/crawlscope/browser.rb +8 -0
- data/lib/crawlscope/cli.rb +10 -10
- data/lib/crawlscope/configuration.rb +20 -5
- data/lib/crawlscope/context.rb +9 -0
- data/lib/crawlscope/{audit.rb → crawl.rb} +62 -58
- data/lib/crawlscope/crawler.rb +19 -1
- data/lib/crawlscope/http.rb +1 -1
- data/lib/crawlscope/rake_tasks.rb +28 -0
- data/lib/crawlscope/rules/links.rb +76 -43
- data/lib/crawlscope/rules/structured_data.rb +14 -1
- data/lib/crawlscope/run.rb +60 -0
- data/lib/crawlscope/schema_registry.rb +3 -349
- data/lib/crawlscope/schemas.rb +355 -0
- data/lib/crawlscope/sitemap.rb +18 -6
- data/lib/crawlscope/structured_data/audit.rb +7 -7
- data/lib/crawlscope/structured_data/check.rb +35 -0
- data/lib/crawlscope/structured_data/reporter.rb +69 -0
- data/lib/crawlscope/url.rb +14 -0
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +12 -23
- data/test/crawlscope/browser_test.rb +155 -0
- data/test/crawlscope/cli_test.rb +128 -6
- data/test/crawlscope/configuration_test.rb +49 -0
- data/test/crawlscope/{audit_test.rb → crawl_test.rb} +11 -5
- data/test/crawlscope/crawler_test.rb +34 -0
- data/test/crawlscope/http_test.rb +56 -0
- data/test/crawlscope/links_rule_test.rb +110 -5
- data/test/crawlscope/rule_registry_test.rb +32 -0
- data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
- data/test/crawlscope/schema_registry_test.rb +19 -0
- data/test/crawlscope/sitemap_test.rb +55 -0
- data/test/crawlscope/structured_data_document_test.rb +36 -0
- data/test/crawlscope/structured_data_report_test.rb +3 -3
- data/test/crawlscope/structured_data_reporter_test.rb +2 -2
- data/test/crawlscope/structured_data_rule_test.rb +20 -0
- data/test/crawlscope/structured_data_writer_test.rb +2 -2
- data/test/crawlscope/url_test.rb +31 -0
- metadata +14 -5
- data/lib/crawlscope/task.rb +0 -131
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class CrawlscopeRuleRegistryTest < Minitest::Test
|
|
6
|
+
Rule = Data.define(:code)
|
|
7
|
+
|
|
8
|
+
def test_rules_for_returns_defaults_when_names_are_blank
|
|
9
|
+
metadata = Rule.new(:metadata)
|
|
10
|
+
links = Rule.new(:links)
|
|
11
|
+
registry = Crawlscope::RuleRegistry.new(rules: [metadata, links], default_codes: %i[links])
|
|
12
|
+
|
|
13
|
+
assert_equal [links], registry.rules_for(nil)
|
|
14
|
+
assert_equal [links], registry.rules_for("")
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def test_rules_for_accepts_csv_and_arrays
|
|
18
|
+
metadata = Rule.new(:metadata)
|
|
19
|
+
links = Rule.new(:links)
|
|
20
|
+
registry = Crawlscope::RuleRegistry.new(rules: [metadata, links])
|
|
21
|
+
|
|
22
|
+
assert_equal [metadata, links], registry.rules_for(["metadata, links"])
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def test_rules_for_rejects_unknown_rules
|
|
26
|
+
registry = Crawlscope::RuleRegistry.new(rules: [Rule.new(:metadata)])
|
|
27
|
+
|
|
28
|
+
error = assert_raises(Crawlscope::ConfigurationError) { registry.rules_for("links") }
|
|
29
|
+
|
|
30
|
+
assert_equal "Unknown Crawlscope rules: links", error.message
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require "test_helper"
|
|
4
4
|
|
|
5
|
-
class
|
|
5
|
+
class CrawlscopeRunTest < Minitest::Test
|
|
6
6
|
FakeResult = Data.define(:reported) do
|
|
7
7
|
def ok?
|
|
8
8
|
true
|
|
@@ -47,16 +47,29 @@ class CrawlscopeTaskTest < Minitest::Test
|
|
|
47
47
|
end
|
|
48
48
|
end
|
|
49
49
|
|
|
50
|
-
class
|
|
50
|
+
class JsonLdConfiguration
|
|
51
51
|
attr_reader :output
|
|
52
52
|
|
|
53
|
-
def initialize(output:,
|
|
53
|
+
def initialize(output:, page:)
|
|
54
54
|
@output = output
|
|
55
|
-
@
|
|
55
|
+
@page = page
|
|
56
|
+
@closed = false
|
|
56
57
|
end
|
|
57
58
|
|
|
58
59
|
def browser_factory
|
|
59
|
-
-> {
|
|
60
|
+
-> { self }
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def close
|
|
64
|
+
@closed = true
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def closed?
|
|
68
|
+
@closed
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def fetch(_url)
|
|
72
|
+
@page
|
|
60
73
|
end
|
|
61
74
|
|
|
62
75
|
def network_idle_timeout_seconds
|
|
@@ -80,30 +93,13 @@ class CrawlscopeTaskTest < Minitest::Test
|
|
|
80
93
|
end
|
|
81
94
|
end
|
|
82
95
|
|
|
83
|
-
class FakeBrowser
|
|
84
|
-
attr_reader :closed
|
|
85
|
-
|
|
86
|
-
def initialize(page:)
|
|
87
|
-
@page = page
|
|
88
|
-
@closed = false
|
|
89
|
-
end
|
|
90
|
-
|
|
91
|
-
def close
|
|
92
|
-
@closed = true
|
|
93
|
-
end
|
|
94
|
-
|
|
95
|
-
def fetch(_url)
|
|
96
|
-
@page
|
|
97
|
-
end
|
|
98
|
-
end
|
|
99
|
-
|
|
100
96
|
def test_validate_passes_rule_names_to_configuration_audit
|
|
101
97
|
result = FakeResult.new(reported: true)
|
|
102
98
|
configuration = FakeConfiguration.new(result: result)
|
|
103
99
|
reporter = FakeReporter.new
|
|
104
100
|
|
|
105
|
-
|
|
106
|
-
returned_result =
|
|
101
|
+
run = Crawlscope::Run.new(configuration: configuration, reporter: reporter)
|
|
102
|
+
returned_result = run.validate(rule_names: "links")
|
|
107
103
|
|
|
108
104
|
assert_equal(
|
|
109
105
|
{
|
|
@@ -122,7 +118,7 @@ class CrawlscopeTaskTest < Minitest::Test
|
|
|
122
118
|
configuration = FakeConfiguration.new(result: result, base_url: "https://example.com", sitemap_path: nil)
|
|
123
119
|
reporter = FakeReporter.new
|
|
124
120
|
|
|
125
|
-
Crawlscope::
|
|
121
|
+
Crawlscope::Run.new(configuration: configuration, reporter: reporter).validate
|
|
126
122
|
|
|
127
123
|
assert_equal(
|
|
128
124
|
{
|
|
@@ -144,7 +140,7 @@ class CrawlscopeTaskTest < Minitest::Test
|
|
|
144
140
|
File.write(sitemap_path, "<urlset></urlset>")
|
|
145
141
|
|
|
146
142
|
Dir.chdir(tmp_dir) do
|
|
147
|
-
Crawlscope::
|
|
143
|
+
Crawlscope::Run.new(configuration: configuration, reporter: reporter).validate
|
|
148
144
|
end
|
|
149
145
|
|
|
150
146
|
assert_equal(
|
|
@@ -159,7 +155,7 @@ class CrawlscopeTaskTest < Minitest::Test
|
|
|
159
155
|
FileUtils.rm_rf(tmp_dir) if tmp_dir
|
|
160
156
|
end
|
|
161
157
|
|
|
162
|
-
def
|
|
158
|
+
def test_validate_json_ld_reports_valid_structured_data
|
|
163
159
|
body = <<~HTML
|
|
164
160
|
<html>
|
|
165
161
|
<head>
|
|
@@ -177,15 +173,14 @@ class CrawlscopeTaskTest < Minitest::Test
|
|
|
177
173
|
status: 200,
|
|
178
174
|
headers: {"content-type" => "text/html"},
|
|
179
175
|
body: body,
|
|
180
|
-
doc:
|
|
176
|
+
doc: nil
|
|
181
177
|
)
|
|
182
|
-
browser = FakeBrowser.new(page: page)
|
|
183
178
|
output = StringIO.new
|
|
184
|
-
configuration =
|
|
179
|
+
configuration = JsonLdConfiguration.new(output: output, page: page)
|
|
185
180
|
report_dir = Dir.mktmpdir
|
|
186
181
|
report_path = File.join(report_dir, "structured-data.json")
|
|
187
182
|
|
|
188
|
-
result = Crawlscope::
|
|
183
|
+
result = Crawlscope::Run.new(configuration: configuration).validate_json_ld(
|
|
189
184
|
urls: [page.url],
|
|
190
185
|
debug: true,
|
|
191
186
|
report_path: report_path,
|
|
@@ -193,9 +188,9 @@ class CrawlscopeTaskTest < Minitest::Test
|
|
|
193
188
|
)
|
|
194
189
|
|
|
195
190
|
assert result.ok?
|
|
196
|
-
|
|
191
|
+
assert_predicate configuration, :closed?
|
|
197
192
|
assert File.exist?(report_path)
|
|
198
|
-
|
|
193
|
+
assert_equal ["https://example.com"], JSON.parse(File.read(report_path)).fetch("results").keys
|
|
199
194
|
assert_includes output.string, "JavaScript mode enabled (Ferrum)"
|
|
200
195
|
assert_includes output.string, "Validating JSON-LD on 1 URL(s)"
|
|
201
196
|
assert_includes output.string, "All valid!"
|
|
@@ -86,4 +86,23 @@ class CrawlscopeSchemaRegistryTest < Minitest::Test
|
|
|
86
86
|
|
|
87
87
|
assert_equal "Unknown Crawlscope rules: unknown", error.message
|
|
88
88
|
end
|
|
89
|
+
|
|
90
|
+
def test_validate_accepts_arrays_graphs_unknown_types_and_non_hashes
|
|
91
|
+
registry = Crawlscope::SchemaRegistry.default
|
|
92
|
+
|
|
93
|
+
errors = registry.validate(
|
|
94
|
+
[
|
|
95
|
+
"ignored",
|
|
96
|
+
{"@type" => "UnknownThing"},
|
|
97
|
+
{
|
|
98
|
+
"@graph" => [
|
|
99
|
+
{"@type" => "Article"},
|
|
100
|
+
{"@type" => "WebSite", "name" => "Example"}
|
|
101
|
+
]
|
|
102
|
+
}
|
|
103
|
+
]
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
assert_equal ["Article"], errors.map { |error| error[:type] }
|
|
107
|
+
end
|
|
89
108
|
end
|
|
@@ -48,4 +48,59 @@ class CrawlscopeSitemapTest < Minitest::Test
|
|
|
48
48
|
|
|
49
49
|
assert_equal ["https://www.example.com/features/reviews"], parser.urls(base_url: "https://www.example.com")
|
|
50
50
|
end
|
|
51
|
+
|
|
52
|
+
def test_rebases_remote_sitemap_index_children_to_base_url
|
|
53
|
+
stub_request(:get, "http://localhost:3000/sitemap.xml")
|
|
54
|
+
.to_return(
|
|
55
|
+
status: 200,
|
|
56
|
+
body: <<~XML
|
|
57
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
58
|
+
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
59
|
+
<sitemap><loc>https://www.example.com/sitemap-marketing.xml</loc></sitemap>
|
|
60
|
+
</sitemapindex>
|
|
61
|
+
XML
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
stub_request(:get, "http://localhost:3000/sitemap-marketing.xml")
|
|
65
|
+
.to_return(
|
|
66
|
+
status: 200,
|
|
67
|
+
body: <<~XML
|
|
68
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
69
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
70
|
+
<url><loc>https://www.example.com/features/reviews</loc></url>
|
|
71
|
+
</urlset>
|
|
72
|
+
XML
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
parser = Crawlscope::Sitemap.new(path: "http://localhost:3000/sitemap.xml")
|
|
76
|
+
|
|
77
|
+
assert_equal ["http://localhost:3000/features/reviews"], parser.urls(base_url: "http://localhost:3000")
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def test_parses_local_sitemap_index_with_absolute_child_sitemap_loc
|
|
81
|
+
Dir.mktmpdir do |dir|
|
|
82
|
+
File.write(
|
|
83
|
+
File.join(dir, "sitemap.xml"),
|
|
84
|
+
<<~XML
|
|
85
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
86
|
+
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
87
|
+
<sitemap><loc>https://www.example.com/sitemap-pages.xml</loc></sitemap>
|
|
88
|
+
</sitemapindex>
|
|
89
|
+
XML
|
|
90
|
+
)
|
|
91
|
+
File.write(
|
|
92
|
+
File.join(dir, "sitemap-pages.xml"),
|
|
93
|
+
<<~XML
|
|
94
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
95
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
96
|
+
<url><loc>https://www.example.com/features/reviews</loc></url>
|
|
97
|
+
</urlset>
|
|
98
|
+
XML
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
parser = Crawlscope::Sitemap.new(path: File.join(dir, "sitemap.xml"))
|
|
102
|
+
|
|
103
|
+
assert_equal ["http://localhost:3000/features/reviews"], parser.urls(base_url: "http://localhost:3000")
|
|
104
|
+
end
|
|
105
|
+
end
|
|
51
106
|
end
|
|
@@ -25,4 +25,40 @@ class CrawlscopeStructuredDataDocumentTest < Minitest::Test
|
|
|
25
25
|
assert_equal ["json-ld", "microdata"], items.map(&:source)
|
|
26
26
|
assert_equal "Hotel Test", document.json_ld_items.first["name"]
|
|
27
27
|
end
|
|
28
|
+
|
|
29
|
+
def test_json_ld_handles_arrays_invalid_json_and_non_object_entries
|
|
30
|
+
html = <<~HTML
|
|
31
|
+
<script type="application/ld+json">
|
|
32
|
+
[{"@type":"WebSite","name":"Example"}, "ignored"]
|
|
33
|
+
</script>
|
|
34
|
+
<script type="application/ld+json">
|
|
35
|
+
{"@type":
|
|
36
|
+
</script>
|
|
37
|
+
HTML
|
|
38
|
+
|
|
39
|
+
document = Crawlscope::StructuredData::Document.new(html: html)
|
|
40
|
+
|
|
41
|
+
assert_equal 2, document.items.size
|
|
42
|
+
assert_equal ["WebSite"], document.json_ld_items.map { |item| item["@type"] }
|
|
43
|
+
assert_equal "Invalid JSON-LD", document.items.last.data[:error]
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def test_microdata_extracts_common_value_attributes
|
|
47
|
+
html = <<~HTML
|
|
48
|
+
<div itemscope itemtype="https://schema.org/Event">
|
|
49
|
+
<meta itemprop="name" content="Launch">
|
|
50
|
+
<time itemprop="startDate" datetime="2026-04-24T10:00:00Z"></time>
|
|
51
|
+
<a itemprop="url" href="https://example.com/event">Event</a>
|
|
52
|
+
<data itemprop="position" value="1"></data>
|
|
53
|
+
</div>
|
|
54
|
+
HTML
|
|
55
|
+
|
|
56
|
+
item = Crawlscope::StructuredData::Document.new(html: html).items.first.data
|
|
57
|
+
|
|
58
|
+
assert_equal "Event", item["@type"]
|
|
59
|
+
assert_equal "Launch", item["name"]
|
|
60
|
+
assert_equal "2026-04-24T10:00:00Z", item["startDate"]
|
|
61
|
+
assert_equal "https://example.com/event", item["url"]
|
|
62
|
+
assert_equal "1", item["position"]
|
|
63
|
+
end
|
|
28
64
|
end
|
|
@@ -4,9 +4,9 @@ require "test_helper"
|
|
|
4
4
|
|
|
5
5
|
class CrawlscopeStructuredDataReportTest < Minitest::Test
|
|
6
6
|
def test_results_maps_validation_errors_and_skips
|
|
7
|
-
result = Crawlscope::StructuredData::Audit::
|
|
7
|
+
result = Crawlscope::StructuredData::Audit::Outcome.new(
|
|
8
8
|
entries: [
|
|
9
|
-
Crawlscope::StructuredData::Audit::
|
|
9
|
+
Crawlscope::StructuredData::Audit::Page.new(
|
|
10
10
|
url: "https://example.com/article",
|
|
11
11
|
status: 200,
|
|
12
12
|
structured_items: [{source: "json-ld", data: {"@type" => "Article"}}],
|
|
@@ -15,7 +15,7 @@ class CrawlscopeStructuredDataReportTest < Minitest::Test
|
|
|
15
15
|
content_type: "text/html",
|
|
16
16
|
skipped_reason: nil
|
|
17
17
|
),
|
|
18
|
-
Crawlscope::StructuredData::Audit::
|
|
18
|
+
Crawlscope::StructuredData::Audit::Page.new(
|
|
19
19
|
url: "https://example.com/feed.xml",
|
|
20
20
|
status: 200,
|
|
21
21
|
structured_items: [],
|
|
@@ -5,9 +5,9 @@ require "test_helper"
|
|
|
5
5
|
|
|
6
6
|
class CrawlscopeStructuredDataReporterTest < Minitest::Test
|
|
7
7
|
def test_reports_failures_and_report_path
|
|
8
|
-
result = Crawlscope::StructuredData::Audit::
|
|
8
|
+
result = Crawlscope::StructuredData::Audit::Outcome.new(
|
|
9
9
|
entries: [
|
|
10
|
-
Crawlscope::StructuredData::Audit::
|
|
10
|
+
Crawlscope::StructuredData::Audit::Page.new(
|
|
11
11
|
url: "https://example.com/article",
|
|
12
12
|
status: 200,
|
|
13
13
|
structured_items: [{source: "json-ld", data: {"@type" => "Article"}}],
|
|
@@ -59,6 +59,26 @@ class CrawlscopeStructuredDataRuleTest < Minitest::Test
|
|
|
59
59
|
assert_equal [:structured_data_parse_error], issues.to_a.map(&:code)
|
|
60
60
|
end
|
|
61
61
|
|
|
62
|
+
def test_reports_missing_structured_data_for_html_pages
|
|
63
|
+
issues = Crawlscope::IssueCollection.new
|
|
64
|
+
rule = Crawlscope::Rules::StructuredData.new
|
|
65
|
+
page = page(
|
|
66
|
+
url: "https://example.com/articles/test",
|
|
67
|
+
body: "<html><body><main><h1>Article</h1></main></body></html>"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
rule.call(
|
|
71
|
+
urls: [page.url],
|
|
72
|
+
pages: [page],
|
|
73
|
+
issues: issues,
|
|
74
|
+
context: {schema_registry: Crawlscope::SchemaRegistry.default}
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
assert_equal [:missing_structured_data], issues.to_a.map(&:code)
|
|
78
|
+
assert_equal "no structured data found; add JSON-LD or microdata markup", issues.to_a.first.message
|
|
79
|
+
assert_equal ["json-ld", "microdata"], issues.to_a.first.details[:expected_sources]
|
|
80
|
+
end
|
|
81
|
+
|
|
62
82
|
private
|
|
63
83
|
|
|
64
84
|
def page(url:, body:)
|
|
@@ -5,9 +5,9 @@ require "test_helper"
|
|
|
5
5
|
|
|
6
6
|
class CrawlscopeStructuredDataWriterTest < Minitest::Test
|
|
7
7
|
def test_writes_json_report
|
|
8
|
-
result = Crawlscope::StructuredData::Audit::
|
|
8
|
+
result = Crawlscope::StructuredData::Audit::Outcome.new(
|
|
9
9
|
entries: [
|
|
10
|
-
Crawlscope::StructuredData::Audit::
|
|
10
|
+
Crawlscope::StructuredData::Audit::Page.new(
|
|
11
11
|
url: "https://example.com/article",
|
|
12
12
|
status: 200,
|
|
13
13
|
structured_items: [{source: "json-ld", data: {"@type" => "Article"}}],
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class CrawlscopeUrlTest < Minitest::Test
|
|
6
|
+
def test_normalize_resolves_relative_urls_and_removes_trailing_slash
|
|
7
|
+
assert_equal "https://example.com/pricing", Crawlscope::Url.normalize("/pricing/", base_url: "https://example.com")
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def test_normalize_preserves_non_default_port
|
|
11
|
+
assert_equal "http://localhost:3000/pricing", Crawlscope::Url.normalize("/pricing", base_url: "http://localhost:3000")
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def test_normalize_for_base_rebases_absolute_urls
|
|
15
|
+
assert_equal(
|
|
16
|
+
"http://localhost:3000/features",
|
|
17
|
+
Crawlscope::Url.normalize_for_base("https://www.example.com/features", base_url: "http://localhost:3000")
|
|
18
|
+
)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def test_path_normalizes_blank_and_trailing_slash
|
|
22
|
+
assert_equal "/", Crawlscope::Url.path("https://example.com")
|
|
23
|
+
assert_equal "/features", Crawlscope::Url.path("https://example.com/features/")
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def test_invalid_urls_are_returned_or_ignored
|
|
27
|
+
assert_equal "http:// bad", Crawlscope::Url.normalize("http:// bad", base_url: "https://example.com")
|
|
28
|
+
assert_nil Crawlscope::Url.path("http:// bad")
|
|
29
|
+
refute Crawlscope::Url.remote?("http:// bad")
|
|
30
|
+
end
|
|
31
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: crawlscope
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Paulo Fidalgo
|
|
@@ -193,16 +193,18 @@ files:
|
|
|
193
193
|
- README.md
|
|
194
194
|
- exe/crawlscope
|
|
195
195
|
- lib/crawlscope.rb
|
|
196
|
-
- lib/crawlscope/audit.rb
|
|
197
196
|
- lib/crawlscope/browser.rb
|
|
198
197
|
- lib/crawlscope/cli.rb
|
|
199
198
|
- lib/crawlscope/configuration.rb
|
|
199
|
+
- lib/crawlscope/context.rb
|
|
200
|
+
- lib/crawlscope/crawl.rb
|
|
200
201
|
- lib/crawlscope/crawler.rb
|
|
201
202
|
- lib/crawlscope/http.rb
|
|
202
203
|
- lib/crawlscope/issue.rb
|
|
203
204
|
- lib/crawlscope/issue_collection.rb
|
|
204
205
|
- lib/crawlscope/page.rb
|
|
205
206
|
- lib/crawlscope/railtie.rb
|
|
207
|
+
- lib/crawlscope/rake_tasks.rb
|
|
206
208
|
- lib/crawlscope/reporter.rb
|
|
207
209
|
- lib/crawlscope/result.rb
|
|
208
210
|
- lib/crawlscope/rule_registry.rb
|
|
@@ -210,23 +212,30 @@ files:
|
|
|
210
212
|
- lib/crawlscope/rules/metadata.rb
|
|
211
213
|
- lib/crawlscope/rules/structured_data.rb
|
|
212
214
|
- lib/crawlscope/rules/uniqueness.rb
|
|
215
|
+
- lib/crawlscope/run.rb
|
|
213
216
|
- lib/crawlscope/schema_registry.rb
|
|
217
|
+
- lib/crawlscope/schemas.rb
|
|
214
218
|
- lib/crawlscope/sitemap.rb
|
|
215
219
|
- lib/crawlscope/structured_data/audit.rb
|
|
220
|
+
- lib/crawlscope/structured_data/check.rb
|
|
216
221
|
- lib/crawlscope/structured_data/document.rb
|
|
217
222
|
- lib/crawlscope/structured_data/report.rb
|
|
218
223
|
- lib/crawlscope/structured_data/reporter.rb
|
|
219
224
|
- lib/crawlscope/structured_data/writer.rb
|
|
220
|
-
- lib/crawlscope/task.rb
|
|
221
225
|
- lib/crawlscope/url.rb
|
|
222
226
|
- lib/crawlscope/version.rb
|
|
223
227
|
- lib/tasks/crawlscope_tasks.rake
|
|
224
|
-
- test/crawlscope/
|
|
228
|
+
- test/crawlscope/browser_test.rb
|
|
225
229
|
- test/crawlscope/cli_test.rb
|
|
226
230
|
- test/crawlscope/configuration_test.rb
|
|
231
|
+
- test/crawlscope/crawl_test.rb
|
|
232
|
+
- test/crawlscope/crawler_test.rb
|
|
233
|
+
- test/crawlscope/http_test.rb
|
|
227
234
|
- test/crawlscope/links_rule_test.rb
|
|
228
235
|
- test/crawlscope/loader_test.rb
|
|
229
236
|
- test/crawlscope/reporter_test.rb
|
|
237
|
+
- test/crawlscope/rule_registry_test.rb
|
|
238
|
+
- test/crawlscope/run_test.rb
|
|
230
239
|
- test/crawlscope/schema_registry_test.rb
|
|
231
240
|
- test/crawlscope/sitemap_test.rb
|
|
232
241
|
- test/crawlscope/structured_data_audit_test.rb
|
|
@@ -235,8 +244,8 @@ files:
|
|
|
235
244
|
- test/crawlscope/structured_data_reporter_test.rb
|
|
236
245
|
- test/crawlscope/structured_data_rule_test.rb
|
|
237
246
|
- test/crawlscope/structured_data_writer_test.rb
|
|
238
|
-
- test/crawlscope/task_test.rb
|
|
239
247
|
- test/crawlscope/uniqueness_rule_test.rb
|
|
248
|
+
- test/crawlscope/url_test.rb
|
|
240
249
|
- test/test_helper.rb
|
|
241
250
|
homepage: https://www.ethos-link.com/opensource/crawlscope
|
|
242
251
|
licenses:
|
data/lib/crawlscope/task.rb
DELETED
|
@@ -1,131 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "json"
|
|
4
|
-
|
|
5
|
-
module Crawlscope
|
|
6
|
-
class Task
|
|
7
|
-
def initialize(configuration: Crawlscope.configuration, reporter: Reporter.new(io: configuration.output))
|
|
8
|
-
@configuration = configuration
|
|
9
|
-
@reporter = reporter
|
|
10
|
-
end
|
|
11
|
-
|
|
12
|
-
def validate(base_url: nil, sitemap_path: nil, rule_names: nil)
|
|
13
|
-
resolved_base_url = base_url || default_base_url
|
|
14
|
-
audit = @configuration.audit(
|
|
15
|
-
base_url: resolved_base_url,
|
|
16
|
-
sitemap_path: sitemap_path || default_sitemap_path(base_url: resolved_base_url),
|
|
17
|
-
rule_names: rule_names
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
result = audit.call
|
|
21
|
-
@reporter.report(result)
|
|
22
|
-
result
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
def validate_ldjson(urls:, debug: false, renderer: @configuration.renderer, timeout_seconds: @configuration.timeout_seconds, report_path: nil, summary: false)
|
|
26
|
-
audit = StructuredData::Audit.new(
|
|
27
|
-
browser_factory: @configuration.browser_factory,
|
|
28
|
-
network_idle_timeout_seconds: @configuration.network_idle_timeout_seconds,
|
|
29
|
-
renderer: renderer,
|
|
30
|
-
schema_registry: @configuration.schema_registry,
|
|
31
|
-
scroll_page: @configuration.scroll_page?,
|
|
32
|
-
timeout_seconds: timeout_seconds
|
|
33
|
-
)
|
|
34
|
-
result = audit.call(urls: urls)
|
|
35
|
-
|
|
36
|
-
report_ldjson_result(result, debug: debug, renderer: renderer)
|
|
37
|
-
StructuredData::Writer.new(path: report_path).write(result) if report_path
|
|
38
|
-
StructuredData::Reporter.new(io: @configuration.output, report_path: report_path).report(result) if summary
|
|
39
|
-
result
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
private
|
|
43
|
-
|
|
44
|
-
def default_base_url
|
|
45
|
-
value = @configuration.base_url
|
|
46
|
-
return value unless value.to_s.strip.empty?
|
|
47
|
-
|
|
48
|
-
"http://localhost:3000"
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
def default_sitemap_path(base_url:)
|
|
52
|
-
value = @configuration.sitemap_path
|
|
53
|
-
return value unless value.to_s.strip.empty?
|
|
54
|
-
|
|
55
|
-
local_path = File.expand_path("public/sitemap.xml", Dir.pwd)
|
|
56
|
-
if local_path_default?(base_url: base_url) && File.exist?(local_path)
|
|
57
|
-
return local_path
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
"#{base_url.to_s.chomp("/")}/sitemap.xml"
|
|
61
|
-
end
|
|
62
|
-
|
|
63
|
-
def local_path_default?(base_url:)
|
|
64
|
-
host = URI.parse(base_url.to_s).host.to_s
|
|
65
|
-
["localhost", "127.0.0.1"].include?(host)
|
|
66
|
-
rescue URI::InvalidURIError
|
|
67
|
-
false
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
def report_ldjson_result(result, debug:, renderer:)
|
|
71
|
-
if renderer == :browser
|
|
72
|
-
@configuration.output.puts("JavaScript mode enabled (Ferrum)")
|
|
73
|
-
end
|
|
74
|
-
|
|
75
|
-
@configuration.output.puts("Validating JSON-LD on #{result.entries.size} URL(s)")
|
|
76
|
-
@configuration.output.puts("")
|
|
77
|
-
|
|
78
|
-
result.entries.each do |entry|
|
|
79
|
-
@configuration.output.puts("=" * 80)
|
|
80
|
-
@configuration.output.puts("URL: #{entry.url}")
|
|
81
|
-
@configuration.output.puts("=" * 80)
|
|
82
|
-
|
|
83
|
-
if entry.fetch_error
|
|
84
|
-
@configuration.output.puts("Error: #{entry.fetch_error}")
|
|
85
|
-
@configuration.output.puts("")
|
|
86
|
-
next
|
|
87
|
-
end
|
|
88
|
-
|
|
89
|
-
if entry.status
|
|
90
|
-
@configuration.output.puts("Status: #{entry.status}")
|
|
91
|
-
else
|
|
92
|
-
@configuration.output.puts("Status: JS runtime fetch")
|
|
93
|
-
end
|
|
94
|
-
|
|
95
|
-
@configuration.output.puts("Structured data found: #{entry.structured_items.size} (JSON-LD: #{entry.json_ld_count}, Microdata: #{entry.microdata_count})")
|
|
96
|
-
|
|
97
|
-
if debug && entry.structured_items.any?
|
|
98
|
-
@configuration.output.puts("")
|
|
99
|
-
@configuration.output.puts("--- Detected Structured Data ---")
|
|
100
|
-
|
|
101
|
-
entry.structured_items.each_with_index do |item, index|
|
|
102
|
-
@configuration.output.puts("")
|
|
103
|
-
@configuration.output.puts("## Item #{index + 1} [#{item[:source]}]")
|
|
104
|
-
@configuration.output.puts(JSON.pretty_generate(item[:data]))
|
|
105
|
-
end
|
|
106
|
-
|
|
107
|
-
@configuration.output.puts("")
|
|
108
|
-
@configuration.output.puts("--- End ---")
|
|
109
|
-
end
|
|
110
|
-
|
|
111
|
-
@configuration.output.puts("")
|
|
112
|
-
@configuration.output.puts("Validation results:")
|
|
113
|
-
|
|
114
|
-
if entry.errors.empty?
|
|
115
|
-
@configuration.output.puts(" All valid!")
|
|
116
|
-
else
|
|
117
|
-
entry.errors.each do |error|
|
|
118
|
-
@configuration.output.puts(" #{error[:type]}: INVALID [#{error[:source]}]")
|
|
119
|
-
error[:errors].each do |validation_error|
|
|
120
|
-
@configuration.output.puts(" - field: #{validation_error[:field]}, issue: #{validation_error[:issue]}")
|
|
121
|
-
end
|
|
122
|
-
end
|
|
123
|
-
end
|
|
124
|
-
|
|
125
|
-
@configuration.output.puts("")
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
@configuration.output.puts("STATUS: #{result.ok? ? "OK" : "FAILED"}")
|
|
129
|
-
end
|
|
130
|
-
end
|
|
131
|
-
end
|