crawlscope 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +7 -8
  3. data/README.md +21 -14
  4. data/lib/crawlscope/browser.rb +8 -0
  5. data/lib/crawlscope/cli.rb +15 -10
  6. data/lib/crawlscope/configuration.rb +20 -5
  7. data/lib/crawlscope/context.rb +9 -0
  8. data/lib/crawlscope/{audit.rb → crawl.rb} +68 -58
  9. data/lib/crawlscope/crawler.rb +19 -1
  10. data/lib/crawlscope/http.rb +1 -1
  11. data/lib/crawlscope/rake_tasks.rb +28 -0
  12. data/lib/crawlscope/rules/links.rb +99 -48
  13. data/lib/crawlscope/rules/metadata.rb +57 -11
  14. data/lib/crawlscope/rules/structured_data.rb +61 -1
  15. data/lib/crawlscope/run.rb +60 -0
  16. data/lib/crawlscope/schema_registry.rb +3 -349
  17. data/lib/crawlscope/schemas.rb +406 -0
  18. data/lib/crawlscope/sitemap.rb +18 -6
  19. data/lib/crawlscope/structured_data/audit.rb +7 -7
  20. data/lib/crawlscope/structured_data/check.rb +35 -0
  21. data/lib/crawlscope/structured_data/reporter.rb +69 -0
  22. data/lib/crawlscope/url.rb +14 -0
  23. data/lib/crawlscope/version.rb +1 -1
  24. data/lib/tasks/crawlscope_tasks.rake +12 -23
  25. data/test/crawlscope/browser_test.rb +155 -0
  26. data/test/crawlscope/cli_test.rb +143 -7
  27. data/test/crawlscope/configuration_test.rb +49 -0
  28. data/test/crawlscope/{audit_test.rb → crawl_test.rb} +23 -7
  29. data/test/crawlscope/crawler_test.rb +34 -0
  30. data/test/crawlscope/http_test.rb +56 -0
  31. data/test/crawlscope/links_rule_test.rb +149 -5
  32. data/test/crawlscope/metadata_rule_test.rb +77 -0
  33. data/test/crawlscope/rule_registry_test.rb +32 -0
  34. data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
  35. data/test/crawlscope/schema_registry_test.rb +19 -0
  36. data/test/crawlscope/sitemap_test.rb +55 -0
  37. data/test/crawlscope/structured_data_document_test.rb +36 -0
  38. data/test/crawlscope/structured_data_report_test.rb +3 -3
  39. data/test/crawlscope/structured_data_reporter_test.rb +2 -2
  40. data/test/crawlscope/structured_data_rule_test.rb +111 -0
  41. data/test/crawlscope/structured_data_writer_test.rb +2 -2
  42. data/test/crawlscope/url_test.rb +31 -0
  43. metadata +15 -5
  44. data/lib/crawlscope/task.rb +0 -131
@@ -5,9 +5,9 @@ require "test_helper"
5
5
 
6
6
  class CrawlscopeStructuredDataWriterTest < Minitest::Test
7
7
  def test_writes_json_report
8
- result = Crawlscope::StructuredData::Audit::Result.new(
8
+ result = Crawlscope::StructuredData::Audit::Outcome.new(
9
9
  entries: [
10
- Crawlscope::StructuredData::Audit::Entry.new(
10
+ Crawlscope::StructuredData::Audit::Page.new(
11
11
  url: "https://example.com/article",
12
12
  status: 200,
13
13
  structured_items: [{source: "json-ld", data: {"@type" => "Article"}}],
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class CrawlscopeUrlTest < Minitest::Test
6
+ def test_normalize_resolves_relative_urls_and_removes_trailing_slash
7
+ assert_equal "https://example.com/pricing", Crawlscope::Url.normalize("/pricing/", base_url: "https://example.com")
8
+ end
9
+
10
+ def test_normalize_preserves_non_default_port
11
+ assert_equal "http://localhost:3000/pricing", Crawlscope::Url.normalize("/pricing", base_url: "http://localhost:3000")
12
+ end
13
+
14
+ def test_normalize_for_base_rebases_absolute_urls
15
+ assert_equal(
16
+ "http://localhost:3000/features",
17
+ Crawlscope::Url.normalize_for_base("https://www.example.com/features", base_url: "http://localhost:3000")
18
+ )
19
+ end
20
+
21
+ def test_path_normalizes_blank_and_trailing_slash
22
+ assert_equal "/", Crawlscope::Url.path("https://example.com")
23
+ assert_equal "/features", Crawlscope::Url.path("https://example.com/features/")
24
+ end
25
+
26
+ def test_invalid_urls_are_returned_or_ignored
27
+ assert_equal "http:// bad", Crawlscope::Url.normalize("http:// bad", base_url: "https://example.com")
28
+ assert_nil Crawlscope::Url.path("http:// bad")
29
+ refute Crawlscope::Url.remote?("http:// bad")
30
+ end
31
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crawlscope
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paulo Fidalgo
@@ -193,16 +193,18 @@ files:
193
193
  - README.md
194
194
  - exe/crawlscope
195
195
  - lib/crawlscope.rb
196
- - lib/crawlscope/audit.rb
197
196
  - lib/crawlscope/browser.rb
198
197
  - lib/crawlscope/cli.rb
199
198
  - lib/crawlscope/configuration.rb
199
+ - lib/crawlscope/context.rb
200
+ - lib/crawlscope/crawl.rb
200
201
  - lib/crawlscope/crawler.rb
201
202
  - lib/crawlscope/http.rb
202
203
  - lib/crawlscope/issue.rb
203
204
  - lib/crawlscope/issue_collection.rb
204
205
  - lib/crawlscope/page.rb
205
206
  - lib/crawlscope/railtie.rb
207
+ - lib/crawlscope/rake_tasks.rb
206
208
  - lib/crawlscope/reporter.rb
207
209
  - lib/crawlscope/result.rb
208
210
  - lib/crawlscope/rule_registry.rb
@@ -210,23 +212,31 @@ files:
210
212
  - lib/crawlscope/rules/metadata.rb
211
213
  - lib/crawlscope/rules/structured_data.rb
212
214
  - lib/crawlscope/rules/uniqueness.rb
215
+ - lib/crawlscope/run.rb
213
216
  - lib/crawlscope/schema_registry.rb
217
+ - lib/crawlscope/schemas.rb
214
218
  - lib/crawlscope/sitemap.rb
215
219
  - lib/crawlscope/structured_data/audit.rb
220
+ - lib/crawlscope/structured_data/check.rb
216
221
  - lib/crawlscope/structured_data/document.rb
217
222
  - lib/crawlscope/structured_data/report.rb
218
223
  - lib/crawlscope/structured_data/reporter.rb
219
224
  - lib/crawlscope/structured_data/writer.rb
220
- - lib/crawlscope/task.rb
221
225
  - lib/crawlscope/url.rb
222
226
  - lib/crawlscope/version.rb
223
227
  - lib/tasks/crawlscope_tasks.rake
224
- - test/crawlscope/audit_test.rb
228
+ - test/crawlscope/browser_test.rb
225
229
  - test/crawlscope/cli_test.rb
226
230
  - test/crawlscope/configuration_test.rb
231
+ - test/crawlscope/crawl_test.rb
232
+ - test/crawlscope/crawler_test.rb
233
+ - test/crawlscope/http_test.rb
227
234
  - test/crawlscope/links_rule_test.rb
228
235
  - test/crawlscope/loader_test.rb
236
+ - test/crawlscope/metadata_rule_test.rb
229
237
  - test/crawlscope/reporter_test.rb
238
+ - test/crawlscope/rule_registry_test.rb
239
+ - test/crawlscope/run_test.rb
230
240
  - test/crawlscope/schema_registry_test.rb
231
241
  - test/crawlscope/sitemap_test.rb
232
242
  - test/crawlscope/structured_data_audit_test.rb
@@ -235,8 +245,8 @@ files:
235
245
  - test/crawlscope/structured_data_reporter_test.rb
236
246
  - test/crawlscope/structured_data_rule_test.rb
237
247
  - test/crawlscope/structured_data_writer_test.rb
238
- - test/crawlscope/task_test.rb
239
248
  - test/crawlscope/uniqueness_rule_test.rb
249
+ - test/crawlscope/url_test.rb
240
250
  - test/test_helper.rb
241
251
  homepage: https://www.ethos-link.com/opensource/crawlscope
242
252
  licenses:
@@ -1,131 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "json"
4
-
5
- module Crawlscope
6
- class Task
7
- def initialize(configuration: Crawlscope.configuration, reporter: Reporter.new(io: configuration.output))
8
- @configuration = configuration
9
- @reporter = reporter
10
- end
11
-
12
- def validate(base_url: nil, sitemap_path: nil, rule_names: nil)
13
- resolved_base_url = base_url || default_base_url
14
- audit = @configuration.audit(
15
- base_url: resolved_base_url,
16
- sitemap_path: sitemap_path || default_sitemap_path(base_url: resolved_base_url),
17
- rule_names: rule_names
18
- )
19
-
20
- result = audit.call
21
- @reporter.report(result)
22
- result
23
- end
24
-
25
- def validate_ldjson(urls:, debug: false, renderer: @configuration.renderer, timeout_seconds: @configuration.timeout_seconds, report_path: nil, summary: false)
26
- audit = StructuredData::Audit.new(
27
- browser_factory: @configuration.browser_factory,
28
- network_idle_timeout_seconds: @configuration.network_idle_timeout_seconds,
29
- renderer: renderer,
30
- schema_registry: @configuration.schema_registry,
31
- scroll_page: @configuration.scroll_page?,
32
- timeout_seconds: timeout_seconds
33
- )
34
- result = audit.call(urls: urls)
35
-
36
- report_ldjson_result(result, debug: debug, renderer: renderer)
37
- StructuredData::Writer.new(path: report_path).write(result) if report_path
38
- StructuredData::Reporter.new(io: @configuration.output, report_path: report_path).report(result) if summary
39
- result
40
- end
41
-
42
- private
43
-
44
- def default_base_url
45
- value = @configuration.base_url
46
- return value unless value.to_s.strip.empty?
47
-
48
- "http://localhost:3000"
49
- end
50
-
51
- def default_sitemap_path(base_url:)
52
- value = @configuration.sitemap_path
53
- return value unless value.to_s.strip.empty?
54
-
55
- local_path = File.expand_path("public/sitemap.xml", Dir.pwd)
56
- if local_path_default?(base_url: base_url) && File.exist?(local_path)
57
- return local_path
58
- end
59
-
60
- "#{base_url.to_s.chomp("/")}/sitemap.xml"
61
- end
62
-
63
- def local_path_default?(base_url:)
64
- host = URI.parse(base_url.to_s).host.to_s
65
- ["localhost", "127.0.0.1"].include?(host)
66
- rescue URI::InvalidURIError
67
- false
68
- end
69
-
70
- def report_ldjson_result(result, debug:, renderer:)
71
- if renderer == :browser
72
- @configuration.output.puts("JavaScript mode enabled (Ferrum)")
73
- end
74
-
75
- @configuration.output.puts("Validating JSON-LD on #{result.entries.size} URL(s)")
76
- @configuration.output.puts("")
77
-
78
- result.entries.each do |entry|
79
- @configuration.output.puts("=" * 80)
80
- @configuration.output.puts("URL: #{entry.url}")
81
- @configuration.output.puts("=" * 80)
82
-
83
- if entry.fetch_error
84
- @configuration.output.puts("Error: #{entry.fetch_error}")
85
- @configuration.output.puts("")
86
- next
87
- end
88
-
89
- if entry.status
90
- @configuration.output.puts("Status: #{entry.status}")
91
- else
92
- @configuration.output.puts("Status: JS runtime fetch")
93
- end
94
-
95
- @configuration.output.puts("Structured data found: #{entry.structured_items.size} (JSON-LD: #{entry.json_ld_count}, Microdata: #{entry.microdata_count})")
96
-
97
- if debug && entry.structured_items.any?
98
- @configuration.output.puts("")
99
- @configuration.output.puts("--- Detected Structured Data ---")
100
-
101
- entry.structured_items.each_with_index do |item, index|
102
- @configuration.output.puts("")
103
- @configuration.output.puts("## Item #{index + 1} [#{item[:source]}]")
104
- @configuration.output.puts(JSON.pretty_generate(item[:data]))
105
- end
106
-
107
- @configuration.output.puts("")
108
- @configuration.output.puts("--- End ---")
109
- end
110
-
111
- @configuration.output.puts("")
112
- @configuration.output.puts("Validation results:")
113
-
114
- if entry.errors.empty?
115
- @configuration.output.puts(" All valid!")
116
- else
117
- entry.errors.each do |error|
118
- @configuration.output.puts(" #{error[:type]}: INVALID [#{error[:source]}]")
119
- error[:errors].each do |validation_error|
120
- @configuration.output.puts(" - field: #{validation_error[:field]}, issue: #{validation_error[:issue]}")
121
- end
122
- end
123
- end
124
-
125
- @configuration.output.puts("")
126
- end
127
-
128
- @configuration.output.puts("STATUS: #{result.ok? ? "OK" : "FAILED"}")
129
- end
130
- end
131
- end