crawlscope 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +7 -11
  3. data/README.md +20 -13
  4. data/lib/crawlscope/browser.rb +8 -0
  5. data/lib/crawlscope/cli.rb +10 -10
  6. data/lib/crawlscope/configuration.rb +20 -5
  7. data/lib/crawlscope/context.rb +9 -0
  8. data/lib/crawlscope/{audit.rb → crawl.rb} +62 -58
  9. data/lib/crawlscope/crawler.rb +19 -1
  10. data/lib/crawlscope/http.rb +1 -1
  11. data/lib/crawlscope/rake_tasks.rb +28 -0
  12. data/lib/crawlscope/rules/links.rb +76 -43
  13. data/lib/crawlscope/rules/structured_data.rb +14 -1
  14. data/lib/crawlscope/run.rb +60 -0
  15. data/lib/crawlscope/schema_registry.rb +3 -349
  16. data/lib/crawlscope/schemas.rb +355 -0
  17. data/lib/crawlscope/sitemap.rb +18 -6
  18. data/lib/crawlscope/structured_data/audit.rb +7 -7
  19. data/lib/crawlscope/structured_data/check.rb +35 -0
  20. data/lib/crawlscope/structured_data/reporter.rb +69 -0
  21. data/lib/crawlscope/url.rb +14 -0
  22. data/lib/crawlscope/version.rb +1 -1
  23. data/lib/tasks/crawlscope_tasks.rake +12 -23
  24. data/test/crawlscope/browser_test.rb +155 -0
  25. data/test/crawlscope/cli_test.rb +128 -6
  26. data/test/crawlscope/configuration_test.rb +49 -0
  27. data/test/crawlscope/{audit_test.rb → crawl_test.rb} +11 -5
  28. data/test/crawlscope/crawler_test.rb +34 -0
  29. data/test/crawlscope/http_test.rb +56 -0
  30. data/test/crawlscope/links_rule_test.rb +110 -5
  31. data/test/crawlscope/rule_registry_test.rb +32 -0
  32. data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
  33. data/test/crawlscope/schema_registry_test.rb +19 -0
  34. data/test/crawlscope/sitemap_test.rb +55 -0
  35. data/test/crawlscope/structured_data_document_test.rb +36 -0
  36. data/test/crawlscope/structured_data_report_test.rb +3 -3
  37. data/test/crawlscope/structured_data_reporter_test.rb +2 -2
  38. data/test/crawlscope/structured_data_rule_test.rb +20 -0
  39. data/test/crawlscope/structured_data_writer_test.rb +2 -2
  40. data/test/crawlscope/url_test.rb +31 -0
  41. metadata +14 -5
  42. data/lib/crawlscope/task.rb +0 -131
@@ -0,0 +1,355 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ class Schemas
5
+ FAQ_PAGE = {
6
+ "type" => "object",
7
+ "required" => ["@context", "@type", "mainEntity"],
8
+ "properties" => {
9
+ "@context" => {"const" => "https://schema.org"},
10
+ "@type" => {"const" => "FAQPage"},
11
+ "mainEntity" => {
12
+ "type" => "array",
13
+ "minItems" => 1,
14
+ "items" => {"$ref" => "#/definitions/Question"}
15
+ }
16
+ },
17
+ "definitions" => {
18
+ "Question" => {
19
+ "type" => "object",
20
+ "required" => ["@type", "name", "acceptedAnswer"],
21
+ "properties" => {
22
+ "@type" => {"const" => "Question"},
23
+ "name" => {"type" => "string"},
24
+ "acceptedAnswer" => {"$ref" => "#/definitions/Answer"}
25
+ }
26
+ },
27
+ "Answer" => {
28
+ "type" => "object",
29
+ "required" => ["@type", "text"],
30
+ "properties" => {
31
+ "@type" => {"const" => "Answer"},
32
+ "text" => {"type" => "string"}
33
+ }
34
+ }
35
+ }
36
+ }.freeze
37
+
38
+ ARTICLE = {
39
+ type: "object",
40
+ required: ["@type", "headline"],
41
+ properties: {
42
+ "@type" => {enum: ["Article", "NewsArticle", "BlogPosting"]},
43
+ :headline => {type: "string", maxLength: 110},
44
+ :image => {type: "string", format: "uri"},
45
+ :datePublished => {type: "string", format: "date-time"},
46
+ :dateModified => {type: "string", format: "date-time"},
47
+ :author => {type: "object"},
48
+ :publisher => {type: "object"}
49
+ }
50
+ }.freeze
51
+
52
+ ORGANIZATION = {
53
+ type: "object",
54
+ required: ["@type", "name"],
55
+ properties: {
56
+ "@type" => {const: "Organization"},
57
+ :name => {type: "string"},
58
+ :url => {type: "string", format: "uri"},
59
+ :logo => {
60
+ anyOf: [
61
+ {type: "string", format: "uri"},
62
+ {
63
+ type: "object",
64
+ required: ["@type", "url"],
65
+ properties: {
66
+ "@type" => {const: "ImageObject"},
67
+ :url => {type: "string", format: "uri"}
68
+ }
69
+ }
70
+ ]
71
+ },
72
+ :description => {type: "string"}
73
+ }
74
+ }.freeze
75
+
76
+ IMAGE_OBJECT = {
77
+ type: "object",
78
+ required: ["@type"],
79
+ properties: {
80
+ "@type" => {const: "ImageObject"},
81
+ :url => {type: "string", format: "uri"},
82
+ :contentUrl => {type: "string", format: "uri"},
83
+ :thumbnail => {type: ["string", "object"]}
84
+ }
85
+ }.freeze
86
+
87
+ OFFER = {
88
+ type: "object",
89
+ additionalProperties: true,
90
+ required: ["@type"],
91
+ properties: {
92
+ "@type" => {const: "Offer"},
93
+ :name => {type: ["string", "null"]},
94
+ :price => {type: ["string", "number"]},
95
+ :priceCurrency => {type: ["string", "null"]},
96
+ :priceSpecification => {type: ["object", "null"]},
97
+ :availability => {type: "string"},
98
+ :shippingDetails => {type: "object"},
99
+ :hasMerchantReturnPolicy => {type: "boolean"},
100
+ :merchantReturnPolicy => {type: "object"},
101
+ :url => {type: "string", format: "uri"},
102
+ :eligibleQuantity => {type: "object"},
103
+ :additionalProperty => {type: "array", items: {type: "object"}}
104
+ }
105
+ }.freeze
106
+
107
+ RATING = {
108
+ type: "object",
109
+ required: ["@type", "ratingValue"],
110
+ properties: {
111
+ "@type" => {const: "Rating"},
112
+ :ratingValue => {type: ["string", "number"]},
113
+ :bestRating => {type: ["string", "number"]},
114
+ :worstRating => {type: ["string", "number"]}
115
+ }
116
+ }.freeze
117
+
118
+ REVIEW = {
119
+ type: "object",
120
+ required: ["@type", "itemReviewed"],
121
+ properties: {
122
+ "@type" => {const: "Review"},
123
+ :itemReviewed => {type: "object"},
124
+ :reviewRating => RATING,
125
+ :author => {type: ["object", "string"]},
126
+ :datePublished => {type: "string", format: "date-time"},
127
+ :reviewBody => {type: "string"}
128
+ }
129
+ }.freeze
130
+
131
+ REVIEW_SNIPPET = {
132
+ type: "object",
133
+ required: ["@type", "reviewRating"],
134
+ properties: {
135
+ "@type" => {const: "Review"},
136
+ :reviewRating => RATING,
137
+ :author => {type: ["object", "string"]},
138
+ :reviewBody => {type: "string"},
139
+ :datePublished => {type: "string", format: "date-time"}
140
+ }
141
+ }.freeze
142
+
143
+ AGGREGATE_RATING = {
144
+ type: "object",
145
+ required: ["@type"],
146
+ properties: {
147
+ "@type" => {const: "AggregateRating"},
148
+ :ratingValue => {type: ["string", "number"]},
149
+ :ratingCount => {type: "integer"},
150
+ :reviewCount => {type: "integer"},
151
+ :bestRating => {type: ["string", "number"]},
152
+ :worstRating => {type: ["string", "number"]}
153
+ }
154
+ }.freeze
155
+
156
+ SOFTWARE_APPLICATION = {
157
+ type: "object",
158
+ required: ["@type", "name"],
159
+ properties: {
160
+ "@type" => {const: "SoftwareApplication"},
161
+ :name => {type: "string"},
162
+ :applicationCategory => {type: "string"},
163
+ :description => {type: "string"},
164
+ :offers => {
165
+ anyOf: [
166
+ OFFER,
167
+ {type: "array", items: OFFER}
168
+ ]
169
+ },
170
+ :featureList => {type: ["string", "array"]},
171
+ :aggregateRating => AGGREGATE_RATING,
172
+ :review => REVIEW_SNIPPET
173
+ }
174
+ }.freeze
175
+
176
+ WEB_APPLICATION = {
177
+ type: "object",
178
+ required: ["@type", "name"],
179
+ properties: {
180
+ "@type" => {const: "WebApplication"},
181
+ :name => {type: "string"},
182
+ :applicationCategory => {type: "string"},
183
+ :description => {type: "string"},
184
+ :operatingSystem => {type: "string"},
185
+ :url => {type: "string", format: "uri"},
186
+ :offers => {
187
+ anyOf: [
188
+ OFFER,
189
+ {type: "array", items: OFFER}
190
+ ]
191
+ },
192
+ :featureList => {type: ["string", "array"]},
193
+ :aggregateRating => AGGREGATE_RATING,
194
+ :review => REVIEW_SNIPPET
195
+ }
196
+ }.freeze
197
+
198
+ HOW_TO = {
199
+ type: "object",
200
+ required: ["@type", "name", "step"],
201
+ properties: {
202
+ "@type" => {const: "HowTo"},
203
+ :name => {type: "string"},
204
+ :description => {type: "string"},
205
+ :step => {
206
+ type: "array",
207
+ minItems: 1,
208
+ items: {
209
+ type: "object",
210
+ required: ["@type", "name", "text"],
211
+ properties: {
212
+ "@type" => {const: "HowToStep"},
213
+ :name => {type: "string"},
214
+ :text => {type: "string"},
215
+ :position => {type: "integer", minimum: 1}
216
+ }
217
+ }
218
+ }
219
+ }
220
+ }.freeze
221
+
222
+ CONTACT_PAGE = {
223
+ type: "object",
224
+ required: ["@type", "name"],
225
+ properties: {
226
+ "@type" => {const: "ContactPage"},
227
+ :name => {type: "string"},
228
+ :description => {type: "string"},
229
+ :url => {type: "string", format: "uri"}
230
+ }
231
+ }.freeze
232
+
233
+ PRODUCT = {
234
+ type: "object",
235
+ required: ["@type", "name"],
236
+ properties: {
237
+ "@type" => {const: "Product"},
238
+ :name => {type: "string"},
239
+ :image => {
240
+ anyOf: [
241
+ {type: "string", format: "uri"},
242
+ IMAGE_OBJECT,
243
+ {type: "array", items: {anyOf: [{type: "string", format: "uri"}, IMAGE_OBJECT]}}
244
+ ]
245
+ },
246
+ :description => {type: "string"},
247
+ :offers => {
248
+ anyOf: [
249
+ OFFER,
250
+ {type: "array", items: OFFER}
251
+ ]
252
+ }
253
+ }
254
+ }.freeze
255
+
256
+ RECIPE = {
257
+ type: "object",
258
+ required: ["@type", "name"],
259
+ properties: {
260
+ "@type" => {const: "Recipe"},
261
+ :name => {type: "string"},
262
+ :image => {type: ["string", "array"]},
263
+ :recipeIngredient => {type: "array", items: {type: "string"}},
264
+ :recipeInstructions => {type: ["string", "array"]}
265
+ }
266
+ }.freeze
267
+
268
+ EVENT = {
269
+ type: "object",
270
+ required: ["@type", "name", "startDate"],
271
+ properties: {
272
+ "@type" => {const: "Event"},
273
+ :name => {type: "string"},
274
+ :startDate => {type: "string", format: "date-time"},
275
+ :endDate => {type: "string", format: "date-time"},
276
+ :location => {type: "object"}
277
+ }
278
+ }.freeze
279
+
280
+ VIDEO_OBJECT = {
281
+ type: "object",
282
+ required: ["@type", "name", "description"],
283
+ properties: {
284
+ "@type" => {const: "VideoObject"},
285
+ :name => {type: "string"},
286
+ :description => {type: "string"},
287
+ :thumbnailUrl => {type: "string", format: "uri"},
288
+ :uploadDate => {type: "string", format: "date-time"}
289
+ }
290
+ }.freeze
291
+
292
+ WEBSITE = {
293
+ type: "object",
294
+ required: ["@type"],
295
+ properties: {
296
+ "@type" => {const: "WebSite"},
297
+ :name => {type: "string"},
298
+ :url => {type: "string", format: "uri"},
299
+ :potentialAction => {type: "object"}
300
+ }
301
+ }.freeze
302
+
303
+ BREADCRUMB_LIST = {
304
+ type: "object",
305
+ required: ["@type", "itemListElement"],
306
+ properties: {
307
+ "@type" => {const: "BreadcrumbList"},
308
+ :itemListElement => {
309
+ type: "array",
310
+ minItems: 1,
311
+ items: {
312
+ type: "object",
313
+ required: ["@type", "position", "name", "item"],
314
+ properties: {
315
+ "@type" => {const: "ListItem"},
316
+ :position => {type: "integer", minimum: 1},
317
+ :name => {type: "string"},
318
+ :item => {type: "string", format: "uri"}
319
+ }
320
+ }
321
+ }
322
+ }
323
+ }.freeze
324
+
325
+ WEB_PAGE = {
326
+ type: "object",
327
+ required: ["@type"],
328
+ properties: {
329
+ "@type" => {const: "WebPage"}
330
+ }
331
+ }.freeze
332
+
333
+ def self.schemas
334
+ {
335
+ "FAQPage" => FAQ_PAGE,
336
+ "Article" => ARTICLE,
337
+ "NewsArticle" => ARTICLE,
338
+ "BlogPosting" => ARTICLE,
339
+ "Organization" => ORGANIZATION,
340
+ "SoftwareApplication" => SOFTWARE_APPLICATION,
341
+ "WebApplication" => WEB_APPLICATION,
342
+ "HowTo" => HOW_TO,
343
+ "ContactPage" => CONTACT_PAGE,
344
+ "Product" => PRODUCT,
345
+ "Review" => REVIEW,
346
+ "WebSite" => WEBSITE,
347
+ "BreadcrumbList" => BREADCRUMB_LIST,
348
+ "Recipe" => RECIPE,
349
+ "Event" => EVENT,
350
+ "VideoObject" => VIDEO_OBJECT,
351
+ "WebPage" => WEB_PAGE
352
+ }
353
+ end
354
+ end
355
+ end
@@ -28,12 +28,12 @@ module Crawlscope
28
28
 
29
29
  if root_name == "sitemapindex"
30
30
  document.xpath("//xmlns:sitemap/xmlns:loc", SITEMAP_NAMESPACE).flat_map do |node|
31
- child_source = resolve_child_source(source, node.text.to_s.strip)
31
+ child_source = resolve_child_source(source, node.text.to_s.strip, base_url: base_url)
32
32
  collect_urls(child_source, base_url: base_url, visited: visited)
33
33
  end
34
34
  else
35
35
  document.xpath("//xmlns:url/xmlns:loc", SITEMAP_NAMESPACE).map do |node|
36
- Url.normalize(node.text.to_s.strip, base_url: base_url)
36
+ Url.normalize_for_base(node.text.to_s.strip, base_url: base_url)
37
37
  end
38
38
  end
39
39
  end
@@ -46,16 +46,28 @@ module Crawlscope
46
46
  end
47
47
  end
48
48
 
49
- def resolve_child_source(parent_source, child_loc)
50
- return child_loc if Url.remote?(child_loc)
51
-
49
+ def resolve_child_source(parent_source, child_loc, base_url:)
52
50
  if Url.remote?(parent_source)
53
- URI.join(parent_source, child_loc).to_s
51
+ Url.normalize_for_base(URI.join(parent_source, child_loc).to_s, base_url: base_url)
52
+ elsif (local_child_path = local_child_path(parent_source, child_loc))
53
+ local_child_path
54
+ elsif Url.remote?(child_loc)
55
+ child_loc
54
56
  else
55
57
  File.expand_path(child_loc, File.dirname(parent_source))
56
58
  end
57
59
  end
58
60
 
61
+ def local_child_path(parent_source, child_loc)
62
+ basename = File.basename(URI.parse(child_loc).path.to_s)
63
+ return if basename.empty?
64
+
65
+ path = File.expand_path(basename, File.dirname(parent_source))
66
+ path if File.file?(path)
67
+ rescue URI::InvalidURIError
68
+ nil
69
+ end
70
+
59
71
  def connection
60
72
  @connection ||= Faraday.new do |faraday|
61
73
  faraday.response :follow_redirects, limit: Http::MAX_REDIRECTS
@@ -3,7 +3,7 @@
3
3
  module Crawlscope
4
4
  module StructuredData
5
5
  class Audit
6
- Entry = Data.define(:url, :status, :structured_items, :errors, :fetch_error, :content_type, :skipped_reason) do
6
+ Page = Data.define(:url, :status, :structured_items, :errors, :fetch_error, :content_type, :skipped_reason) do
7
7
  def json_ld_count
8
8
  structured_items.count { |item| item[:source] == "json-ld" }
9
9
  end
@@ -21,7 +21,7 @@ module Crawlscope
21
21
  end
22
22
  end
23
23
 
24
- Result = Data.define(:entries) do
24
+ Outcome = Data.define(:entries) do
25
25
  def ok?
26
26
  entries.all?(&:ok?)
27
27
  end
@@ -40,7 +40,7 @@ module Crawlscope
40
40
  fetcher = build_fetcher(urls)
41
41
  entries = urls.map { |url| validate_url(url, fetcher) }
42
42
 
43
- Result.new(entries: entries)
43
+ Outcome.new(entries: entries)
44
44
  ensure
45
45
  fetcher&.close
46
46
  end
@@ -111,9 +111,9 @@ module Crawlscope
111
111
  content_type = page.headers["content-type"].to_s
112
112
 
113
113
  if page.error
114
- Entry.new(url: url, status: page.status, structured_items: [], errors: [], fetch_error: page.error, content_type: content_type, skipped_reason: nil)
114
+ Page.new(url: url, status: page.status, structured_items: [], errors: [], fetch_error: page.error, content_type: content_type, skipped_reason: nil)
115
115
  elsif page.status && !(200..299).cover?(page.status.to_i)
116
- Entry.new(
116
+ Page.new(
117
117
  url: url,
118
118
  status: page.status,
119
119
  structured_items: [],
@@ -123,7 +123,7 @@ module Crawlscope
123
123
  skipped_reason: nil
124
124
  )
125
125
  elsif !content_type.empty? && !content_type.include?("text/html")
126
- Entry.new(
126
+ Page.new(
127
127
  url: url,
128
128
  status: page.status,
129
129
  structured_items: [],
@@ -134,7 +134,7 @@ module Crawlscope
134
134
  )
135
135
  else
136
136
  structured_items, errors = build_validation_errors(page)
137
- Entry.new(
137
+ Page.new(
138
138
  url: url,
139
139
  status: page.status,
140
140
  structured_items: structured_items,
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ module StructuredData
5
+ class Check
6
+ def initialize(configuration:)
7
+ @configuration = configuration
8
+ end
9
+
10
+ def call(urls:, debug: false, renderer: @configuration.renderer, timeout_seconds: @configuration.timeout_seconds, report_path: nil, summary: false)
11
+ result = audit(renderer: renderer, timeout_seconds: timeout_seconds).call(urls: urls)
12
+ reporter = Reporter.new(io: @configuration.output, report_path: report_path)
13
+
14
+ reporter.details(result, debug: debug, renderer: renderer)
15
+ Writer.new(path: report_path).write(result) if report_path
16
+ reporter.report(result) if summary
17
+
18
+ result
19
+ end
20
+
21
+ private
22
+
23
+ def audit(renderer:, timeout_seconds:)
24
+ Audit.new(
25
+ browser_factory: @configuration.browser_factory,
26
+ network_idle_timeout_seconds: @configuration.network_idle_timeout_seconds,
27
+ renderer: renderer,
28
+ schema_registry: @configuration.schema_registry,
29
+ scroll_page: @configuration.scroll_page?,
30
+ timeout_seconds: timeout_seconds
31
+ )
32
+ end
33
+ end
34
+ end
35
+ end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "uri"
4
+ require "json"
4
5
 
5
6
  module Crawlscope
6
7
  module StructuredData
@@ -21,6 +22,18 @@ module Crawlscope
21
22
  end
22
23
  end
23
24
 
25
+ def details(result, debug:, renderer:)
26
+ @io.puts("JavaScript mode enabled (Ferrum)") if renderer == :browser
27
+ @io.puts("Validating JSON-LD on #{result.entries.size} URL(s)")
28
+ @io.puts("")
29
+
30
+ result.entries.each do |entry|
31
+ report_entry(entry, debug: debug)
32
+ end
33
+
34
+ @io.puts("STATUS: #{result.ok? ? "OK" : "FAILED"}")
35
+ end
36
+
24
37
  private
25
38
 
26
39
  def extract_path(url)
@@ -68,6 +81,62 @@ module Crawlscope
68
81
 
69
82
  @io.puts("#{report.failure_count} of #{report.total} URLs failed validation.")
70
83
  end
84
+
85
+ def report_entry(entry, debug:)
86
+ @io.puts("=" * 80)
87
+ @io.puts("URL: #{entry.url}")
88
+ @io.puts("=" * 80)
89
+
90
+ if entry.fetch_error
91
+ @io.puts("Error: #{entry.fetch_error}")
92
+ @io.puts("")
93
+ return
94
+ end
95
+
96
+ report_status(entry)
97
+ @io.puts("Structured data found: #{entry.structured_items.size} (JSON-LD: #{entry.json_ld_count}, Microdata: #{entry.microdata_count})")
98
+ report_debug(entry) if debug && entry.structured_items.any?
99
+ report_validation(entry)
100
+ @io.puts("")
101
+ end
102
+
103
+ def report_status(entry)
104
+ if entry.status
105
+ @io.puts("Status: #{entry.status}")
106
+ else
107
+ @io.puts("Status: JS runtime fetch")
108
+ end
109
+ end
110
+
111
+ def report_debug(entry)
112
+ @io.puts("")
113
+ @io.puts("--- Detected Structured Data ---")
114
+
115
+ entry.structured_items.each_with_index do |item, index|
116
+ @io.puts("")
117
+ @io.puts("## Item #{index + 1} [#{item[:source]}]")
118
+ @io.puts(JSON.pretty_generate(item[:data]))
119
+ end
120
+
121
+ @io.puts("")
122
+ @io.puts("--- End ---")
123
+ end
124
+
125
+ def report_validation(entry)
126
+ @io.puts("")
127
+ @io.puts("Validation results:")
128
+
129
+ if entry.errors.empty?
130
+ @io.puts(" All valid!")
131
+ else
132
+ entry.errors.each do |error|
133
+ @io.puts(" #{error[:type]}: INVALID [#{error[:source]}]")
134
+ error[:errors].each do |validation_error|
135
+ @io.puts(" - field: #{validation_error[:field]}, issue: #{validation_error[:issue]}")
136
+ end
137
+ end
138
+ end
139
+ end
71
140
  end
72
141
  end
73
142
  end
@@ -23,6 +23,20 @@ module Crawlscope
23
23
  url.to_s
24
24
  end
25
25
 
26
+ def normalize_for_base(url, base_url:)
27
+ uri = URI.parse(normalize(url, base_url: base_url))
28
+ base_uri = URI.parse(base_url.to_s)
29
+ unless base_uri.host.to_s.empty?
30
+ uri.scheme = base_uri.scheme
31
+ uri.host = base_uri.host
32
+ uri.port = base_uri.port
33
+ end
34
+
35
+ normalize(uri.to_s, base_url: base_url)
36
+ rescue URI::InvalidURIError
37
+ url.to_s
38
+ end
39
+
26
40
  def path(url)
27
41
  uri = URI.parse(url.to_s)
28
42
  value = uri.path.to_s
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Crawlscope
4
- VERSION = "0.1.0"
4
+ VERSION = "0.2.0"
5
5
  end
@@ -1,44 +1,33 @@
1
1
  namespace :crawlscope do
2
- desc "Validate sitemap URLs with the default Crawlscope rules. ENV: BASE_URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
2
+ desc "Validate URLs with all default Crawlscope rules. ENV: URL, SITEMAP, RULES, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, CONCURRENCY"
3
3
  task validate: :environment do
4
- status = Crawlscope::Cli.start(["validate"], out: $stdout, err: $stderr)
5
- exit(status) unless status.zero?
4
+ Crawlscope::RakeTasks.validate
6
5
  end
7
6
 
8
7
  namespace :validate do
9
- desc "Validate JSON-LD on one or more URLs. ENV: URL (required, semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
8
+ desc "Directly validate JSON-LD on one or more URLs. ENV: URL (required, semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
10
9
  task ldjson: :environment do
11
- status = Crawlscope::Cli.start(["ldjson"], out: $stdout, err: $stderr)
12
- exit(status) unless status.zero?
10
+ Crawlscope::RakeTasks.ldjson
13
11
  end
14
12
 
15
- desc "Validate sitemap URLs with the metadata rule. ENV: BASE_URL, SITEMAP, JS=1"
13
+ desc "Validate URLs with the metadata rule. ENV: URL, SITEMAP, JS=1"
16
14
  task metadata: :environment do
17
- crawlscope_task_with_rules("metadata")
15
+ Crawlscope::RakeTasks.validate_rule("metadata")
18
16
  end
19
17
 
20
- desc "Validate sitemap URLs with the structured_data rule. ENV: BASE_URL, SITEMAP, JS=1"
18
+ desc "Validate sitemap URLs with the structured_data rule. ENV: URL, SITEMAP, JS=1"
21
19
  task structured_data: :environment do
22
- crawlscope_task_with_rules("structured_data")
20
+ Crawlscope::RakeTasks.validate_rule("structured_data")
23
21
  end
24
22
 
25
- desc "Validate sitemap URLs with the uniqueness rule. ENV: BASE_URL, SITEMAP, JS=1"
23
+ desc "Validate URLs with the uniqueness rule. ENV: URL, SITEMAP, JS=1"
26
24
  task uniqueness: :environment do
27
- crawlscope_task_with_rules("uniqueness")
25
+ Crawlscope::RakeTasks.validate_rule("uniqueness")
28
26
  end
29
27
 
30
- desc "Validate sitemap URLs with the links rule. ENV: BASE_URL, SITEMAP, JS=1"
28
+ desc "Validate URLs with the links rule. ENV: URL, SITEMAP, JS=1"
31
29
  task links: :environment do
32
- crawlscope_task_with_rules("links")
30
+ Crawlscope::RakeTasks.validate_rule("links")
33
31
  end
34
32
  end
35
-
36
- def crawlscope_task_with_rules(rules)
37
- original_rules = ENV["RULES"]
38
- ENV["RULES"] = rules
39
- status = Crawlscope::Cli.start(["validate"], out: $stdout, err: $stderr)
40
- exit(status) unless status.zero?
41
- ensure
42
- ENV["RULES"] = original_rules
43
- end
44
33
  end