crawlscope 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +7 -11
  3. data/README.md +20 -13
  4. data/lib/crawlscope/browser.rb +8 -0
  5. data/lib/crawlscope/cli.rb +10 -10
  6. data/lib/crawlscope/configuration.rb +20 -5
  7. data/lib/crawlscope/context.rb +9 -0
  8. data/lib/crawlscope/{audit.rb → crawl.rb} +62 -58
  9. data/lib/crawlscope/crawler.rb +19 -1
  10. data/lib/crawlscope/http.rb +1 -1
  11. data/lib/crawlscope/rake_tasks.rb +28 -0
  12. data/lib/crawlscope/rules/links.rb +76 -43
  13. data/lib/crawlscope/rules/structured_data.rb +14 -1
  14. data/lib/crawlscope/run.rb +60 -0
  15. data/lib/crawlscope/schema_registry.rb +3 -349
  16. data/lib/crawlscope/schemas.rb +355 -0
  17. data/lib/crawlscope/sitemap.rb +18 -6
  18. data/lib/crawlscope/structured_data/audit.rb +7 -7
  19. data/lib/crawlscope/structured_data/check.rb +35 -0
  20. data/lib/crawlscope/structured_data/reporter.rb +69 -0
  21. data/lib/crawlscope/url.rb +14 -0
  22. data/lib/crawlscope/version.rb +1 -1
  23. data/lib/tasks/crawlscope_tasks.rake +12 -23
  24. data/test/crawlscope/browser_test.rb +155 -0
  25. data/test/crawlscope/cli_test.rb +128 -6
  26. data/test/crawlscope/configuration_test.rb +49 -0
  27. data/test/crawlscope/{audit_test.rb → crawl_test.rb} +11 -5
  28. data/test/crawlscope/crawler_test.rb +34 -0
  29. data/test/crawlscope/http_test.rb +56 -0
  30. data/test/crawlscope/links_rule_test.rb +110 -5
  31. data/test/crawlscope/rule_registry_test.rb +32 -0
  32. data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
  33. data/test/crawlscope/schema_registry_test.rb +19 -0
  34. data/test/crawlscope/sitemap_test.rb +55 -0
  35. data/test/crawlscope/structured_data_document_test.rb +36 -0
  36. data/test/crawlscope/structured_data_report_test.rb +3 -3
  37. data/test/crawlscope/structured_data_reporter_test.rb +2 -2
  38. data/test/crawlscope/structured_data_rule_test.rb +20 -0
  39. data/test/crawlscope/structured_data_writer_test.rb +2 -2
  40. data/test/crawlscope/url_test.rb +31 -0
  41. metadata +14 -5
  42. data/lib/crawlscope/task.rb +0 -131
@@ -40,41 +40,49 @@ module Crawlscope
40
40
  end
41
41
 
42
42
  def extract_links(pages)
43
- links = []
43
+ pages.select(&:html?).flat_map { |page| page_links(page) }
44
+ end
44
45
 
45
- pages.each do |page|
46
- next unless page.html?
46
+ def page_links(page)
47
+ source_path = Url.path(page.normalized_url)
48
+ return [] unless crawlable_path?(source_path)
47
49
 
48
- source_path = Url.path(page.normalized_url)
49
- next if source_path.nil?
50
+ contextual_links(page.doc).filter_map do |node|
51
+ link_for(page: page, source_path: source_path, node: node)
52
+ end
53
+ end
50
54
 
51
- contextual_links(page.doc).each do |node|
52
- href = node["href"].to_s.strip
53
- next if href.empty?
54
- next if href.start_with?("#")
55
- next if LINK_SCHEMES_TO_SKIP.any? { |prefix| href.start_with?(prefix) }
55
+ def link_for(page:, source_path:, node:)
56
+ href = node["href"].to_s.strip
57
+ return unless crawlable_href?(href)
56
58
 
57
- anchor_text = normalize_anchor_text(node.text)
58
- next if anchor_text.empty?
59
+ anchor_text = normalize_anchor_text(node.text)
60
+ return if anchor_text.empty?
59
61
 
60
- target_url = normalize_internal_link(page.normalized_url, href)
61
- next if target_url.nil?
62
+ target_url = normalize_internal_link(page.normalized_url, href)
63
+ return if target_url.nil?
62
64
 
63
- target_path = Url.path(target_url)
64
- next if target_path.nil?
65
- next if skip_internal_path?(target_path)
65
+ target_path = Url.path(target_url)
66
+ return unless crawlable_path?(target_path)
66
67
 
67
- links << {
68
- anchor_text: anchor_text,
69
- source_path: source_path,
70
- source_url: page.normalized_url,
71
- target_path: target_path,
72
- target_url: target_url
73
- }
74
- end
75
- end
68
+ {
69
+ anchor_text: anchor_text,
70
+ source_path: source_path,
71
+ source_url: page.normalized_url,
72
+ target_path: target_path,
73
+ target_url: target_url
74
+ }
75
+ end
76
76
 
77
- links
77
+ def crawlable_href?(href)
78
+ return false if href.empty?
79
+ return false if href.start_with?("#")
80
+
81
+ LINK_SCHEMES_TO_SKIP.none? { |prefix| href.start_with?(prefix) }
82
+ end
83
+
84
+ def crawlable_path?(path)
85
+ !path.nil? && !skip_internal_path?(path)
78
86
  end
79
87
 
80
88
  def normalize_anchor_text(text)
@@ -122,39 +130,64 @@ module Crawlscope
122
130
  resolved_links = []
123
131
 
124
132
  links.group_by { |link| link[:target_url] }.each do |target_url, grouped_links|
125
- resolution = @resolve_target.call(target_url)
126
- if resolution.nil?
127
- report_unresolved_target(target_url, grouped_links, issues, resolution)
133
+ target = resolve_target(target_url)
134
+
135
+ if target.unresolved?
136
+ report_unresolved_target(target_url, grouped_links, issues, target.resolution)
128
137
  next
129
138
  end
130
139
 
131
- status = resolution[:status]
132
-
133
- if status.nil?
134
- next if resolution[:crawled] && resolution[:error]
135
-
136
- report_unresolved_target(target_url, grouped_links, issues, resolution)
140
+ if target.ignored_error?
137
141
  next
138
142
  end
139
143
 
140
- unless @allowed_statuses.include?(status)
141
- report_broken_target(target_url, grouped_links, issues, status)
144
+ unless target.allowed?(@allowed_statuses)
145
+ report_broken_target(target_url, grouped_links, issues, target.status)
142
146
  next
143
147
  end
144
148
 
145
- final_url = resolution[:final_url].to_s.empty? ? target_url : resolution[:final_url]
146
- final_path = Url.path(final_url)
147
- next if final_path.nil?
148
- next if skip_internal_path?(final_path)
149
+ next unless crawlable_path?(target.final_path)
149
150
 
150
151
  grouped_links.each do |link|
151
- resolved_links << link.merge(final_path: final_path, final_url: final_url)
152
+ resolved_links << link.merge(final_path: target.final_path, final_url: target.final_url)
152
153
  end
153
154
  end
154
155
 
155
156
  resolved_links
156
157
  end
157
158
 
159
+ def resolve_target(target_url)
160
+ resolution = @resolve_target.call(target_url)
161
+ LinkTarget.new(target_url: target_url, resolution: resolution)
162
+ end
163
+
164
+ LinkTarget = Data.define(:target_url, :resolution) do
165
+ def allowed?(statuses)
166
+ statuses.include?(status)
167
+ end
168
+
169
+ def final_path
170
+ Url.path(final_url)
171
+ end
172
+
173
+ def final_url
174
+ value = resolution[:final_url].to_s
175
+ value.empty? ? target_url : value
176
+ end
177
+
178
+ def ignored_error?
179
+ resolution && status.nil? && resolution[:crawled] && resolution[:error]
180
+ end
181
+
182
+ def status
183
+ resolution && resolution[:status]
184
+ end
185
+
186
+ def unresolved?
187
+ resolution.nil? || (status.nil? && !ignored_error?)
188
+ end
189
+ end
190
+
158
191
  def skip_internal_path?(path)
159
192
  return true if path == "/"
160
193
 
@@ -23,8 +23,21 @@ module Crawlscope
23
23
 
24
24
  def validate_page(page, issues, schema_registry)
25
25
  document = Crawlscope::StructuredData::Document.new(html: page.body)
26
+ items = document.items
26
27
 
27
- document.items.each do |item|
28
+ if items.empty?
29
+ issues.add(
30
+ code: :missing_structured_data,
31
+ severity: :warning,
32
+ category: :structured_data,
33
+ url: page.url,
34
+ message: "no structured data found; add JSON-LD or microdata markup",
35
+ details: {expected_sources: ["json-ld", "microdata"]}
36
+ )
37
+ return
38
+ end
39
+
40
+ items.each do |item|
28
41
  data = item.data
29
42
  source = item.source
30
43
 
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ class Run
5
+ def initialize(configuration: Crawlscope.configuration, reporter: Reporter.new(io: configuration.output))
6
+ @configuration = configuration
7
+ @reporter = reporter
8
+ end
9
+
10
+ def validate(base_url: nil, sitemap_path: nil, rule_names: nil)
11
+ resolved_base_url = base_url || default_base_url
12
+ crawl = @configuration.audit(
13
+ base_url: resolved_base_url,
14
+ sitemap_path: sitemap_path || default_sitemap_path(base_url: resolved_base_url),
15
+ rule_names: rule_names
16
+ )
17
+
18
+ result = crawl.call
19
+ @reporter.report(result)
20
+ result
21
+ end
22
+
23
+ def validate_json_ld(urls:, debug: false, renderer: @configuration.renderer, timeout_seconds: @configuration.timeout_seconds, report_path: nil, summary: false)
24
+ StructuredData::Check.new(configuration: @configuration).call(
25
+ urls: urls,
26
+ debug: debug,
27
+ renderer: renderer,
28
+ timeout_seconds: timeout_seconds,
29
+ report_path: report_path,
30
+ summary: summary
31
+ )
32
+ end
33
+
34
+ private
35
+
36
+ def default_base_url
37
+ value = @configuration.base_url
38
+ return value unless value.to_s.strip.empty?
39
+
40
+ "http://localhost:3000"
41
+ end
42
+
43
+ def default_sitemap_path(base_url:)
44
+ value = @configuration.sitemap_path
45
+ return value unless value.to_s.strip.empty?
46
+
47
+ local_path = File.expand_path("public/sitemap.xml", Dir.pwd)
48
+ return local_path if local_path_default?(base_url: base_url) && File.exist?(local_path)
49
+
50
+ "#{base_url.to_s.chomp("/")}/sitemap.xml"
51
+ end
52
+
53
+ def local_path_default?(base_url:)
54
+ host = URI.parse(base_url.to_s).host.to_s
55
+ ["localhost", "127.0.0.1"].include?(host)
56
+ rescue URI::InvalidURIError
57
+ false
58
+ end
59
+ end
60
+ end
@@ -2,362 +2,16 @@
2
2
 
3
3
  require "json-schema"
4
4
 
5
+ JSON::Validator.use_multi_json = false
6
+
5
7
  module Crawlscope
6
8
  class SchemaRegistry
7
- FAQ_PAGE = {
8
- "type" => "object",
9
- "required" => ["@context", "@type", "mainEntity"],
10
- "properties" => {
11
- "@context" => {"const" => "https://schema.org"},
12
- "@type" => {"const" => "FAQPage"},
13
- "mainEntity" => {
14
- "type" => "array",
15
- "minItems" => 1,
16
- "items" => {"$ref" => "#/definitions/Question"}
17
- }
18
- },
19
- "definitions" => {
20
- "Question" => {
21
- "type" => "object",
22
- "required" => ["@type", "name", "acceptedAnswer"],
23
- "properties" => {
24
- "@type" => {"const" => "Question"},
25
- "name" => {"type" => "string"},
26
- "acceptedAnswer" => {"$ref" => "#/definitions/Answer"}
27
- }
28
- },
29
- "Answer" => {
30
- "type" => "object",
31
- "required" => ["@type", "text"],
32
- "properties" => {
33
- "@type" => {"const" => "Answer"},
34
- "text" => {"type" => "string"}
35
- }
36
- }
37
- }
38
- }.freeze
39
-
40
- ARTICLE = {
41
- type: "object",
42
- required: ["@type", "headline"],
43
- properties: {
44
- "@type" => {enum: ["Article", "NewsArticle", "BlogPosting"]},
45
- :headline => {type: "string", maxLength: 110},
46
- :image => {type: "string", format: "uri"},
47
- :datePublished => {type: "string", format: "date-time"},
48
- :dateModified => {type: "string", format: "date-time"},
49
- :author => {type: "object"},
50
- :publisher => {type: "object"}
51
- }
52
- }.freeze
53
-
54
- ORGANIZATION = {
55
- type: "object",
56
- required: ["@type", "name"],
57
- properties: {
58
- "@type" => {const: "Organization"},
59
- :name => {type: "string"},
60
- :url => {type: "string", format: "uri"},
61
- :logo => {
62
- anyOf: [
63
- {type: "string", format: "uri"},
64
- {
65
- type: "object",
66
- required: ["@type", "url"],
67
- properties: {
68
- "@type" => {const: "ImageObject"},
69
- :url => {type: "string", format: "uri"}
70
- }
71
- }
72
- ]
73
- },
74
- :description => {type: "string"}
75
- }
76
- }.freeze
77
-
78
- IMAGE_OBJECT = {
79
- type: "object",
80
- required: ["@type"],
81
- properties: {
82
- "@type" => {const: "ImageObject"},
83
- :url => {type: "string", format: "uri"},
84
- :contentUrl => {type: "string", format: "uri"},
85
- :thumbnail => {type: ["string", "object"]}
86
- }
87
- }.freeze
88
-
89
- OFFER = {
90
- type: "object",
91
- additionalProperties: true,
92
- required: ["@type"],
93
- properties: {
94
- "@type" => {const: "Offer"},
95
- :name => {type: ["string", "null"]},
96
- :price => {type: ["string", "number"]},
97
- :priceCurrency => {type: ["string", "null"]},
98
- :priceSpecification => {type: ["object", "null"]},
99
- :availability => {type: "string"},
100
- :shippingDetails => {type: "object"},
101
- :hasMerchantReturnPolicy => {type: "boolean"},
102
- :merchantReturnPolicy => {type: "object"},
103
- :url => {type: "string", format: "uri"},
104
- :eligibleQuantity => {type: "object"},
105
- :additionalProperty => {type: "array", items: {type: "object"}}
106
- }
107
- }.freeze
108
-
109
- RATING = {
110
- type: "object",
111
- required: ["@type", "ratingValue"],
112
- properties: {
113
- "@type" => {const: "Rating"},
114
- :ratingValue => {type: ["string", "number"]},
115
- :bestRating => {type: ["string", "number"]},
116
- :worstRating => {type: ["string", "number"]}
117
- }
118
- }.freeze
119
-
120
- REVIEW = {
121
- type: "object",
122
- required: ["@type", "itemReviewed"],
123
- properties: {
124
- "@type" => {const: "Review"},
125
- :itemReviewed => {type: "object"},
126
- :reviewRating => RATING,
127
- :author => {type: ["object", "string"]},
128
- :datePublished => {type: "string", format: "date-time"},
129
- :reviewBody => {type: "string"}
130
- }
131
- }.freeze
132
-
133
- REVIEW_SNIPPET = {
134
- type: "object",
135
- required: ["@type", "reviewRating"],
136
- properties: {
137
- "@type" => {const: "Review"},
138
- :reviewRating => RATING,
139
- :author => {type: ["object", "string"]},
140
- :reviewBody => {type: "string"},
141
- :datePublished => {type: "string", format: "date-time"}
142
- }
143
- }.freeze
144
-
145
- AGGREGATE_RATING = {
146
- type: "object",
147
- required: ["@type"],
148
- properties: {
149
- "@type" => {const: "AggregateRating"},
150
- :ratingValue => {type: ["string", "number"]},
151
- :ratingCount => {type: "integer"},
152
- :reviewCount => {type: "integer"},
153
- :bestRating => {type: ["string", "number"]},
154
- :worstRating => {type: ["string", "number"]}
155
- }
156
- }.freeze
157
-
158
- SOFTWARE_APPLICATION = {
159
- type: "object",
160
- required: ["@type", "name"],
161
- properties: {
162
- "@type" => {const: "SoftwareApplication"},
163
- :name => {type: "string"},
164
- :applicationCategory => {type: "string"},
165
- :description => {type: "string"},
166
- :offers => {
167
- anyOf: [
168
- OFFER,
169
- {type: "array", items: OFFER}
170
- ]
171
- },
172
- :featureList => {type: ["string", "array"]},
173
- :aggregateRating => AGGREGATE_RATING,
174
- :review => REVIEW_SNIPPET
175
- }
176
- }.freeze
177
-
178
- WEB_APPLICATION = {
179
- type: "object",
180
- required: ["@type", "name"],
181
- properties: {
182
- "@type" => {const: "WebApplication"},
183
- :name => {type: "string"},
184
- :applicationCategory => {type: "string"},
185
- :description => {type: "string"},
186
- :operatingSystem => {type: "string"},
187
- :url => {type: "string", format: "uri"},
188
- :offers => {
189
- anyOf: [
190
- OFFER,
191
- {type: "array", items: OFFER}
192
- ]
193
- },
194
- :featureList => {type: ["string", "array"]},
195
- :aggregateRating => AGGREGATE_RATING,
196
- :review => REVIEW_SNIPPET
197
- }
198
- }.freeze
199
-
200
- HOW_TO = {
201
- type: "object",
202
- required: ["@type", "name", "step"],
203
- properties: {
204
- "@type" => {const: "HowTo"},
205
- :name => {type: "string"},
206
- :description => {type: "string"},
207
- :step => {
208
- type: "array",
209
- minItems: 1,
210
- items: {
211
- type: "object",
212
- required: ["@type", "name", "text"],
213
- properties: {
214
- "@type" => {const: "HowToStep"},
215
- :name => {type: "string"},
216
- :text => {type: "string"},
217
- :position => {type: "integer", minimum: 1}
218
- }
219
- }
220
- }
221
- }
222
- }.freeze
223
-
224
- CONTACT_PAGE = {
225
- type: "object",
226
- required: ["@type", "name"],
227
- properties: {
228
- "@type" => {const: "ContactPage"},
229
- :name => {type: "string"},
230
- :description => {type: "string"},
231
- :url => {type: "string", format: "uri"}
232
- }
233
- }.freeze
234
-
235
- PRODUCT = {
236
- type: "object",
237
- required: ["@type", "name"],
238
- properties: {
239
- "@type" => {const: "Product"},
240
- :name => {type: "string"},
241
- :image => {
242
- anyOf: [
243
- {type: "string", format: "uri"},
244
- IMAGE_OBJECT,
245
- {type: "array", items: {anyOf: [{type: "string", format: "uri"}, IMAGE_OBJECT]}}
246
- ]
247
- },
248
- :description => {type: "string"},
249
- :offers => {
250
- anyOf: [
251
- OFFER,
252
- {type: "array", items: OFFER}
253
- ]
254
- }
255
- }
256
- }.freeze
257
-
258
- RECIPE = {
259
- type: "object",
260
- required: ["@type", "name"],
261
- properties: {
262
- "@type" => {const: "Recipe"},
263
- :name => {type: "string"},
264
- :image => {type: ["string", "array"]},
265
- :recipeIngredient => {type: "array", items: {type: "string"}},
266
- :recipeInstructions => {type: ["string", "array"]}
267
- }
268
- }.freeze
269
-
270
- EVENT = {
271
- type: "object",
272
- required: ["@type", "name", "startDate"],
273
- properties: {
274
- "@type" => {const: "Event"},
275
- :name => {type: "string"},
276
- :startDate => {type: "string", format: "date-time"},
277
- :endDate => {type: "string", format: "date-time"},
278
- :location => {type: "object"}
279
- }
280
- }.freeze
281
-
282
- VIDEO_OBJECT = {
283
- type: "object",
284
- required: ["@type", "name", "description"],
285
- properties: {
286
- "@type" => {const: "VideoObject"},
287
- :name => {type: "string"},
288
- :description => {type: "string"},
289
- :thumbnailUrl => {type: "string", format: "uri"},
290
- :uploadDate => {type: "string", format: "date-time"}
291
- }
292
- }.freeze
293
-
294
- WEBSITE = {
295
- type: "object",
296
- required: ["@type"],
297
- properties: {
298
- "@type" => {const: "WebSite"},
299
- :name => {type: "string"},
300
- :url => {type: "string", format: "uri"},
301
- :potentialAction => {type: "object"}
302
- }
303
- }.freeze
304
-
305
- BREADCRUMB_LIST = {
306
- type: "object",
307
- required: ["@type", "itemListElement"],
308
- properties: {
309
- "@type" => {const: "BreadcrumbList"},
310
- :itemListElement => {
311
- type: "array",
312
- minItems: 1,
313
- items: {
314
- type: "object",
315
- required: ["@type", "position", "name", "item"],
316
- properties: {
317
- "@type" => {const: "ListItem"},
318
- :position => {type: "integer", minimum: 1},
319
- :name => {type: "string"},
320
- :item => {type: "string", format: "uri"}
321
- }
322
- }
323
- }
324
- }
325
- }.freeze
326
-
327
- WEB_PAGE = {
328
- type: "object",
329
- required: ["@type"],
330
- properties: {
331
- "@type" => {const: "WebPage"}
332
- }
333
- }.freeze
334
-
335
9
  def initialize(schemas: {})
336
10
  @schemas = schemas.transform_keys(&:to_s).dup
337
11
  end
338
12
 
339
13
  def self.default
340
- new(
341
- schemas: {
342
- "FAQPage" => FAQ_PAGE,
343
- "Article" => ARTICLE,
344
- "NewsArticle" => ARTICLE,
345
- "BlogPosting" => ARTICLE,
346
- "Organization" => ORGANIZATION,
347
- "SoftwareApplication" => SOFTWARE_APPLICATION,
348
- "WebApplication" => WEB_APPLICATION,
349
- "HowTo" => HOW_TO,
350
- "ContactPage" => CONTACT_PAGE,
351
- "Product" => PRODUCT,
352
- "Review" => REVIEW,
353
- "WebSite" => WEBSITE,
354
- "BreadcrumbList" => BREADCRUMB_LIST,
355
- "Recipe" => RECIPE,
356
- "Event" => EVENT,
357
- "VideoObject" => VIDEO_OBJECT,
358
- "WebPage" => WEB_PAGE
359
- }
360
- )
14
+ new(schemas: Schemas.schemas)
361
15
  end
362
16
 
363
17
  def dup