crawlscope 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +7 -8
  3. data/README.md +21 -14
  4. data/lib/crawlscope/browser.rb +8 -0
  5. data/lib/crawlscope/cli.rb +15 -10
  6. data/lib/crawlscope/configuration.rb +20 -5
  7. data/lib/crawlscope/context.rb +9 -0
  8. data/lib/crawlscope/{audit.rb → crawl.rb} +68 -58
  9. data/lib/crawlscope/crawler.rb +19 -1
  10. data/lib/crawlscope/http.rb +1 -1
  11. data/lib/crawlscope/rake_tasks.rb +28 -0
  12. data/lib/crawlscope/rules/links.rb +99 -48
  13. data/lib/crawlscope/rules/metadata.rb +57 -11
  14. data/lib/crawlscope/rules/structured_data.rb +61 -1
  15. data/lib/crawlscope/run.rb +60 -0
  16. data/lib/crawlscope/schema_registry.rb +3 -349
  17. data/lib/crawlscope/schemas.rb +406 -0
  18. data/lib/crawlscope/sitemap.rb +18 -6
  19. data/lib/crawlscope/structured_data/audit.rb +7 -7
  20. data/lib/crawlscope/structured_data/check.rb +35 -0
  21. data/lib/crawlscope/structured_data/reporter.rb +69 -0
  22. data/lib/crawlscope/url.rb +14 -0
  23. data/lib/crawlscope/version.rb +1 -1
  24. data/lib/tasks/crawlscope_tasks.rake +12 -23
  25. data/test/crawlscope/browser_test.rb +155 -0
  26. data/test/crawlscope/cli_test.rb +143 -7
  27. data/test/crawlscope/configuration_test.rb +49 -0
  28. data/test/crawlscope/{audit_test.rb → crawl_test.rb} +23 -7
  29. data/test/crawlscope/crawler_test.rb +34 -0
  30. data/test/crawlscope/http_test.rb +56 -0
  31. data/test/crawlscope/links_rule_test.rb +149 -5
  32. data/test/crawlscope/metadata_rule_test.rb +77 -0
  33. data/test/crawlscope/rule_registry_test.rb +32 -0
  34. data/test/crawlscope/{task_test.rb → run_test.rb} +28 -33
  35. data/test/crawlscope/schema_registry_test.rb +19 -0
  36. data/test/crawlscope/sitemap_test.rb +55 -0
  37. data/test/crawlscope/structured_data_document_test.rb +36 -0
  38. data/test/crawlscope/structured_data_report_test.rb +3 -3
  39. data/test/crawlscope/structured_data_reporter_test.rb +2 -2
  40. data/test/crawlscope/structured_data_rule_test.rb +111 -0
  41. data/test/crawlscope/structured_data_writer_test.rb +2 -2
  42. data/test/crawlscope/url_test.rb +31 -0
  43. metadata +15 -5
  44. data/lib/crawlscope/task.rb +0 -131
@@ -2,362 +2,16 @@
2
2
 
3
3
  require "json-schema"
4
4
 
5
+ JSON::Validator.use_multi_json = false
6
+
5
7
  module Crawlscope
6
8
  class SchemaRegistry
7
- FAQ_PAGE = {
8
- "type" => "object",
9
- "required" => ["@context", "@type", "mainEntity"],
10
- "properties" => {
11
- "@context" => {"const" => "https://schema.org"},
12
- "@type" => {"const" => "FAQPage"},
13
- "mainEntity" => {
14
- "type" => "array",
15
- "minItems" => 1,
16
- "items" => {"$ref" => "#/definitions/Question"}
17
- }
18
- },
19
- "definitions" => {
20
- "Question" => {
21
- "type" => "object",
22
- "required" => ["@type", "name", "acceptedAnswer"],
23
- "properties" => {
24
- "@type" => {"const" => "Question"},
25
- "name" => {"type" => "string"},
26
- "acceptedAnswer" => {"$ref" => "#/definitions/Answer"}
27
- }
28
- },
29
- "Answer" => {
30
- "type" => "object",
31
- "required" => ["@type", "text"],
32
- "properties" => {
33
- "@type" => {"const" => "Answer"},
34
- "text" => {"type" => "string"}
35
- }
36
- }
37
- }
38
- }.freeze
39
-
40
- ARTICLE = {
41
- type: "object",
42
- required: ["@type", "headline"],
43
- properties: {
44
- "@type" => {enum: ["Article", "NewsArticle", "BlogPosting"]},
45
- :headline => {type: "string", maxLength: 110},
46
- :image => {type: "string", format: "uri"},
47
- :datePublished => {type: "string", format: "date-time"},
48
- :dateModified => {type: "string", format: "date-time"},
49
- :author => {type: "object"},
50
- :publisher => {type: "object"}
51
- }
52
- }.freeze
53
-
54
- ORGANIZATION = {
55
- type: "object",
56
- required: ["@type", "name"],
57
- properties: {
58
- "@type" => {const: "Organization"},
59
- :name => {type: "string"},
60
- :url => {type: "string", format: "uri"},
61
- :logo => {
62
- anyOf: [
63
- {type: "string", format: "uri"},
64
- {
65
- type: "object",
66
- required: ["@type", "url"],
67
- properties: {
68
- "@type" => {const: "ImageObject"},
69
- :url => {type: "string", format: "uri"}
70
- }
71
- }
72
- ]
73
- },
74
- :description => {type: "string"}
75
- }
76
- }.freeze
77
-
78
- IMAGE_OBJECT = {
79
- type: "object",
80
- required: ["@type"],
81
- properties: {
82
- "@type" => {const: "ImageObject"},
83
- :url => {type: "string", format: "uri"},
84
- :contentUrl => {type: "string", format: "uri"},
85
- :thumbnail => {type: ["string", "object"]}
86
- }
87
- }.freeze
88
-
89
- OFFER = {
90
- type: "object",
91
- additionalProperties: true,
92
- required: ["@type"],
93
- properties: {
94
- "@type" => {const: "Offer"},
95
- :name => {type: ["string", "null"]},
96
- :price => {type: ["string", "number"]},
97
- :priceCurrency => {type: ["string", "null"]},
98
- :priceSpecification => {type: ["object", "null"]},
99
- :availability => {type: "string"},
100
- :shippingDetails => {type: "object"},
101
- :hasMerchantReturnPolicy => {type: "boolean"},
102
- :merchantReturnPolicy => {type: "object"},
103
- :url => {type: "string", format: "uri"},
104
- :eligibleQuantity => {type: "object"},
105
- :additionalProperty => {type: "array", items: {type: "object"}}
106
- }
107
- }.freeze
108
-
109
- RATING = {
110
- type: "object",
111
- required: ["@type", "ratingValue"],
112
- properties: {
113
- "@type" => {const: "Rating"},
114
- :ratingValue => {type: ["string", "number"]},
115
- :bestRating => {type: ["string", "number"]},
116
- :worstRating => {type: ["string", "number"]}
117
- }
118
- }.freeze
119
-
120
- REVIEW = {
121
- type: "object",
122
- required: ["@type", "itemReviewed"],
123
- properties: {
124
- "@type" => {const: "Review"},
125
- :itemReviewed => {type: "object"},
126
- :reviewRating => RATING,
127
- :author => {type: ["object", "string"]},
128
- :datePublished => {type: "string", format: "date-time"},
129
- :reviewBody => {type: "string"}
130
- }
131
- }.freeze
132
-
133
- REVIEW_SNIPPET = {
134
- type: "object",
135
- required: ["@type", "reviewRating"],
136
- properties: {
137
- "@type" => {const: "Review"},
138
- :reviewRating => RATING,
139
- :author => {type: ["object", "string"]},
140
- :reviewBody => {type: "string"},
141
- :datePublished => {type: "string", format: "date-time"}
142
- }
143
- }.freeze
144
-
145
- AGGREGATE_RATING = {
146
- type: "object",
147
- required: ["@type"],
148
- properties: {
149
- "@type" => {const: "AggregateRating"},
150
- :ratingValue => {type: ["string", "number"]},
151
- :ratingCount => {type: "integer"},
152
- :reviewCount => {type: "integer"},
153
- :bestRating => {type: ["string", "number"]},
154
- :worstRating => {type: ["string", "number"]}
155
- }
156
- }.freeze
157
-
158
- SOFTWARE_APPLICATION = {
159
- type: "object",
160
- required: ["@type", "name"],
161
- properties: {
162
- "@type" => {const: "SoftwareApplication"},
163
- :name => {type: "string"},
164
- :applicationCategory => {type: "string"},
165
- :description => {type: "string"},
166
- :offers => {
167
- anyOf: [
168
- OFFER,
169
- {type: "array", items: OFFER}
170
- ]
171
- },
172
- :featureList => {type: ["string", "array"]},
173
- :aggregateRating => AGGREGATE_RATING,
174
- :review => REVIEW_SNIPPET
175
- }
176
- }.freeze
177
-
178
- WEB_APPLICATION = {
179
- type: "object",
180
- required: ["@type", "name"],
181
- properties: {
182
- "@type" => {const: "WebApplication"},
183
- :name => {type: "string"},
184
- :applicationCategory => {type: "string"},
185
- :description => {type: "string"},
186
- :operatingSystem => {type: "string"},
187
- :url => {type: "string", format: "uri"},
188
- :offers => {
189
- anyOf: [
190
- OFFER,
191
- {type: "array", items: OFFER}
192
- ]
193
- },
194
- :featureList => {type: ["string", "array"]},
195
- :aggregateRating => AGGREGATE_RATING,
196
- :review => REVIEW_SNIPPET
197
- }
198
- }.freeze
199
-
200
- HOW_TO = {
201
- type: "object",
202
- required: ["@type", "name", "step"],
203
- properties: {
204
- "@type" => {const: "HowTo"},
205
- :name => {type: "string"},
206
- :description => {type: "string"},
207
- :step => {
208
- type: "array",
209
- minItems: 1,
210
- items: {
211
- type: "object",
212
- required: ["@type", "name", "text"],
213
- properties: {
214
- "@type" => {const: "HowToStep"},
215
- :name => {type: "string"},
216
- :text => {type: "string"},
217
- :position => {type: "integer", minimum: 1}
218
- }
219
- }
220
- }
221
- }
222
- }.freeze
223
-
224
- CONTACT_PAGE = {
225
- type: "object",
226
- required: ["@type", "name"],
227
- properties: {
228
- "@type" => {const: "ContactPage"},
229
- :name => {type: "string"},
230
- :description => {type: "string"},
231
- :url => {type: "string", format: "uri"}
232
- }
233
- }.freeze
234
-
235
- PRODUCT = {
236
- type: "object",
237
- required: ["@type", "name"],
238
- properties: {
239
- "@type" => {const: "Product"},
240
- :name => {type: "string"},
241
- :image => {
242
- anyOf: [
243
- {type: "string", format: "uri"},
244
- IMAGE_OBJECT,
245
- {type: "array", items: {anyOf: [{type: "string", format: "uri"}, IMAGE_OBJECT]}}
246
- ]
247
- },
248
- :description => {type: "string"},
249
- :offers => {
250
- anyOf: [
251
- OFFER,
252
- {type: "array", items: OFFER}
253
- ]
254
- }
255
- }
256
- }.freeze
257
-
258
- RECIPE = {
259
- type: "object",
260
- required: ["@type", "name"],
261
- properties: {
262
- "@type" => {const: "Recipe"},
263
- :name => {type: "string"},
264
- :image => {type: ["string", "array"]},
265
- :recipeIngredient => {type: "array", items: {type: "string"}},
266
- :recipeInstructions => {type: ["string", "array"]}
267
- }
268
- }.freeze
269
-
270
- EVENT = {
271
- type: "object",
272
- required: ["@type", "name", "startDate"],
273
- properties: {
274
- "@type" => {const: "Event"},
275
- :name => {type: "string"},
276
- :startDate => {type: "string", format: "date-time"},
277
- :endDate => {type: "string", format: "date-time"},
278
- :location => {type: "object"}
279
- }
280
- }.freeze
281
-
282
- VIDEO_OBJECT = {
283
- type: "object",
284
- required: ["@type", "name", "description"],
285
- properties: {
286
- "@type" => {const: "VideoObject"},
287
- :name => {type: "string"},
288
- :description => {type: "string"},
289
- :thumbnailUrl => {type: "string", format: "uri"},
290
- :uploadDate => {type: "string", format: "date-time"}
291
- }
292
- }.freeze
293
-
294
- WEBSITE = {
295
- type: "object",
296
- required: ["@type"],
297
- properties: {
298
- "@type" => {const: "WebSite"},
299
- :name => {type: "string"},
300
- :url => {type: "string", format: "uri"},
301
- :potentialAction => {type: "object"}
302
- }
303
- }.freeze
304
-
305
- BREADCRUMB_LIST = {
306
- type: "object",
307
- required: ["@type", "itemListElement"],
308
- properties: {
309
- "@type" => {const: "BreadcrumbList"},
310
- :itemListElement => {
311
- type: "array",
312
- minItems: 1,
313
- items: {
314
- type: "object",
315
- required: ["@type", "position", "name", "item"],
316
- properties: {
317
- "@type" => {const: "ListItem"},
318
- :position => {type: "integer", minimum: 1},
319
- :name => {type: "string"},
320
- :item => {type: "string", format: "uri"}
321
- }
322
- }
323
- }
324
- }
325
- }.freeze
326
-
327
- WEB_PAGE = {
328
- type: "object",
329
- required: ["@type"],
330
- properties: {
331
- "@type" => {const: "WebPage"}
332
- }
333
- }.freeze
334
-
335
9
  def initialize(schemas: {})
336
10
  @schemas = schemas.transform_keys(&:to_s).dup
337
11
  end
338
12
 
339
13
  def self.default
340
- new(
341
- schemas: {
342
- "FAQPage" => FAQ_PAGE,
343
- "Article" => ARTICLE,
344
- "NewsArticle" => ARTICLE,
345
- "BlogPosting" => ARTICLE,
346
- "Organization" => ORGANIZATION,
347
- "SoftwareApplication" => SOFTWARE_APPLICATION,
348
- "WebApplication" => WEB_APPLICATION,
349
- "HowTo" => HOW_TO,
350
- "ContactPage" => CONTACT_PAGE,
351
- "Product" => PRODUCT,
352
- "Review" => REVIEW,
353
- "WebSite" => WEBSITE,
354
- "BreadcrumbList" => BREADCRUMB_LIST,
355
- "Recipe" => RECIPE,
356
- "Event" => EVENT,
357
- "VideoObject" => VIDEO_OBJECT,
358
- "WebPage" => WEB_PAGE
359
- }
360
- )
14
+ new(schemas: Schemas.schemas)
361
15
  end
362
16
 
363
17
  def dup