crawlscope 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +31 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +323 -0
  5. data/exe/crawlscope +6 -0
  6. data/lib/crawlscope/audit.rb +128 -0
  7. data/lib/crawlscope/browser.rb +88 -0
  8. data/lib/crawlscope/cli.rb +245 -0
  9. data/lib/crawlscope/configuration.rb +123 -0
  10. data/lib/crawlscope/crawler.rb +28 -0
  11. data/lib/crawlscope/http.rb +77 -0
  12. data/lib/crawlscope/issue.rb +17 -0
  13. data/lib/crawlscope/issue_collection.rb +41 -0
  14. data/lib/crawlscope/page.rb +23 -0
  15. data/lib/crawlscope/railtie.rb +9 -0
  16. data/lib/crawlscope/reporter.rb +33 -0
  17. data/lib/crawlscope/result.rb +9 -0
  18. data/lib/crawlscope/rule_registry.rb +39 -0
  19. data/lib/crawlscope/rules/links.rb +220 -0
  20. data/lib/crawlscope/rules/metadata.rb +93 -0
  21. data/lib/crawlscope/rules/structured_data.rb +58 -0
  22. data/lib/crawlscope/rules/uniqueness.rb +88 -0
  23. data/lib/crawlscope/schema_registry.rb +431 -0
  24. data/lib/crawlscope/sitemap.rb +67 -0
  25. data/lib/crawlscope/structured_data/audit.rb +150 -0
  26. data/lib/crawlscope/structured_data/document.rb +93 -0
  27. data/lib/crawlscope/structured_data/report.rb +77 -0
  28. data/lib/crawlscope/structured_data/reporter.rb +73 -0
  29. data/lib/crawlscope/structured_data/writer.rb +26 -0
  30. data/lib/crawlscope/task.rb +131 -0
  31. data/lib/crawlscope/url.rb +43 -0
  32. data/lib/crawlscope/version.rb +5 -0
  33. data/lib/crawlscope.rb +34 -0
  34. data/lib/tasks/crawlscope_tasks.rake +44 -0
  35. data/test/crawlscope/audit_test.rb +165 -0
  36. data/test/crawlscope/cli_test.rb +157 -0
  37. data/test/crawlscope/configuration_test.rb +45 -0
  38. data/test/crawlscope/links_rule_test.rb +87 -0
  39. data/test/crawlscope/loader_test.rb +11 -0
  40. data/test/crawlscope/reporter_test.rb +50 -0
  41. data/test/crawlscope/schema_registry_test.rb +89 -0
  42. data/test/crawlscope/sitemap_test.rb +51 -0
  43. data/test/crawlscope/structured_data_audit_test.rb +118 -0
  44. data/test/crawlscope/structured_data_document_test.rb +28 -0
  45. data/test/crawlscope/structured_data_report_test.rb +37 -0
  46. data/test/crawlscope/structured_data_reporter_test.rb +32 -0
  47. data/test/crawlscope/structured_data_rule_test.rb +78 -0
  48. data/test/crawlscope/structured_data_writer_test.rb +32 -0
  49. data/test/crawlscope/task_test.rb +206 -0
  50. data/test/crawlscope/uniqueness_rule_test.rb +46 -0
  51. data/test/test_helper.rb +23 -0
  52. metadata +271 -0
@@ -0,0 +1,431 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json-schema"
4
+
5
+ module Crawlscope
6
+ class SchemaRegistry
7
+ FAQ_PAGE = {
8
+ "type" => "object",
9
+ "required" => ["@context", "@type", "mainEntity"],
10
+ "properties" => {
11
+ "@context" => {"const" => "https://schema.org"},
12
+ "@type" => {"const" => "FAQPage"},
13
+ "mainEntity" => {
14
+ "type" => "array",
15
+ "minItems" => 1,
16
+ "items" => {"$ref" => "#/definitions/Question"}
17
+ }
18
+ },
19
+ "definitions" => {
20
+ "Question" => {
21
+ "type" => "object",
22
+ "required" => ["@type", "name", "acceptedAnswer"],
23
+ "properties" => {
24
+ "@type" => {"const" => "Question"},
25
+ "name" => {"type" => "string"},
26
+ "acceptedAnswer" => {"$ref" => "#/definitions/Answer"}
27
+ }
28
+ },
29
+ "Answer" => {
30
+ "type" => "object",
31
+ "required" => ["@type", "text"],
32
+ "properties" => {
33
+ "@type" => {"const" => "Answer"},
34
+ "text" => {"type" => "string"}
35
+ }
36
+ }
37
+ }
38
+ }.freeze
39
+
40
+ ARTICLE = {
41
+ type: "object",
42
+ required: ["@type", "headline"],
43
+ properties: {
44
+ "@type" => {enum: ["Article", "NewsArticle", "BlogPosting"]},
45
+ :headline => {type: "string", maxLength: 110},
46
+ :image => {type: "string", format: "uri"},
47
+ :datePublished => {type: "string", format: "date-time"},
48
+ :dateModified => {type: "string", format: "date-time"},
49
+ :author => {type: "object"},
50
+ :publisher => {type: "object"}
51
+ }
52
+ }.freeze
53
+
54
+ ORGANIZATION = {
55
+ type: "object",
56
+ required: ["@type", "name"],
57
+ properties: {
58
+ "@type" => {const: "Organization"},
59
+ :name => {type: "string"},
60
+ :url => {type: "string", format: "uri"},
61
+ :logo => {
62
+ anyOf: [
63
+ {type: "string", format: "uri"},
64
+ {
65
+ type: "object",
66
+ required: ["@type", "url"],
67
+ properties: {
68
+ "@type" => {const: "ImageObject"},
69
+ :url => {type: "string", format: "uri"}
70
+ }
71
+ }
72
+ ]
73
+ },
74
+ :description => {type: "string"}
75
+ }
76
+ }.freeze
77
+
78
+ IMAGE_OBJECT = {
79
+ type: "object",
80
+ required: ["@type"],
81
+ properties: {
82
+ "@type" => {const: "ImageObject"},
83
+ :url => {type: "string", format: "uri"},
84
+ :contentUrl => {type: "string", format: "uri"},
85
+ :thumbnail => {type: ["string", "object"]}
86
+ }
87
+ }.freeze
88
+
89
+ OFFER = {
90
+ type: "object",
91
+ additionalProperties: true,
92
+ required: ["@type"],
93
+ properties: {
94
+ "@type" => {const: "Offer"},
95
+ :name => {type: ["string", "null"]},
96
+ :price => {type: ["string", "number"]},
97
+ :priceCurrency => {type: ["string", "null"]},
98
+ :priceSpecification => {type: ["object", "null"]},
99
+ :availability => {type: "string"},
100
+ :shippingDetails => {type: "object"},
101
+ :hasMerchantReturnPolicy => {type: "boolean"},
102
+ :merchantReturnPolicy => {type: "object"},
103
+ :url => {type: "string", format: "uri"},
104
+ :eligibleQuantity => {type: "object"},
105
+ :additionalProperty => {type: "array", items: {type: "object"}}
106
+ }
107
+ }.freeze
108
+
109
+ RATING = {
110
+ type: "object",
111
+ required: ["@type", "ratingValue"],
112
+ properties: {
113
+ "@type" => {const: "Rating"},
114
+ :ratingValue => {type: ["string", "number"]},
115
+ :bestRating => {type: ["string", "number"]},
116
+ :worstRating => {type: ["string", "number"]}
117
+ }
118
+ }.freeze
119
+
120
+ REVIEW = {
121
+ type: "object",
122
+ required: ["@type", "itemReviewed"],
123
+ properties: {
124
+ "@type" => {const: "Review"},
125
+ :itemReviewed => {type: "object"},
126
+ :reviewRating => RATING,
127
+ :author => {type: ["object", "string"]},
128
+ :datePublished => {type: "string", format: "date-time"},
129
+ :reviewBody => {type: "string"}
130
+ }
131
+ }.freeze
132
+
133
+ REVIEW_SNIPPET = {
134
+ type: "object",
135
+ required: ["@type", "reviewRating"],
136
+ properties: {
137
+ "@type" => {const: "Review"},
138
+ :reviewRating => RATING,
139
+ :author => {type: ["object", "string"]},
140
+ :reviewBody => {type: "string"},
141
+ :datePublished => {type: "string", format: "date-time"}
142
+ }
143
+ }.freeze
144
+
145
+ AGGREGATE_RATING = {
146
+ type: "object",
147
+ required: ["@type"],
148
+ properties: {
149
+ "@type" => {const: "AggregateRating"},
150
+ :ratingValue => {type: ["string", "number"]},
151
+ :ratingCount => {type: "integer"},
152
+ :reviewCount => {type: "integer"},
153
+ :bestRating => {type: ["string", "number"]},
154
+ :worstRating => {type: ["string", "number"]}
155
+ }
156
+ }.freeze
157
+
158
+ SOFTWARE_APPLICATION = {
159
+ type: "object",
160
+ required: ["@type", "name"],
161
+ properties: {
162
+ "@type" => {const: "SoftwareApplication"},
163
+ :name => {type: "string"},
164
+ :applicationCategory => {type: "string"},
165
+ :description => {type: "string"},
166
+ :offers => {
167
+ anyOf: [
168
+ OFFER,
169
+ {type: "array", items: OFFER}
170
+ ]
171
+ },
172
+ :featureList => {type: ["string", "array"]},
173
+ :aggregateRating => AGGREGATE_RATING,
174
+ :review => REVIEW_SNIPPET
175
+ }
176
+ }.freeze
177
+
178
+ WEB_APPLICATION = {
179
+ type: "object",
180
+ required: ["@type", "name"],
181
+ properties: {
182
+ "@type" => {const: "WebApplication"},
183
+ :name => {type: "string"},
184
+ :applicationCategory => {type: "string"},
185
+ :description => {type: "string"},
186
+ :operatingSystem => {type: "string"},
187
+ :url => {type: "string", format: "uri"},
188
+ :offers => {
189
+ anyOf: [
190
+ OFFER,
191
+ {type: "array", items: OFFER}
192
+ ]
193
+ },
194
+ :featureList => {type: ["string", "array"]},
195
+ :aggregateRating => AGGREGATE_RATING,
196
+ :review => REVIEW_SNIPPET
197
+ }
198
+ }.freeze
199
+
200
+ HOW_TO = {
201
+ type: "object",
202
+ required: ["@type", "name", "step"],
203
+ properties: {
204
+ "@type" => {const: "HowTo"},
205
+ :name => {type: "string"},
206
+ :description => {type: "string"},
207
+ :step => {
208
+ type: "array",
209
+ minItems: 1,
210
+ items: {
211
+ type: "object",
212
+ required: ["@type", "name", "text"],
213
+ properties: {
214
+ "@type" => {const: "HowToStep"},
215
+ :name => {type: "string"},
216
+ :text => {type: "string"},
217
+ :position => {type: "integer", minimum: 1}
218
+ }
219
+ }
220
+ }
221
+ }
222
+ }.freeze
223
+
224
+ CONTACT_PAGE = {
225
+ type: "object",
226
+ required: ["@type", "name"],
227
+ properties: {
228
+ "@type" => {const: "ContactPage"},
229
+ :name => {type: "string"},
230
+ :description => {type: "string"},
231
+ :url => {type: "string", format: "uri"}
232
+ }
233
+ }.freeze
234
+
235
+ PRODUCT = {
236
+ type: "object",
237
+ required: ["@type", "name"],
238
+ properties: {
239
+ "@type" => {const: "Product"},
240
+ :name => {type: "string"},
241
+ :image => {
242
+ anyOf: [
243
+ {type: "string", format: "uri"},
244
+ IMAGE_OBJECT,
245
+ {type: "array", items: {anyOf: [{type: "string", format: "uri"}, IMAGE_OBJECT]}}
246
+ ]
247
+ },
248
+ :description => {type: "string"},
249
+ :offers => {
250
+ anyOf: [
251
+ OFFER,
252
+ {type: "array", items: OFFER}
253
+ ]
254
+ }
255
+ }
256
+ }.freeze
257
+
258
+ RECIPE = {
259
+ type: "object",
260
+ required: ["@type", "name"],
261
+ properties: {
262
+ "@type" => {const: "Recipe"},
263
+ :name => {type: "string"},
264
+ :image => {type: ["string", "array"]},
265
+ :recipeIngredient => {type: "array", items: {type: "string"}},
266
+ :recipeInstructions => {type: ["string", "array"]}
267
+ }
268
+ }.freeze
269
+
270
+ EVENT = {
271
+ type: "object",
272
+ required: ["@type", "name", "startDate"],
273
+ properties: {
274
+ "@type" => {const: "Event"},
275
+ :name => {type: "string"},
276
+ :startDate => {type: "string", format: "date-time"},
277
+ :endDate => {type: "string", format: "date-time"},
278
+ :location => {type: "object"}
279
+ }
280
+ }.freeze
281
+
282
+ VIDEO_OBJECT = {
283
+ type: "object",
284
+ required: ["@type", "name", "description"],
285
+ properties: {
286
+ "@type" => {const: "VideoObject"},
287
+ :name => {type: "string"},
288
+ :description => {type: "string"},
289
+ :thumbnailUrl => {type: "string", format: "uri"},
290
+ :uploadDate => {type: "string", format: "date-time"}
291
+ }
292
+ }.freeze
293
+
294
+ WEBSITE = {
295
+ type: "object",
296
+ required: ["@type"],
297
+ properties: {
298
+ "@type" => {const: "WebSite"},
299
+ :name => {type: "string"},
300
+ :url => {type: "string", format: "uri"},
301
+ :potentialAction => {type: "object"}
302
+ }
303
+ }.freeze
304
+
305
+ BREADCRUMB_LIST = {
306
+ type: "object",
307
+ required: ["@type", "itemListElement"],
308
+ properties: {
309
+ "@type" => {const: "BreadcrumbList"},
310
+ :itemListElement => {
311
+ type: "array",
312
+ minItems: 1,
313
+ items: {
314
+ type: "object",
315
+ required: ["@type", "position", "name", "item"],
316
+ properties: {
317
+ "@type" => {const: "ListItem"},
318
+ :position => {type: "integer", minimum: 1},
319
+ :name => {type: "string"},
320
+ :item => {type: "string", format: "uri"}
321
+ }
322
+ }
323
+ }
324
+ }
325
+ }.freeze
326
+
327
+ WEB_PAGE = {
328
+ type: "object",
329
+ required: ["@type"],
330
+ properties: {
331
+ "@type" => {const: "WebPage"}
332
+ }
333
+ }.freeze
334
+
335
+ def initialize(schemas: {})
336
+ @schemas = schemas.transform_keys(&:to_s).dup
337
+ end
338
+
339
+ def self.default
340
+ new(
341
+ schemas: {
342
+ "FAQPage" => FAQ_PAGE,
343
+ "Article" => ARTICLE,
344
+ "NewsArticle" => ARTICLE,
345
+ "BlogPosting" => ARTICLE,
346
+ "Organization" => ORGANIZATION,
347
+ "SoftwareApplication" => SOFTWARE_APPLICATION,
348
+ "WebApplication" => WEB_APPLICATION,
349
+ "HowTo" => HOW_TO,
350
+ "ContactPage" => CONTACT_PAGE,
351
+ "Product" => PRODUCT,
352
+ "Review" => REVIEW,
353
+ "WebSite" => WEBSITE,
354
+ "BreadcrumbList" => BREADCRUMB_LIST,
355
+ "Recipe" => RECIPE,
356
+ "Event" => EVENT,
357
+ "VideoObject" => VIDEO_OBJECT,
358
+ "WebPage" => WEB_PAGE
359
+ }
360
+ )
361
+ end
362
+
363
+ def dup
364
+ self.class.new(schemas: deep_copy(@schemas))
365
+ end
366
+
367
+ def fetch(type)
368
+ @schemas.fetch(type.to_s)
369
+ end
370
+
371
+ def register(type, schema)
372
+ @schemas[type.to_s] = schema
373
+ self
374
+ end
375
+
376
+ def registered?(type)
377
+ @schemas.key?(type.to_s)
378
+ end
379
+
380
+ def validate(item)
381
+ if item.is_a?(Array)
382
+ return item.flat_map { |entry| validate(entry) }
383
+ end
384
+
385
+ errors = []
386
+
387
+ if item.is_a?(Hash) && item["@graph"].is_a?(Array)
388
+ item["@graph"].each do |graph_item|
389
+ errors.concat(validate(graph_item))
390
+ end
391
+ end
392
+
393
+ type = item.is_a?(Hash) ? item["@type"] : nil
394
+ return errors if type.nil?
395
+
396
+ schema = @schemas[type.to_s]
397
+ return errors if schema.nil?
398
+
399
+ JSON::Validator.fully_validate(schema, item, errors_as_objects: true).each do |error|
400
+ errors << {
401
+ field: error[:fragment].to_s.sub("#/", ""),
402
+ issue: error[:message],
403
+ type: type
404
+ }
405
+ end
406
+
407
+ errors
408
+ rescue JSON::Schema::ValidationError => error
409
+ [{field: "unknown", issue: error.message, type: type}]
410
+ end
411
+
412
+ def to_h
413
+ @schemas.dup
414
+ end
415
+
416
+ private
417
+
418
+ def deep_copy(value)
419
+ case value
420
+ when Hash
421
+ value.each_with_object({}) do |(key, entry), copy|
422
+ copy[key] = deep_copy(entry)
423
+ end
424
+ when Array
425
+ value.map { |entry| deep_copy(entry) }
426
+ else
427
+ value
428
+ end
429
+ end
430
+ end
431
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "faraday"
4
+ require "faraday/follow_redirects"
5
+ require "nokogiri"
6
+ require "uri"
7
+
8
+ module Crawlscope
9
+ class Sitemap
10
+ SITEMAP_NAMESPACE = {"xmlns" => "http://www.sitemaps.org/schemas/sitemap/0.9"}.freeze
11
+
12
+ def initialize(path:)
13
+ @path = path
14
+ end
15
+
16
+ def urls(base_url:)
17
+ collect_urls(@path, base_url: base_url, visited: Set.new).uniq
18
+ end
19
+
20
+ private
21
+
22
+ def collect_urls(source, base_url:, visited:)
23
+ return [] if visited.include?(source)
24
+
25
+ visited.add(source)
26
+ document = Nokogiri::XML(read(source))
27
+ root_name = document.root&.name
28
+
29
+ if root_name == "sitemapindex"
30
+ document.xpath("//xmlns:sitemap/xmlns:loc", SITEMAP_NAMESPACE).flat_map do |node|
31
+ child_source = resolve_child_source(source, node.text.to_s.strip)
32
+ collect_urls(child_source, base_url: base_url, visited: visited)
33
+ end
34
+ else
35
+ document.xpath("//xmlns:url/xmlns:loc", SITEMAP_NAMESPACE).map do |node|
36
+ Url.normalize(node.text.to_s.strip, base_url: base_url)
37
+ end
38
+ end
39
+ end
40
+
41
+ def read(source)
42
+ if Url.remote?(source)
43
+ connection.get(source).body
44
+ else
45
+ File.read(source)
46
+ end
47
+ end
48
+
49
+ def resolve_child_source(parent_source, child_loc)
50
+ return child_loc if Url.remote?(child_loc)
51
+
52
+ if Url.remote?(parent_source)
53
+ URI.join(parent_source, child_loc).to_s
54
+ else
55
+ File.expand_path(child_loc, File.dirname(parent_source))
56
+ end
57
+ end
58
+
59
+ def connection
60
+ @connection ||= Faraday.new do |faraday|
61
+ faraday.response :follow_redirects, limit: Http::MAX_REDIRECTS
62
+ faraday.options.timeout = 20
63
+ faraday.options.open_timeout = 20
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,150 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ module StructuredData
5
+ class Audit
6
+ Entry = Data.define(:url, :status, :structured_items, :errors, :fetch_error, :content_type, :skipped_reason) do
7
+ def json_ld_count
8
+ structured_items.count { |item| item[:source] == "json-ld" }
9
+ end
10
+
11
+ def microdata_count
12
+ structured_items.count { |item| item[:source] == "microdata" }
13
+ end
14
+
15
+ def ok?
16
+ fetch_error.nil? && errors.empty?
17
+ end
18
+
19
+ def structured_data_found?
20
+ skipped_reason == "non-html" || structured_items.any?
21
+ end
22
+ end
23
+
24
+ Result = Data.define(:entries) do
25
+ def ok?
26
+ entries.all?(&:ok?)
27
+ end
28
+ end
29
+
30
+ def initialize(schema_registry:, renderer:, timeout_seconds:, browser_factory: nil, network_idle_timeout_seconds: Configuration::DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, scroll_page: Configuration::DEFAULT_BROWSER_SCROLL_PAGE)
31
+ @schema_registry = schema_registry
32
+ @renderer = renderer.to_sym
33
+ @timeout_seconds = timeout_seconds
34
+ @browser_factory = browser_factory
35
+ @network_idle_timeout_seconds = network_idle_timeout_seconds
36
+ @scroll_page = scroll_page
37
+ end
38
+
39
+ def call(urls:)
40
+ fetcher = build_fetcher(urls)
41
+ entries = urls.map { |url| validate_url(url, fetcher) }
42
+
43
+ Result.new(entries: entries)
44
+ ensure
45
+ fetcher&.close
46
+ end
47
+
48
+ private
49
+
50
+ def build_browser(base_url)
51
+ browser_factory = @browser_factory
52
+
53
+ if browser_factory
54
+ browser_factory.call
55
+ else
56
+ Crawlscope::Browser.new(
57
+ base_url: base_url,
58
+ timeout_seconds: @timeout_seconds,
59
+ network_idle_timeout_seconds: @network_idle_timeout_seconds,
60
+ scroll_page: @scroll_page
61
+ )
62
+ end
63
+ rescue LoadError => error
64
+ raise ConfigurationError, "Browser rendering requires the ferrum gem (#{error.message})"
65
+ end
66
+
67
+ def build_fetcher(urls)
68
+ first_url = urls.first.to_s
69
+ base_url = first_url.empty? ? "http://localhost:3000" : first_url
70
+
71
+ if @renderer == :browser
72
+ build_browser(base_url)
73
+ else
74
+ Crawlscope::Http.new(base_url: base_url, timeout_seconds: @timeout_seconds)
75
+ end
76
+ end
77
+
78
+ def build_validation_errors(page)
79
+ document = Document.new(html: page.body)
80
+ structured_items = document.items.map do |item|
81
+ {
82
+ data: item.data,
83
+ source: item.source
84
+ }
85
+ end
86
+
87
+ errors = structured_items.filter_map do |item|
88
+ if item[:data].is_a?(Hash) && item[:data][:error]
89
+ {
90
+ errors: [{field: "parse", issue: item[:data][:message]}],
91
+ source: item[:source],
92
+ type: item[:source]
93
+ }
94
+ else
95
+ schema_errors = @schema_registry.validate(item[:data])
96
+ next if schema_errors.empty?
97
+
98
+ {
99
+ errors: schema_errors,
100
+ source: item[:source],
101
+ type: item[:data]["@type"] || item[:source]
102
+ }
103
+ end
104
+ end
105
+
106
+ [structured_items, errors]
107
+ end
108
+
109
+ def validate_url(url, fetcher)
110
+ page = fetcher.fetch(url)
111
+ content_type = page.headers["content-type"].to_s
112
+
113
+ if page.error
114
+ Entry.new(url: url, status: page.status, structured_items: [], errors: [], fetch_error: page.error, content_type: content_type, skipped_reason: nil)
115
+ elsif page.status && !(200..299).cover?(page.status.to_i)
116
+ Entry.new(
117
+ url: url,
118
+ status: page.status,
119
+ structured_items: [],
120
+ errors: [],
121
+ fetch_error: "Non-success status",
122
+ content_type: content_type,
123
+ skipped_reason: nil
124
+ )
125
+ elsif !content_type.empty? && !content_type.include?("text/html")
126
+ Entry.new(
127
+ url: url,
128
+ status: page.status,
129
+ structured_items: [],
130
+ errors: [],
131
+ fetch_error: nil,
132
+ content_type: content_type,
133
+ skipped_reason: "non-html"
134
+ )
135
+ else
136
+ structured_items, errors = build_validation_errors(page)
137
+ Entry.new(
138
+ url: url,
139
+ status: page.status,
140
+ structured_items: structured_items,
141
+ errors: errors,
142
+ fetch_error: nil,
143
+ content_type: content_type,
144
+ skipped_reason: nil
145
+ )
146
+ end
147
+ end
148
+ end
149
+ end
150
+ end