reffy 6.2.2 → 6.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "reffy",
3
- "version": "6.2.2",
3
+ "version": "6.3.0",
4
4
  "description": "W3C/WHATWG spec dependencies exploration companion. Features a short set of tools to study spec references as well as WebIDL term definitions and references found in W3C specifications.",
5
5
  "repository": {
6
6
  "type": "git",
@@ -34,7 +34,7 @@
34
34
  "abortcontroller-polyfill": "1.7.3",
35
35
  "browser-specs": "2.27.0",
36
36
  "commander": "9.0.0",
37
- "fetch-filecache-for-crawling": "4.0.2",
37
+ "fetch-filecache-for-crawling": "4.1.0",
38
38
  "puppeteer": "13.1.3",
39
39
  "semver": "^7.3.5",
40
40
  "webidl2": "24.2.0"
@@ -42,10 +42,10 @@
42
42
  "devDependencies": {
43
43
  "chai": "4.3.6",
44
44
  "mocha": "9.2.0",
45
- "nock": "13.2.2",
46
- "respec": "29.0.5",
45
+ "nock": "13.2.4",
46
+ "respec": "29.0.6",
47
47
  "respec-hljs": "2.1.1",
48
- "rollup": "2.67.0"
48
+ "rollup": "2.67.1"
49
49
  },
50
50
  "scripts": {
51
51
  "test": "mocha --recursive tests/"
package/src/lib/fetch.js CHANGED
@@ -33,7 +33,7 @@ catch (err) {
33
33
  * @return {Promise(Response)} Promise to get an HTTP response
34
34
  */
35
35
  async function fetch(url, options) {
36
- options = Object.assign({}, options);
36
+ options = Object.assign({headers: {}}, options);
37
37
  ['cacheFolder', 'resetCache', 'cacheRefresh', 'logToConsole'].forEach(param => {
38
38
  let fetchParam = (param === 'cacheRefresh') ? 'refresh' : param;
39
39
  if (config[param] && !options.hasOwnProperty(fetchParam)) {
@@ -53,4 +53,4 @@ async function fetch(url, options) {
53
53
  }
54
54
 
55
55
 
56
- module.exports = fetch;
56
+ module.exports = fetch;
@@ -98,7 +98,14 @@ nock("https://www.w3.org")
98
98
  .get("/Tools/respec/respec-w3c").replyWithFile(200,
99
99
  path.join(modulesFolder, "respec", "builds", "respec-w3c.js"),
100
100
  { "Content-Type": "application/js" })
101
- .get("/TR/idontexist/").reply(404, '');
101
+ .get("/TR/idontexist/").reply(404, '')
102
+ .get("/TR/ididnotchange/").reply(function() {
103
+ if (this.req.headers['if-modified-since'][0] === "Fri, 11 Feb 2022 00:00:42 GMT") {
104
+ return [304, ''];
105
+ } else {
106
+ return [200, 'Unexpected path'];
107
+ }
108
+ });
102
109
 
103
110
  nock("https://drafts.csswg.org")
104
111
  .persist()
@@ -117,4 +124,4 @@ nock.emitter.on('no match', function(req, options, requestBody) {
117
124
  }
118
125
  });
119
126
 
120
- module.exports = nock;
127
+ module.exports = nock;
@@ -29,6 +29,7 @@ const {
29
29
  createFolderIfNeeded
30
30
  } = require('./util');
31
31
 
32
+ const {version: reffyVersion} = require('../../package.json');
32
33
 
33
34
  /**
34
35
  * Return the spec if crawl succeeded or crawl result from given fallback list
@@ -78,10 +79,15 @@ async function crawlSpec(spec, crawlOptions) {
78
79
  path.dirname(crawlOptions.fallback) : '';
79
80
 
80
81
  if (spec.error) {
81
- return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData);
82
+ return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData?.results);
82
83
  }
83
84
 
84
85
  try {
86
+ const fallback = crawlOptions.fallbackData?.results?.find(s => s.url === spec.url);
87
+ let cacheInfo = {};
88
+ if (crawlOptions.fallbackData?.crawler === `reffy-${reffyVersion}`) {
89
+ cacheInfo = Object.assign({}, fallback?.crawlCacheInfo);
90
+ }
85
91
  const result = await processSpecification(
86
92
  spec.crawled,
87
93
  (spec, modules) => {
@@ -97,8 +103,14 @@ async function crawlSpec(spec, crawlOptions) {
97
103
  },
98
104
  [spec, crawlOptions.modules],
99
105
  { quiet: crawlOptions.quiet,
100
- forceLocalFetch: crawlOptions.forceLocalFetch }
106
+ forceLocalFetch: crawlOptions.forceLocalFetch,
107
+ ...cacheInfo}
101
108
  );
109
+ if (result.status === "notmodified" && fallback) {
110
+ crawlOptions.quiet ?? console.warn(`skipping ${spec.url}, no change`);
111
+ const copy = Object.assign({}, fallback);
112
+ return expandSpecResult(copy, fallbackFolder);
113
+ }
102
114
 
103
115
  // Specific rule for IDL extracts:
104
116
  // parse the extracted WebIdl content
@@ -169,6 +181,9 @@ async function crawlSpec(spec, crawlOptions) {
169
181
 
170
182
  // Copy results back into initial spec object
171
183
  spec.crawled = result.crawled;
184
+ if (result.crawlCacheInfo) {
185
+ spec.crawlCacheInfo = result.crawlCacheInfo;
186
+ }
172
187
  crawlOptions.modules.forEach(mod => {
173
188
  if (result[mod.property]) {
174
189
  spec[mod.property] = result[mod.property];
@@ -183,7 +198,7 @@ async function crawlSpec(spec, crawlOptions) {
183
198
  spec.error = err.toString() + (err.stack ? ' ' + err.stack : '');
184
199
  }
185
200
 
186
- return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData);
201
+ return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData?.results);
187
202
  }
188
203
 
189
204
 
@@ -351,7 +366,7 @@ async function crawlList(speclist, crawlOptions) {
351
366
  // Load fallback data if necessary
352
367
  if (crawlOptions.fallback) {
353
368
  try {
354
- crawlOptions.fallbackData = JSON.parse(await fs.promises.readFile(crawlOptions.fallback)).results;
369
+ crawlOptions.fallbackData = JSON.parse(await fs.promises.readFile(crawlOptions.fallback));
355
370
  } catch (e) {
356
371
  throw new Error(`Could not parse fallback data file ${crawlOptions.fallback}`);
357
372
  }
@@ -469,12 +484,14 @@ async function saveResults(data, settings) {
469
484
 
470
485
  // Save all results to an index.json file
471
486
  const indexFilename = path.join(settings.output, 'index.json');
487
+
472
488
  const contents = {
473
489
  type: 'crawl',
474
490
  title: 'Reffy crawl',
475
491
  date: (new Date()).toJSON(),
476
492
  options: settings,
477
493
  stats: {},
494
+ crawler: `reffy-${reffyVersion}`,
478
495
  results: data
479
496
  };
480
497
  contents.options.modules = contents.options.modules.map(mod => mod.property);
package/src/lib/util.js CHANGED
@@ -13,7 +13,6 @@ const specEquivalents = require('../specs/spec-equivalents.json');
13
13
 
14
14
  const reffyModules = require('../browserlib/reffy.json');
15
15
 
16
-
17
16
  /**
18
17
  * Maximum depth difference supported between Reffy's install path and custom
19
18
  * modules that may be provided on the command-line
@@ -22,6 +21,7 @@ const reffyModules = require('../browserlib/reffy.json');
22
21
  */
23
22
  const maxPathDepth = 20;
24
23
 
24
+ let prefetchedResponses = {};
25
25
 
26
26
  /**
27
27
  * Returns a range array from 0 to the number provided (not included)
@@ -325,7 +325,8 @@ async function teardownBrowser() {
325
325
  * flag tells the function that all network requests need to be only handled
326
326
  * by Node.js's "fetch" function (as opposed to falling back to Puppeteer's
327
327
  * network and caching logic), which is useful to keep full control of network
328
- * requests in tests.
328
+ * requests in tests. The "etag" and "lastModified" options give input
329
+ * to the conditional fetch request sent for the primary crawled URL
329
330
  * @return {Promise} The promise to get the results of the processing function
330
331
  */
331
332
  async function processSpecification(spec, processFunction, args, options) {
@@ -409,17 +410,18 @@ async function processSpecification(spec, processFunction, args, options) {
409
410
  await cdp.send('Fetch.continueRequest', { requestId });
410
411
  return;
411
412
  }
413
+ const response = prefetchedResponses[request.url] ?? await fetch(request.url, { signal: controller.signal, headers: request.headers });
412
414
 
413
- const response = await fetch(request.url, { signal: controller.signal });
414
415
  const body = await response.buffer();
416
+
415
417
  await cdp.send('Fetch.fulfillRequest', {
416
418
  requestId,
417
419
  responseCode: response.status,
418
420
  responseHeaders: Object.keys(response.headers.raw()).map(header => {
419
- return {
420
- name: header,
421
- value: response.headers.raw()[header].join(',')
422
- };
421
+ return {
422
+ name: header,
423
+ value: response.headers.raw()[header].join(',')
424
+ };
423
425
  }),
424
426
  body: body.toString('base64')
425
427
  });
@@ -442,8 +444,8 @@ async function processSpecification(spec, processFunction, args, options) {
442
444
  await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
443
445
  }
444
446
  else {
445
- options.quiet ?? console.warn(`[warn] Fall back to regular network request for ${request.url}`, err);
446
447
  try {
448
+ options.quiet ?? console.warn(`[warn] Fall back to regular network request for ${request.url}`, err);
447
449
  await cdp.send('Fetch.continueRequest', { requestId });
448
450
  }
449
451
  catch (err) {
@@ -457,6 +459,34 @@ async function processSpecification(spec, processFunction, args, options) {
457
459
  }
458
460
 
459
461
  try {
462
+ // Fetch the spec URL if using https
463
+ // This allow to skip launching a browser
464
+ // if we have a fallback data source
465
+ // with a defined cache target for the spec
466
+ if (!spec.url.startsWith('file://')) {
467
+ let response;
468
+ // We set a conditional request header
469
+ // Use If-Modified-Since in preference as it is in practice
470
+ // more reliable for conditional requests
471
+ let headers = {'Accept-Encoding': 'gzip, deflate, br', 'Upgrade-Insecure-Requests': 1, 'User-Agent': browser.userAgent()};
472
+ if (options.lastModified) {
473
+ headers["If-Modified-Since"] = options.lastModified;
474
+ } else if (options.etag) {
475
+ headers["If-None-Match"] = options.etag;
476
+ }
477
+ try {
478
+ response = await fetch(spec.url, {headers});
479
+ if (response.status === 304) {
480
+ return {status: "notmodified"};
481
+ }
482
+ prefetchedResponses[spec.url] = response;
483
+ } catch (err) {
484
+ throw new Error(`Loading ${spec.url} triggered network error ${err}`);
485
+ }
486
+ if (response.status !== 200) {
487
+ throw new Error(`Loading ${spec.url} triggered HTTP status ${response.status}`);
488
+ }
489
+ }
460
490
  const page = await browser.newPage();
461
491
 
462
492
  // Disable cache if caller wants to handle all network requests
@@ -497,13 +527,27 @@ async function processSpecification(spec, processFunction, args, options) {
497
527
 
498
528
  // Load the page
499
529
  // (note HTTP status is 0 when `file://` URLs are loaded)
530
+ let cacheInfo;
500
531
  if (spec.html) {
501
532
  await page.setContent(spec.html, loadOptions);
502
533
  }
503
534
  else {
504
- const result = await page.goto(spec.url, loadOptions);
535
+ let result;
536
+ try {
537
+ result = await page.goto(spec.url, loadOptions);
538
+ } catch (err) {
539
+ throw new Error(`Loading ${spec.url} triggered network error ${err}`);
540
+ }
505
541
  if ((result.status() !== 200) && (!spec.url.startsWith('file://') || (result.status() !== 0))) {
506
- throw new Error(`Loading ${spec.url} triggered HTTP status ${result.status()}`);
542
+ throw new Error(`Loading ${spec.url} triggered HTTP status ${result.status()}`);
543
+ }
544
+ const responseHeaders = result.headers();
545
+ // Use Last-Modified in preference as it is in practice
546
+ // more reliable for conditional requests
547
+ if (responseHeaders['last-modified']) {
548
+ cacheInfo = {lastModified: responseHeaders['last-modified']};
549
+ } else if (responseHeaders.etag) {
550
+ cacheInfo = {etag: responseHeaders.etag};
507
551
  }
508
552
  }
509
553
 
@@ -613,7 +657,7 @@ async function processSpecification(spec, processFunction, args, options) {
613
657
 
614
658
  // Run the processFunction method in the browser context
615
659
  const results = await page.evaluate(processFunction, ...args);
616
-
660
+ results.crawlCacheInfo = cacheInfo;
617
661
  // Pending network requests may still be in the queue, flag the page
618
662
  // as closed not to send commands on a CDP session that's no longer
619
663
  // attached to anything